class TestFeatureKeyFunction: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, key_function=get_gene_name) self.genes = Fasta(self.fasta, key_function=get_gene_name) def test_keys(self): expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1'] result = sorted(self.genes.keys()) assert result == expect def test_key_function_by_dictionary_get_key(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.genes['MDM4'][100-1:150] assert str(result) == expect def test_key_function_by_fetch(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.faidx.fetch('MDM4', 100, 150) assert str(result) == expect @raises(ValueError) def test_duplicated_keys(self): genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
def test_fetch_border_padded(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta.gz', default_seq='N') expect = 'TCNNNNNNNNNNNNNNNNNNN' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500) print(result) assert str(result) == expect
def test_key_function_by_fetch(self): faidx = Faidx('data/genes.fasta', split_char='|', duplicate_action="drop") expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('KF435150.1', 100, 150) assert str(result) == expect
def test_fetch_border(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta') expect = 'TC' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500) assert str(result) == expect
def test_fetch_border_padded(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta.gz', default_seq='N') expect = 'TCNNNNNNNNNNNNNNNNNNN' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500) print(result) assert str(result) == expect
class TestFeatureBoundsCheck: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta, default_seq='N') def test_fetch_border_padded(self): """ Fetch past the end of a gene entry """ expect = 'TCNNNNNNNNNNNNNNNNNNN' result = self.faidx.fetch('KF435150.1', 480, 500) assert str(result) == expect
class Genome(object): def __init__(self, db): from pyfaidx import Faidx fa = os.path.join(app.config["DATA_FOLDER"], db, db + ".fa") self.fasta = Faidx(fa) def get_sequence(self, chr, start, end): return self.fasta.fetch(chr, start, end) def destroy(self): self.fasta.close()
def test_fetch_whole_entry(self): faidx = Faidx('data/genes.fasta.gz') expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA' 'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA' 'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG' 'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT' 'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG' 'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG' 'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT' 'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA' 'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA' 'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC') result = faidx.fetch('gi|557361099|gb|KF435150.1|', 1, 481) assert str(result) == expect
def load_seqs_and_annotations(protein_annotations_sqlite_db_file_path, fasta_file_path, shuffle = True, records_limit = None, verbose = True, \ log_progress_every = 10000): if verbose: log('Loading %s records...' % ('all' if records_limit is None else records_limit)) conn = sqlite3.connect(protein_annotations_sqlite_db_file_path) raw_proteins_and_annotations = pd.read_sql_query('SELECT uniprot_name, complete_go_annotation_indices FROM protein_annotations' + ('' if records_limit is None else \ (' LIMIT %d' % records_limit)), conn) if verbose: log('Loaded %d proteins and their GO annotations (%d columns: %s)' % (raw_proteins_and_annotations.shape + (', '.join(raw_proteins_and_annotations.columns), ))) if shuffle: raw_proteins_and_annotations = raw_proteins_and_annotations.sample( frac=1, random_state=0) if verbose: log('Loading Faidx (%s)...' % fasta_file_path) seqs_faidx = Faidx(fasta_file_path) if verbose: log('Finished loading Faidx.') n_failed = 0 for i, (_, (uniprot_id, raw_go_annotation_indices)) in enumerate( raw_proteins_and_annotations.iterrows()): if verbose and i % log_progress_every == 0: log('%d/%d' % (i, len(raw_proteins_and_annotations)), end='\r') seq_fasta_id = 'UniRef90_%s' % uniprot_id.split('_')[0] try: seq = str( seqs_faidx.fetch(seq_fasta_id, 1, seqs_faidx.index[seq_fasta_id].rlen)) yield uniprot_id, seq, json.loads(raw_go_annotation_indices) except KeyError: n_failed += 1 if verbose: log('Finished. Failed finding the sequence for %d of %d records.' % (n_failed, len(raw_proteins_and_annotations)))
def test_fetch_whole_entry(self): faidx = Faidx('data/genes.fasta.gz') expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA' 'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA' 'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG' 'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT' 'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG' 'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG' 'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT' 'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA' 'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA' 'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC') result = faidx.fetch('gi|557361099|gb|KF435150.1|', 1, 481) assert str(result) == expect
def test_fetch_whole_entry(self): faidx = Faidx("data/genes.fasta") expect = ( "ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA" "CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA" "AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG" "TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT" "AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG" "TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG" "AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT" "AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA" "GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA" "TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC" ) result = faidx.fetch("gi|557361099|gb|KF435150.1|", 1, 482) assert str(result) == expect
class TestFeatureBoundsCheck: def __init__(self): self.fasta = os.path.join(path, 'data/genes.fasta') self.faidx = Faidx(self.fasta) self.faidx_strict = Faidx(self.fasta, strict_bounds=True) def test_fetch_whole_entry(self): expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA' 'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA' 'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG' 'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT' 'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG' 'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG' 'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT' 'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA' 'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA' 'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC') result = self.faidx.fetch('KF435150.1', 1, 482) assert str(result) == expect def test_fetch_middle(self): expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = self.faidx.fetch('KF435150.1', 100, 150) assert str(result) == expect def test_fetch_end(self): expect = 'TC' result = self.faidx.fetch('KF435150.1', 480, 482) assert str(result) == expect def test_fetch_border(self): """ Fetch past the end of a gene entry """ expect = 'TC' result = self.faidx.fetch('KF435150.1', 480, 500) assert str(result) == expect def test_rev(self): expect = 'GA' result = self.faidx.fetch('KF435150.1', 480, 482) assert str(-result) == expect, result @raises(FetchError) def test_fetch_past_bounds(self): """ Fetch past the end of a gene entry """ expect = 'TC' result = self.faidx_strict.fetch('KF435150.1', 480, 5000)
def test_key_function_by_fetch(self): faidx = Faidx('data/genes.fasta', key_function=get_gene_name) expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('MDM4', 100, 150) assert str(result) == expect
def test_fetch_reversed_coordinates(self): """ Fetch starting with a negative coordinate """ faidx = Faidx("data/genes.fasta", strict_bounds=True) result = faidx.fetch("gi|557361099|gb|KF435150.1|", 50, 10)
def test_fetch_keyerror(self): """ Fetch a key that does not exist """ faidx = Faidx("data/genes.fasta", strict_bounds=True) result = faidx.fetch("gi|joe|gb|KF435150.1|", 1, 10)
def profile(self, input_vcf_file, ref_genome_file, output_file, raw_gt_format="GTR", sample_id=None, ): # unzip decompose and clean vcf file clean_vcf_file = join_path(mkdtemp(), "clean.vcf") if input_vcf_file.endswith(".gz"): cmd = "gunzip -c " + input_vcf_file else: cmd = "cat " + input_vcf_file cmd += " | vt decompose -s -" cmd += " | grep -Pv \"\t\*\t\"" cmd += " | grep -v \"\\x3b\"" cmd += " | grep -v \"^M\"" cmd += " > " + clean_vcf_file p, stdout_data = exec_sh(cmd, silent=True) # parse the clean vcf file for the required fields vcf_query_format = "'" vcf_query_format += "%CHROM" vcf_query_format += "\t%POS" vcf_query_format += "\t%REF" vcf_query_format += "\t%ALT" vcf_query_format += "[\t%SAMPLE=%" + raw_gt_format + "]" vcf_query_format += "\n" vcf_query_format += "'" cmd = "vcf-query" cmd += " -f " + vcf_query_format cmd += " " + clean_vcf_file p, stdout_data = exec_sh(cmd, silent=True) # get list of smaple id and prepare data structure first_variant_record = stdout_data.decode('utf-8').split("\n")[0] variant_items = first_variant_record.strip().split("\t") samples_features = {} for sample_idx in range(4, len(variant_items)): gt_data = variant_items[sample_idx] m = re.match(r"(?P<sample_id>.*)=(?P<raw_gt>.*)", gt_data) sample_id = m.group("sample_id") samples_features[sample_id] = copy.deepcopy(SNV_FEATURES_TEMPLATE) # iterate over all vcf record and count variants for each sample fa = Faidx(ref_genome_file) for variant_record in stdout_data.decode('utf-8').split("\n"): variant_items = variant_record.strip().split("\t") if len(variant_items) < 4: continue chrom = variant_items[0] pos = variant_items[1] ref = variant_items[2] alt = variant_items[3] if len(ref) > 1: continue if ref == "-": continue if len(alt) > 1: continue if alt == "-": continue triplet = fa.fetch(chrom, int(pos)-1, int(pos)+1).seq feature_id = SNV_FEATURES_HASH[ref][alt][triplet] # iterate over all samples in the record for sample_idx in range(4, len(variant_items)): gt_data = variant_items[sample_idx] m = re.match(r"(?P<sample_id>.*)=(?P<raw_gt>.*)", gt_data) raw_gt = m.group("raw_gt") if raw_gt == "0/0": continue samples_features[m.group("sample_id")][feature_id][FEATURE_QUANTITY] += 1 fa.close() # write output feature file with open(output_file, "w") as f_o: header = VARIANT_TYPE header += "\t" + VARIANT_SUBGROUP header += "\t" + FEATURE_ID for sample_id in samples_features: header += "\t" + sample_id f_o.write(header+"\n") for feature_id in SNV_FEATURES_TEMPLATE: feature_info = "{:s}\t{:s}\t{:s}".format(SNV_FEATURES_TEMPLATE[feature_id][VARIANT_TYPE], SNV_FEATURES_TEMPLATE[feature_id][VARIANT_SUBGROUP], feature_id, ) for sample_id in samples_features: feature_info += "\t" + str(samples_features[sample_id][feature_id][FEATURE_QUANTITY]) f_o.write(feature_info + "\n") self.info() self.info("Done!! The output file is at " + output_file)
def test_fetch_past_bounds(self): """ Fetch past the end of a gene entry """ faidx = Faidx("data/genes.fasta", strict_bounds=True) result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 5000)
def test_fetch_reversed_coordinates(self): """ Fetch starting with a negative coordinate """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|557361099|gb|KF435150.1|', 50, 10)
def test_issue_74_end_faidx(self): f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False) f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True) end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end assert end0 == end1
def test_fetch_end(self): faidx = Faidx('data/genes.fasta') expect = 'TC' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 481) assert str(result) == expect
def test_rev(self): faidx = Faidx('data/genes.fasta.gz') expect = 'GA' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 481) assert str(-result) == expect, result
def test_fetch_middle(self): faidx = Faidx('data/genes.fasta.gz') expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 100, 150) assert str(result) == expect
fa = Faidx(ref_fasta) # downloaded from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence vcf_data = pd.read_csv(vcf_filename, sep='\t', comment='&') vcf_data = vcf_data.assign(chromosome=vcf_data['#CHROM'].astype(int)) vcf_data = vcf_data.assign(position=vcf_data['POS'].astype(int)) vcf_data = vcf_data.assign(mutation_id=vcf_data.chromosome.astype(str) + '_' + vcf_data.position.astype(str) + '_' + vcf_data.REF + vcf_data.ALT) vcf_data = vcf_data.assign(ref_counts=vcf_data.tumor.str.split(':').str[1]. str.split(',').str[0].astype(int)) vcf_data = vcf_data.assign(var_counts=vcf_data.tumor.str.split(':').str[1]. str.split(',').str[1].astype(int)) vcf_data = vcf_data.assign(triplet=vcf_data.apply( lambda x: fa.fetch(str(x.chromosome), x.position-1, x.position+1).seq, axis=1)) vcf_data = vcf_data.assign(pattern=pd.Categorical( vcf_data.apply(get_context, axis=1), categories=PAT_LIST, ordered=True)) vcf_data = vcf_data.assign( trinucleotide=pd.Categorical( vcf_data.apply(lambda x: PAT_LIST.index(x['pattern']), axis=1), categories=list(range(len(PAT_LIST))), ordered=True)) foldername = 'salcedo_dream_challenge/{}_{}'.format(tumor, seq_depth) pathlib.Path(foldername).mkdir(parents=True, exist_ok=True) vcf_data.to_csv( '{}/unrestricted_input_mut.csv'.format(foldername), sep='\t') cnv_table = pd.read_csv(cnv_filename, sep='\t')
def test_rev(self): faidx = Faidx('data/genes.fasta.gz') expect = 'GA' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 481) assert str(-result) == expect, result
def test_fetch_end(self): faidx = Faidx('data/genes.fasta') expect = 'TC' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 482) assert str(result) == expect
def test_issue_74_end_faidx(self): f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False) f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True) end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end assert end0 == end1
def test_fetch_border(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta') expect = 'TC' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500) assert str(result) == expect
def test_fetch_negative(self): """ Fetch starting with a negative coordinate """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|557361099|gb|KF435150.1|', -10, 10)
class GFF(object): def __init__(self, file, target): self.file = file # this is the .gff file self.target = target # this is the file that it is an annotation of self.read_gff_file() self.faidx = Faidx(target) def read_gff_file(self): assert self.file l = [] with open(self.file, 'r') as f: for line in f: if line[0] != '#': line = line.rstrip() l.append(line.split('\t')) self.df = pd.DataFrame( l, columns='seqid source type start end score strand phase attributes' .split()) def build_genes(self): """ For the entries that have type == gene """ self.genes = self.df[self.df['type'].isin(['gene', 'exon', 'mRNA'])] self.genes['gene'] = self.genes['attributes'].apply(gene_name_parse) self.genes['geneID'] = self.genes['attributes'].apply(gene_id_parse) self.genes['biotype'] = self.genes['attributes'].apply(biotype_parse) def find_genes(self, genelist): """ Input: list of genes to query Returns : self.genes info for those genes """ return self.genes[self.genes['gene'].isin(genelist)] def retrieve_gene_sequence(self, gene): start = int( self.genes[(self.genes['gene'] == gene) & (self.genes.type == 'gene')]['start'].values[0]) end = int(self.genes[(self.genes['gene'] == gene) & (self.genes.type == 'gene')]['end'].values[0]) source = str( self.genes[(self.genes['gene'] == gene) & (self.genes.type == 'gene')]['seqid'].values[0]) strand = str( self.genes[(self.genes['gene'] == gene) & (self.genes.type == 'gene')]['strand'].values[0]) # add buffers for sequencing primers start -= 1000 end += 1000 seq = str(self.faidx.fetch(source, start, end)) if strand == '-': seq = self.reverse_comp(seq) return seq def retrieve_gene_structures(self, gene): """ Input - gene : str Returns - fasta_dict : keys are gene;transcript;segment_type;start;stop;strand values are sequences """ def reverse_comp(self, seq): """ Returns the reverse complmiment of a DNA sequence """ d = {'A': 'T', 'G': 'C', 'C': 'G', 'T': 'A', 'N': 'N'} new_seq = '' for base in seq: new_seq += d[base.upper()] return new_seq[::-1]
def test_fetch_middle(self): faidx = Faidx("data/genes.fasta") expect = "TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 100, 150) assert str(result) == expect
def test_fetch_middle(self): faidx = Faidx('data/genes.fasta.gz') expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('gi|557361099|gb|KF435150.1|', 100, 150) assert str(result) == expect
def test_fetch_end(self): faidx = Faidx("data/genes.fasta") expect = "TC" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482) assert str(result) == expect
def test_fetch_past_bounds(self): """ Fetch past the end of a gene entry """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 5000)
def test_key_function_by_fetch(self): faidx = Faidx('data/genes.fasta', split_char='|') expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA' result = faidx.fetch('KF435150.1', 100, 150) assert str(result) == expect
def test_fetch_keyerror(self): """ Fetch a key that does not exist """ faidx = Faidx('data/genes.fasta.gz', strict_bounds=True) result = faidx.fetch('gi|joe|gb|KF435150.1|', 1, 10)
def test_rev(self): faidx = Faidx("data/genes.fasta") expect = "GA" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482) assert str(-result) == expect, result
def test_fetch_border(self): """ Fetch past the end of a gene entry """ faidx = Faidx("data/genes.fasta") expect = "TC" result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 500) assert str(result) == expect
#! /usr/bin/env python3 """ gff output line 26973 NC_000001.11 BestRefSeq%2CGnomon gene 11980181 12013515 . + . ID=gene-MFN2;Dbxref=GeneID:9927,HGNC:HGNC:1687... MFN2 9927 protein_coding """ from pyfaidx import Faidx file = '/Users/jacob.cooper/resources/genomes/GRCh38_latest_genomic.fasta' chromosome = 'NC_000001.11' start = 11980181 end = 12013515 fa = Faidx(file) print(fa.fetch(chromosome, start, end))