Пример #1
0
class TestFeatureKeyFunction:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta, key_function=get_gene_name)
        self.genes = Fasta(self.fasta, key_function=get_gene_name)

    def test_keys(self):
        expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1']
        result = sorted(self.genes.keys())
        assert result == expect

    def test_key_function_by_dictionary_get_key(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.genes['MDM4'][100-1:150]
        assert str(result) == expect

    def test_key_function_by_fetch(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.faidx.fetch('MDM4',
                             100, 150)
        assert str(result) == expect

    @raises(ValueError)
    def test_duplicated_keys(self):
        genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
Пример #2
0
 def test_fetch_border_padded(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta.gz', default_seq='N')
     expect = 'TCNNNNNNNNNNNNNNNNNNN'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500)
     print(result)
     assert str(result) == expect
Пример #3
0
 def test_key_function_by_fetch(self):
     faidx = Faidx('data/genes.fasta',
                   split_char='|',
                   duplicate_action="drop")
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('KF435150.1', 100, 150)
     assert str(result) == expect
Пример #4
0
 def test_fetch_border(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta')
     expect = 'TC'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 500)
     assert str(result) == expect
Пример #5
0
 def test_fetch_border_padded(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta.gz', default_seq='N')
     expect = 'TCNNNNNNNNNNNNNNNNNNN'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 500)
     print(result)
     assert str(result) == expect
Пример #6
0
class TestFeatureBoundsCheck:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta, default_seq='N')

    def test_fetch_border_padded(self):
        """ Fetch past the end of a gene entry """
        expect = 'TCNNNNNNNNNNNNNNNNNNN'
        result = self.faidx.fetch('KF435150.1',
                             480, 500)
        assert str(result) == expect
Пример #7
0
class Genome(object):
    def __init__(self, db):
        from pyfaidx import Faidx
        fa = os.path.join(app.config["DATA_FOLDER"], db, db + ".fa")
        self.fasta = Faidx(fa)

    def get_sequence(self, chr, start, end):
        return self.fasta.fetch(chr, start, end)

    def destroy(self):
        self.fasta.close()
Пример #8
0
 def test_fetch_whole_entry(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA'
               'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA'
               'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG'
               'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT'
               'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG'
               'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG'
               'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT'
               'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA'
               'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA'
               'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC')
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 1, 481)
     assert str(result) == expect
Пример #9
0
def load_seqs_and_annotations(protein_annotations_sqlite_db_file_path, fasta_file_path, shuffle = True, records_limit = None, verbose = True, \
        log_progress_every = 10000):

    if verbose:
        log('Loading %s records...' %
            ('all' if records_limit is None else records_limit))

    conn = sqlite3.connect(protein_annotations_sqlite_db_file_path)
    raw_proteins_and_annotations = pd.read_sql_query('SELECT uniprot_name, complete_go_annotation_indices FROM protein_annotations' + ('' if records_limit is None else \
            (' LIMIT %d' % records_limit)), conn)

    if verbose:
        log('Loaded %d proteins and their GO annotations (%d columns: %s)' %
            (raw_proteins_and_annotations.shape +
             (', '.join(raw_proteins_and_annotations.columns), )))

    if shuffle:
        raw_proteins_and_annotations = raw_proteins_and_annotations.sample(
            frac=1, random_state=0)

    if verbose:
        log('Loading Faidx (%s)...' % fasta_file_path)

    seqs_faidx = Faidx(fasta_file_path)

    if verbose:
        log('Finished loading Faidx.')

    n_failed = 0

    for i, (_, (uniprot_id, raw_go_annotation_indices)) in enumerate(
            raw_proteins_and_annotations.iterrows()):

        if verbose and i % log_progress_every == 0:
            log('%d/%d' % (i, len(raw_proteins_and_annotations)), end='\r')

        seq_fasta_id = 'UniRef90_%s' % uniprot_id.split('_')[0]

        try:
            seq = str(
                seqs_faidx.fetch(seq_fasta_id, 1,
                                 seqs_faidx.index[seq_fasta_id].rlen))
            yield uniprot_id, seq, json.loads(raw_go_annotation_indices)
        except KeyError:
            n_failed += 1

    if verbose:
        log('Finished. Failed finding the sequence for %d of %d records.' %
            (n_failed, len(raw_proteins_and_annotations)))
Пример #10
0
 def test_fetch_whole_entry(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA'
             'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA'
             'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG'
             'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT'
             'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG'
             'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG'
             'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT'
             'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA'
             'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA'
             'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC')
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          1, 481)
     assert str(result) == expect
 def test_fetch_whole_entry(self):
     faidx = Faidx("data/genes.fasta")
     expect = (
         "ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA"
         "CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA"
         "AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG"
         "TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT"
         "AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG"
         "TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG"
         "AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT"
         "AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA"
         "GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA"
         "TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC"
     )
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 1, 482)
     assert str(result) == expect
Пример #12
0
class TestFeatureBoundsCheck:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta)
        self.faidx_strict = Faidx(self.fasta, strict_bounds=True)

    def test_fetch_whole_entry(self):
        expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA'
                'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA'
                'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG'
                'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT'
                'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG'
                'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG'
                'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT'
                'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA'
                'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA'
                'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC')
        result = self.faidx.fetch('KF435150.1',
                             1, 482)
        assert str(result) == expect

    def test_fetch_middle(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.faidx.fetch('KF435150.1',
                             100, 150)
        assert str(result) == expect

    def test_fetch_end(self):
        expect = 'TC'
        result = self.faidx.fetch('KF435150.1',
                             480, 482)
        assert str(result) == expect

    def test_fetch_border(self):
        """ Fetch past the end of a gene entry """
        expect = 'TC'
        result = self.faidx.fetch('KF435150.1',
                             480, 500)
        assert str(result) == expect

    def test_rev(self):
        expect = 'GA'
        result = self.faidx.fetch('KF435150.1',
                             480, 482)
        assert str(-result) == expect, result

    @raises(FetchError)
    def test_fetch_past_bounds(self):
        """ Fetch past the end of a gene entry """
        expect = 'TC'
        result = self.faidx_strict.fetch('KF435150.1',
                                         480, 5000)
Пример #13
0
 def test_key_function_by_fetch(self):
     faidx = Faidx('data/genes.fasta', key_function=get_gene_name)
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('MDM4',
                          100, 150)
     assert str(result) == expect
 def test_fetch_reversed_coordinates(self):
     """ Fetch starting with a negative coordinate """
     faidx = Faidx("data/genes.fasta", strict_bounds=True)
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 50, 10)
 def test_fetch_keyerror(self):
     """ Fetch a key that does not exist """
     faidx = Faidx("data/genes.fasta", strict_bounds=True)
     result = faidx.fetch("gi|joe|gb|KF435150.1|", 1, 10)
Пример #16
0
    def profile(self,
                input_vcf_file,
                ref_genome_file,
                output_file,
                raw_gt_format="GTR",
                sample_id=None,
                ):
        # unzip decompose and  clean vcf file
        clean_vcf_file = join_path(mkdtemp(), "clean.vcf")
        if input_vcf_file.endswith(".gz"):
            cmd = "gunzip -c " + input_vcf_file
        else:
            cmd = "cat " + input_vcf_file
        cmd += " | vt decompose -s -"
        cmd += " | grep -Pv \"\t\*\t\""
        cmd += " | grep -v \"\\x3b\""
        cmd += " | grep -v \"^M\""
        cmd += " > " + clean_vcf_file
        p, stdout_data = exec_sh(cmd, silent=True)

        # parse the clean vcf file for the required fields
        vcf_query_format = "'"
        vcf_query_format += "%CHROM"
        vcf_query_format += "\t%POS"
        vcf_query_format += "\t%REF"
        vcf_query_format += "\t%ALT"
        vcf_query_format += "[\t%SAMPLE=%" + raw_gt_format + "]"
        vcf_query_format += "\n"
        vcf_query_format += "'"
        cmd = "vcf-query"
        cmd += " -f " + vcf_query_format
        cmd += " " + clean_vcf_file
        p, stdout_data = exec_sh(cmd, silent=True)

        # get list of smaple id and prepare data structure
        first_variant_record = stdout_data.decode('utf-8').split("\n")[0]
        variant_items = first_variant_record.strip().split("\t")
        samples_features = {}
        for sample_idx in range(4, len(variant_items)):
            gt_data = variant_items[sample_idx] 
            m = re.match(r"(?P<sample_id>.*)=(?P<raw_gt>.*)", gt_data)
            sample_id = m.group("sample_id")
            samples_features[sample_id] = copy.deepcopy(SNV_FEATURES_TEMPLATE)

        # iterate over all vcf record and count variants for each sample
        fa = Faidx(ref_genome_file)
        for variant_record in stdout_data.decode('utf-8').split("\n"):
            variant_items = variant_record.strip().split("\t")
            if len(variant_items) < 4:
                continue
            chrom = variant_items[0]
            pos = variant_items[1]
            ref = variant_items[2]
            alt = variant_items[3]
            if len(ref) > 1:
                continue
            if ref == "-":
                continue
            if len(alt) > 1:
                continue
            if alt == "-":
                continue
            triplet = fa.fetch(chrom, int(pos)-1, int(pos)+1).seq
            feature_id = SNV_FEATURES_HASH[ref][alt][triplet]
            # iterate over all samples in the record
            for sample_idx in range(4, len(variant_items)):
                gt_data = variant_items[sample_idx] 
                m = re.match(r"(?P<sample_id>.*)=(?P<raw_gt>.*)", gt_data)
                raw_gt = m.group("raw_gt")
                if raw_gt == "0/0":
                    continue
                samples_features[m.group("sample_id")][feature_id][FEATURE_QUANTITY] += 1
        fa.close()

        # write output feature file
        with open(output_file, "w") as f_o:
            header = VARIANT_TYPE
            header += "\t" + VARIANT_SUBGROUP
            header += "\t" + FEATURE_ID
            for sample_id in samples_features:
                header += "\t" + sample_id
            f_o.write(header+"\n")
            for feature_id in SNV_FEATURES_TEMPLATE:
                feature_info =  "{:s}\t{:s}\t{:s}".format(SNV_FEATURES_TEMPLATE[feature_id][VARIANT_TYPE],
                                                          SNV_FEATURES_TEMPLATE[feature_id][VARIANT_SUBGROUP],
                                                          feature_id,
                                                          )
                for sample_id in samples_features:
                    feature_info += "\t" + str(samples_features[sample_id][feature_id][FEATURE_QUANTITY])
                f_o.write(feature_info + "\n")

        self.info()
        self.info("Done!! The output file is at " + output_file)
 def test_fetch_past_bounds(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx("data/genes.fasta", strict_bounds=True)
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 5000)
Пример #18
0
 def test_fetch_reversed_coordinates(self):
     """ Fetch starting with a negative coordinate """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 50, 10)
Пример #19
0
 def test_issue_74_end_faidx(self):
     f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False)
     f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True)
     end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     assert end0 == end1
Пример #20
0
 def test_fetch_end(self):
     faidx = Faidx('data/genes.fasta')
     expect = 'TC'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 481)
     assert str(result) == expect
Пример #21
0
 def test_rev(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = 'GA'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 481)
     assert str(-result) == expect, result
Пример #22
0
 def test_fetch_middle(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          100, 150)
     assert str(result) == expect
Пример #23
0
fa = Faidx(ref_fasta)
# downloaded from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence

vcf_data = pd.read_csv(vcf_filename, sep='\t', comment='&')

vcf_data = vcf_data.assign(chromosome=vcf_data['#CHROM'].astype(int))
vcf_data = vcf_data.assign(position=vcf_data['POS'].astype(int))
vcf_data = vcf_data.assign(mutation_id=vcf_data.chromosome.astype(str) + '_' +
                           vcf_data.position.astype(str) + '_' +
                           vcf_data.REF + vcf_data.ALT)
vcf_data = vcf_data.assign(ref_counts=vcf_data.tumor.str.split(':').str[1].
                           str.split(',').str[0].astype(int))
vcf_data = vcf_data.assign(var_counts=vcf_data.tumor.str.split(':').str[1].
                           str.split(',').str[1].astype(int))
vcf_data = vcf_data.assign(triplet=vcf_data.apply(
    lambda x: fa.fetch(str(x.chromosome), x.position-1, x.position+1).seq,
    axis=1))
vcf_data = vcf_data.assign(pattern=pd.Categorical(
            vcf_data.apply(get_context, axis=1),
            categories=PAT_LIST, ordered=True))
vcf_data = vcf_data.assign(
        trinucleotide=pd.Categorical(
            vcf_data.apply(lambda x: PAT_LIST.index(x['pattern']),
                                     axis=1),
            categories=list(range(len(PAT_LIST))), ordered=True))
foldername = 'salcedo_dream_challenge/{}_{}'.format(tumor, seq_depth)
pathlib.Path(foldername).mkdir(parents=True, exist_ok=True)
vcf_data.to_csv(
    '{}/unrestricted_input_mut.csv'.format(foldername), sep='\t')
cnv_table = pd.read_csv(cnv_filename, sep='\t')
Пример #24
0
 def test_rev(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = 'GA'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 481)
     assert str(-result) == expect, result
 def test_fetch_end(self):
     faidx = Faidx('data/genes.fasta')
     expect = 'TC'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 482)
     assert str(result) == expect
Пример #26
0
 def test_issue_74_end_faidx(self):
     f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False)
     f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True)
     end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     assert end0 == end1
 def test_fetch_border(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta')
     expect = 'TC'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500)
     assert str(result) == expect
Пример #28
0
 def test_fetch_negative(self):
     """ Fetch starting with a negative coordinate """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                                      -10, 10)
Пример #29
0
class GFF(object):
    def __init__(self, file, target):
        self.file = file  # this is the .gff file
        self.target = target  # this is the file that it is an annotation of
        self.read_gff_file()
        self.faidx = Faidx(target)

    def read_gff_file(self):
        assert self.file
        l = []
        with open(self.file, 'r') as f:
            for line in f:
                if line[0] != '#':
                    line = line.rstrip()
                    l.append(line.split('\t'))
        self.df = pd.DataFrame(
            l,
            columns='seqid source type start end score strand phase attributes'
            .split())

    def build_genes(self):
        """
        For the entries that have type == gene

        """
        self.genes = self.df[self.df['type'].isin(['gene', 'exon', 'mRNA'])]
        self.genes['gene'] = self.genes['attributes'].apply(gene_name_parse)
        self.genes['geneID'] = self.genes['attributes'].apply(gene_id_parse)
        self.genes['biotype'] = self.genes['attributes'].apply(biotype_parse)

    def find_genes(self, genelist):
        """
        Input: list of genes to query
        Returns : self.genes info for those genes
        """

        return self.genes[self.genes['gene'].isin(genelist)]

    def retrieve_gene_sequence(self, gene):
        start = int(
            self.genes[(self.genes['gene'] == gene)
                       & (self.genes.type == 'gene')]['start'].values[0])
        end = int(self.genes[(self.genes['gene'] == gene)
                             & (self.genes.type == 'gene')]['end'].values[0])
        source = str(
            self.genes[(self.genes['gene'] == gene)
                       & (self.genes.type == 'gene')]['seqid'].values[0])
        strand = str(
            self.genes[(self.genes['gene'] == gene)
                       & (self.genes.type == 'gene')]['strand'].values[0])

        # add buffers for sequencing primers
        start -= 1000
        end += 1000

        seq = str(self.faidx.fetch(source, start, end))
        if strand == '-':
            seq = self.reverse_comp(seq)
        return seq

    def retrieve_gene_structures(self, gene):
        """
        Input
            - gene : str
        Returns
            - fasta_dict : 
                keys are gene;transcript;segment_type;start;stop;strand
                values are sequences
        """

    def reverse_comp(self, seq):
        """
        Returns the reverse complmiment of a DNA sequence
        """
        d = {'A': 'T', 'G': 'C', 'C': 'G', 'T': 'A', 'N': 'N'}
        new_seq = ''
        for base in seq:
            new_seq += d[base.upper()]
        return new_seq[::-1]
 def test_fetch_middle(self):
     faidx = Faidx("data/genes.fasta")
     expect = "TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 100, 150)
     assert str(result) == expect
Пример #31
0
 def test_fetch_middle(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 100, 150)
     assert str(result) == expect
 def test_fetch_end(self):
     faidx = Faidx("data/genes.fasta")
     expect = "TC"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482)
     assert str(result) == expect
Пример #33
0
 def test_fetch_past_bounds(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 5000)
 def test_key_function_by_fetch(self):
     faidx = Faidx('data/genes.fasta', split_char='|')
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('KF435150.1',
                          100, 150)
     assert str(result) == expect
Пример #35
0
 def test_fetch_keyerror(self):
     """ Fetch a key that does not exist """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|joe|gb|KF435150.1|', 1, 10)
 def test_rev(self):
     faidx = Faidx("data/genes.fasta")
     expect = "GA"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482)
     assert str(-result) == expect, result
 def test_fetch_border(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx("data/genes.fasta")
     expect = "TC"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 500)
     assert str(result) == expect
Пример #38
0
#! /usr/bin/env python3
"""
gff output line
26973  NC_000001.11  BestRefSeq%2CGnomon  gene  11980181  12013515     .      +     .  ID=gene-MFN2;Dbxref=GeneID:9927,HGNC:HGNC:1687...  MFN2   9927  protein_coding
"""

from pyfaidx import Faidx


file = '/Users/jacob.cooper/resources/genomes/GRCh38_latest_genomic.fasta'
chromosome = 'NC_000001.11'
start = 11980181
end = 12013515

fa = Faidx(file)
print(fa.fetch(chromosome, start, end))