Пример #1
0
 def __init__(self, ref_genome="at_tair10"):
     if ref_genome == "at_tair10":
         self.chrs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5']
         self.def_color = [
             "#1f78b4", "#33a02c", "#1f78b4", "#33a02c", "#1f78b4"
         ]
         self.real_chrlen = [
             34964571, 22037565, 25499034, 20862711, 31270811
         ]
         self.golden_chrlen = [
             30427671, 19698289, 23459830, 18585056, 26975502
         ]
         self.centro_start = [
             14364752, 3602775, 12674550, 2919690, 11668616
         ]
         self.centro_end = [15750321, 3735247, 13674767, 4011692, 12082583]
         self.cetro_mid = np.add(self.centro_start, self.centro_end) / 2
     elif os.path.exists(ref_genome):
         ## Provide a fasta file to check for genome lengths etc
         from pyfaidx import Faidx
         genome = Faidx(ref_genome).index
         self.chrs = np.sort(np.array(genome.keys())).tolist()
         self.real_chrlen = [genome[ef].rlen for ef in self.chrs]
         self.golden_chrlen = self.real_chrlen
     self.chr_inds = np.append(0, np.cumsum(self.golden_chrlen))
Пример #2
0
 def test_fetch_border(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta')
     expect = 'TC'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 500)
     assert str(result) == expect
Пример #3
0
 def test_fetch_border_padded(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta.gz', default_seq='N')
     expect = 'TCNNNNNNNNNNNNNNNNNNN'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 480, 500)
     print(result)
     assert str(result) == expect
Пример #4
0
 def test_build_issue_111(self):
     expect_index = ("gi|563317589|dbj|AB821309	3510	114	70	71\n"
                     "gi|557361099|gb|KF435150	481	3789	70	71\n"
                     "gi|557361097|gb|KF435149	642	4368	70	71\n"
                     "gi|543583796|ref|NR_104216	4573	5141	70	71\n"
                     "gi|543583795|ref|NR_104215	5317	9901	70	71\n"
                     "gi|543583794|ref|NR_104212	5374	15415	70	71\n"
                     "gi|543583788|ref|NM_001282545	4170	20980	70	71\n"
                     "gi|543583786|ref|NM_001282543	5466	25324	70	71\n"
                     "gi|543583785|ref|NM_000465	5523	30980	70	71\n"
                     "gi|543583740|ref|NM_001282549	3984	36696	70	71\n"
                     "gi|543583738|ref|NM_001282548	4113	40851	70	71\n"
                     "gi|530384540|ref|XM_005249645	2752	45151	70	71\n"
                     "gi|530384538|ref|XM_005249644	3004	48071	70	71\n"
                     "gi|530384536|ref|XM_005249643	3109	51246	70	71\n"
                     "gi|530384534|ref|XM_005249642	3097	54528	70	71\n"
                     "gi|530373237|ref|XM_005265508	2794	57830	70	71\n"
                     "gi|530373235|ref|XM_005265507	2848	60824	70	71\n"
                     "gi|530364726|ref|XR_241081	1009	63849	70	71\n"
                     "gi|530364725|ref|XR_241080	4884	65009	70	71\n"
                     "gi|530364724|ref|XR_241079	2819	70099	70	71\n")
     index = Faidx('data/genes.fasta',
                   read_long_names=True,
                   key_function=lambda x: x.split('.')[0])
     result_index = ''.join(index._index_as_string())
     assert result_index == expect_index
Пример #5
0
    def setUpClass(cls):
        cls.dir = tempfile.mkdtemp()

        # create a fasta file
        cls.fa = os.path.join(cls.dir, 'genome.fa')
        with open(cls.fa, mode='wt') as handle:
            handle.write('>chr1\n')
            handle.write('ACTGATGCTAGCTAGTATCTGACTCAGTAGCTCGAT\n')

        # index the fasta file
        fai = Faidx(cls.fa)
        fai.close()

        # set the final args that depend on the temp directory
        get_options.set_attr('tempdir', cls.dir)
        get_options.set_attr('reference', cls.fa)

        outvcf = os.path.join(cls.dir, 'out.vcf.gz')
        invcf = os.path.join(cls.dir, 'in.vcf.gz')
        get_options.set_attr('vcf', invcf)
        get_options.set_attr('out', outvcf)

        # write a VCF to be converted. This includes one variant which cannot be
        # converted. TODO: make a unit test to check for expected log output for
        # unconvertible variant
        with gzip.open(invcf, 'wt') as handle:
            handle.write('##fileformat=VCFv4.1\n' \
                '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' \
                '1\t10\t.\tT\tG\t100\tPASS\tAC=100\n' \
                '1\t1000000\t.\tT\tG\t100\tPASS\tAC=100\n' \
                '1\t2000000\t.\tA\tG\t100\tPASS\tAC=100\n')
Пример #6
0
 def test_key_function_by_fetch(self):
     faidx = Faidx('data/genes.fasta',
                   split_char='|',
                   duplicate_action="drop")
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('KF435150.1', 100, 150)
     assert str(result) == expect
Пример #7
0
 def test_fetch_border_padded(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx('data/genes.fasta.gz', default_seq='N')
     expect = 'TCNNNNNNNNNNNNNNNNNNN'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 500)
     print(result)
     assert str(result) == expect
Пример #8
0
 def test_reindex_on_modification(self):
     """ This test ensures that the index is regenerated when the FASTA
     modification time is newer than the index modification time.
     mdshw5/pyfaidx#50 """
     faidx = Faidx('data/genes.fasta')
     index_mtime = getmtime(faidx.indexname)
     faidx.close()
     os.utime('data/genes.fasta', (index_mtime + 10, ) * 2)
     time.sleep(2)
     faidx = Faidx('data/genes.fasta')
     assert getmtime(faidx.indexname) > index_mtime
Пример #9
0
class Genome(object):
    def __init__(self, db):
        from pyfaidx import Faidx
        fa = os.path.join(app.config["DATA_FOLDER"], db, db + ".fa")
        self.fasta = Faidx(fa)

    def get_sequence(self, chr, start, end):
        return self.fasta.fetch(chr, start, end)

    def destroy(self):
        self.fasta.close()
Пример #10
0
    def setUpClass(cls):
        cls.dir = tempfile.mkdtemp()

        # create a fasta file
        cls.fa = os.path.join(cls.dir, 'genome.fa')
        with open(cls.fa, mode='wt') as handle:
            handle.write('>chrN\n')
            handle.write('NNTGATGCTAGCTAGTATCTG\n')

        # index the fasta file
        fai = Faidx(cls.fa)
        fai.close()
Пример #11
0
 def test_fetch_whole_entry(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA'
               'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA'
               'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG'
               'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT'
               'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG'
               'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG'
               'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT'
               'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA'
               'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA'
               'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC')
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 1, 481)
     assert str(result) == expect
Пример #12
0
def load_seqs_and_annotations(protein_annotations_sqlite_db_file_path, fasta_file_path, shuffle = True, records_limit = None, verbose = True, \
        log_progress_every = 10000):

    if verbose:
        log('Loading %s records...' %
            ('all' if records_limit is None else records_limit))

    conn = sqlite3.connect(protein_annotations_sqlite_db_file_path)
    raw_proteins_and_annotations = pd.read_sql_query('SELECT uniprot_name, complete_go_annotation_indices FROM protein_annotations' + ('' if records_limit is None else \
            (' LIMIT %d' % records_limit)), conn)

    if verbose:
        log('Loaded %d proteins and their GO annotations (%d columns: %s)' %
            (raw_proteins_and_annotations.shape +
             (', '.join(raw_proteins_and_annotations.columns), )))

    if shuffle:
        raw_proteins_and_annotations = raw_proteins_and_annotations.sample(
            frac=1, random_state=0)

    if verbose:
        log('Loading Faidx (%s)...' % fasta_file_path)

    seqs_faidx = Faidx(fasta_file_path)

    if verbose:
        log('Finished loading Faidx.')

    n_failed = 0

    for i, (_, (uniprot_id, raw_go_annotation_indices)) in enumerate(
            raw_proteins_and_annotations.iterrows()):

        if verbose and i % log_progress_every == 0:
            log('%d/%d' % (i, len(raw_proteins_and_annotations)), end='\r')

        seq_fasta_id = 'UniRef90_%s' % uniprot_id.split('_')[0]

        try:
            seq = str(
                seqs_faidx.fetch(seq_fasta_id, 1,
                                 seqs_faidx.index[seq_fasta_id].rlen))
            yield uniprot_id, seq, json.loads(raw_go_annotation_indices)
        except KeyError:
            n_failed += 1

    if verbose:
        log('Finished. Failed finding the sequence for %d of %d records.' %
            (n_failed, len(raw_proteins_and_annotations)))
Пример #13
0
 def test_fetch_whole_entry(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA'
             'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA'
             'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG'
             'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT'
             'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG'
             'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG'
             'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT'
             'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA'
             'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA'
             'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC')
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          1, 481)
     assert str(result) == expect
Пример #14
0
 def test_build_issue_141(self):
     expect_index = ("gi|563317589|dbj|AB821309.1|	3510	115	70	72\n"
                     "gi|557361099|gb|KF435150.1|	481	3842	70	72\n"
                     "gi|557361097|gb|KF435149.1|	642	4429	70	72\n"
                     "gi|543583796|ref|NR_104216.1|	4573	5213	70	72\n"
                     "gi|543583795|ref|NR_104215.1|	5317	10040	70	72\n"
                     "gi|543583794|ref|NR_104212.1|	5374	15631	70	72\n"
                     "gi|543583788|ref|NM_001282545.1|	4170	21274	70	72\n"
                     "gi|543583786|ref|NM_001282543.1|	5466	25679	70	72\n"
                     "gi|543583785|ref|NM_000465.3|	5523	31415	70	72\n"
                     "gi|543583740|ref|NM_001282549.1|	3984	37211	70	72\n"
                     "gi|543583738|ref|NM_001282548.1|	4113	41424	70	72\n"
                     "gi|530384540|ref|XM_005249645.1|	2752	45784	70	72\n"
                     "gi|530384538|ref|XM_005249644.1|	3004	48745	70	72\n"
                     "gi|530384536|ref|XM_005249643.1|	3109	51964	70	72\n"
                     "gi|530384534|ref|XM_005249642.1|	3097	55292	70	72\n"
                     "gi|530373237|ref|XM_005265508.1|	2794	58640	70	72\n"
                     "gi|530373235|ref|XM_005265507.1|	2848	61675	70	72\n"
                     "gi|530364726|ref|XR_241081.1|	1009	64742	70	72\n"
                     "gi|530364725|ref|XR_241080.1|	4884	65918	70	72\n"
                     "gi|530364724|ref|XR_241079.1|	2819	71079	70	72\n")
     index_file = Faidx('data/issue_141.fasta').indexname
     result_index = open(index_file).read()
     os.remove('data/issue_141.fasta.fai')
     print(result_index)
     assert result_index == expect_index
 def test_fetch_whole_entry(self):
     faidx = Faidx("data/genes.fasta")
     expect = (
         "ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA"
         "CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA"
         "AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG"
         "TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT"
         "AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG"
         "TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG"
         "AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT"
         "AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA"
         "GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA"
         "TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC"
     )
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 1, 482)
     assert str(result) == expect
Пример #16
0
    def test_build_issue_96_fail_read_malformed_index_duplicate_key(self):
        """ Ensure that the fasta file is closed if construction of the 'Faidx' file
        fails when attempting to read a pre-existing index. The index is malformed because
        it contains mulitple occurrences of the same index.
        See mdshw5/pyfaidx#96
        """
        tmp_dir = mkdtemp()
        try:
            fasta_path = os.path.join(tmp_dir, 'issue_96.fasta')
            faidx_path = os.path.join(tmp_dir, 'issue_96.fasta.fai')
            # Write simple fasta file
            with open(fasta_path, 'w') as fasta_out:
                fasta_out.write(">seq1\nCTCCGGGCCCAT\nATAAAGCCTAAA\n")
            with open(faidx_path, 'w') as faidx_out:
                faidx_out.write("seq1\t24\t6\t12\t13\nseq1\t24\t6\t12\t13\n")

            builtins_open = builtins.open

            opened_files = []

            def test_open(*args, **kwargs):
                f = builtins_open(*args, **kwargs)
                opened_files.append(f)
                return f

            with mock.patch('six.moves.builtins.open', side_effect=test_open):
                try:
                    Faidx(fasta_path)
                    self.assertFail(
                        "Faidx construction should fail with 'ValueError'.")
                except ValueError:
                    pass
            self.assertTrue(all(f.closed for f in opened_files))
        finally:
            shutil.rmtree(tmp_dir)
Пример #17
0
 def test_valgrind_blank_lines(self):
     """ Makes all full-length lines blank and checks that error is raised
     in all appropriate circumstances.
     """
     # http://stackoverflow.com/a/23212515/717419
     if platform.system() == 'Windows':
         raise SkipTest
     indexed = []
     with open('data/genes.fasta') as genes:
         fasta = genes.readlines()
     n_lines = sum(1 for line in fasta)
     for n in range(n_lines):
         with NamedTemporaryFile(mode='w') as lines:
             for i, line in enumerate(fasta):
                 if i == n and line[0] != '>' and len(line) == 71:
                     line = '\n'
                     full_line = True
                 elif i == n:
                     full_line = False
                 lines.write(line)
                 lines.flush()
             name = lines.name
             if full_line:
                 try:
                     Faidx(name)
                     indexed.append(True)
                 except FastaIndexingError:
                     indexed.append(False)
     assert not any(indexed)
Пример #18
0
 def test_build_issue_126(self):
     """ Samtools BGZF index should be identical to pyfaidx BGZF index """
     expect_index = ("gi|563317589|dbj|AB821309.1|	3510	114	70	71\n"
                     "gi|557361099|gb|KF435150.1|	481	3789	70	71\n"
                     "gi|557361097|gb|KF435149.1|	642	4368	70	71\n"
                     "gi|543583796|ref|NR_104216.1|	4573	5141	70	71\n"
                     "gi|543583795|ref|NR_104215.1|	5317	9901	70	71\n"
                     "gi|543583794|ref|NR_104212.1|	5374	15415	70	71\n"
                     "gi|543583788|ref|NM_001282545.1|	4170	20980	70	71\n"
                     "gi|543583786|ref|NM_001282543.1|	5466	25324	70	71\n"
                     "gi|543583785|ref|NM_000465.3|	5523	30980	70	71\n"
                     "gi|543583740|ref|NM_001282549.1|	3984	36696	70	71\n"
                     "gi|543583738|ref|NM_001282548.1|	4113	40851	70	71\n"
                     "gi|530384540|ref|XM_005249645.1|	2752	45151	70	71\n"
                     "gi|530384538|ref|XM_005249644.1|	3004	48071	70	71\n"
                     "gi|530384536|ref|XM_005249643.1|	3109	51246	70	71\n"
                     "gi|530384534|ref|XM_005249642.1|	3097	54528	70	71\n"
                     "gi|530373237|ref|XM_005265508.1|	2794	57830	70	71\n"
                     "gi|530373235|ref|XM_005265507.1|	2848	60824	70	71\n"
                     "gi|530364726|ref|XR_241081.1|	1009	63849	70	71\n"
                     "gi|530364725|ref|XR_241080.1|	4884	65009	70	71\n"
                     "gi|530364724|ref|XR_241079.1|	2819	70099	70	71\n")
     index_file = Faidx('data/genes.fasta.gz').indexname
     result_index = open(index_file).read()
     assert result_index == expect_index
Пример #19
0
class TestFeatureKeyFunction:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta, key_function=get_gene_name)
        self.genes = Fasta(self.fasta, key_function=get_gene_name)

    def test_keys(self):
        expect = ['BARD1', 'FGFR2', 'KF435149.1', 'MDM4', 'NM_000465.3', 'NM_001282543.1', 'NM_001282545.1', 'NM_001282548.1', 'NM_001282549.1', 'NR_104212.1', 'NR_104215.1', 'XM_005249642.1', 'XM_005249643.1', 'XM_005249644.1', 'XM_005249645.1', 'XM_005265507.1', 'XM_005265508.1', 'XR_241079.1', 'XR_241080.1', 'XR_241081.1']
        result = sorted(self.genes.keys())
        assert result == expect

    def test_key_function_by_dictionary_get_key(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.genes['MDM4'][100-1:150]
        assert str(result) == expect

    def test_key_function_by_fetch(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.faidx.fetch('MDM4',
                             100, 150)
        assert str(result) == expect

    @raises(ValueError)
    def test_duplicated_keys(self):
        genes = Fasta(self.fasta, key_function=get_duplicated_gene_name)
Пример #20
0
    def test_build_issue_96_fail_build_faidx(self):
        """ Ensure that the fasta file is closed if construction of the 'Faidx' file
        when attempting to build an index.
        See mdshw5/pyfaidx#96
        """
        tmp_dir = mkdtemp()
        try:
            fasta_path = os.path.join(tmp_dir, 'issue_96.fasta')
            # Write simple fasta file with inconsistent sequence line lengths,
            # so building an index raises a 'FastaIndexingError'
            with open(fasta_path, 'w') as fasta_out:
                fasta_out.write(
                    ">seq1\nCTCCGGGCCCAT\nAACACTTGGGGGTAGCTAAAGTGAA\nATAAAGCCTAAA\n"
                )

            builtins_open = builtins.open

            opened_files = []

            def test_open(*args, **kwargs):
                f = builtins_open(*args, **kwargs)
                opened_files.append(f)
                return f

            with mock.patch('six.moves.builtins.open', side_effect=test_open):
                try:
                    Faidx(fasta_path)
                    self.assertFail(
                        "Faidx construction should fail with 'FastaIndexingError'."
                    )
                except FastaIndexingError:
                    pass
            self.assertTrue(all(f.closed for f in opened_files))
        finally:
            shutil.rmtree(tmp_dir)
Пример #21
0
def split_target_sequence(target_chroms, target_fasta_name, inter_files):
    Faidx(target_fasta_name)
    target_fasta_dict = Fasta(target_fasta_name, key_function=lambda x: x.split()[0])
    for chrm in target_chroms:
        if chrm != target_fasta_name:
            out = open(inter_files + "/" + chrm + ".fa", 'w')
            out.write(">" + chrm + "\n" + str(target_fasta_dict[chrm]))
    return target_fasta_dict
Пример #22
0
def split_target_sequence(target_chroms, target_fasta_name):
    Faidx(target_fasta_name)
    target_fasta = Fasta(target_fasta_name,
                         key_function=lambda x: x.split()[0])
    for chrm in target_chroms:
        if chrm != target_fasta_name:
            out = open(chrm + ".fa", 'w')
            out.write(">" + chrm + "\n" + str(target_fasta[chrm]))
            out.close()
Пример #23
0
def get_gene_sequences(parent_dict, ref_chroms, reference_fasta_name, processes, inter_files, liftover_type):
    pool = Pool(processes)
    Faidx(reference_fasta_name)
    func = partial(get_gene_sequences_subset, parent_dict, reference_fasta_name, inter_files, liftover_type)
    for result in pool.imap_unordered(func, ref_chroms):
        continue
    pool.close()
    pool.join()
    return
Пример #24
0
 def test_build_issue_83(self):
     """ Ensure that blank lines between entries are treated in the
     same way as samtools 1.2. See mdshw5/pyfaidx#83.
     """
     expect_index = ("MT	119	4	70	71\nGL000207.1	60	187	60	61\n")
     index_file = Faidx('data/issue_83.fasta').indexname
     result_index = open(index_file).read()
     os.remove('data/issue_83.fasta.fai')
     assert result_index == expect_index
def get_batches(NUM_BATCHES, GENOME_FASTA):
    # Return a 3-level list(ref): partitions -> chunks -> chunk properties (scaffold + coordinates)
    PARTS = []
    GENOME_NAME = os.path.basename(GENOME_FASTA).split(".")[0]
    TOTAL_SIZE = 0
    SEQS = {}

    FAIDX = Faidx(GENOME_FASTA)
    FASTA_IDX = GENOME_FASTA + ".fai"

    with open(FASTA_IDX) as FILE:
        for LINE in FILE:
            LINE = LINE.rstrip()
            SEQ, SEQ_SIZE, JUNK = LINE.split("\t", 2)
            TOTAL_SIZE += int(SEQ_SIZE)
            SEQS[SEQ] = int(SEQ_SIZE)

    if NUM_BATCHES > 0:
        CHUNK_SIZE = int(TOTAL_SIZE / NUM_BATCHES) + 1

    BATCHES = []
    CURRENT_BATCH_SIZE = 0
    for SCAFFOLD in SEQS:
        SEQ_SIZE = SEQS[SCAFFOLD]
        SEQ_IDX = 0

        while SEQ_SIZE > 0:
            if (CURRENT_BATCH_SIZE + SEQ_SIZE) > CHUNK_SIZE:
                FILL_SIZE = CHUNK_SIZE - CURRENT_BATCH_SIZE
                CHUNK_INFO = str(GENOME_NAME + ":" + SCAFFOLD + ":" +
                                 str(SEQ_IDX) + "-" + str(SEQ_SIZE))
                #NOTE: For scaffold size, always refer back to the index dict, not SEQ_SIZE,
                # since SEQ_SIZE changes depending on if the whole scaffold was used in
                # a single batch or not (as in the if statement of this loop)
                PARTS.append(
                    [SCAFFOLD, SEQS[SCAFFOLD], SEQ_IDX, FILL_SIZE, CHUNK_INFO])
                BATCHES.append([PARTS])
                PARTS = []
                SEQ_IDX += FILL_SIZE
                SEQ_SIZE -= FILL_SIZE
                CURRENT_BATCH_SIZE = 0
            else:
                CHUNK_INFO = str(GENOME_NAME + ":" + SCAFFOLD + ":" +
                                 str(SEQ_IDX) + "-" + str(SEQ_SIZE))
                PARTS.append(
                    [SCAFFOLD, SEQS[SCAFFOLD], SEQ_IDX, SEQ_SIZE, CHUNK_INFO])
                CURRENT_BATCH_SIZE += SEQ_SIZE
                SEQ_SIZE = 0
    #unclear if BATCHES will be in the appropriate hierarchy of lists/parts(elements) atm
    # This bit must be outside of the for loop, otherwise each iteration thru the loop
    # will append the current PARTS list to BATCHES x# of scaffolds in the PARTS list
    if PARTS:
        BATCHES.append([PARTS])

    return BATCHES
Пример #26
0
def split_target_sequence(target_chroms, target_fasta_name, inter_files):
    Faidx(target_fasta_name)
    genome_size =0
    target_fasta = Fasta(target_fasta_name, key_function = lambda x: x.split()[0])
    for value in target_fasta.values():
        genome_size += len(value)
    for chrm in target_chroms:
        if chrm != target_fasta_name:
            out=open( inter_files + "/" + chrm+".fa", 'w')
            out.write(">" + chrm + "\n" + str(target_fasta[chrm]))
    return genome_size
Пример #27
0
class TestFeatureBoundsCheck:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta, default_seq='N')

    def test_fetch_border_padded(self):
        """ Fetch past the end of a gene entry """
        expect = 'TCNNNNNNNNNNNNNNNNNNN'
        result = self.faidx.fetch('KF435150.1',
                             480, 500)
        assert str(result) == expect
Пример #28
0
 def test_issue_144_no_defline(self):
     """ Ensure that an exception is raised when a file contains no deflines. See mdshw5/pyfaidx#144.
     """
     tmp_dir = mkdtemp()
     try:
         fasta_path = os.path.join(tmp_dir, 'issue_144.fasta')
         # Write simple fasta file
         with open(fasta_path, 'w') as fasta_out:
             fasta_out.write("CTCCGGGCCCAT\nATAAAGCCTAAA\n")
         faidx = Faidx(fasta_path)
     finally:
         shutil.rmtree(tmp_dir)
Пример #29
0
 def test_read_back_index(self):
     """Ensure that index files written with write_fai() can be read back"""
     import locale
     old_locale = locale.getlocale(locale.LC_NUMERIC)
     try:
         locale.setlocale(locale.LC_NUMERIC, 'en_US.utf8')
         faidx = Faidx('data/genes.fasta')
         faidx.write_fai()
         faidx = Faidx('data/genes.fasta', build_index=False)
     finally:
         locale.setlocale(locale.LC_NUMERIC, old_locale)
Пример #30
0
 def test_reindex_on_modification(self):
     """ This test ensures that the index is regenerated when the FASTA
     modification time is newer than the index modification time.
     mdshw5/pyfaidx#50 """
     faidx = Faidx('data/genes.fasta')
     index_mtime = getmtime(faidx.indexname)
     faidx.close()
     os.utime('data/genes.fasta', (index_mtime + 10, ) * 2)
     time.sleep(2)
     faidx = Faidx('data/genes.fasta')
     assert getmtime(faidx.indexname) > index_mtime
Пример #31
0
class TestFeatureBoundsCheck:
    def __init__(self):
        self.fasta = os.path.join(path, 'data/genes.fasta')
        self.faidx = Faidx(self.fasta)
        self.faidx_strict = Faidx(self.fasta, strict_bounds=True)

    def test_fetch_whole_entry(self):
        expect = ('ATGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGA'
                'CAGTGCTTGCAGGATCTCTCCTGGACAAATCAATCAGGTACGACCA'
                'AAACTGCCGCTTTTGAAGATTTTGCATGCAGCAGGTGCGCAAGG'
                'TGAAATGTTCACTGTTAAAGAGGTCATGCACTATTTAGGTCAGTACAT'
                'AATGGTGAAGCAACTTTATGATCAGCAGGAGCAGCATATGGTATATTG'
                'TGGTGGAGATCTTTTGGGAGAACTACTGGGACGTCAGAGCTTCTCCGTG'
                'AAAGACCCAAGCCCTCTCTATGATATGCTAAGAAAGAATCTTGTCACTTT'
                'AGCCACTGCTACTACAGCAAAGTGCAGAGGAAAGTTCCACTTCCAGAAAAA'
                'GAACTACAGAAGACGATATCCCCACACTGCCTACCTCAGAGCATAAATGCA'
                'TACATTCTAGAGAAGGTGATTGAAGTGGGAAAAAATGATGACCTGGAGGACTC')
        result = self.faidx.fetch('KF435150.1',
                             1, 482)
        assert str(result) == expect

    def test_fetch_middle(self):
        expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
        result = self.faidx.fetch('KF435150.1',
                             100, 150)
        assert str(result) == expect

    def test_fetch_end(self):
        expect = 'TC'
        result = self.faidx.fetch('KF435150.1',
                             480, 482)
        assert str(result) == expect

    def test_fetch_border(self):
        """ Fetch past the end of a gene entry """
        expect = 'TC'
        result = self.faidx.fetch('KF435150.1',
                             480, 500)
        assert str(result) == expect

    def test_rev(self):
        expect = 'GA'
        result = self.faidx.fetch('KF435150.1',
                             480, 482)
        assert str(-result) == expect, result

    @raises(FetchError)
    def test_fetch_past_bounds(self):
        """ Fetch past the end of a gene entry """
        expect = 'TC'
        result = self.faidx_strict.fetch('KF435150.1',
                                         480, 5000)
Пример #32
0
def get_transcriptome(fa_input, fa_output, fai_output, log):

    logging.basicConfig(filename=log, filemode="w", level=logging.INFO, format='%(message)s')
    logging.info("timestamp: {}".format(str(datetime.datetime.now())))
    for i, j in locals().items():
        logging.info("\t{}: {}\n".format(i,j))

    try:
        # Parse fasta file uncompress and simplify transcript ids
        logging.info("Read input transcriptome fasta file")
        with open(fa_output, "w") as fa_out:
            for rec in Fasta.Reader(fa_input):
                fa_out.write(">{}\n{}\n".format(rec.short_name, rec.seq))

        logging.info("Index fasta file")
        with Faidx(fa_output) as fa_out:
            fa_out.build_index()

    except:
        logging.exception('Error while running get_transcriptome')
        raise
Пример #33
0
 def test_order(self):
     order = ("gi|563317589|dbj|AB821309.1|", "gi|557361099|gb|KF435150.1|",
              "gi|557361097|gb|KF435149.1|",
              "gi|543583796|ref|NR_104216.1|",
              "gi|543583795|ref|NR_104215.1|",
              "gi|543583794|ref|NR_104212.1|",
              "gi|543583788|ref|NM_001282545.1|",
              "gi|543583786|ref|NM_001282543.1|",
              "gi|543583785|ref|NM_000465.3|",
              "gi|543583740|ref|NM_001282549.1|",
              "gi|543583738|ref|NM_001282548.1|",
              "gi|530384540|ref|XM_005249645.1|",
              "gi|530384538|ref|XM_005249644.1|",
              "gi|530384536|ref|XM_005249643.1|",
              "gi|530384534|ref|XM_005249642.1|",
              "gi|530373237|ref|XM_005265508.1|",
              "gi|530373235|ref|XM_005265507.1|",
              "gi|530364726|ref|XR_241081.1|",
              "gi|530364725|ref|XR_241080.1|",
              "gi|530364724|ref|XR_241079.1|")
     result = tuple(Faidx('data/genes.fasta').index.keys())
     assert result == order
 def test_fetch_keyerror(self):
     """ Fetch a key that does not exist """
     faidx = Faidx("data/genes.fasta", strict_bounds=True)
     result = faidx.fetch("gi|joe|gb|KF435150.1|", 1, 10)
Пример #35
0
 def __init__(self):
     self.fasta = os.path.join(path, 'data/genes.fasta')
     self.faidx = Faidx(self.fasta, default_seq='N')
Пример #36
0
 def __init__(self):
     self.fasta = os.path.join(path, 'data/genes.fasta')
     self.faidx = Faidx(self.fasta, key_function=get_gene_name)
     self.genes = Fasta(self.fasta, key_function=get_gene_name)
Пример #37
0
 def test_issue_74_end_faidx(self):
     f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False)
     f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True)
     end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     assert end0 == end1
Пример #38
0
 def test_issue_74_end_faidx(self):
     f0 = Faidx('data/genes.fasta.gz', one_based_attributes=False)
     f1 = Faidx('data/genes.fasta.gz', one_based_attributes=True)
     end0 = f0.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     end1 = f1.fetch('gi|557361099|gb|KF435150.1|', 1, 90).end
     assert end0 == end1
 def test_key_function_by_fetch(self):
     faidx = Faidx('data/genes.fasta', split_char='|')
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('KF435150.1',
                          100, 150)
     assert str(result) == expect
Пример #40
0
 def test_fetch_reversed_coordinates(self):
     """ Fetch starting with a negative coordinate """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|557361099|gb|KF435150.1|', 50, 10)
Пример #41
0
 def test_fetch_keyerror(self):
     """ Fetch a key that does not exist """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|joe|gb|KF435150.1|', 1, 10)
Пример #42
0
 def test_rev(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = 'GA'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 481)
     assert str(-result) == expect, result
 def test_fetch_reversed_coordinates(self):
     """ Fetch starting with a negative coordinate """
     faidx = Faidx("data/genes.fasta", strict_bounds=True)
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 50, 10)
Пример #44
0
 def test_fetch_negative(self):
     """ Fetch starting with a negative coordinate """
     faidx = Faidx('data/genes.fasta.gz', strict_bounds=True)
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                                      -10, 10)
Пример #45
0
 def __init__(self):
     self.fasta = os.path.join(path, 'data/genes.fasta')
     self.faidx = Faidx(self.fasta)
     self.faidx_strict = Faidx(self.fasta, strict_bounds=True)
Пример #46
0
 def test_fetch_middle(self):
     faidx = Faidx('data/genes.fasta.gz')
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          100, 150)
     assert str(result) == expect
 def test_rev(self):
     faidx = Faidx("data/genes.fasta")
     expect = "GA"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482)
     assert str(-result) == expect, result
 def test_fetch_border(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx("data/genes.fasta")
     expect = "TC"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 500)
     assert str(result) == expect
 def test_fetch_end(self):
     faidx = Faidx("data/genes.fasta")
     expect = "TC"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 482)
     assert str(result) == expect
 def test_fetch_middle(self):
     faidx = Faidx("data/genes.fasta")
     expect = "TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA"
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 100, 150)
     assert str(result) == expect
Пример #51
0
 def test_key_function_by_fetch(self):
     faidx = Faidx('data/genes.fasta', key_function=get_gene_name)
     expect = 'TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA'
     result = faidx.fetch('MDM4',
                          100, 150)
     assert str(result) == expect
 def test_fetch_past_bounds(self):
     """ Fetch past the end of a gene entry """
     faidx = Faidx("data/genes.fasta", strict_bounds=True)
     result = faidx.fetch("gi|557361099|gb|KF435150.1|", 480, 5000)
Пример #53
0
 def test_issue_134_no_build_index(self):
     """ Ensure that index file is not built when build_index=False. See mdshw5/pyfaidx#134.
     """
     faidx = Faidx('data/genes.fasta', build_index=False)
Пример #54
0
 def test_samtools_compare(self):
     with open(self.samtools, 'r') as expect:
         expect = expect.read()
     index = Faidx.build_fai(self.fasta)
     result = ''.join(index)
     assert result == expect
Пример #55
0
 def test_build(self):
     with open(self.expect, 'r') as fai:
         expect = fai.read()
     index = Faidx.build_fai(self.fasta)
     result = ''.join(index)
     assert result == expect
Пример #56
0
 def test_order(self):
     index = Faidx.build_fai(self.fasta)
     genes = [x.split()[0] for x in index]
     assert genes == list(self.faidx.index.keys())
Пример #57
0
from collections import defaultdict
from pyfaidx import Faidx
import sys
sys.path.append('/data/home/xutun/mySrc/modifyPoppyPaper')
from getUse import dd,classF

annotDiamondF = f'{dd}/isoseqDiamondAnnot.prot.diamond'
transProtFa = f'{dd}/total.merge_corrected.faa'
annotProtFa = f'{dd}/ref/poppy_v6.proteins.final_revised.fasta'
toMergeGeneF = f'{dd}/toMergeGene.tab'

classD = pd.read_table(classF,sep='\t')

candidateTransSet = defaultdict(list)
trans2annotDiamondSet = defaultdict(lambda:defaultdict(int))
transProtFaHandle = Faidx(transProtFa)
annotProtFaHandle = Faidx(annotProtFa)
resultTrans2protLen = defaultdict(int)


def isMerge(gene):
    gene = gene.split('_')
    if len(gene)>=2:
        if len(gene[0])==11 and len(gene[1])==11:
            if 'PS' in gene[0] and 'PS' in gene[1]:
                return True
    return False

def getTrans2gene():
    trans2gene = defaultdict(int)
    for ind,row in classD.iterrows():
Пример #58
0
 def test_fetch_end(self):
     faidx = Faidx('data/genes.fasta')
     expect = 'TC'
     result = faidx.fetch('gi|557361099|gb|KF435150.1|',
                          480, 481)
     assert str(result) == expect