def split_data(fastaFN, partitions):
    buckets = [[] for _ in xrange(partitions)]
    ring = cycle(xrange(partitions))

    with open(fastaFN, 'rU') as inF:
        for seq in util.parseFASTA(inF):
            buckets[ring.next()].append(seq)

    return buckets
def split_data(fastaFN, partitions):
    buckets = [[] for _ in xrange(partitions)]
    ring = cycle(xrange(partitions))

    with open(fastaFN, 'rU') as inF:
        for seq in util.parseFASTA(inF):
            buckets[ring.next()].append(seq)

    return buckets
示例#3
0
    def test_parseFASTA(self):
        """
        Testing parseFASTA function.

        :return: Returns OK is test goals were achieved, otherwise raises
                 error.
        """
        FASTARecord = namedtuple("FASTA_Record", "id descr data")
        parseFASTA_result = ut.parseFASTA("phylotoast/test/test_FASTA.fna")
        manually_parsed = [
            FASTARecord(
                id="PIDF154_1",
                descr=
                "HU82XDC01DBOHO orig_bc=ACAGGTCG new_bc=ACAGGTCG bc_diffs=0",
                data=
                "AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGAGATTAAGTAGCTTGCTATTTAATCTTAGTGGCGCACGGGTGAGTAATATATAGCTAATCTGCCCTACACTAGAGGACAACAGTTGGAAACGACTGCTAATACTCTATACTCCTTCTTTACATAAGTTAAGTCGGGAAAGTTTTTCGGTGTAGGATGAGGCTATATCGTATCAGCTAGTGGTAGGTAACGGCCTACCAAGGCTATGACGCGTAACTGGTCTGAGAGGATGATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTAGGGGAATATTGCTCAAATGGGGGGAAAACCCTGAAAGCAGCAACGCCGCGTGGAGGATGACACTTTTCGGA"
            ),
            FASTARecord(
                id="PIDTA158_2",
                descr=
                "HU82XDC01A3N0T orig_bc=ACCGCAGG new_bc=ACCGCAGG bc_diffs=0",
                data=
                "GATGAACGCTAGCGATAGGCTTAACACATGCAAGTCGAGGGCATCACGAATTAGCAATAGTTTGGTGGCGACCGGCGCACGGGTGCGTAACACGTATACAACCTACCTTCAATTGGGGAATAACCTGGAGAAATTTGGACTAATACCCCATAGTAAACGGGAGAGGCATTCTTTTTTGTTTAAAGATTTATTGATTGGAGATGGGTATGCGTAGGATTAGCTAGTTGGTAAGGTAACGGCTTACCAAGGCAACGATCCTTAGGGGTT"
            ),
            FASTARecord(
                id="PIDF160_3",
                descr=
                "HU82XDC01DTNIU orig_bc=ACCGTAGA new_bc=ACCGTAGA bc_diffs=0",
                data=
                "GATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGTTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTTTAGGGCATCCTAAGATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGACAATGGAACCAAAAGTCTGATCCAGCAATTCTGTGTGCACGATG"
            ),
            FASTARecord(
                id="PIDTA.TB168_4",
                descr=
                "HU82XDC01ETBU0 orig_bc=GCGCAACG new_bc=GCGCAACG bc_diffs=0",
                data=
                "GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATATGGCATCGTATAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGATCGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGGAATATTGGGACAATGGGACCGAGAGTCTGATCCAGCAACTCTGTGTGCACGAT"
            ),
            FASTARecord(
                id="PIDTA.TB140_5",
                descr=
                "HU82XDC01AVWB9 orig_bc=ACTGGAGA new_bc=ACTGGAGA bc_diffs=0",
                data=
                "GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAATTTGGGTTTTAACTTAGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTAGAAATGAATGCTAATACCTGATATTATGATTTTAAGGCATCTTAGAATTATGAAAGCTATAAGCACTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACAGCTCACCAAGGC"
            )
        ]
        for rec1, rec2 in zip(parseFASTA_result, manually_parsed):
            self.assertEqual(rec1,
                             rec2,
                             msg="FASTA records not parsed as expected.")
示例#4
0
def main():
    args = handle_program_options()

    with open(args.biom_fp, 'rU') as bf:
        biom_otus = {row['id'] for row in json.load(bf)['rows']}

    repset = util.parseFASTA(args.repset_fp)
    seq_ids = set()

    with open(args.repset_out_fp, 'w') as out_f:
        fasta_str = ">{} {}\n{}\n"
        for seq in repset:
            if seq.id not in seq_ids and seq.id in biom_otus:
                seq_ids.add(seq.id)
                out_f.write(fasta_str.format(seq.id, seq.descr, seq.data))
def main():
    args = handle_program_options()

    with open(args.biom_fp, 'rU') as bf:
        biom_otus = {row['id'] for row in  json.load(bf)['rows']}

    repset = util.parseFASTA(args.repset_fp)
    seq_ids = set()

    with open(args.repset_out_fp, 'w') as out_f:
        fasta_str = ">{} {}\n{}\n"
        for seq in repset:
            if seq.id not in seq_ids and seq.id in biom_otus:
                seq_ids.add(seq.id)
                out_f.write(fasta_str.format(seq.id, seq.descr, seq.data))
示例#6
0
    def test_parseFASTA(self):
        """
        Testing parseFASTA function.

        :return: Returns OK is test goals were achieved, otherwise raises
                 error.
        """
        FASTARecord = namedtuple("FASTA_Record", "id descr data")
        parseFASTA_result = ut.parseFASTA("phylotoast/test/test_FASTA.fna")
        manually_parsed = [FASTARecord(id="PIDF154_1", descr="HU82XDC01DBOHO orig_bc=ACAGGTCG new_bc=ACAGGTCG bc_diffs=0", data="AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGAGATTAAGTAGCTTGCTATTTAATCTTAGTGGCGCACGGGTGAGTAATATATAGCTAATCTGCCCTACACTAGAGGACAACAGTTGGAAACGACTGCTAATACTCTATACTCCTTCTTTACATAAGTTAAGTCGGGAAAGTTTTTCGGTGTAGGATGAGGCTATATCGTATCAGCTAGTGGTAGGTAACGGCCTACCAAGGCTATGACGCGTAACTGGTCTGAGAGGATGATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTAGGGGAATATTGCTCAAATGGGGGGAAAACCCTGAAAGCAGCAACGCCGCGTGGAGGATGACACTTTTCGGA"),
                           FASTARecord(id="PIDTA158_2", descr="HU82XDC01A3N0T orig_bc=ACCGCAGG new_bc=ACCGCAGG bc_diffs=0", data="GATGAACGCTAGCGATAGGCTTAACACATGCAAGTCGAGGGCATCACGAATTAGCAATAGTTTGGTGGCGACCGGCGCACGGGTGCGTAACACGTATACAACCTACCTTCAATTGGGGAATAACCTGGAGAAATTTGGACTAATACCCCATAGTAAACGGGAGAGGCATTCTTTTTTGTTTAAAGATTTATTGATTGGAGATGGGTATGCGTAGGATTAGCTAGTTGGTAAGGTAACGGCTTACCAAGGCAACGATCCTTAGGGGTT"),
                           FASTARecord(id="PIDF160_3", descr="HU82XDC01DTNIU orig_bc=ACCGTAGA new_bc=ACCGTAGA bc_diffs=0", data="GATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGTTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTTTAGGGCATCCTAAGATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGACAATGGAACCAAAAGTCTGATCCAGCAATTCTGTGTGCACGATG"),
                           FASTARecord(id="PIDTA.TB168_4", descr="HU82XDC01ETBU0 orig_bc=GCGCAACG new_bc=GCGCAACG bc_diffs=0", data="GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATATGGCATCGTATAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGATCGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGGAATATTGGGACAATGGGACCGAGAGTCTGATCCAGCAACTCTGTGTGCACGAT"),
                           FASTARecord(id="PIDTA.TB140_5", descr="HU82XDC01AVWB9 orig_bc=ACTGGAGA new_bc=ACTGGAGA bc_diffs=0", data="GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAATTTGGGTTTTAACTTAGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTAGAAATGAATGCTAATACCTGATATTATGATTTTAAGGCATCTTAGAATTATGAAAGCTATAAGCACTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACAGCTCACCAAGGC")]
        for rec1, rec2 in zip(parseFASTA_result, manually_parsed):
            self.assertEqual(
                rec1, rec2,
                msg="FASTA records not parsed as expected."
            )