示例#1
0
 def test_dump(self):
     s = 'ACGTACGTAGCAT'
     bwt = BWT(s)
     t = TemporaryFile()
     bwt.dump(t)
     t.seek(0)
     bwt2 = BWT('', t)
     t.close()
     assert_array_equal(bwt.seq, bwt2.seq)
     assert_array_equal(bwt.sa, bwt2.sa)
     self.assertEqual(bwt.alphabet, bwt2.alphabet)
示例#2
0
    def test_search(self):
        s1 = 'ACGTACGTAGCAT'
        alphabet = list(set(s1))
        bwt1 = BWT(s1)
        self.assertEqual(list(bwt1.search('ACGT')), [0, 4])
        del s1, bwt1

        s2 = 'TTCGGCAGTCACCGCGGTTTTCTCGAACGCTTAGGGATAATCGGATCATTCTCACGTAACAGGGTTACGGAGAGTGGAGTGGT'\
             'TGCATGTTAGCCCGCTCTATGCTGGCACCGTGCGGCCACAAACTTATACGTTCTCAGCAGGTATTGTCTCCGGCAATGTTCTC'\
             'TACTGAGGGATGCAATGACATTACGCCACTTATCATTTTAGAAATGCGAGCTTCGAGGGCTGGTGCCGACAGGGCCCTAGCTT'\
             'CGCGCTGCTACACTCTCTAGTCACTAGACACATCTCCATGGGGGAGATAATCTTCGTTTTCCGAGCATGAAACGTACCGGTTA'\
             'CACCCATCATAACGGTGAGAGTTAGTGTGGTTTTTCGGACCAACGGACTGCTGGGTTGCGTGGTTCATTGGTCCTTGACGACG'\
             'AGCAATAGCATGACGCTTTAATAAATCGTTCTACGTTGGGAGGTAACGCGGAATCCCAAGGCCCTCGACATCGTCCTCTCCAT'\
             'ACGAGACACGAAACATTTATATCACTTCGGGCCATTATGCATATCAGTTGGCTGGTTCCTCTTGACGTTAAAATAGGTGGGAA'\
             'GTTATTCGATACCAACGTCTGCAGGTACCGAATAGTGCACGGCGACACACCACAGGGATCTATTAATAAACCTGGTGATGTGA'\
             'TTGGTCCGACTTCGACTAACAACGATTCGACCAGTCTTAATTCTGATCTCGTGACCCGTGTCCTTATTCCATCTAATGAAATT'\
             'CCTGTGGCGATCGGGCACTTCCCGGCGTAAAGTAACCTCCGGACGGCCTAGATTCTATTCTGAAGCTCGCTCGTTTGGAACTT'\
             'GGGGCGCTGAGTCATTACGGGCGCGGTTCTACCATGCACAGCAATATTACACCACTCCTCCCACAGAATCTTCCGACGTGAAG'\
             'GAATGCCCGCAGACAGACATGCTGGTGAAACTGCACCACGACCTTTCGCAACCAGCGCCGGGCGAAAGTCAGTTCAGTCTGCC'\
             'GGAC'
        bwt2 = BWT(s2)
        self.assertEqual(sorted(bwt2.search('ACGT')), [53, 130, 320, 447, 562, 595, 905])
        del s2, bwt2

        s3 = ''.join(choice(alphabet) for _ in xrange(1000))
        bwt3 = BWT(s3)
        n3 = ''.join(choice(alphabet) for _ in xrange(100))
        self.assertEqual(sum(1 for _ in re.finditer('(?=' + n3 + ')', s3)), len(list(bwt3.search(n3))))
示例#3
0
if __name__ == '__main__':

    #Parse some arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('command', help="Command [index, search]")
    parser.add_argument('files', metavar='file', nargs='+', help='files to index/search (FASTA)')
    parser.add_argument('--fastq', help='fastq file containing reads to map')
    parser.add_argument('--fasta', help='fasta file containing reads to map')
    parser.add_argument('--ed', type=int, help='maximum edit distance to search for')
    args = parser.parse_args()
    print args

    #Index goes through all the input files and creats a FM-index and dumps it to a JSON file
    if args.command == 'index':
        for fasta in args.files:
            bwt = BWT(SeqIO.read(fasta, 'fasta').seq.tostring())
            with open(fasta + '.index', 'w') as out:
                bwt.dump(out)
            del bwt

    #For the meat:
    elif args.command == 'search':
        #Pull all of our genomes into memory
        indices = {}
        #Try to grab existing indices, otherwise create them
        for fasta in args.files:
            try:
                with open(fasta + '.index') as f:
                    indices[fasta] = BWT('', f)
            except IOError:
                indices[fasta] = BWT(SeqIO.read(fasta, 'fasta').seq.tostring())