예제 #1
0
def test_bloom_c_2():  # simple one
    K = 4
    HT_SIZE = 10  # use 11
    N_HT1 = 1  # hashtable size = 11
    N_HT2 = 2  # hashtable size = 11,13

    # use only 1 hashtable, no bloom filter
    ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT1)
    ht1.count("AAAA")  # 00 00 00 00 = 0
    ht1.count("ACTG")  # 00 10 01 11 =
    assert ht1.n_unique_kmers() == 2
    ht1.count("AACG")  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert ht1.n_unique_kmers() == 2
    ht1.count("AGAC")  # 00  11 00 10 # collision  with 2nd kmer
    assert ht1.n_unique_kmers() == 2

    # use two hashtables with 11,13
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT2)
    ht2.count("AAAA")  # 00 00 00 00 = 0

    ht2.count("ACTG")  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert ht2.n_unique_kmers() == 2
    ht2.count("AACG")  # 00 00 10 11 = 11  # collision with only 1st kmer
    assert ht2.n_unique_kmers() == 3
    ht2.count("AGAC")  # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert ht2.n_unique_kmers() == 3
예제 #2
0
    def test_random_20_a_succ_IV_save(self):
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        savefile_ht = utils.get_temp_filename('ht')
        savefile_tags = utils.get_temp_filename('tags')
        outfile = filename + utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        ht.save(savefile_ht)
        ht.save_tagset(savefile_tags)

        del ht
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)

        ht.load(savefile_ht)
        ht.load_tagset(savefile_tags)

        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)

        subsets = []
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i + 1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
예제 #3
0
    def test_random_20_a_succ_IV_save(self):
        ht = khmer.new_hashbits(20, 4**13+1)
        filename = os.path.join(thisdir, 'test-data/random-20-a.fa')
        savefile_ht = filename + '.ht'
        savefile_tags = filename + '.tags'
        outfile = filename + '.out'

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        ht.save(savefile_ht);
        ht.save_tagset(savefile_tags);

        del ht
        ht = khmer.new_hashbits(20, 4**13+1)

        ht.load(savefile_ht);
        ht.load_tagset(savefile_tags);
        
        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)
        
        subsets = []
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i+1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)
            
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
def count_overlap(K,HT_SIZE,N_HT,filename,filename2,file_result,file_curve):

    if file_curve !='N':
        count = 0
        for n, record in enumerate(screed.open(filename2)):
            count = count+1
        max_count = count/100
        file3 = open(file_curve,'w')
        
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    for n, record in enumerate(screed.open(filename)):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht.get(kmer)):
                n_unique+=1
            ht.count(kmer)
    print filename,'has been consumed.'
    fpr = (1- math.exp(-n_unique/HT_SIZE))**Z
    printout1 = "%s:\n# of unique kmers: %n\n# of occupied bin: %n\nfalse positive\
rate: %n" %(filename,n_unique,ht.n_occupied(),fpr)
# consume second dataset
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    seq_count = 0
    for n, record in enumerate(screed.open(filename2)):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht2.get(kmer)):
                n_unique+=1
                if (ht.get(kmer)):
                    n_overlap+=1
            ht2.count(kmer)
        if file_curve !='N':
            seq_count = seq_count + 1
            if seq_count == max_count:
                #n_occu = ht2.n_occupied
                string = str(n_unique)+' '+str(n_overlap)+'\n'
                file3 = open(file_curve,'a')
                file3.write(string)
                file3.close()
                seq_count = 0
    print filename2,'has been consumed.'
    fpr = (1- math.exp(-n_unique/HT_SIZE))**Z
    printout2 = "%s:\n# of unique k-mers: %n\n# of occupied bin: %n\nfalse \
positive rate: %n\n===============\n# of overlap unique k-mers: %n\n" \
%(filename2,n_unique,ht2.n_occupied(),n_overlap)
    file_result_object = open(file_result,'w')
    file_result_object.write(printout1)
    file_result_object.write(printout2)
예제 #5
0
def test_count_within_radius_big():
    inpfile = utils.get_test_data('random-20-a.fa')
    ht = khmer.new_hashbits(20, 1e5, 4)

    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
    assert n == 3960

    ht = khmer.new_hashbits(21, 1e5, 4)
    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
    assert n == 39
예제 #6
0
def test_count_within_radius_big():
    inpfile = os.path.join(thisdir, "test-data", "random-20-a.fa")
    ht = khmer.new_hashbits(20, 1e6, 4)

    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius("CGCAGGCTGGATTCTAGAGG", 1e6)
    assert n == 3960

    ht = khmer.new_hashbits(21, 1e6, 4)
    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius("CGCAGGCTGGATTCTAGAGGC", 1e6)
    assert n == 39
예제 #7
0
def run_no_curve(K,HT_SIZE,N_HT,filename,filename2,file_result):    
    file_result_object = open(file_result,'w')
    
    
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
    
    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht.get(kmer)):
                n_unique+=1
            ht.count(kmer)
    print filename,'has been consumed.'        
    print '# of unique kmers:',n_unique
    print '# of occupied bin:',ht.n_occupied()
    printout = filename+":"+'\n'
    printout =  printout+'# of unique kmers:'+str(n_unique)+'\n'
    printout = printout + '# of occupied bin:'+str(ht.n_occupied())+'\n'
    
    
    
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht2.get(kmer)):
                n_unique+=1
                if (ht.get(kmer)):
                    n_overlap+=1
            ht2.count(kmer)
            
    print filename2,'has been consumed.'        
    print '# of unique kmers:',n_unique
    print '# of occupied bin:',ht2.n_occupied()
    
    print n_overlap,'unique kmers appears in both ',filename,' and ',filename2
    
    
    printout = printout+filename2+":"+'\n'
    printout =  printout+'# of unique kmers:'+str(n_unique)+'\n'
    printout = printout + '# of occupied bin:'+str(ht2.n_occupied())+'\n'
    printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n'
    
    file_result_object.write(printout)
예제 #8
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_file_status(infile)

    check_space([args.ptfile, args.fafile])

    print 'loading k-mer presence table from', args.ptfile
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n'
        f_curve_obj.write(to_print)
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        description="Annotate seqs with partitions.")

    parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K,
                        help="k-mer size (default: %d)" % DEFAULT_K)
    parser.add_argument('graphbase')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    K = args.ksize
    ht = khmer.new_hashbits(K, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    print 'loading partition map from:', partitionmap_file
    ht.load_partitionmap(partitionmap_file)

    for infile in args.input_filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        n = ht.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (n, infile)
        print 'partitions are in', outfile
예제 #10
0
def main(dir1, dir2, n_threads):
    # detect all of the relevant partitionmap files
    subset_filenames = glob.glob(os.path.join(dir1, '*.pmap'))

    # create empty hashtable structure
    ht = khmer.new_hashbits(K, 1, 1)

    # put jobs on queue
    merge_queue = Queue.Queue()
    for filename in subset_filenames:
        merge_queue.put((ht, filename))

    print 'starting threads'

    threads = []
    for n in range(n_threads):
        t = threading.Thread(target=pull_pair, args=(merge_queue,))
        threads.append(t)
        t.start()

    # wait for threads
    for t in threads:
        t.join()

    # done!

    if merge_queue.qsize() == 1:
        ht, merge_file = merge_queue.get()
        print 'copying', merge_file
        shutil.copy(merge_file, os.path.join(dir2,
                                             os.path.basename(merge_file)))

    assert merge_queue.qsize() == 0
예제 #11
0
def diff(ht, filename):
   genome = khmer.new_hashbits(K, 4**K, 1)
   
   found = 0
   not_found = 0

   for n, record in enumerate(screed.fasta.fasta_iter(open(filename))):
      read = record['sequence']
      name = record['name']

      if 'N' in read:
         continue

      if len(read) < K:
         continue

      seq_len = len(read)
      for n in range(0,seq_len+1-K):
         kmer = read[n:n+K]

         if not genome.get(kmer):
            genome.consume(kmer)
         
            if ht.get(kmer):
               found += 1
            else:
               not_found += 1

   return found, not_found
예제 #12
0
def test_save_load_tagset_trunc():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.add_tag('G' * 32)
    ht.save_tagset(outfile)
    ht.save_tagset('/tmp/goodversion-k32.tagset')

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    fp = open(outfile, 'wb')
    fp.write(data[:26])
    fp.close()

    # try loading it...
    try:
        ht.load_tagset(outfile)
        assert 0, "this test should fail"
    except IOError:
        pass
예제 #13
0
def test_consume_absentfasta_with_reads_parser():
    presencetable = khmer.new_hashbits(31, 1, 1)
    try:
        presencetable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
예제 #14
0
def main():
    parser = argparse.ArgumentParser(description="Merge pmap files.")

    parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K,
                        help="k-mer size (default: %d)" % DEFAULT_K)
    parser.add_argument('--keep-subsets', dest='remove_subsets',
                        default=True, action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase')
    args = parser.parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])

    K = args.ksize
    ht = khmer.new_hashbits(K, 1, 1)

    for pmap_file in pmap_files:
        print 'merging', pmap_file
        ht.merge_subset_from_disk(pmap_file)

    print 'saving merged to', output_file
    ht.save_partitionmap(output_file)

    if args.remove_subsets:
        print 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
예제 #15
0
def main():
    parser = build_common_args()
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parse_args(parser)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, filename in enumerate(filenames):
        print 'consuming input', filename
        ht.consume_fasta(filename)

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)
    open(base + '.info', 'w').write('through end: %s' % filename)
예제 #16
0
def main():
    ht = khmer.new_hashbits(K, 1, 1)

    x = [0] * 255
    y = [0] * 255

    ht.load_stop_tags(sys.argv[1])
    for n, record in enumerate(screed.open(sys.argv[2])):
        if n % 10000 == 0:
            sys.stderr.write('... %d\n' % n)

        s, p = ht.trim_on_stoptags(record.sequence)

        if len(s) == len(record.sequence):
            continue

        if p == 0:
            p = 31
        else:
            p += 1

        x[p] += 1
        y[len(record.sequence)] += 1

    for i, (n, m) in enumerate(zip(x, y)):
        if m:
            print '%d,%d,%d' % (i, n, m)
예제 #17
0
def main():
    info('annotate-partitions.py', ['graph'])
    args = get_parser().parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    htable = khmer.new_hashbits(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_file_status(partitionmap_file)
    for _ in filenames:
        check_file_status(_)

    check_space(filenames)

    print 'loading partition map from:', partitionmap_file
    htable.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
예제 #18
0
def test_extract_unique_paths_2():
    kh = khmer.new_hashbits(10, 4, 4)

    kh.consume('ATGGAGAGAC')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print x
    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
예제 #19
0
def test_save_load_tagset_trunc():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.add_tag('G' * 32)
    ht.save_tagset(outfile)

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    for i in range(len(data)):
        fp = open(outfile, 'wb')
        fp.write(data[:i])
        fp.close()

        # try loading it...
        try:
            ht.load_tagset(outfile)
            assert 0, "this test should fail"
        except IOError as err:
            print str(err), i
예제 #20
0
def main():
    parser = build_common_args()
    parser.add_argument("output_filename")
    parser.add_argument("input_filenames", nargs="+")

    args = parse_args(parser)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print "Saving hashtable to %s" % base
    print "Loading kmers from sequences in %s" % repr(filenames)

    ###

    print "making hashtable"
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, filename in enumerate(filenames):
        print "consuming input", filename
        ht.consume_fasta(filename)

        if n > 0 and n % 10 == 0:
            print "mid-save", base
            ht.save(base)
            open(base + ".info", "w").write("through %s" % filename)

    print "saving", base
    ht.save(base)
    open(base + ".info", "w").write("through end: %s" % filename)
예제 #21
0
def test_tag_across_stoptraverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 1e4  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    # without tagging/joining across consume, this breaks into two partition;
    # with, it is one partition.
    ht.add_stop_tag('CCGAATATATAACAGCGACG')

    ht.consume_fasta_and_tag_with_stoptags(filename)  # DO join reads across

    subset = ht.do_subset_partition(0, 0)
    n, _ = ht.count_partitions()
    assert n == 99                       # reads only connected by traversal...

    n, _ = ht.subset_count_partitions(subset)
    assert n == 2                        # but need main to cross stoptags.

    ht.merge_subset(subset)

    n, _ = ht.count_partitions()         # ta-da!
    assert n == 1, n
예제 #22
0
 def test_save_load_merge_nexist(self):
     ht = khmer.new_hashbits(20, 1)
     try:
         a = ht.load_subset_partitionmap('this does not exist')
         assert 0, "this should not succeed"
     except IOError as e:
         print str(e)
예제 #23
0
def test__get_set_tag_density():
    ht = khmer.new_hashbits(32, 1, 1)

    orig = ht._get_tag_density()
    assert orig != 2
    ht._set_tag_density(2)
    assert ht._get_tag_density() == 2
예제 #24
0
    def test_save_merge_from_disk_2(self):
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads // 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
예제 #25
0
    def test_save_merge_from_disk(self):
        ht = khmer.new_hashbits(20, 4 ** 4 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
예제 #26
0
def test_tiny_real_partitions():
    filename = utils.get_test_data('real-partition-tiny.fa')

    ht = khmer.new_hashbits(32, 8e1, 4)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()

    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert len(parts) == 2, len(parts)
    assert len(set(parts)) == 1
    assert set(parts) != set(['0'])

    test_tiny_real_partitions.runme = True
예제 #27
0
def main():
    info('merge-partitions.py', ['graph'])
    args = get_parser().parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]), file=sys.stderr)

    ksize = args.ksize
    htable = khmer.new_hashbits(ksize, 1, 1)

    for _ in pmap_files:
        check_input_files(_, args.force)

    check_space(pmap_files, args.force)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        htable.merge_subset_from_disk(pmap_file)

    print('saving merged to', output_file, file=sys.stderr)
    htable.save_partitionmap(output_file)

    if args.remove_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
예제 #28
0
def test_filter_stoptags():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    stopfile = utils.get_temp_filename('stoptags', in_dir)

    # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir.
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    # now, create a file with some stop tags in it --
    K = 18
    kh = khmer.new_hashbits(K, 1, 1)
    kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
    kh.save_stop_tags(stopfile)
    del kh
    
    # finally, run filter-stoptags.
    script = scriptpath('filter-stoptags.py')
    args = ['-k', str(K), stopfile, infile, infile]
    (status, out, err) = runscript(script, args, in_dir)
    print out
    print err
    assert status == 0

    # verify that the basic output file exists
    outfile = infile + '.stopfilt'
    assert os.path.exists(outfile), outfile

    # it should contain only one unique sequence, because we've trimmed
    # off everything after the beginning of the only long sequence in there.
    seqs = set([ r.sequence for r in screed.open(outfile) ])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
예제 #29
0
def main(filename):
    global ht

    basename = os.path.basename(filename)

    print 'input file to partition: %s' % filename
    print '-- settings:'
    print 'K', K
    print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
    print 'N HASHTABLES %d' % N_HT
    print '--'

    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    ht.consume_fasta(filename)

    counting = khmer.new_counting_hash(K, COUNTING_SIZE, N_HT)
    ht.traverse_from_reads(filename, 100, 5000, 5, counting)

    print 'saving stoptags binary'
    ht.save_stop_tags(basename + '.stoptags')
    print 'saving stoptags text'
    ht.print_stop_tags(basename + '.stoptags.txt')

    sys.exit(0)
예제 #30
0
def test_save_load_merge_on_graph():
    ht = khmer.new_hashbits(20, 4 ** 4 + 1)
    filename = utils.get_test_data('test-graph2.fa')

    (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
    assert total_reads == 3, total_reads

    divvy = ht.divide_tags_into_subsets(1)
    print(divvy)
    assert len(divvy) is 3
    (a, b, c) = divvy

    outfile1 = utils.get_temp_filename('x.pmap')
    outfile2 = utils.get_temp_filename('y.pmap')

    x = ht.do_subset_partition(a, b)
    ht.save_subset_partitionmap(x, outfile1)
    del x

    y = ht.do_subset_partition(b, 0)
    ht.save_subset_partitionmap(y, outfile2)
    del y

    a = ht.load_partitionmap(outfile1)  # <-- this is different
    b = ht.load_subset_partitionmap(outfile2)

    ht.merge_subset(b)

    outfile = utils.get_temp_filename('out.part')
    n_partitions = ht.output_partitions(filename, outfile)
    assert n_partitions == 1, n_partitions        # combined.
예제 #31
0
    def test_3_merge_013(self):
        ht = khmer.new_hashbits(20, 4**14 + 1)

        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        (a, b, c) = ht.divide_tags_into_subsets(1)

        x = ht.do_subset_partition(a, a)
        ht.merge_subset(x)

        y = ht.do_subset_partition(b, 0)
        ht.merge_subset(y)

        outfile = utils.get_temp_filename('out')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
예제 #32
0
def test_stop_traverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 1e4  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    # without tagging/joining across consume, this breaks into two partition;
    # with, it is one partition.
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')

    ht.consume_fasta_and_tag(filename)  # DO NOT join reads across stoptags
    subset = ht.do_subset_partition(0, 0, True)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 2, n
예제 #33
0
def test_notag_across_stoptraverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 1e4  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    # connecting k-mer at the beginning/end of a read: breaks up into two.
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')

    ht.consume_fasta_and_tag_with_stoptags(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 2, n
예제 #34
0
    def test_find_all_tags_kmersize(self):
        ht = khmer.new_hashbits(20, 4 ** 4 + 1)

        a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT"
        b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT"
        c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT"

        print ht.consume(a)
        try:
            ppi = ht.find_all_tags(c[:19])
            assert False, "should raise a ValueError for wrong k-mer size"
        except ValueError:
            pass

        try:
            ppi = ht.find_all_tags(c[:21])
            assert False, "should raise a ValueError for wrong k-mer size"
        except ValueError:
            pass
예제 #35
0
    def test_random_20_a_succ_IV(self):
        ht = khmer.new_hashbits(20, 4**13 + 1)
        filename = utils.get_test_data('random-20-a.fa')
        outfile = utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)
        subsets = []

        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i + 1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
예제 #36
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_input_files(infile, args.force)

    check_space([args.ptfile, args.fafile], args.force)

    print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')
    if args.csv:
        f_curve_obj_csv = csv.writer(f_curve_obj)
        # write headers:
        f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        if args.csv:
            f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
        else:
            print(list_curve[100 + i], list_curve[i], file=f_curve_obj)

    print('wrote to: ' + args.report_filename, file=sys.stderr)
예제 #37
0
def main():
    global done, worker_count
    done = False
    worker_count = 0

    infile = sys.argv[1]
    outfile = infile + '.graphsize2'

    print 'creating ht'
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, 1)
    print 'eating fa', infile
    total_reads, n_consumed = ht.consume_fasta(infile)
    outfp = open(outfile, 'w')

    inqueue = Queue.Queue(50)
    outqueue = Queue.Queue(50)

    ## worker and writer threads
    for i in range(WORKER_THREADS):
        t = threading.Thread(target=process, args=(inqueue, outqueue, ht))
        worker_count += 1
        t.start()

    threading.Thread(target=write, args=(outqueue, outfp)).start()

    ### main thread
    x = []
    i = 0
    for n, record in enumerate(screed.fasta.fasta_iter(open(infile))):
        if n % 10000 == 0:
            print '...', n

        x.append(record)
        i += 1

        if i > GROUPSIZE:
            inqueue.put(x)
            x = []
            i = 0
    inqueue.put(x)

    done = True
예제 #38
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size',
                        dest='ksize')
    parser.add_argument('stoptags_file')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    K = args.ksize

    stoptags = args.stoptags_file
    infiles = args.input_filenames

    print 'loading stop tags, with K', K
    ht = khmer.new_hashbits(K, 1, 1)
    ht.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
예제 #39
0
def test_save_load_tagset():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.save_tagset(outfile)

    ht.add_tag('G' * 32)

    ht.load_tagset(outfile)  # implicitly => clear_tags=True
    ht.save_tagset(outfile)

    # if tags have been cleared, then the new tagfile will be larger (34 bytes)
    # else smaller (26 bytes).

    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()
    assert len(data) == 26, len(data)
예제 #40
0
def test_find_unpart_fail():
    filename = utils.get_test_data('random-20-a.odd.fa')
    filename2 = utils.get_test_data('random-20-a.odd.fa')  # <- switch to odd

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 49

    ht.find_unpart(filename2, True, False)
    n, _ = ht.count_partitions()
    assert n == 49, n  # only 49 sequences worth of tags
예제 #41
0
def test_find_unpart_notraverse():
    filename = utils.get_test_data('random-20-a.odd.fa')
    filename2 = utils.get_test_data('random-20-a.even.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 49

    ht.find_unpart(filename2, False, False)  # <-- don't traverse
    n, _ = ht.count_partitions()
    assert n == 99, n  # all sequences disconnected
예제 #42
0
    def test_save_merge_from_disk_file_not_exist(self):
        ht = khmer.new_hashbits(20, 4**4 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')

        # fail to create file... => failure expected

        try:
            ht.merge_subset_from_disk(outfile1)
            assert 0, "this should fail"
        except IOError as e:
            print str(e)
예제 #43
0
def test_save_load_tagset_noclear():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.save_tagset(outfile)

    ht.add_tag('G' * 32)

    ht.load_tagset(outfile, False)  # set clear_tags => False; zero tags
    ht.save_tagset(outfile)

    # if tags have been cleared, then the new tagfile will be large (34 bytes);
    # else small (26 bytes).

    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()
    assert len(data) == 34, len(data)
예제 #44
0
def main():
    filename = sys.argv[1]
    K = int(sys.argv[2])  # size of kmer
    HT_SIZE = int(sys.argv[3])  # size of hashtable
    N_HT = int(sys.argv[4])  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht.get(kmer)):
                n_unique += 1
            ht.count(kmer)

    print n_unique
    print ht.n_occupied()
    print ht.n_unique_kmers()
예제 #45
0
    def test_ordered_connect(self):
        ht = khmer.new_hashbits(20, 4 ** 4 + 1)

        a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT"
        b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT"
        c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT"

        print ht.consume(a)
        ppi = ht.find_all_tags(a[:20])
        pid = ht.assign_partition_id(ppi)
        assert pid == 0, pid

        print ht.consume(b)
        ppi = ht.find_all_tags(b[:20])
        pid = ht.assign_partition_id(ppi)
        assert pid == 0, pid

        print ht.consume(c)
        ppi = ht.find_all_tags(c[:20])
        pid = ht.assign_partition_id(ppi)
        assert pid == 2, pid
예제 #46
0
def deg(filename, ht):
    kmers = khmer.new_hashbits(K, 4**K, 1)

    degs = {}

    for n, record in enumerate(screed.fasta.fasta_iter(open(filename))):
        read = record['sequence']
        name = record['name']

        if len(read) < K:
            continue

        if 'N' in read:
            continue

        get_all_kmers(ht, read[0:K], K, kmers, degs)
        n_occ = kmers.n_occupied()

    del kmers

    return n_occ, degs
예제 #47
0
def test_output_partitions():
    filename = utils.get_test_data('test-output-partitions.fa')

    ht = khmer.new_hashbits(10, 1, 1)
    ht.set_partition_id('TTAGGACTGC', 2)
    ht.set_partition_id('TGCGTTTCAA', 3)
    ht.set_partition_id('ATACTGTAAA', 4)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()
    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert parts[0] == '2'
    assert parts[1] == '3'
    assert parts[2] == '4'
def main():
    readsfile = sys.argv[1]
    contigfile = sys.argv[2]
    outfile = os.path.basename(readsfile) + '.sweep'
    if len(sys.argv) == 4:
        outfile = sys.argv[3]

    # create a hashbits data structure
    ht = khmer.new_hashbits(K, 1, 1)

    # tag every k-mer in the contigs
    ht._set_tag_density(0)

    # load contigs, connect into N partitions
    print 'loading contigs from', contigfile
    ht.consume_fasta_and_tag(contigfile)
    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    print 'outputting contig-partitioned reads to', outfile
    ht.output_partitions(readsfile, outfile, True)
예제 #49
0
    def test_random_20_a_succ_III(self):
        ht = khmer.new_hashbits(20, 4**7 + 1)
        filename = utils.get_test_data('random-20-a.fa')
        outfile = utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads / 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)
        assert len(divvy) == 4, len(divvy)

        x = ht.do_subset_partition(divvy[0], divvy[2])
        y = ht.do_subset_partition(divvy[2], 0)

        ht._validate_subset_partitionmap(x)
        ht._validate_subset_partitionmap(y)

        ht.merge_subset(y)
        ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
예제 #50
0
def main():
    info('filter-stoptags.py', ['graph'])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading stop tags, with K', args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'filtering', infile
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print >>sys.stderr, 'output in', outfile
예제 #51
0
def test_badget():
    hbts = khmer.new_hashbits(6, 1e6, 1)

    dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"

    hbts.consume(dna)

    assert hbts.get("AGCTTT") == 1

    assert hbts.get("GATGAG") == 0

    try:
        hbts.get(b"AGCTT")
        assert 0, "this should fail"
    except ValueError as err:
        print(str(err))

    try:
        hbts.get(u"AGCTT")
        assert 0, "this should fail"
    except ValueError as err:
        print(str(err))
예제 #52
0
def test_small_real_partitions():
    filename = utils.get_test_data('real-partition-small.fa')

    ht = khmer.new_hashbits(32, 2e2, 4)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()
    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert len(parts) == 6, len(parts)
    assert len(set(parts)) == 1
    assert set(parts) != set(['0'])
예제 #53
0
def test_filter_sodd():
    K = 32
    HASHTABLE_SIZE = int(8e7)
    N_HT = 4
    MAX_SODD = 3

    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    filename = utils.get_test_data('../../data/high-sodd.fa')

    ht.consume_fasta(filename)

    seq = "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGTTTTGCCAAAAATTTTTACAGTTAGAAATTATTCACAAAGTTGCACCGGAATTCGGTTACAAACGTCATTCTAACTAAT"
    trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)
    assert trim_seq == "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGT"

    seq = "ACAAAATTCCACATATAGTCATAATTGTGGGCAATTTTCGTCCCAAATTAGTTAGAATGACGTTTGTAACCGAATTCCGGTGCAACTTTGTGAATAATTTCTAACTGTAAAAAT"
    trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)
    assert trim_seq == "ACAAAATTCCACATATAGTCATAATTGTGGGCAATT"

    seq = "GCACGCAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG"
    trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)
    assert trim_seq == seq
예제 #54
0
def main():
    K = 8
    n = 50
    add_kmers = 50
    total_kmers = n + add_kmers

    print "\"FPR\",\"LOWER\",\"AVG\",\"UPPER\""

    for p in [x / 200.0 + .01 for x in range(59)]:
        diam_lens = []
        for j in range(500):
            seq = gen_circ_chrom(n, K)
            m = calc_m(total_kmers, p)
            k = opt_ht(m, total_kmers)
            HT_SIZE = calc_ht_size(m, k)

            ht = khmer.new_hashbits(K, HT_SIZE, k)
            ht.consume(seq)

            for i in range(add_kmers):
                ht.consume(generate_read(K))

            real_kmers = get_real_kmers(seq, K)

            out_len = []
            # step one: find the "outbranch" lengths for each real k-mer
            for kmer in real_kmers:
                out_len.append(get_level(ht, kmer, real_kmers, K))

            # step two: find the shortest longest path using the info from step 1
            diam_lens.append(max(out_len))

        #avg = numpy.mean(diam_lens)
        #se = numpy.std(diam_lens) / numpy.sqrt(len(diam_lens))
        #lim = se * 1.96
        #print str(p) + "," + str(avg-lim) + "," + str(avg) + "," + str(avg+lim)
        low, med, upp = estimate_mean(diam_lens)
        print str(p) + "," + str(low) + "," + str(med) + "," + str(upp)
예제 #55
0
def main(filename):
    global ht
    
    basename = os.path.basename(filename)

    print 'input file to partition: %s' % filename
    print '-- settings:'
    print 'K', K
    print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
    print 'N HASHTABLES %d' % N_HT
    print '--'

    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    counting = khmer.new_counting_hash(K, COUNTING_SIZE, N_HT)
    ht.consume_fasta_and_traverse(filename, 100, 500, 5, counting)

    print 'saving stoptags binary'
    ht.save_stop_tags(basename + '.stoptags')
    print 'saving stoptags text'
    ht.print_stop_tags(basename + '.stoptags.txt')

    sys.exit(0)
예제 #56
0
def test_combine_pe():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    ht = khmer.new_hashbits(32, 1, 1)

    ht.consume_partitioned_fasta(inpfile)
    assert ht.count_partitions() == (2, 0)

    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    pid1 = ht.get_partition_id(s1)

    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    pid2 = ht.get_partition_id(s2)

    assert pid1 == 2
    assert pid2 == 80293

    ht.join_partitions(pid1, pid2)

    pid1 = ht.get_partition_id(s1)
    pid2 = ht.get_partition_id(s2)

    assert pid1 == pid2
    assert ht.count_partitions() == (1, 0)
예제 #57
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
            ht2.count(kmer)

    assert n_unique == 3960
    assert ht2.n_occupied() == 3882
    assert ht2.n_unique_kmers() == 3960  # this number equals to n_unique
예제 #58
0
def main():
    filename1 = sys.argv[1]
    filename2 = sys.argv[2]
    uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')

    kh = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    for n, record in enumerate(screed.open(filename1)):
        if n % 10000 == 0:
            print '...', filename1, n
        seq = record.sequence.upper().replace('N', 'A')
        kh.consume(seq)

    path_n = 0
    for n, record in enumerate(screed.open(filename2)):
        if n % 10000 == 0:
            print '...', filename2, n
        seq = record.sequence.upper().replace('N', 'A')
        paths = kh.extract_unique_paths(seq, UNIQUE_LEN, UNIQUE_F)
        kh.consume(seq)

        for path in paths:
            path_n += 1
            print >> uniq2, '>%s from:%s\n%s' % (path_n, record.name, path)
예제 #59
0
def main():
    info('merge-stoptags.py')
    args = get_parser().parse_args()

    stdbase = args.stdbase

    # @RamRS: This might need some more work
    infiles = []
    for _ in glob.glob(stdbase + "*/*.stoptags"):
        if os.path.exists(_):
            check_input_files(_, False)
            infiles.append(_)

    check_space(infiles, False)
    ht = khmer.new_hashbits(args.ksize, 1, 1)
    for _ in infiles:
        print >> sys.stderr, 'loading stoptags %s' % _
        ht.load_stop_tags(_, 0)

    print >> sys.stderr, 'writing file merge.stoptags'
    ht.save_stop_tags('merge.stoptags')

    print >> sys.stderr, 'done!'
예제 #60
0
def main(subset_filenames):
    print 'K', K
    print 'MIN SIZE', MIN_PARTITION_SIZE
    print '--'

    # create an empty hashtable & load in the tags
    ht = khmer.new_hashbits(32, 1, 1)
    tagmap = ht.new_tagmap()

    # find the maximum partition size for each tag, across all subsets
    for filename in subset_filenames:
        print 'maxifying:', filename
        subset = ht.load_subset_partitionmap(filename)
        ht.subset_maxify_partition_size(subset, tagmap)
        del subset
        gc.collect()

    # filter tags based on the max partition size to which they belong
    print 'discarding'
    ht.discard_tags(tagmap, MIN_PARTITION_SIZE)

    # finally, filter each subset filename and save.
    for filename in subset_filenames:
        print 'loading x 2', filename
        subset = ht.load_subset_partitionmap(filename)
        print 'filtering', filename
        ht.subset_filter_against_tags(subset, tagmap)

        dir = os.path.dirname(filename)
        new_filename = 'filtered_' + os.path.basename(filename)
        new_filename = os.path.join(dir, new_filename)

        print 'saving', new_filename
        ht.save_subset_partitionmap(subset, new_filename)

        del subset
        gc.collect()