def test_alignerrorregion(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "AAAAAGTTCGAAAAAGGCACG" aligner = khmer.new_readaligner(ch, 1, 20, 11) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACTATTAAAAAAGTTCGAAAAAGGCACGGG") graphAlign, readAlign = aligner.align(read) assert readAlign == '' assert graphAlign == ''
def test_alignnocov(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.new_readaligner(ch) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACCTAGGTTCGACATGTACC") graphAlign, readAlign = aligner.align(read) # should be the same assert readAlign == 'ACCTAGGTTCGACATGTACC' assert graphAlign == 'ACCTAGGTTCGACATGTACC'
def test_alignnocov(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.new_readaligner(ch, 0, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACCTAGGTTCGACATGTACC") score, graphAlign, readAlign, trunc = aligner.align(read) # should be the same eq_(readAlign, 'ACCTAGGTTCGACATGTACC') eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
def test_readalign_new(): ch = khmer.new_counting_hash(32, 1048576, 1) aligner = khmer.new_readaligner(ch, 1, 0) for seq in ht_seqs: ch.consume(seq) for query in queries: score, graphAlign, readAlign, trunc = aligner.align(query["seq"]) print graphAlign print readAlign eq_(graphAlign, query["graph_aln"]) eq_(readAlign, query["read_aln"]) eq_(trunc, query["truncated"])
def test_readalign(): ch = khmer.new_counting_hash(10, 1048576, 1) aligner = khmer.new_readaligner(ch, 1, 20) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") read = "ACCTAGGTTCGACATGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") graphAlign, readAlign = aligner.align(read) assert readAlign == 'ACCTAGGTTCGACATGTACC' assert graphAlign == 'AGCTAGGTTCGACAAGT-CC', graphAlign
def test_readalign(): ch = khmer.new_counting_hash(10, 1048576, 1) aligner = khmer.new_readaligner(ch, 1, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") read = "ACCTAGGTTCGACATGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") score, graphAlign, readAlign, trunc = aligner.align(read) eq_(readAlign, 'ACCTAGGTTCGACATGTACc') eq_(graphAlign, 'AGCTAGGTTCGACAAGTCC-')
def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record['name'] seq = record['sequence'] seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') seq = graph_seq return name, seq
def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record["name"] seq = record["sequence"] seq = seq.replace("N", "A") grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace("-", "") seq = graph_seq return name, seq
def main(): parser = build_counting_args() parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2) parser.add_argument("--theta", type=float, default=1.0) parser.add_argument("input_table") parser.add_argument("input_filenames", nargs="+") add_loadhash_args(parser) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print >> sys.stderr, 'file with ht: %s' % counting_ht print >> sys.stderr, 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() aligner = khmer.new_readaligner( ht, args.trusted_cov, args.theta ) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts) ### the filtering loop for infile in infiles: print >> sys.stderr, 'aligning', infile for n, record in enumerate(screed.open(infile)): name = record['name'] seq = record['sequence'].upper() print >> sys.stderr, name print >> sys.stderr, seq score, graph_alignment, read_alignment, truncated = aligner.align( seq) print >> sys.stderr, score print >> sys.stderr, graph_alignment print >> sys.stderr, read_alignment print >> sys.stderr, truncated print ">{0}\n{1}".format(name, graph_alignment)
def main(): hash_filename = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] max_error_region = int(sys.argv[4]) C = 20 # 20 corrected = 0 uncorrected = 0 outfp = open(output_filename, 'w') ht = khmer.load_counting_hash(hash_filename) aligner = khmer.new_readaligner(ht, 1, C, max_error_region) K = ht.ksize() for n, record in enumerate(screed.open(input_filename)): if n % 1000 == 0: print n seq = record.sequence seq_name = record.name seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') corrected += 1 outfp.write('>%s\n%s\n' % (seq_name, graph_seq)) else: uncorrected += 1 outfp.write('>%s\n%s\n' % (seq_name, seq)) print 'corrected', corrected print 'uncorrected', uncorrected outfp.close()
def main(): parser = build_counting_args() parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2) parser.add_argument("--theta", type=float, default=1.0) parser.add_argument("input_table") parser.add_argument("input_filenames", nargs="+") add_loadhash_args(parser) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print >>sys.stderr, 'file with ht: %s' % counting_ht print >>sys.stderr, 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() aligner = khmer.new_readaligner(ht, args.trusted_cov, args.theta) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts) ### the filtering loop for infile in infiles: print >>sys.stderr, 'aligning', infile for n, record in enumerate(screed.open(infile)): name = record['name'] seq = record['sequence'].upper() print >>sys.stderr, name print >>sys.stderr, seq score, graph_alignment, read_alignment, truncated = aligner.align(seq) print >>sys.stderr, score print >>sys.stderr, graph_alignment print >>sys.stderr, read_alignment print >>sys.stderr, truncated print ">{0}\n{1}".format(name, graph_alignment)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument( "--bits-theta", help= "Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align( record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" .format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = argparse.ArgumentParser(description='XXX') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE) parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=env_hashsize, help='lower bound on hashsize to use') parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on median k-mer abundance of this', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--tempdir', '-T', type=str, dest='tempdir', default='./') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes NORMALIZE_LIMIT = args.normalize_to print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cov, args.bits_theta) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print 'created temporary directory %s; use -T to change location' % tempdir ### save_pass2 = 0 n_aligned = 0 n_corrected = 0 total_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) corrfilename = os.path.basename(filename) + '.corr' pass2list.append((filename, pass2filename, corrfilename)) pass2fp = open(pass2filename, 'w') corrfp = open(corrfilename, 'w') for n, read in enumerate(screed.open(filename)): total_reads += 1 if n % 10000 == 0: print '...', n, filename, n_aligned, n_corrected, save_pass2, \ total_reads seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) # next, decide whether or to keep it. output_corrected = False if not truncated: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if graph_seq != read.sequence: n_corrected += 1 # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: output_corrected = True # has this portion of the graph saturated? if not, # consume & save => pass2. if output_corrected: corrfp.write(output_single(read, corrected)) else: # uncorrected... ht.consume(read.sequence) pass2fp.write(output_single(read, read.sequence)) save_pass2 += 1 pass2fp.close() corrfp.close() print '%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename) print 'aligned %d of %d reads so far' % (n_aligned, total_reads) print 'changed %d of %d reads so far' % (n_corrected, total_reads) for orig_filename, pass2filename, corrfilename in pass2list: print 'second pass: looking at sequences kept aside in %s' % \ pass2filename for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print '... x 2', n, pass2filename, n_aligned, n_corrected, \ total_reads corrfp = open(corrfilename, 'a') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) if truncated: # no good alignment; output original corrected = read.sequence else: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if corrected != read.sequence: n_corrected += 1 corrfp.write(output_single(read, corrected)) print 'removing %s' % pass2filename os.unlink(pass2filename) print 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) print 'Aligned %d of %d total' % (n_aligned, total_reads) print 'Changed %d of %d total' % (n_corrected, total_reads)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align(record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write("+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = argparse.ArgumentParser(description='XXX') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE) parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=env_hashsize, help='lower bound on hashsize to use') parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on median k-mer abundance of this', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--tempdir', '-T', type=str, dest='tempdir', default='./') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes NORMALIZE_LIMIT = args.normalize_to print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cov, args.bits_theta) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print 'created temporary directory %s; use -T to change location' % tempdir ### save_pass2 = 0 n_aligned = 0 n_corrected = 0 total_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) corrfilename = os.path.basename(filename) + '.corr' pass2list.append((filename, pass2filename, corrfilename)) pass2fp = open(pass2filename, 'w') corrfp = open(corrfilename, 'w') for n, read in enumerate(screed.open(filename)): total_reads += 1 if n % 10000 == 0: print '...', n, filename, n_aligned, n_corrected, save_pass2, \ total_reads seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) # next, decide whether or to keep it. output_corrected = False if not truncated: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if graph_seq != read.sequence: n_corrected += 1 # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: output_corrected = True # has this portion of the graph saturated? if not, # consume & save => pass2. if output_corrected: corrfp.write(output_single(read, corrected)) else: # uncorrected... ht.consume(read.sequence) pass2fp.write(output_single(read, read.sequence)) save_pass2 += 1 pass2fp.close() corrfp.close() print '%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename) print 'aligned %d of %d reads so far' % (n_aligned, total_reads) print 'changed %d of %d reads so far' % (n_corrected, total_reads) for orig_filename, pass2filename, corrfilename in pass2list: print 'second pass: looking at sequences kept aside in %s' % \ pass2filename for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print '... x 2', n, pass2filename, n_aligned, n_corrected, \ total_reads corrfp = open(corrfilename, 'a') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) if truncated: # no good alignment; output original corrected = read.sequence else: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if corrected != read.sequence: n_corrected += 1 corrfp.write(output_single(read, corrected)) print 'removing %s' % pass2filename os.unlink(pass2filename) print 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) print 'Aligned %d of %d total' % (n_aligned, total_reads) print 'Changed %d of %d total' % (n_corrected, total_reads)
import math hash_filename = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] max_error_region = int(sys.argv[4]) C = 20 # 20 corrected = 0 uncorrected = 0 outfp = open(output_filename, 'w') ht = khmer.load_counting_hash(hash_filename) aligner = khmer.new_readaligner(ht, 1, C, max_error_region) K = ht.ksize() for n, record in enumerate(screed.open(input_filename)): if n % 1000 == 0: print n seq = record.sequence seq_name = record.name seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0:
import math hash_filename = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] max_error_region = int(sys.argv[4]) C = 20 # 20 corrected = 0 uncorrected = 0 outfp = open(output_filename, "w") ht = khmer.load_counting_hash(hash_filename) aligner = khmer.new_readaligner(ht, 1, C, max_error_region) K = ht.ksize() for n, record in enumerate(screed.open(input_filename)): if n % 1000 == 0: print n seq = record.sequence seq_name = record.name seq = seq.replace("N", "A") grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: