def main(): parser = argparse.ArgumentParser(description="Get reads coverage matrix") parser.add_argument('hashname1') parser.add_argument('hashname2') parser.add_argument('hashname3') parser.add_argument('file1') parser.add_argument('file2') parser.add_argument('file3') parser.add_argument('output') args = parser.parse_args() hashname1 = args.hashname1 hashname2 = args.hashname2 hashname3 = args.hashname3 output = args.output file1 = args.file1 file2 = args.file2 file3 = args.file3 outfp = open(output, 'w') print 'hashtable from', hashname1 ht1 = khmer.load_counting_hash(hashname1) print 'hashtable from', hashname2 ht2 = khmer.load_counting_hash(hashname2) print 'hashtable from', hashname3 ht3 = khmer.load_counting_hash(hashname3) matrix = {} set_x = set() set_y = set() set_z = set() for file_n in [file1,file2,file3]: print 'reading reads file ',file_n for n, record in enumerate(screed.open(file_n)): if n > 0 and n % 100000 == 0:#100000 print '...', n, file_n seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) set_x.add(med1) med2, _, _ = ht2.get_median_count(seq) set_y.add(med2) med3, _, _ = ht3.get_median_count(seq) set_z.add(med3) key = str(med1)+'-'+str(med2)+'-'+str(med3) matrix[key] = matrix.get(key,0) + 1 for x in range(max(list(set_x))): for y in range(max(list(set_y))): for z in range(max(list(set_z))): to_print = str(x)+'-'+str(y)+' '+ str(z)+ ' ' +\ str(matrix.get(str(x)+'-'+str(y)+'-'+str(z),0))+'\n' outfp.write(to_print) outfp.close()
def test_normalize_by_median_dumpfrequency(): CUTOFF = "1" infiles = [utils.get_temp_filename("test-0.fq")] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append(utils.get_temp_filename("test-{x}.fq".format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data("test-fastq-reads.fq"), infile) script = scriptpath("normalize-by-median.py") args = ["-d", "2", "-C", CUTOFF, "-k", "17"] args.extend(infiles) (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, "backup.ht")) test_good_read = "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" test_good_read2 = "TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA" assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, "backup.ht")) assert out.count("Backup: Saving") == 2 assert "Nothing" in out
def main(): hashfile = sys.argv[1] filename = sys.argv[2] figure = sys.argv[3] ht = khmer.load_counting_hash(hashfile) outabund = open(os.path.basename(filename) + '.counts', 'w') counts = [] d = {} for sequence in open(sys.argv[2]): sequence = sequence.strip() count = ht.get(sequence) counts.append(count) d[count] = d.get(count, 0) + 1 if count > 1000: print >> outabund, sequence, count outfp = open(figure + '.countshist', 'w') sofar = 0 sofar_cumu = 0 for k in sorted(d.keys()): sofar += d[k] sofar_cumu += k * d[k] print >> outfp, k, d[k], sofar, sofar_cumu hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000)) savefig(figure)
def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append( utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename('test-good.fq', tempdir=os.path.dirname( corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('hashname') parser.add_argument('datafiles', nargs='+') args = parser.parse_args() hashfile = args.hashname datafiles = args.datafiles print 'loading counting hash' ht = khmer.load_counting_hash(hashfile) print 'loaded.' for datafile in datafiles: print 'annotating', datafile outfile = os.path.basename(datafile) + '.kannot' outfp = open(outfile, 'w') for n, record in enumerate(screed.open(datafile)): if n % 1000 == 0: print '...', n med, _, _ = ht.get_median_count(record.sequence) outfp.write('>%s kmed=%d\n%s\n' % (record.name, med, record.sequence))
def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def main(): parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('seqfile') #parser.add_argument('histout') args = parser.parse_args() hashfile = args.hashname seqfile = args.seqfile #histout = args.histout fp = open(seqfile.split('.fa')[0] + '.cov.fa', 'w') print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) hist = {} for n, record in enumerate(screed.open(seqfile)): if n > 0 and n % 100000 == 0: print '...', n seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) print >>fp, '>%s_[cov=%f]' % (record.name, med) print >>fp, '%s' % record.sequence
def test_abund_dist_gz_bigcount(): infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) outfile = utils.get_temp_filename('test_ct.gz') script = 'load-into-counting.py' htfile = utils.get_temp_filename('test_ct') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = open(htfile, 'rb').read() f_out = gzip.open(outfile, 'wb') # compress the created bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table try: counting_hash = khmer.load_counting_hash(outfile) except IOError as err: assert 0, 'Should not produce IOError: ' + str(err) hashsizes = counting_hash.hashsizes() kmer_size = counting_hash.ksize() tracking = khmer._Hashbits(kmer_size, hashsizes) abundances = counting_hash.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print(_, i) if _ > 255 and i > 0: flag = True break assert flag
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile) check_space(infiles) print 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print 'writing to', output_filename output = open(output_filename, 'w') for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) print >> output, record.name, medn, ave, stdev, len(seq)
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('ref') args = parser.parse_args() ct = khmer.load_counting_hash(args.table) aligner = khmer.ReadAligner(ct, 5, 1.0) for record in screed.open(args.ref): s = record.sequence s = s.replace('N', 'A') score, graph_alignment, read_alignment, truncated = \ aligner.align(s) assert not truncated g = graph_alignment.replace('-', '') r = read_alignment.replace('-', '') print record.name for kstart in range(0, len(g) - ct.ksize() + 1): kmer = g[kstart:kstart + ct.ksize()] print kstart, ct.get(kmer)
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename( 'test-good.fq', tempdir=os.path.dirname(corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def main(): parser = argparse.ArgumentParser() parser.add_argument("hashname") parser.add_argument("datafiles", nargs="+") args = parser.parse_args() hashfile = args.hashname datafiles = args.datafiles print "loading counting hash" ht = khmer.load_counting_hash(hashfile) print "loaded." for datafile in datafiles: print "annotating", datafile outfile = os.path.basename(datafile) + ".kannot" outfp = open(outfile, "w") for n, record in enumerate(screed.open(datafile)): if n % 1000 == 0: print "...", n med, _, _ = ht.get_median_count(record.sequence) outfp.write(">%s kmed=%d\n%s\n" % (record.name, med, record.sequence))
def main(): parser = argparse.ArgumentParser() parser.add_argument('index') parser.add_argument('reads') args = parser.parse_args() print >>sys.stderr, "Loading graph & labels" cg = khmer.load_counting_hash(args.index + '.graph') lh = khmer._LabelHash(cg) lh.load_labels_and_tags(args.index + '.labels') fp = open(args.index + '.list', 'rb') names = load(fp) fp.close() print >>sys.stderr, 'loaded %d references' % (len(names),) aligner = khmer.ReadAligner(cg, 1, 1.0) # run through all the queries, align, and use alignments to look up # the label. for record in screed.open(args.reads): # build alignments against cg _, ga, ra, truncated = aligner.align(record.sequence) # now grab the tags associated with the alignment ga = ga.replace('-', '') labels = lh.sweep_label_neighborhood(ga) # retrieve the labels associated with the tags matches = set([ names[i] for i in labels ]) # print out the matches. print record.name, len(matches), ", ".join(matches)
def main(): parser = argparse.ArgumentParser( description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('seqfile') parser.add_argument('histout') args = parser.parse_args() hashfile = args.hashname seqfile = args.seqfile histout = args.histout outfp = open(histout, 'w') print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) hist = {} for n, record in enumerate(screed.open(seqfile)): if n > 0 and n % 100000 == 0: print '...', n seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) hist[med] = hist.get(med, 0) + 1 maxk = max(hist.keys()) for i in range(maxk + 1): outfp.write('%d %d\n' % (i, hist.get(i, 0))) outfp.close()
def main(): parser = argparse.ArgumentParser( description='Count k-mers summary stats for sequences') parser.add_argument('htfile') parser.add_argument('input') parser.add_argument('output') args = parser.parse_args() htfile = args.htfile input_filename = args.input output_filename = args.output print 'loading counting hash from', htfile ht = khmer.load_counting_hash(htfile) K = ht.ksize() print 'writing to', output_filename output = open(output_filename, 'w') for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if K <= len(seq): a, b, c = ht.get_median_count(seq) print >>output, record.name, a, b, c, len(seq)
def main(): info('count-kmers.py', ['counting']) args = get_parser().parse_args() print ('hashtable from', args.input_counting_table_filename, file=sys.stderr) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._Hashbits( # pylint: disable=protected-access kmer_size, hashsizes) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) for filename in args.input_sequence_filenames: for record in screed.open(filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i+kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(counting_hash.get(kmer))]) print ('Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()), file=sys.stderr)
def main(): info('count-kmers.py', ['counting']) args = get_parser().parse_args() print('hashtable from', args.input_counting_table_filename, file=sys.stderr) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._Hashbits( # pylint: disable=protected-access kmer_size, hashsizes) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) for filename in args.input_sequence_filenames: for record in screed.open(filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i + kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(counting_hash.get(kmer))]) print('Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()), file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description="Reads coverage increase") parser.add_argument('hashname') parser.add_argument('output') parser.add_argument('input_filename') args = parser.parse_args() hashfile = args.hashname histout = args.output filename = args.input_filename print filename outfp = open(histout, 'w') print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) count = 0 for n, record in enumerate(screed.open(filename)): if n > 0 and n % 100000 == 0:#100000 print '...', n outfp.write('%d %d %f\n' % (n, count, float(count)/n)) seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med > 0: count = count + 1 outfp.close()
def main(): hashfile = sys.argv[1] filename = sys.argv[2] figure = sys.argv[3] ht = khmer.load_counting_hash(hashfile) outabund = open(os.path.basename(filename) + '.counts', 'w') counts = [] d = {} for sequence in open(sys.argv[2]): sequence = sequence.strip() count = ht.get(sequence) counts.append(count) d[count] = d.get(count, 0) + 1 if count > 1000: print >>outabund, sequence, count outfp = open(figure + '.countshist', 'w') sofar = 0 sofar_cumu = 0 for k in sorted(d.keys()): sofar += d[k] sofar_cumu += k * d[k] print >>outfp, k, d[k], sofar, sofar_cumu hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000)) savefig(figure)
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [ args.input_counting_table_filename, args.input_sequence_filename ] for infile in infiles: check_file_status(infile) check_space(infiles) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size) print('HT sizes:', hashsizes) print('outputting to', args.output_histogram_filename) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename) print('preparing hist...') abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): parser = argparse.ArgumentParser( description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('seqfile') #parser.add_argument('histout') args = parser.parse_args() hashfile = args.hashname seqfile = args.seqfile #histout = args.histout fp = open(seqfile.split('.fa')[0] + '.cov.fa', 'w') print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) hist = {} for n, record in enumerate(screed.open(seqfile)): if n > 0 and n % 100000 == 0: print '...', n seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) print >> fp, '>%s_[cov=%f]' % (record.name, med) print >> fp, '%s' % record.sequence
def test_abund_dist_gz_bigcount(): infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) outfile = utils.get_temp_filename('test_ct.gz') script = scriptpath('load-into-counting.py') htfile = utils.get_temp_filename('test_ct') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = open(htfile, 'rb').read() f_out = gzip.open(outfile, 'wb') # compress the created bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table counting_hash = khmer.load_counting_hash(outfile) hashsizes = counting_hash.hashsizes() kmer_size = counting_hash.ksize() tracking = khmer._Hashbits(kmer_size, hashsizes) abundances = counting_hash.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print _, i if _ > 255 and i > 0: flag = True break assert flag
def main(): parser = build_common_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') args = parse_args(parser) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff input_name_list = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in input_name_list: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of', \ total, 'or', int(100. - discarded / float(total) * 100.), '%' if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(os.path.basename(args.savehash))
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage.") parser.add_argument('--max-error-region', '-M', dest='max_error_region', default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record['name'] seq = record['sequence'] seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') seq = graph_seq return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.corr' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage." ) parser.add_argument( "--max-error-region", "-M", dest="max_error_region", default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed", ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record["name"] seq = record["sequence"] seq = seq.replace("N", "A") grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace("-", "") seq = graph_seq return name, seq # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".corr" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance." ) parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False) parser.add_argument( "--normalize-to", "-Z", type=int, dest="normalize_to", help="base variable-coverage cutoff on this median k-mer abundance", default=DEFAULT_NORMALIZE_LIMIT, ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = argparse.ArgumentParser() parser.add_argument('readsfile') parser.add_argument('samposfile') parser.add_argument('khfile') parser.add_argument('-V', '--variable', default=False, action='store_true') args = parser.parse_args() print >> sys.stderr, 'loading posdict' ignore_set = set() posdict = dict(read_pos_file(args.samposfile, ignore_set)) print >> sys.stderr, 'loading kh' kh = khmer.load_counting_hash(args.khfile) K = kh.ksize() count = 0 for record in screed.open(args.readsfile): if record.name in posdict: posns2 = posdict[record.name] seq = record.sequence.replace('N', 'A') posns1 = kh.find_spectral_error_positions(seq, CUTOFF) posns1 = add_n_posns(posns1, record.sequence) if posns1 != posns2: count += 1 print record.name, posns1, posns2 sys.stdout.write(record.sequence) sys.stdout.write('\n') for i in range(len(seq)): if i in posns1: sys.stdout.write('X') else: sys.stdout.write(' ') sys.stdout.write('\n') for i in range(len(seq)): if i in posns2: sys.stdout.write('Z') else: sys.stdout.write(' ') sys.stdout.write('\n') for i in range(len(seq) - K + 1): if kh.get(seq[i:i + K]) < CUTOFF: sys.stdout.write('*') else: sys.stdout.write(' ') sys.stdout.write('\n') print '' if count > 1000: break
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [args.input_counting_table_filename, args.input_sequence_filename] for infile in infiles: check_file_status(infile) check_space(infiles) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size) print('HT sizes:', hashsizes) print('outputting to', args.output_histogram_filename) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename) print('preparing hist...') abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): parser = argparse.ArgumentParser() parser.add_argument('index') parser.add_argument('reads') args = parser.parse_args() print >>sys.stderr, "Loading graph & labels" cg = khmer.load_counting_hash(args.index + '.graph') lh = khmer._LabelHash(cg) lh.load_labels_and_tags(args.index + '.labels') fp = open(args.index + '.list', 'rb') names = load(fp) fp.close() print >>sys.stderr, 'loaded %d references' % (len(names),) aligner = khmer.ReadAligner(cg, 1, 1.0) counts = {} for k in names: counts[k] = 0 # run through all the queries, align, and use alignments to look up # the label. for n, record in enumerate(screed.open(args.reads)): if n % 1000 == 0: print >>sys.stderr, '...', n # build alignments against cg seq = record.sequence.replace('N', 'A') _, ga, ra, truncated = aligner.align(seq) if len(ga) < 0.8 * len(seq): continue # now grab the tags associated with the alignment ga = ga.replace('-', '') tags = lh.sweep_tag_neighborhood(ga) if not tags: continue intersect = set(lh.get_tag_labels(tags[0])) for tag in tags[1:]: intersect.intersection_update(lh.get_tag_labels(tag)) if not intersect: # ignore confused reads continue # retrieve the labels associated with the tags matches = list(set([ names[i] for i in intersect ])) hit = random.choice(matches) counts[hit] += 1 for k, v in counts.iteritems(): if v: print k, v
def main(): parser = argparse.ArgumentParser() parser.add_argument('readsfile') parser.add_argument('samposfile') parser.add_argument('khfile') parser.add_argument('-V', '--variable', default=False, action='store_true') args = parser.parse_args() print >>sys.stderr, 'loading posdict' ignore_set = set() posdict = dict(read_pos_file(args.samposfile, ignore_set)) print >>sys.stderr, 'loading kh' kh = khmer.load_counting_hash(args.khfile) K = kh.ksize() count = 0 for record in screed.open(args.readsfile): if record.name in posdict: posns2 = posdict[record.name] seq = record.sequence.replace('N', 'A') posns1 = kh.find_spectral_error_positions(seq, CUTOFF) posns1 = add_n_posns(posns1, record.sequence) if posns1 != posns2: count += 1 print record.name, posns1, posns2 sys.stdout.write(record.sequence) sys.stdout.write('\n') for i in range(len(seq)): if i in posns1: sys.stdout.write('X') else: sys.stdout.write(' ') sys.stdout.write('\n') for i in range(len(seq)): if i in posns2: sys.stdout.write('Z') else: sys.stdout.write(' ') sys.stdout.write('\n') for i in range(len(seq) - K + 1): if kh.get(seq[i:i+K]) < CUTOFF: sys.stdout.write('*') else: sys.stdout.write(' ') sys.stdout.write('\n') print '' if count > 1000: break
def main(): args = get_parser().parse_args() # reads counting table ct_reads = khmer.load_counting_hash(args.ct_reads) # transcripts counting table ct_exon = khmer.load_counting_hash(args.ct_exon) # transcripts themselves transcripts = args.transcripts K = ct_reads.ksize() assert ct_exon.ksize() == K # build a read aligner against, well, the reads: aligner = khmer.ReadAligner(ct_reads, 1, 1.0) # pick up a list of sequences to pay attention to searchlist = set([ x.strip() for x in open('seq-profiles.list') ]) # run through the transcripts. for record in screed.open(transcripts): if record.name.split(' ')[0] not in searchlist: continue print 'found!', record.name.split(' ')[0] counts = [] # not norm by exon count counts2 = [] # norm by exon count seq = record.sequence.replace('N', 'A') for kmer in kmers(seq, K): exon_count = ct_exon.get(kmer) if exon_count: count = ct_reads.get(kmer) counts.append(count) counts2.append(count / float(exon_count)) filename = record.name.split(' ')[0] + '.kprofile' fp = open(filename, 'w') for n, (c1, c2) in enumerate(zip(counts, counts2)): print >>fp, n, c1, c2
def main(): parser = argparse.ArgumentParser(description="Get reads coverage matrix") parser.add_argument('hashname1') parser.add_argument('hashname2') parser.add_argument('hashname3') parser.add_argument('file1') parser.add_argument('file2') parser.add_argument('file3') parser.add_argument('output') args = parser.parse_args() hashname1 = args.hashname1 hashname2 = args.hashname2 hashname3 = args.hashname3 output = args.output file1 = args.file1 file2 = args.file2 file3 = args.file3 outfp = open(output, 'w') print 'hashtable from', hashname1 ht1 = khmer.load_counting_hash(hashname1) print 'hashtable from', hashname2 ht2 = khmer.load_counting_hash(hashname2) print 'hashtable from', hashname3 ht3 = khmer.load_counting_hash(hashname3) matrix1 = {} matrix2 = {} matrix3 = {} # set_x = set() # set_y = set() for file_n in [file1, file2, file3]: print 'reading reads file ', file_n for n, record in enumerate(screed.open(file_n)): if n > 0 and n % 100000 == 0: #100000 print '...', n, file_n seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) med2, _, _ = ht2.get_median_count(seq) med3, _, _ = ht3.get_median_count(seq) to_print = str(med1) + ' ' + str(med2) + ' ' + str(med3) + '\n' outfp.write(to_print) outfp.close()
def main(): parser = argparse.ArgumentParser(description="Get reads coverage matrix") parser.add_argument('hashname1') parser.add_argument('hashname2') parser.add_argument('hashname3') parser.add_argument('file1') parser.add_argument('file2') parser.add_argument('file3') parser.add_argument('output') args = parser.parse_args() hashname1 = args.hashname1 hashname2 = args.hashname2 hashname3 = args.hashname3 output = args.output file1 = args.file1 file2 = args.file2 file3 = args.file3 outfp = open(output, 'w') print 'hashtable from', hashname1 ht1 = khmer.load_counting_hash(hashname1) print 'hashtable from', hashname2 ht2 = khmer.load_counting_hash(hashname2) print 'hashtable from', hashname3 ht3 = khmer.load_counting_hash(hashname3) matrix1 = {} matrix2 = {} matrix3 = {} # set_x = set() # set_y = set() for file_n in [file1,file2,file3]: print 'reading reads file ',file_n for n, record in enumerate(screed.open(file_n)): if n > 0 and n % 100000 == 0:#100000 print '...', n, file_n seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) med2, _, _ = ht2.get_median_count(seq) med3, _, _ = ht3.get_median_count(seq) to_print = str(med1)+' '+str(med2)+' ' +str(med3)+'\n' outfp.write(to_print) outfp.close()
def main(): parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') args = parser.parse_args() hashfile = args.hashname datafile = args.datafile histout = args.histout print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout if os.path.exists(histout): if not args.squash_output: print >>sys.stderr, 'ERROR: %s exists; not squashing.' % histout sys.exit(-1) print '** squashing existing file %s' % histout print 'preparing hist...' z = ht.abundance_distribution(datafile, tracking) total = sum(z) fp = open(histout, 'w') sofar = 0 for n, i in enumerate(z): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >>fp, n, i, sofar, round(frac, 3) if sofar == total: break
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('ref') parser.add_argument('--trusted', type=int, default=5) parser.add_argument('--variants-out', type=str, default='variants.txt', dest='variants_out') args = parser.parse_args() ct = khmer.load_counting_hash(args.table) aligner = khmer.ReadAligner(ct, args.trusted, 1.0) i = 0 for record in screed.open(args.ref): i += 1 if i > 50: break seq = record.sequence seq = seq.replace('N', 'A') try: score, alignment = align_long(ct, aligner, seq) except: traceback.print_exc() continue g = alignment.g r = alignment.r m, n = alignment.compare() print record.name, m, n, n - m, "%.3f%%" % (float(m)/ n * 100) for start in range(0, len(alignment), 60): print start print alignment[start:start+60] gidx = AlignmentIndex(alignment) fp = open(args.variants_out, 'w') for gi, a, b in alignment.variants(): kmer = '' pos = gi while len(kmer) < ct.ksize() and pos < len(alignment.g): ch = alignment.g[pos] pos += 1 if ch in '=-': continue kmer += ch if alignment.covs[gi]: print >>fp, gi, a, b, gidx.get_ri(gi), kmer, alignment.covs[gi] if 0: print len(seq), alignment.refseqlen() gidx._sanityCheck(seq)
def main(): parser = build_counting_multifile_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print('file with ht: %s' % counting_ht) print('loading hashtable') ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() xxxfp = None print("K:", K) # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) pct = dev / avg * 100 xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name)) if random.randint(1, med) > args.coverage or pct > 100: return None, None return name, seq # the filtering loop for infile in infiles: print('filtering', infile) xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w') outfile = os.path.basename(infile) + '.medpctfilt' outfp = open(outfile, 'w') for n, record in enumerate(screed.open(infile)): if n % 100000 == 0: print('...', n) name, seq = process_fn(record) if name and seq: print('>%s\n%s' % (name, seq), file=outfp) print('output in', outfile)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '--min-coverage', type=int, default=None) parser.add_argument('-M', '--max-coverage', type=int, default=None) parser.add_argument('input_counting_table') parser.add_argument('input_readfile') parser.add_argument('output_readfile') args = parser.parse_args() print >>sys.stderr, 'min_coverage: %s' % args.min_coverage print >>sys.stderr, 'max_coverage: %s' % args.max_coverage if not (args.min_coverage or args.max_coverage): print >>sys.stderr, "neither min nor max coverage specified!? exiting!" sys.exit(1) if args.min_coverage and args.max_coverage and \ args.max_coverage < args.min_coverage: print >>sys.stderr, "min_coverage > max_coverage!? exiting!" sys.exit(1) htable = khmer.load_counting_hash(args.input_counting_table) output_file = args.output_readfile output_fp = open(output_file, 'w') n_kept = 0 n = 0 for n, record in enumerate(screed.open(args.input_readfile)): if n % 100000 == 0: print >>sys.stderr, '...', n, n_kept seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue keep = True if args.min_coverage and med < args.min_coverage: keep = False if args.max_coverage and med > args.max_coverage: keep = False if keep: n_kept += 1 output_fp.write(output_single(record)) print >>sys.stderr, 'consumed %d reads; kept %d' % (n, n_kept)
def main(): parser = argparse.ArgumentParser(description="Get reads coverage matrix") parser.add_argument('hashname1') parser.add_argument('hashname2') parser.add_argument('file1') parser.add_argument('file2') parser.add_argument('output') args = parser.parse_args() hashname1 = args.hashname1 hashname2 = args.hashname2 output = args.output file1 = args.file1 file2 = args.file2 outfp = open(output, 'w') print 'hashtable from', hashname1 ht1 = khmer.load_counting_hash(hashname1) ht2 = khmer.load_counting_hash(hashname2) for n, record in enumerate(screed.open(file1)): if n > 0 and n % 100000 == 0:#100000 print '...', n, file1 seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) med2, _, _ = ht2.get_median_count(seq) to_print = record.name+' '+str(med1)+' '+str(med2)+'\n' outfp.write(to_print) for n, record in enumerate(screed.open(file2)): if n > 0 and n % 100000 == 0:#100000 print '...', n, file2 seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) med2, _, _ = ht2.get_median_count(seq) to_print = record.name+' '+str(med1)+' '+str(med2)+'\n' outfp.write(to_print) outfp.close()
def main(): parser = argparse.ArgumentParser(description="Get reads coverage matrix") parser.add_argument('hashname1') parser.add_argument('hashname2') parser.add_argument('file1') parser.add_argument('file2') parser.add_argument('output') args = parser.parse_args() hashname1 = args.hashname1 hashname2 = args.hashname2 output = args.output file1 = args.file1 file2 = args.file2 outfp = open(output, 'w') print 'hashtable from', hashname1 ht1 = khmer.load_counting_hash(hashname1) ht2 = khmer.load_counting_hash(hashname2) for n, record in enumerate(screed.open(file1)): if n > 0 and n % 100000 == 0: #100000 print '...', n, file1 seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) med2, _, _ = ht2.get_median_count(seq) to_print = record.name + ' ' + str(med1) + ' ' + str(med2) + '\n' outfp.write(to_print) for n, record in enumerate(screed.open(file2)): if n > 0 and n % 100000 == 0: #100000 print '...', n, file2 seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) med2, _, _ = ht2.get_median_count(seq) to_print = record.name + ' ' + str(med1) + ' ' + str(med2) + '\n' outfp.write(to_print) outfp.close()
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_table, args.force) infiles = args.input_filename for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading counting table:', args.input_table, file=sys.stderr) htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): files = sys.argv[2:] total_reads = len(files) * [0] n_consumed = len(files) * [0] n_seq_kept = len(files) * [0] print('loading ht') ht = khmer.load_counting_hash(sys.argv[1]) for i, infile in enumerate(files): print('outputting', infile + '.freq') ht.output_fasta_kmer_pos_freq(infile, infile + ".freq")
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_) check_space(infiles) print 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('sequences') parser.add_argument('-C', '--cutoff', default=3, type=int) parser.add_argument('--coverage', default=20, type=int) parser.add_argument('-V', '--variable', default=False, action='store_true') parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args() kh = khmer.load_counting_hash(args.table) n_skipped_variable = 0 n_total = 0 print >> sys.stderr, "K:", kh.ksize() print >> sys.stderr, "CUTOFF:", args.cutoff if args.variable: print >> sys.stderr, "variable coverage flag set;" print >> sys.stderr, "NORMALIZE_LIMIT:", args.coverage else: print >> sys.stderr, "assuming even coverage - no -V" for n, record in enumerate(screed.open(args.sequences)): if n % 100000 == 0: print >> sys.stderr, '...', n seq = record.sequence.replace('N', 'A') n_total += 1 varskip = False if args.variable: med, _, _ = kh.get_median_count(seq) if med < args.coverage: varskip = True n_skipped_variable += 1 if varskip: print >> args.outfile, record.name, 'V' else: #posns = find_spectral_error_positions(kh, seq, args.cutoff) posns = kh.find_spectral_error_positions(seq, args.cutoff) posns = add_n_posns(posns, record.sequence) print >> args.outfile, record.name, ",".join(map(str, posns)) if args.variable: sys.stderr.write('Skipped %d reads of %d total due to -V\n' % \ (n_skipped_variable, n_total))
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('ref') parser.add_argument('--trusted', type=int, default=5) parser.add_argument('--variants-out', type=str, default='variants.txt', dest='variants_out') args = parser.parse_args() ct = khmer.load_counting_hash(args.table) aligner = khmer.ReadAligner(ct, args.trusted, 1.0) for record in screed.open(args.ref): seq = record.sequence seq = seq.replace('N', 'A') score, alignment = align_long(ct, aligner, seq) g = alignment.g r = alignment.r m, n = alignment.compare() print record.name, m, n, n - m, "%.3f%%" % (float(m) / n * 100) for start in range(0, len(alignment), 60): print start print alignment[start:start + 60] gidx = AlignmentIndex(alignment) fp = open(args.variants_out, 'w') for gi, a, b in alignment.variants(): kmer = '' pos = gi while len(kmer) < ct.ksize() and pos < len(alignment.g): ch = alignment.g[pos] pos += 1 if ch in '=-': continue kmer += ch if alignment.covs[gi]: print >> fp, gi, a, b, gidx.get_ri( gi), kmer, alignment.covs[gi] if 0: print len(seq), alignment.refseqlen() gidx._sanityCheck(seq)
def main(): hashfile = sys.argv[1] filename = sys.argv[2] outfile = os.path.basename(filename) print 'loading kh file', hashfile ht = khmer.load_counting_hash(hashfile) x = ht.fasta_count_kmers_by_position(filename, 100, 1) write_dist(x, open(outfile + '.pos.abund=1', 'w')) print 'wrote', outfile + '.pos.abund=1' y = ht.fasta_count_kmers_by_position(filename, 100, 255) write_dist(y, open(outfile + '.pos.abund=255', 'w')) print 'wrote', outfile + '.pos.abund=255'
def main(): parser = argparse.ArgumentParser( description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('seqfile') parser.add_argument('histout') args = parser.parse_args() hashfile = args.hashname seqfile = args.seqfile histout = args.histout outfp = open(histout, 'w') print('hashtable from', hashfile) ht = khmer.load_counting_hash(hashfile) hist = {} for i in range(65536): hist[i] = 0 for n, record in enumerate(screed.open(seqfile)): if n > 0 and n % 100000 == 0: print('...', n) seq = record.sequence.replace('N', 'A') try: med, _, _ = ht.get_median_count(seq) except ValueError: continue hist[med] = hist[med] + 1 histlist = list(hist.items()) histlist.sort() maxk = max(hist.keys()) sumk = sum(hist.values()) sofar = 0 for n, m in histlist: sofar += m percent = float(sofar) / sumk outfp.write('%d %d %d %.3f\n' % (n, m, sofar, percent)) outfp.close()
def do_test(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_above_x(1, 2**31) orig = khmer.CountingHash(12, sizes) orig.consume_fasta(inpath) orig.save(savepath) loaded = khmer.load_counting_hash(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-o', '--outputpath', dest='outputpath', default='.') args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames outpath = args.outputpath print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile