def test_error_create(): from khmer import _GraphLabels try: _GraphLabels(None) assert 0, "This should fail." except ValueError as err: print(str(err))
def test_assemble_left_double_fork(self, left_double_fork_structure): # assemble entire contig + branch points b/c of labels; start from end graph, contig, L, HDN, R, branch = left_double_fork_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) # first try without the labels paths = asm.assemble(contig[-K:]) assert len(paths) == 1 # without labels, should get the beginning of the HDN thru the end assert paths[0] == contig[HDN.pos:] # now add labels and check that we get two full length paths hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(branch) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[-K:]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths)
def main(): p = argparse.ArgumentParser() p.add_argument('assembly') p.add_argument('readfiles', nargs='+') p.add_argument('-o', '--output', default=None) p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int) p.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE, type=float) args = p.parse_args() ng = khmer.Nodegraph(args.ksize, args.tablesize, 4) # first, consume & tag the reads for readfile in args.readfiles: print('loading & tagging reads from:', readfile) ng.consume_seqfile_and_tag(readfile) ## next, consume & label the assembly print('loading & tagging assembly from:', args.assembly) lh = khmer._GraphLabels(ng) lh.consume_seqfile_and_tag_with_labels(args.assembly) if args.output: outfp = open(args.output, 'w') ## finally, walk across the reads & find those with no labels n = 0 m = 0 for readfile in args.readfiles: print('loading reads from:', readfile) if not args.output: outfile = os.path.basename(readfile) + '.leftover2' outfp = open(outfile, 'w') print('writing to:', outfile, file=sys.stderr) for record in screed.open(readfile): if n % 100000 == 0 and n: print('...', readfile, n, m, file=sys.stderr) x = ng.get_tags_and_positions(record.sequence) do_extract = False for (pos, tag) in x: if not lh.get_tag_labels(tag): do_extract = True break if do_extract: khmer.utils.write_record(record, outfp) m += 1 n += 1 if not args.output: outfp.close() print('%d left out of assembly, of %d reads' % (m, n), file=sys.stderr)
def test_assemble_left_double_fork(self, left_double_fork_structure): # assemble entire contig + branch points b/c of labels; start from end graph, contig, L, HDN, R, branch = left_double_fork_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) # first try without the labels paths = asm.assemble(contig[-K:]) assert len(paths) == 1 # without labels, should get the beginning of the HDN thru the end assert paths[0] == contig[HDN.pos :] # now add labels and check that we get two full length paths hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(branch) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[-K:]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths)
def test_consume_seqfile_and_tag_with_labels(Graphtype): infile = utils.get_test_data('valid-read-testing.fq') # read this in consume_and_tag graph = Graphtype(15, PRIMES_1m) x = _GraphLabels(graph) x.consume_seqfile_and_tag_with_labels(infile) assert x.n_labels() == 9
def test_consume_partitioned_seqfile_and_label(Graphtype): infile = utils.get_test_data('valid-read-testing.fq') # read this in consume_and_tag graph = Graphtype(15, PRIMES_1m) x = _GraphLabels(graph) x.consume_partitioned_fasta_and_tag_with_labels(infile) assert x.n_labels() == 9
def test_consume_partitioned_seqfile_and_label(Graphtype): infile = utils.get_test_data('valid-read-testing.fq') # read this in consume_and_tag graph = Graphtype(15, *params_1m) x = _GraphLabels(graph) x.consume_partitioned_fasta_and_tag_with_labels(infile) assert x.n_labels() == 9
def test_assemble_tandem_repeats(self, tandem_repeat_structure): # assemble one copy of a tandem repeat graph, repeat, tandem_repeats = tandem_repeat_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) paths = asm.assemble(repeat[:K]) assert len(paths) == 1 # There are K-1 k-mers spanning the junction between # the beginning and end of the repeat assert len(paths[0]) == len(repeat) + K - 1
def test_assemble_snp_bubble_single(self, snp_bubble_structure): # assemble entire contig + one of two paths through a bubble graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(wildtype) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) paths = asm.assemble(wildtype[:K]) assert len(paths) == 1 assert utils._equals_rc(paths[0], wildtype)
def test_beginning_to_end_across_tip(self, right_tip_structure): # assemble entire contig, ignoring branch point b/c of labels graph, contig, L, HDN, R, tip = right_tip_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) # L, HDN, and R will be labeled with 1 lh.label_across_high_degree_nodes(contig, hdn, 1) path = asm.assemble(contig[:K]) assert len(path) == 1, "there should only be one path" path = path[0] # @CTB assert len(path) == len(contig) assert utils._equals_rc(path, contig)
def main(): p = argparse.ArgumentParser() p.add_argument('fastq_files', nargs='+') args = p.parse_args() cg = khmer.Countgraph(K, 1e8, 4) kept = 0 hdn = khmer.HashSet(K) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < K: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < K: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 elif cov == 30: contigs = lh.assemble_labeled_path(record.sequence[:K]) for contig in contigs: for t in translate(contig): for o in extract_orfs(t): if hash(o) not in output: output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1
def test_assemble_snp_bubble_both(self, snp_bubble_structure): # assemble entire contig + both paths graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(wildtype) hdn += graph.find_high_degree_nodes(mutant) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) lh.label_across_high_degree_nodes(mutant, hdn, 2) paths = asm.assemble(wildtype[:K]) assert len(paths) == 2 assert any(utils._contains_rc(wildtype, path) for path in paths) assert any(utils._contains_rc(mutant, path) for path in paths)
def test_assemble_right_double_fork(self, right_double_fork_structure): # assemble two contigs from a double forked structure graph, contig, L, HDN, R, branch = right_double_fork_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(branch) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[:K]) print("Path lengths", [len(x) for x in paths]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths)
def test_assemble_right_double_fork(self, right_double_fork_structure): # assemble two contigs from a double forked structure graph, contig, L, HDN, R, branch = right_double_fork_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(branch) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[:K]) print('Path lengths', [len(x) for x in paths]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths)
def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure): # assemble one side of bubble, blocked with stop_bf, # when labels on both branches # stop_bf should trip a filter failure, negating the label spanning graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure stop_bf = khmer.Nodegraph(K, 1e5, 4) lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(wildtype) hdn += graph.find_high_degree_nodes(mutant) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) lh.label_across_high_degree_nodes(mutant, hdn, 2) # do the labeling, but block the mutant with stop_bf stop_bf.count(mutant[HDN_L.pos + 1:HDN_L.pos + K + 1]) paths = asm.assemble(wildtype[:K], stop_bf) assert len(paths) == 1 assert any(utils._equals_rc(path, wildtype) for path in paths)
def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure): # assemble one side of bubble, blocked with stop_bf, # when labels on both branches # stop_bf should trip a filter failure, negating the label spanning graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure stop_bf = khmer.Nodegraph(K, 1e5, 4) lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(wildtype) hdn += graph.find_high_degree_nodes(mutant) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) lh.label_across_high_degree_nodes(mutant, hdn, 2) # do the labeling, but block the mutant with stop_bf stop_bf.count(mutant[HDN_L.pos + 1 : HDN_L.pos + K + 1]) paths = asm.assemble(wildtype[:K], stop_bf) assert len(paths) == 1 assert any(utils._equals_rc(path, wildtype) for path in paths)
def main(): p = argparse.ArgumentParser() p.add_argument('contig_files', nargs='+') args = p.parse_args() ng = khmer.Nodegraph(K, 1e8, 4) starts = [] for filename in args.contig_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n) ng.consume(record.sequence) starts.append(record.sequence[:K]) hdn = khmer.HashSet(K) for filename in args.contig_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n) hdn += ng.find_high_degree_nodes(record.sequence) lh = khmer._GraphLabels(ng) for filename in args.contig_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n) lh.label_across_high_degree_nodes(record.sequence, hdn, n) counter = 0 for k in starts: contigs = lh.assemble_labeled_path(k) if not contigs: print('nada...') for c in contigs: print('>%d\n%s' % (counter, c)) counter += 1
def test_assemble_right_triple_fork(self, right_triple_fork_structure): # assemble three contigs from a trip fork (graph, contig, L, HDN, R, top_sequence, bottom_sequence) = right_triple_fork_structure lh = khmer._GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(top_sequence) hdn += graph.find_high_degree_nodes(bottom_sequence) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(top_sequence, hdn, 2) lh.label_across_high_degree_nodes(bottom_sequence, hdn, 3) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[:K]) print([len(x) for x in paths]) assert len(paths) == 3 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, top_sequence) for path in paths) assert any(utils._equals_rc(path, bottom_sequence) for path in paths)
def main(): parser = argparse.ArgumentParser() parser.add_argument('seqfiles', nargs='+') parser.add_argument('-o', '--output', default=None) parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int) parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE, type=float) parser.add_argument('--force', action='store_true') #parser.add_argument('--gml', action='store_true') args = parser.parse_args() assert args.ksize % 2, "ksize must be odd" assert args.output, "you probably want an output file" print('building graphs and loading files') # Create graph, and two stop bloom filters - one for loading, one for # traversing. Create them all here so that we can error out quickly # if memory is a problem. graph = khmer.Nodegraph(args.ksize, args.tablesize, 2) stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2) stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2) n = 0 # load in all of the input sequences, one file at a time. for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...', seqfile, n) graph.consume(record.sequence) # complain if too small set of graphs was used. fp_rate = khmer.calc_expected_collisions(graph, args.force, max_false_pos=.05) # initialize the object that will track information for us. pathy = Pathfinder(args.ksize) print('finding high degree nodes') degree_nodes = khmer.HashSet(args.ksize) n = 0 for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...2', seqfile, n) # walk across sequences, find all high degree nodes, # name them and cherish them. Don't do this on identical sequences. if min(stop_bf2.get_kmer_counts(record.sequence)) == 0: stop_bf2.consume(record.sequence) degree_nodes += graph.find_high_degree_nodes(record.sequence) del stop_bf2 if not len(degree_nodes): print('no high degree nodes; exiting.') sys.exit(0) #### lh = khmer._GraphLabels(graph) n = 0 for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...2', seqfile, n) lh.label_across_high_degree_nodes(record.sequence, degree_nodes, n) print('num labels:', lh.n_labels()) # get all of the degree > 2 nodes and give them IDs. for node in degree_nodes: pathy.new_segment(node) print('traversing linear segments from', len(degree_nodes), 'nodes') # now traverse from each high degree nodes into all neighboring nodes, # seeking adjacencies. if neighbor is high degree node, add it to # adjacencies; if neighbor is not, then traverse the linear path. also # track minhashes while we're at it. for n, k in enumerate(degree_nodes): if n % 10000 == 0: print('...', n, 'of', len(degree_nodes)) # retrieve the segment ID of the primary node. k_id = pathy.segments_r[k] # find all the neighbors of this high-degree node. nbh = graph.neighbors(k) for nk in nbh: # neighbor is high degree? fine, mark its adjacencies. if nk in degree_nodes: nk_id = pathy.segments_r[nk] pathy.add_adjacency(k_id, nk_id) else: # linear! walk it. traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy, degree_nodes, lh) print(len(pathy.segments), 'segments, containing', sum(pathy.segments.values()), 'nodes') # save to GML if args.output: import graph_writer print('saving to', args.output) fp = open(args.output, 'w') w = graph_writer.GmlWriter(fp, [], []) for k, v in pathy.segments.items(): w.add_vertex(k, v, []) for k, v in pathy.adjacencies.items(): for edge in v: w.add_edge(k, edge, [])
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('-o', type=argparse.FileType('w'), default='assembly-stats.csv') args = p.parse_args() cg = create_countgraph(args) kept = 0 hdn = khmer.HashSet(args.ksize) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(args.o, delimiter=',', fieldnames=[ 'read_n', 'action', 'cov', 'n_hdn', 'contig_n', 'orf_n', 'new' ]) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_hdn': None, 'contig_n': None, 'orf_n': None, 'new': None }) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 statswriter.writerow({ 'read_n': n, 'action': 'l', 'cov': cov, 'n_hdn': len(hdn), 'contig_n': None, 'orf_n': None, 'new': None }) elif cov == 30: contigs = lh.assemble_labeled_path( record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': None, 'new': None }) for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1 else: new = False statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': orf_n, 'new': new })
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('-o', type=argparse.FileType('w'), default='assembly-stats.csv') args = p.parse_args() cg = create_countgraph(args) kept = 0 hdn = khmer.HashSet(args.ksize) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(args.o, delimiter=',', fieldnames=['read_n', 'action', 'cov', 'n_hdn', 'contig_n', 'orf_n', 'new']) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({'read_n': n, 'action': 'c', 'cov': cov, 'n_hdn': None, 'contig_n': None, 'orf_n': None, 'new': None}) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 statswriter.writerow({'read_n': n, 'action': 'l', 'cov': cov, 'n_hdn': len(hdn), 'contig_n': None, 'orf_n': None, 'new': None}) elif cov == 30: contigs = lh.assemble_labeled_path(record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': None, 'new': None}) for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1 else: new = False statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': orf_n, 'new': new})