def test_assembly_round2(): instream = kevlar.open(data_file('var1.round2.augfastq'), 'r') graph = kevlar.ReadGraph() graph.load(kevlar.parse_augmented_fastx(instream)) contig = graph.get_record('contig1') read = graph.get_record('read22f start=5,mutations=0') pair = calc_offset(contig, read, 'AAGTCTCGACTTTAAGGAAGTGGGCCTAC') assert pair.tail == read assert pair.head == contig assert merge_pair(pair) == ('TATCACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAAAGAGT' 'TACACCCCGGTTTTTAGAAGTCTCGACTTTAAGGAAGTGGGCCTA' 'CGGCGGAAGCCGTC')
def test_assembly_contigs(): instream = kevlar.open(data_file('AluContigs.augfastq'), 'r') graph = kevlar.ReadGraph() graph.load(kevlar.parse_augmented_fastx(instream)) contig6 = graph.get_record('contig6') contig7 = graph.get_record('contig7') pair = calc_offset(contig6, contig7, 'AAAGTTTTCTTAAAAACATATATGGCCGGGC') assert pair.offset == 50 assert pair.overlap == 85 assert pair.tail == contig6 newrecord = merge_and_reannotate(pair, 'newcontig') assert newrecord.sequence == ('TTGCCCAGGCTGGTCTCAAACTCCTGAGCTCAAAGCGATCTGT' 'CGGCCTGGGCATCCAAAAAAAGTTTTCTTAAAAACATATATGG' 'CCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAG' 'GCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCC' 'TGGCTAACACG')
def partition(readstream, strict=False, minabund=None, maxabund=None, dedup=True, gmlfile=None, logstream=sys.stderr): timer = kevlar.Timer() timer.start() timer.start('loadreads') print('[kevlar::partition] Loading reads', file=logstream) graph = kevlar.ReadGraph() graph.load(readstream, minabund=minabund, maxabund=maxabund) elapsed = timer.stop('loadreads') print('[kevlar::partition]', 'Reads loaded in {:.2f} sec'.format(elapsed), file=logstream) timer.start('buildgraph') mode = 'strict' if strict else 'relaxed' message = 'Building read graph in {:s} mode'.format(mode) print('[kevlar::partition]', message, file=logstream) graph.populate_edges(strict=strict) elapsed = timer.stop('buildgraph') print('[kevlar::partition]', 'Graph built in {:.2f} sec'.format(elapsed), file=logstream) if gmlfile: # pragma: no cover kevlar.to_gml(graph, gmlfile, logstream) timer.start('partition') print('[kevlar::partition] Partition readgraph', file=logstream) part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True) for part in part_iter: reads = [graph.get_record(readname) for readname in list(part)] yield reads elapsed = timer.stop('partition') print('[kevlar::partition]', 'Partitioning done in {:.2f} sec'.format(elapsed), file=logstream) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::partition]', message, file=logstream)
def partition(readstream, strict=False, minabund=None, maxabund=None, dedup=True, gmlfile=None): timer = kevlar.Timer() timer.start() timer.start('loadreads') kevlar.plog('[kevlar::partition] Loading reads') graph = kevlar.ReadGraph() graph.load(readstream, minabund=minabund, maxabund=maxabund) elapsed = timer.stop('loadreads') message = 'Reads loaded in {:.2f} sec'.format(elapsed) kevlar.plog('[kevlar::partition]', message) timer.start('buildgraph') mode = 'strict' if strict else 'relaxed' message = 'Building read graph in {:s} mode'.format(mode) kevlar.plog('[kevlar::partition]', message) graph.populate_edges(strict=strict) elapsed = timer.stop('buildgraph') message = 'Graph built in {:.2f} sec'.format(elapsed) kevlar.plog('[kevlar::partition]', message) if gmlfile: # pragma: no cover kevlar.to_gml(graph, gmlfile, logstream) timer.start('partition') kevlar.plog('[kevlar::partition] Partition readgraph') part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True) for n, part in enumerate(part_iter, 1): reads = [graph.get_record(readname) for readname in list(part)] for read in reads: read.name += ' kvcc={:d}'.format(n) yield n, reads elapsed = timer.stop('partition') message = 'Partitioning done in {:.2f} sec'.format(elapsed) kevlar.plog('[kevlar::partition]', message) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) kevlar.plog('[kevlar::partition]', message)
def test_graph_init(): """Test graph initialization.""" instream = kevlar.open(data_file('var1.reads.augfastq'), 'r') graph = kevlar.ReadGraph() graph.load(kevlar.parse_augmented_fastx(instream)) graph.populate_edges(strict=True) # 10 reads in the file, but read16f has no valid connections due to error assert len(graph.nodes()) == 10 # The given read shares its interesting k-mer and has compatible overlaps # with 6 other reads (read13f and read15f have errors). r23name = 'read23f start=67,mutations=0' assert len(graph[r23name]) == 6 # Test the values of one of the edges. r35name = 'read35f start=25,mutations=0' assert graph[r23name][r35name]['offset'] == 42 assert graph[r23name][r35name]['overlap'] == 58 # Should all be a single CC assert len(list(connected_components(graph))) == 2 assert len([p for p in graph.partitions()]) == 1 r8name = 'read8f start=8,mutations=0' r37name = 'read37f start=9,mutations=0' assert graph[r37name][r8name]['offset'] == 1 assert graph[r37name][r8name]['overlap'] == 99 pair = OverlappingReadPair(tail=graph.get_record(r8name), head=graph.get_record(r37name), offset=1, overlap=99, sameorient=True, swapped=False) assert merge_pair(pair) == ('CACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAAAGAGTTAC' 'ACCCCGGTTTTTAGAAGTCTCGACTTTAAGGAAGTGGGCCTACGG' 'CGGAAGCCGTC')
def assemble_greedy(readstream, gmlfilename=None, debug=False, logstream=sys.stderr): debugout = None if debug: debugout = logstream graph = kevlar.ReadGraph() graph.load(readstream) inputreads = set(graph.nodes()) message = 'loaded {:d} reads'.format(graph.number_of_nodes()) message += ' and {:d} interesting k-mers'.format(len(graph.ikmers)) print('[kevlar::assemble::default]', message, file=logstream) graph.populate_edges(strict=True) message = 'populated "shared interesting k-mers" graph' message += ' with {:d} edges'.format(graph.number_of_edges()) # If number of nodes is less than number of reads, it's probably because # some reads have no valid overlaps with other reads. print('[kevlar::assemble::default]', message, file=logstream) if graph.number_of_edges() == 0: message = 'nothing to be done, aborting' raise KevlarEdgelessGraphError(message) if gmlfilename: tempgraph = graph.copy() for n1, n2 in tempgraph.edges(): ikmerset = tempgraph[n1][n2]['ikmers'] ikmerstr = ','.join(ikmerset) tempgraph[n1][n2]['ikmers'] = ikmerstr networkx.write_gml(tempgraph, gmlfilename) message = 'graph written to {:s}'.format(gmlfilename) print('[kevlar::assemble::default]', message, file=logstream) edges_dropped = prune_graph(graph) cc_stream = networkx.connected_component_subgraphs(graph, copy=False) ccs = [cc for cc in cc_stream if cc.number_of_edges() > 0] ccnodes = sum([cc.number_of_nodes() for cc in ccs]) message = 'dropped {:d} edges'.format(edges_dropped) message += ', graph now has {:d} connected component(s)'.format(len(ccs)) message += ', {:d} nodes'.format(ccnodes) message += ', and {:d} edges'.format(graph.number_of_edges()) print('[kevlar::assemble::default]', message, file=logstream) if len(ccs) > 1: message = 'multiple connected components designated by cc=N in output' print('[kevlar::assemble::default] WARNING:', message, file=logstream) contigcount = 0 unassembledcount = 0 for n, cc in enumerate(ccs, 1): cc = graph.full_cc(cc) assemble_with_greed(cc, n, debugout) for seqname in cc.nodes(): if seqname in inputreads: unassembledcount += 1 continue contigcount += 1 contigrecord = cc.get_record(seqname) yield contigrecord assembledcount = ccnodes - unassembledcount message = '[kevlar::assemble] assembled' message += ' {:d}/{:d} reads'.format(assembledcount, ccnodes) message += ' from {:d} connected component(s)'.format(len(ccs)) message += ' into {:d} contig(s)'.format(contigcount) print(message, file=logstream)