示例#1
0
def test_assembly_round2():
    instream = kevlar.open(data_file('var1.round2.augfastq'), 'r')
    graph = kevlar.ReadGraph()
    graph.load(kevlar.parse_augmented_fastx(instream))
    contig = graph.get_record('contig1')
    read = graph.get_record('read22f start=5,mutations=0')
    pair = calc_offset(contig, read, 'AAGTCTCGACTTTAAGGAAGTGGGCCTAC')
    assert pair.tail == read
    assert pair.head == contig
    assert merge_pair(pair) == ('TATCACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAAAGAGT'
                                'TACACCCCGGTTTTTAGAAGTCTCGACTTTAAGGAAGTGGGCCTA'
                                'CGGCGGAAGCCGTC')
示例#2
0
def test_assembly_contigs():
    instream = kevlar.open(data_file('AluContigs.augfastq'), 'r')
    graph = kevlar.ReadGraph()
    graph.load(kevlar.parse_augmented_fastx(instream))
    contig6 = graph.get_record('contig6')
    contig7 = graph.get_record('contig7')
    pair = calc_offset(contig6, contig7, 'AAAGTTTTCTTAAAAACATATATGGCCGGGC')
    assert pair.offset == 50
    assert pair.overlap == 85
    assert pair.tail == contig6
    newrecord = merge_and_reannotate(pair, 'newcontig')
    assert newrecord.sequence == ('TTGCCCAGGCTGGTCTCAAACTCCTGAGCTCAAAGCGATCTGT'
                                  'CGGCCTGGGCATCCAAAAAAAGTTTTCTTAAAAACATATATGG'
                                  'CCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAG'
                                  'GCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCC'
                                  'TGGCTAACACG')
示例#3
0
def partition(readstream,
              strict=False,
              minabund=None,
              maxabund=None,
              dedup=True,
              gmlfile=None,
              logstream=sys.stderr):
    timer = kevlar.Timer()
    timer.start()

    timer.start('loadreads')
    print('[kevlar::partition] Loading reads', file=logstream)

    graph = kevlar.ReadGraph()
    graph.load(readstream, minabund=minabund, maxabund=maxabund)
    elapsed = timer.stop('loadreads')
    print('[kevlar::partition]',
          'Reads loaded in {:.2f} sec'.format(elapsed),
          file=logstream)

    timer.start('buildgraph')
    mode = 'strict' if strict else 'relaxed'
    message = 'Building read graph in {:s} mode'.format(mode)
    print('[kevlar::partition]', message, file=logstream)
    graph.populate_edges(strict=strict)
    elapsed = timer.stop('buildgraph')
    print('[kevlar::partition]',
          'Graph built in {:.2f} sec'.format(elapsed),
          file=logstream)

    if gmlfile:  # pragma: no cover
        kevlar.to_gml(graph, gmlfile, logstream)

    timer.start('partition')
    print('[kevlar::partition] Partition readgraph', file=logstream)
    part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True)
    for part in part_iter:
        reads = [graph.get_record(readname) for readname in list(part)]
        yield reads
    elapsed = timer.stop('partition')
    print('[kevlar::partition]',
          'Partitioning done in {:.2f} sec'.format(elapsed),
          file=logstream)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::partition]', message, file=logstream)
示例#4
0
def partition(readstream, strict=False, minabund=None, maxabund=None,
              dedup=True, gmlfile=None):
    timer = kevlar.Timer()
    timer.start()

    timer.start('loadreads')
    kevlar.plog('[kevlar::partition] Loading reads')

    graph = kevlar.ReadGraph()
    graph.load(readstream, minabund=minabund, maxabund=maxabund)
    elapsed = timer.stop('loadreads')
    message = 'Reads loaded in {:.2f} sec'.format(elapsed)
    kevlar.plog('[kevlar::partition]', message)

    timer.start('buildgraph')
    mode = 'strict' if strict else 'relaxed'
    message = 'Building read graph in {:s} mode'.format(mode)
    kevlar.plog('[kevlar::partition]', message)
    graph.populate_edges(strict=strict)
    elapsed = timer.stop('buildgraph')
    message = 'Graph built in {:.2f} sec'.format(elapsed)
    kevlar.plog('[kevlar::partition]', message)

    if gmlfile:  # pragma: no cover
        kevlar.to_gml(graph, gmlfile, logstream)

    timer.start('partition')
    kevlar.plog('[kevlar::partition] Partition readgraph')
    part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True)
    for n, part in enumerate(part_iter, 1):
        reads = [graph.get_record(readname) for readname in list(part)]
        for read in reads:
            read.name += ' kvcc={:d}'.format(n)
        yield n, reads
    elapsed = timer.stop('partition')
    message = 'Partitioning done in {:.2f} sec'.format(elapsed)
    kevlar.plog('[kevlar::partition]', message)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    kevlar.plog('[kevlar::partition]', message)
示例#5
0
def test_graph_init():
    """Test graph initialization."""
    instream = kevlar.open(data_file('var1.reads.augfastq'), 'r')
    graph = kevlar.ReadGraph()
    graph.load(kevlar.parse_augmented_fastx(instream))
    graph.populate_edges(strict=True)

    # 10 reads in the file, but read16f has no valid connections due to error
    assert len(graph.nodes()) == 10

    # The given read shares its interesting k-mer and has compatible overlaps
    # with 6 other reads (read13f and read15f have errors).
    r23name = 'read23f start=67,mutations=0'
    assert len(graph[r23name]) == 6

    # Test the values of one of the edges.
    r35name = 'read35f start=25,mutations=0'
    assert graph[r23name][r35name]['offset'] == 42
    assert graph[r23name][r35name]['overlap'] == 58

    # Should all be a single CC
    assert len(list(connected_components(graph))) == 2
    assert len([p for p in graph.partitions()]) == 1

    r8name = 'read8f start=8,mutations=0'
    r37name = 'read37f start=9,mutations=0'
    assert graph[r37name][r8name]['offset'] == 1
    assert graph[r37name][r8name]['overlap'] == 99
    pair = OverlappingReadPair(tail=graph.get_record(r8name),
                               head=graph.get_record(r37name),
                               offset=1,
                               overlap=99,
                               sameorient=True,
                               swapped=False)
    assert merge_pair(pair) == ('CACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAAAGAGTTAC'
                                'ACCCCGGTTTTTAGAAGTCTCGACTTTAAGGAAGTGGGCCTACGG'
                                'CGGAAGCCGTC')
示例#6
0
def assemble_greedy(readstream,
                    gmlfilename=None,
                    debug=False,
                    logstream=sys.stderr):
    debugout = None
    if debug:
        debugout = logstream

    graph = kevlar.ReadGraph()
    graph.load(readstream)
    inputreads = set(graph.nodes())
    message = 'loaded {:d} reads'.format(graph.number_of_nodes())
    message += ' and {:d} interesting k-mers'.format(len(graph.ikmers))
    print('[kevlar::assemble::default]', message, file=logstream)

    graph.populate_edges(strict=True)
    message = 'populated "shared interesting k-mers" graph'
    message += ' with {:d} edges'.format(graph.number_of_edges())
    # If number of nodes is less than number of reads, it's probably because
    # some reads have no valid overlaps with other reads.
    print('[kevlar::assemble::default]', message, file=logstream)

    if graph.number_of_edges() == 0:
        message = 'nothing to be done, aborting'
        raise KevlarEdgelessGraphError(message)

    if gmlfilename:
        tempgraph = graph.copy()
        for n1, n2 in tempgraph.edges():
            ikmerset = tempgraph[n1][n2]['ikmers']
            ikmerstr = ','.join(ikmerset)
            tempgraph[n1][n2]['ikmers'] = ikmerstr
        networkx.write_gml(tempgraph, gmlfilename)
        message = 'graph written to {:s}'.format(gmlfilename)
        print('[kevlar::assemble::default]', message, file=logstream)

    edges_dropped = prune_graph(graph)
    cc_stream = networkx.connected_component_subgraphs(graph, copy=False)
    ccs = [cc for cc in cc_stream if cc.number_of_edges() > 0]
    ccnodes = sum([cc.number_of_nodes() for cc in ccs])
    message = 'dropped {:d} edges'.format(edges_dropped)
    message += ', graph now has {:d} connected component(s)'.format(len(ccs))
    message += ', {:d} nodes'.format(ccnodes)
    message += ', and {:d} edges'.format(graph.number_of_edges())
    print('[kevlar::assemble::default]', message, file=logstream)
    if len(ccs) > 1:
        message = 'multiple connected components designated by cc=N in output'
        print('[kevlar::assemble::default] WARNING:', message, file=logstream)

    contigcount = 0
    unassembledcount = 0
    for n, cc in enumerate(ccs, 1):
        cc = graph.full_cc(cc)
        assemble_with_greed(cc, n, debugout)
        for seqname in cc.nodes():
            if seqname in inputreads:
                unassembledcount += 1
                continue
            contigcount += 1
            contigrecord = cc.get_record(seqname)
            yield contigrecord

    assembledcount = ccnodes - unassembledcount
    message = '[kevlar::assemble] assembled'
    message += ' {:d}/{:d} reads'.format(assembledcount, ccnodes)
    message += ' from {:d} connected component(s)'.format(len(ccs))
    message += ' into {:d} contig(s)'.format(contigcount)
    print(message, file=logstream)