Exemplo n.º 1
0
 def _traverse_a_branch_from_queue(self):
     setup = self.branch_queue.popleft()
     color_branch_traverser = self.branch_traverser[setup.traversal_color]
     branch = color_branch_traverser.traverse_from(
         setup.start_string,
         orientation=setup.orientation,
         parent_graph=self.graph)
     Interactor.from_graph(self.graph).compose_in_graph(branch.graph)
     self._connect_branch_to_parent_graph(branch, setup)
     self._link_branch_and_queue_neighbor_traversals(branch)
Exemplo n.º 2
0
 def _traverse_from_each_kmer_in(self, kmer_generator):
     for start_kmer in kmer_generator:
         try:
             Interactor.from_graph(self.graph) \
                 .compose_in_graph(self._traverse_from(start_kmer).graph)
             self.log_graph_size()
         except KeyError:
             pass
         if self.max_nodes and len(self.graph) > self.max_nodes:
             raise Exception(
                 ("Terminating contig traversal after kmer {}"
                  " because max node limit is reached").format(start_kmer))
     return self
Exemplo n.º 3
0
    def test_in_y_graph_finds_two_paths_of_revcomp(self):
        # given
        b = get_cortex_builder()
        b.with_kmer('CGC 1 .......T')
        b.with_kmer('AGC 1 a....CG.')
        b.with_kmer('AAG 1 .....C..')
        b.with_kmer('GCC 1 a.......')
        cdb = b.build()

        cdb = Interactor(cdb).make_graph_nodes_consistent(['AAG']).graph

        # when
        paths = list(Interactor(cdb).all_simple_paths())

        # then
        assert ['AAGCC', 'AAGCG'] == sorted([str(p.seq) for p in paths])
Exemplo n.º 4
0
    def test_two_linked_kmers_are_jsonifiable(self):
        # given
        colors = (0, 1)
        color_names = ['samp1', 'samp2']
        graph_builder = builder.Graph() \
            .with_kmer_size(3) \
            .with_num_colors(2) \
            .with_color_names(*color_names) \
            .with_kmer('AAA 1 1 .....C.. ........') \
            .with_kmer('AAC 1 0 a....... ........')

        graph = load_cortex_graph(graph_builder.build())
        graph = Interactor(graph) \
            .make_graph_nodes_consistent(seed_kmer_strings=['GTT']) \
            .graph
        kmer_json = cortexpy.graph.serializer.serializer.Serializer(
            graph).to_json()

        # when
        expect = expectation.JsonGraph.from_string(kmer_json)

        # then
        kmer_data = json.loads(kmer_json)  # does not raise
        assert kmer_data['graph']['colors'] == list(colors)
        assert kmer_data['graph']['sample_names'] == color_names

        expect.has_n_nodes(2)
        expect.has_n_edges(1)
Exemplo n.º 5
0
 def build(self):
     graph = self.builder.build()
     if self.seed_kmers is None:
         self.seed_kmers = [next(iter(graph))]
     graph = Interactor.from_graph(graph).make_graph_nodes_consistent(self.seed_kmers).graph
     return UnitigFinder.from_graph(graph,
                                    colors=list(self.builder.colors),
                                    test_coverage=self.test_coverage)
Exemplo n.º 6
0
def assemble(argv):
    import argparse
    from cortexpy.command.shared import get_shared_argparse
    shared_parser = get_shared_argparse()

    parser = argparse.ArgumentParser(prog='cortexpy assemble', parents=[shared_parser], description="""
    Assemble all possible transcripts in <graph> from all k-mers in <start-sequences> and print the
    resulting transcripts as a FASTA to stdout. All specified colors are traversed and collapsed
    before output.
    """)
    parser.add_argument('graph', help='cortex graph')
    parser.add_argument('start_sequences_fasta', help='FASTA file with sequences to start from')
    parser.add_argument('--color', type=int, help='Restrict view to single color')
    parser.add_argument('--max-nodes', type=int, default=1000,
                        help='Maximum number of nodes to traverse [default: %(default)s]')
    args = parser.parse_args(argv)

    from cortexpy.logging_config import configure_logging_from_args_and_get_logger
    logger = configure_logging_from_args_and_get_logger(args, 'cortexpy.assemble')

    import sys
    from Bio import SeqIO
    from cortexpy.utils import kmerize_fasta
    from cortexpy.graph.interactor import Interactor
    from cortexpy.graph.parser.random_access import RandomAccess
    from cortexpy.constants import EngineTraversalOrientation
    from cortexpy.graph.traversal.engine import Engine

    if args.out == '-':
        output = sys.stdout
    else:
        output = open(args.out, 'wt')

    random_access = RandomAccess(open(args.graph, 'rb'))
    if args.color is None:
        colors = list(range(random_access.num_colors))
    else:
        colors = [args.color]
    traverser = Engine(
        random_access,
        traversal_colors=colors,
        orientation=EngineTraversalOrientation.both,
        max_nodes=args.max_nodes,
    )
    traverser.traverse_from_each_kmer_in_fasta(args.start_sequences_fasta)
    kmers = kmerize_fasta(args.start_sequences_fasta, traverser.ra_parser.kmer_size)
    interactor = Interactor.from_graph(traverser.graph).make_graph_nodes_consistent(
        seed_kmer_strings=kmers)

    seq_record_generator = interactor.all_simple_paths()

    SeqIO.write(seq_record_generator, output, 'fasta')
Exemplo n.º 7
0
 def run(self):
     if self.retrieve:
         self.retriever = ContigRetriever(self.graph_builder.build())
         return self.retriever.get_kmer_graph(self.contig_to_retrieve)
     elif self.traverse:
         traverser = Engine(RandomAccess(self.graph_builder.build()),
                            traversal_colors=self.traversal_colors)
         graph = traverser.traverse_from(self.traversal_start_kmer).graph
         return Interactor(graph) \
             .make_graph_nodes_consistent([self.traversal_start_kmer]) \
             .graph
     else:
         raise Exception("Need to load a command")
Exemplo n.º 8
0
    def test_in_y_graph_finds_two_paths(self):
        # given
        b = CortexGraphBuilder()
        b.add_path('CAA', 'AAA')
        b.add_path('TAA', 'AAA')
        b.make_consistent('AAA')
        cdb = b.build()

        # when
        paths = list(Interactor(cdb).all_simple_paths())

        # then
        assert {'CAAA', 'TAAA'} == set([str(p.seq) for p in paths])
Exemplo n.º 9
0
    def test_emits_one_single_color_unitig(self):
        # given
        b = CortexGraphBuilder()
        b.with_colors(0)
        b.add_edge('AAA', 'AAT', color=0)
        b.make_consistent('AAA')
        graph = b.build()

        # when
        paths = list(Interactor(graph).all_simple_paths())

        # then
        assert ['AAAT'] == [str(p.seq) for p in paths]
Exemplo n.º 10
0
def test_revcomps_a_kmer():
    # given
    b = get_cortex_builder()
    b.with_kmer('AAA 1 ........')
    cdb = b.build()

    # when
    expect = KmerGraphExpectation(
        Interactor(cdb).make_graph_nodes_consistent({'TTT'}).graph)

    # then
    expect.has_node('TTT')
    expect.has_n_nodes(1)
Exemplo n.º 11
0
    def test_follows_two_colors_with_no_color_specified(self):
        # given
        b = CortexGraphBuilder()
        b.with_colors(0, 1)
        b.add_edge('AAA', 'AAT', color=0)
        b.add_edge('AAT', 'ATA', color=1)
        b.make_consistent('AAA')
        graph = b.build()

        # when
        paths = list(Interactor(graph).all_simple_paths())

        # then
        assert {'AAATA'} == set([str(p.seq) for p in paths])
Exemplo n.º 12
0
    def test_with_link_for_y_graph_emits_one_path(self):
        # given
        b = CortexGraphBuilder()
        b.with_kmer_size(3)
        b.add_path('AAA', 'AAC')
        b.add_path('AAA', 'AAT')
        b.make_consistent('AAA')
        cdb = b.build()

        links = LinksBuilder() \
            .with_link_for_kmer('F 1 1 C', 'AAA') \
            .build()

        # when
        paths = list(Interactor(cdb).all_simple_paths(links=links))

        # then
        assert ['AAAC'] == [str(p.seq) for p in paths]
Exemplo n.º 13
0
def test_revcomps_path():
    # given
    b = get_cortex_builder()
    b.with_kmer('CGC 1 .......T')
    b.with_kmer('AGC 1 ......G.')

    cdb = b.build()

    for seed, expected_nodes in [('CGC', ['CGC', 'GCT']),
                                 ('GCT', ['CGC', 'GCT']),
                                 ('AGC', ['AGC', 'GCG']),
                                 ('GCG', ['AGC', 'GCG'])]:
        # when
        expect = KmerGraphExpectation(
            Interactor(cdb).make_graph_nodes_consistent([seed]).graph)

        # then
        expect.has_nodes(*expected_nodes)
        expect.has_n_nodes(2)
Exemplo n.º 14
0
    def test_bubble_and_y_with_two_links_returns_two_transcripts(self):
        # given
        links = LinksBuilder() \
            .with_link_for_kmer('F 2 1 CT', 'AAA') \
            .with_link_for_kmer('F 1 1 A', 'CCC') \
            .build()

        b = CortexGraphBuilder()
        b.with_kmer_size(3)
        b.add_path('AAA', 'AAC', 'ACC', 'CCC', 'CCA')
        b.add_path('AAA', 'AAG', 'AGC', 'GCC', 'CCC', 'CCT')
        b.make_consistent('AAA')
        cdb = b.build()

        # when
        paths = list(Interactor(cdb).all_simple_paths(links=links))

        # then
        assert ['AAACCCA', 'AAACCCT'] == sorted([str(p.seq) for p in paths])
Exemplo n.º 15
0
def prune(argv):
    import argparse
    from .shared import get_shared_argparse
    shared_parser = get_shared_argparse()
    parser = argparse.ArgumentParser('cortexpy prune', parents=[shared_parser])
    parser.add_argument('-t',
                        '--remove-tips',
                        required=True,
                        type=int,
                        help='Remove tips shorter than this number')
    parser.add_argument('graph',
                        help="Input cortexpy graph.  '-' reads from stdin")
    args = parser.parse_args(argv)

    from cortexpy.logging_config import configure_logging_from_args_and_get_logger
    logger = configure_logging_from_args_and_get_logger(args, 'cortexpy.prune')

    if args.remove_tips < 2:
        logger.error('--remove-tips (%s) needs to be greater than 1',
                     args.remove_tips)
        return 1

    from cortexpy.graph.interactor import Interactor
    from cortexpy.graph.parser.streaming import load_cortex_graph
    from cortexpy.graph.serializer.kmer import dump_colored_de_bruijn_graph_to_cortex

    import sys
    if args.out == '-':
        output = sys.stdout.buffer
    else:
        output = open(args.out, 'wb')

    logger.info('Loading de Bruijn graph')
    if args.graph == '-':
        graph = load_cortex_graph(sys.stdin.buffer)
    else:
        graph = load_cortex_graph(open(args.graph, 'rb'))

    logger.info(f'Loaded {len(graph)} kmers')

    graph = Interactor(graph).prune_tips_less_than(args.remove_tips).graph
    dump_colored_de_bruijn_graph_to_cortex(graph, output)
Exemplo n.º 16
0
def test_keys_y_graph():
    # given
    b = get_cortex_builder()
    b.with_kmer('CGC 1 .......T')
    b.with_kmer('AGC 1 a....CG.')
    b.with_kmer('AAG 1 .....C..')
    b.with_kmer('GCC 1 a.......')

    expected_nodes1 = ['CGC', 'GCT', 'CTT', 'GGC']
    expected_nodes2 = ['AAG', 'AGC', 'GCG', 'GCC']
    for expected_nodes in [expected_nodes1, expected_nodes2]:
        for seed in expected_nodes:
            cdb = b.build()

            # when
            expect = KmerGraphExpectation(
                Interactor(cdb).make_graph_nodes_consistent([seed]).graph)

            # then
            expect.has_nodes(*expected_nodes)
Exemplo n.º 17
0
def test_revcomps_many_kmers(data, num_kmers, kmer_size):
    # given
    kmers = {}
    for _ in range(num_kmers):
        kmer_string = data.draw(kmer_strings(min_size=kmer_size, max_size=kmer_size))
        kmers[lexlo(kmer_string)] = kmer_string

    b = get_cortex_builder()
    for kmer in kmers.keys():
        b.with_kmer('{} 1 ........'.format(kmer))
    cdb = b.build()

    # when
    expect = KmerGraphExpectation(
        Interactor(cdb).make_graph_nodes_consistent(set(kmers.values())).graph)

    # then
    for kmer_string in kmers.values():
        expect.has_node(kmer_string)
    expect.has_n_nodes(len(kmers))
    def test_single_kmer_revcomp_seed(self, seed):
        # given
        b = get_cortex_builder()
        b.with_kmer('AAA 1 ......G.')
        b.with_kmer('AAG 1 a.......')
        cdb = b.build()

        # when
        graph = Interactor(cdb).make_graph_nodes_consistent([seed]).graph

        # then
        if seed == 'AAA':
            assert [] == list(graph.in_edges(seed))
            assert [('AAA', 'AAG')] == list(graph.out_edges(seed))
        else:
            assert [('CTT', 'TTT')] == list(graph.in_edges(seed))
            assert [] == list(graph.out_edges(seed))
    def test_gets_correct_neighbors_of_kmer(self):
        # given
        b = get_cortex_builder()
        b.with_kmer('AAC 1 .......T')
        b.with_kmer('ACT 1 a.....G.')
        b.with_kmer('CAG 1 .......T')
        cdb = b.build()
        seed = 'AAC'

        # when
        graph = Interactor(cdb).make_graph_nodes_consistent([seed]).graph

        # then
        assert ['CTG'] == list(graph['ACT'])
        assert ['CTG'] == list(graph.succ['ACT'])
        assert ['AAC'] == list(graph.pred['ACT'])
        assert [('ACT', 'CTG')] == list(graph.out_edges('ACT'))
        assert [('AAC', 'ACT')] == list(graph.in_edges('ACT'))

        assert [] == list(graph['CTG'])
        assert [] == list(graph.succ['CTG'])
        assert ['ACT'] == list(graph.pred['CTG'])
        assert [] == list(graph.out_edges('CTG'))
        assert [('ACT', 'CTG')] == list(graph.in_edges('CTG'))
Exemplo n.º 20
0
def traverse(argv):
    import argparse
    from cortexpy.command.shared import get_shared_argparse
    shared_parser = get_shared_argparse()

    parser = argparse.ArgumentParser(prog='cortexpy traverse',
                                     parents=[shared_parser],
                                     description="""
        Traverse all simple paths between all sources and targets of an input graph.

        Input is a cortex graph. Output is a FASTA.

        This tool also allows the creation of a JSON representation of a CORTEX graph that is consistent 
        with seed strings by using the --to-json and --seed-strings arguments.

        If a links file is supplied, then branches consistent with the links will be preferred in
        the traversal. 
        """)
    parser.add_argument('graph',
                        help="cortex graph. Slurp graph from stdin is '-'.")
    parser.add_argument('--to-json', action='store_true')
    parser.add_argument(
        '--seed-strings',
        nargs='*',
        default=[],
        help="Strings with seed kmers from which to start contig traversal. "
        "Multiple strings can be specified.")
    parser.add_argument('--color',
                        type=int,
                        help='Restrict view to single color')
    parser.add_argument('--max-paths',
                        type=int,
                        default=0,
                        help='Return exit status 64 if more than this '
                        'number of paths are encountered. '
                        '0 turns off this check.')
    parser.add_argument(
        '--graph-index',
        type=int,
        default=0,
        help='Graph index to be added to description of all output paths')
    parser.add_argument(
        '--extra-start-kmer',
        help='Disconnect this k-mer from incoming k-mers before '
        'candidate transcript creation. '
        'This argument may fail if not used together with --seed-strings.')
    parser.add_argument('--links-file',
                        help='gzipped Mccortex-style links file for graph')
    args = parser.parse_args(argv)

    from cortexpy.logging_config import configure_logging_from_args_and_get_logger
    logger = configure_logging_from_args_and_get_logger(
        args, 'cortexpy.traverse')

    import sys
    import gzip
    from cortexpy.graph.interactor import Interactor
    from cortexpy.graph.serializer.serializer import Serializer
    from cortexpy.graph.parser.streaming import load_cortex_graph
    from cortexpy.links import Links
    from . import get_exit_code_yaml_path
    import yaml

    EXIT_CODES = yaml.load(open(get_exit_code_yaml_path(), 'rt'),
                           Loader=yaml.FullLoader)

    if args.out == '-':
        output = sys.stdout
    else:
        output = open(args.out, 'wt')

    logger.info(f'Loading graph: %s', args.graph)
    if args.graph == '-':
        graph = load_cortex_graph(sys.stdin.buffer)
    else:
        graph = load_cortex_graph(open(args.graph, 'rb'))
    logger.info(f'Loaded {len(graph)} kmers')

    consistent_graph = None
    if args.seed_strings:
        seed_kmer_strings = strings_to_kmer_strings(args.seed_strings,
                                                    graph.graph['kmer_size'])
        logger.info(
            f'Making graph consistent with {len(seed_kmer_strings)} kmers from --seed-strings'
        )
        consistent_graph = Interactor(graph) \
            .make_graph_nodes_consistent(seed_kmer_strings) \
            .graph

    if args.to_json:
        logger.info('Writing JSON representation of graph to STDOUT')
        if consistent_graph:
            graph = consistent_graph
        print(Serializer(graph).to_json())
        return

    if not consistent_graph:
        logger.info('Making graph consistent')
        consistent_graph = Interactor.from_graph(graph) \
            .make_graph_nodes_consistent() \
            .graph

    if args.extra_start_kmer:
        if args.extra_start_kmer not in graph:
            logger.error(
                f'Could not find extra start kmer ({args.extra_start_kmer}) in graph'
            )
            return 1

    links = None
    if args.links_file is not None:
        logger.info(f'Loading links file {args.links_file}')
        links = Links.from_binary_stream(gzip.open(args.links_file, 'rb'))
    seq_record_generator = Interactor(consistent_graph) \
        .all_simple_paths(args.extra_start_kmer, links=links)
    seq_record_generator = annotated_seq_records(seq_record_generator,
                                                 graph_idx=args.graph_index)
    if args.max_paths > 0:
        logger.info('Exiting after element %s', args.max_paths)
        seq_record_generator = raise_after_nth_element(seq_record_generator,
                                                       args.max_paths)
    logger.info('Writing seq records to %s', args.out)
    try:
        for record in seq_record_generator:
            output.write(record.format('fasta'))
    except IndexError:
        logger.error('Max paths (%s) exceeded', args.max_paths)
        return EXIT_CODES['MAX_PATH_EXCEEDED']
Exemplo n.º 21
0
 def build(self):
     if self.consistent_seeds:
         self.graph = Interactor.from_graph(self.graph) \
             .make_graph_nodes_consistent(self.consistent_seeds) \
             .graph
     return self.graph
Exemplo n.º 22
0
 def _make_kmer_graph_consistent(self):
     if isinstance(self.graph, CortexDiGraph):
         self.graph = Interactor(self.graph).make_graph_nodes_consistent().graph