def test_parses_a_graph(self, tmpdir): # given kmer_size = 3 mc_builder = (builder.Mccortex().with_dna_sequence( 'ACGTT').with_kmer_size(kmer_size)) expected_kmers = [ KmerRecord('AAC', (1, ), [as_edge_set('......G.')]), KmerRecord('ACG', (2, ), [as_edge_set('a......T')]), ] # when output_graph = mc_builder.build(tmpdir) kmer_generator = kmer_generator_from_stream(open(output_graph, 'rb')) # then actual_kmers = list(kmer_generator) for kmer in actual_kmers: logger.info(kmer) for expected_kmer, kmer in zip(expected_kmers, actual_kmers): assert kmer.kmer == expected_kmer.kmer assert kmer.coverage == expected_kmer.coverage assert kmer.edges == expected_kmer.edges assert len(actual_kmers) == len(expected_kmers)
def test_raises_on_non_lexlo_kmer(self): # when rec = KmerRecord('AAA', [1], [edge_set.empty()]) kmer = Kmer.from_kmer_data( KmerData(rec.to_bytestring(), kmer_size=3, num_colors=1)) with pytest.raises(AttributeError): kmer.kmer = reverse_complement(kmer.kmer)
def with_kmer_record(self, kmer): colors_so_far = 0 for graph_idx, n_colors in enumerate(self.n_colors_per_graph): last_color_idx = n_colors + colors_so_far graph_kmer = KmerRecord( kmer.kmer, kmer.coverage[colors_so_far:last_color_idx], kmer.edges[colors_so_far:last_color_idx]) self.graph_builders[graph_idx].with_kmer_record(graph_kmer) colors_so_far += n_colors return self
def test_gets_aaa_for_ttt_query(self, RAClass): # given graph_builder = builder.Graph() graph_builder.with_kmer_size(3) graph_builder.with_num_colors(1) expected_kmer = KmerRecord('AAA', [1], [as_edge_set('........')]) graph_builder.with_kmer_record(expected_kmer) cg = RAClass(graph_builder.build()) # when assert expected_kmer.kmer == cg.get_kmer_for_string('AAA').kmer assert expected_kmer.kmer == cg.get_kmer_for_string('TTT').kmer
def test_gets_aaa(self): # given graph_builder = (builder.Graph().with_kmer_size(3).with_num_colors(1)) expected_kmer = KmerRecord('AAA', (1, ), [as_edge_set('........')]) graph_builder.with_kmer_record(expected_kmer) cg = self.RAClass(graph_builder.build()) # when for kmer in cg.values(): assert expected_kmer.kmer == kmer.kmer assert np.all(expected_kmer.coverage == kmer.coverage) assert expected_kmer.edges == kmer.edges
def kmer_records(draw, kmer_size, num_colors, kmer_strings=dna_sequences): kmer = draw(kmer_strings(min_size=kmer_size, max_size=kmer_size)) coverage = tuple( draw( s.lists(s.integers(min_value=1, max_value=MAX_UINT), min_size=num_colors, max_size=num_colors))) edges = np.array(draw( s.lists(s.lists(s.integers(min_value=0, max_value=1), min_size=8, max_size=8), min_size=num_colors, max_size=num_colors)), dtype=np.uint8) edges = [EdgeSet(np.concatenate((e[:4], e[::-1][:4]))) for e in edges] return KmerRecord(kmer, coverage, edges)
def test_retrieves_kmer_by_random_access(self, tmpdir): # given kmer_size = 3 output_graph = (builder.Mccortex().with_dna_sequence( 'ACGTTT').with_kmer_size(kmer_size).build(tmpdir)) expected = KmerRecord('AAC', (1, ), [as_edge_set('A.....G.')]) cg = RandomAccess(open(output_graph, 'rb')) # when actual = cg['AAC'] # then logger.info(actual) assert actual.kmer == expected.kmer assert actual.coverage == expected.coverage assert actual.edges == expected.edges
def with_kmer(self, kmer_string, color_coverage=1, edges='........', repeat_color_edges_n_times=None): if ' ' in kmer_string: assert ' ' not in kmer_string num_words = kmer_string.count(' ') kmer_words = kmer_string.split(' ') kmer_string = kmer_words.pop(0) assert num_words % 2 == 0 num_colors = num_words // 2 self.with_num_colors(num_colors) color_coverage = [int(word) for word in kmer_words[0:num_colors]] edges = kmer_words[num_colors:] revcomp = str(Seq(kmer_string).reverse_complement()) if revcomp < kmer_string: raise Exception( "kmer_string '{}' is not lexlo. Please fix.".format( kmer_string)) if (repeat_color_edges_n_times and isinstance(edges, str) and isinstance(color_coverage, int)): self.with_num_colors(repeat_color_edges_n_times) edges = [edges for _ in range(repeat_color_edges_n_times)] color_coverage = [ color_coverage for _ in range(repeat_color_edges_n_times) ] if isinstance(edges, str): edges = [edges] if isinstance(color_coverage, int): color_coverage = [color_coverage] if self.kmer_size_is_set: assert self.kmer_size == len(kmer_string) else: self.with_kmer_size(len(kmer_string)) return self.with_kmer_record( KmerRecord(kmer_string, color_coverage, tuple([as_edge_set(e) for e in edges])))
def test_parses_a_graph_with_kmer_size_32(self, tmpdir): # given kmer_size = 33 contig = ''.join(list(repeat('A', kmer_size))) mc_builder = (builder.Mccortex().with_dna_sequence( contig).with_kmer_size(kmer_size)) expected_kmers = [ KmerRecord(contig, (1, ), [as_edge_set('........')]), ] # when output_graph = mc_builder.build(tmpdir) kmer_generator = kmer_generator_from_stream(open(output_graph, 'rb')) # then actual_kmers = list(kmer_generator) for kmer in actual_kmers: logger.info(kmer) for expected_kmer, kmer in zip(expected_kmers, actual_kmers): assert kmer.kmer == expected_kmer.kmer assert kmer.coverage == expected_kmer.coverage assert kmer.edges == expected_kmer.edges