def test_1_no_symbol_provided(self): missing_symbol = missings.MissingBase() const_symbol_provider = missings.ConstBaseProvider(missing_symbol) expected_symbol = graph.Base('?') actual_symbol = const_symbol_provider.get_base(msa.SequenceID('s'), 0) self.assertEqual(expected_symbol, actual_symbol)
def raise_error_if_unequal(self, sequence_id: msa.SequenceID, expected_sequence: str, fasta_provider: missings.FromFile) -> None: for i, expected_symbol in enumerate(expected_sequence): expected_base = graph.Base(expected_symbol) actual_base = fasta_provider.get_base(sequence_id, i) self.assertEqual(expected_base, actual_base)
def test_05_single_block_single_nucletodide(self): maf_path = self.maf_files_dir.joinpath( "test_5_single_block_single_nucletodide.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def get_poagraph(maf: msa.Maf, metadata: Optional[msa.MetadataCSV]) -> \ Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]: """Get poagraph elements from MAF. Args: maf: Multialignment file in MAF format. metadata: MetadataCSV. Returns: Tuple of poagraph elements. """ alignment = [*AlignIO.parse(maf.filecontent, "maf")] nodes, sequences = _init_poagraph(alignment, metadata) current_node_id = graph.NodeID(-1) column_id = graph.ColumnID(-1) for block_id, block in enumerate(alignment): global_logger.info(f"Processing block {block_id}...") block_width = len(block[0].seq) for col in range(block_width): column_id += 1 sequence_id_to_nucleotide = { msa.SequenceID(seq.id): seq[col] for seq in block } nodes_codes = sorted([ *(set([ nucleotide for nucleotide in sequence_id_to_nucleotide.values() ])).difference({'-'}) ]) column_nodes_ids = [ graph.NodeID(current_node_id + i + 1) for i, _ in enumerate(nodes_codes) ] for i, nucl in enumerate(nodes_codes): current_node_id += 1 nodes.append( graph.Node(node_id=current_node_id, base=graph.Base(nucl), aligned_to=_get_next_aligned_node_id( graph.NodeID(i), column_nodes_ids), column_id=graph.ColumnID(column_id), block_id=graph.BlockID(block_id))) for seq_id, nucleotide in sequence_id_to_nucleotide.items(): if nucleotide == nucl: sequences[seq_id] = _add_node_do_sequence( sequence=sequences[seq_id], node_id=current_node_id) return nodes, sequences
def test_08_reversed_block(self): maf_path = self.maf_files_dir.joinpath("test_8_reversed_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), # next block is reversed because it was converted to dag graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 8, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 8, 9])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 9])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_02_seq_starts_in_second_block(self): maf_path = self.maf_files_dir.joinpath( "test_2_seq_starts_in_second_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 2, 3])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 5, 7])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 8])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def get_base(self, seq_id: msa.SequenceID, i: int) -> graph.Base: """Returns base at position i in sequence identified by seq_id. Args: seq_id: ID of sequence present in given fasta file. i: position of the base to check in fasta file. Returns: The base present in seuqence seq_id at position i. """ if seq_id not in self._sequences.keys(): raise FastaProviderException(f"Wrong sequence id: {seq_id}. ") if i > len(self._sequences[seq_id]): raise FastaProviderException(f"""Index {i} is too large for sequence {seq_id}.""") return graph.Base(self._sequences[seq_id][i])
def test_2_read_seqeunce_from_cache_instead_downloading(self): fasta_provider = missings.FromNCBI(use_cache=True) cache_dir_path = pathtools.get_child_path(Path.cwd(), ".fastacache") if cache_dir_path.exists(): shutil.rmtree(cache_dir_path) cache_dir_path.mkdir() sequence_id = msa.SequenceID("seq1") fake_sequence = "foo" expected_base = graph.Base("o") fake_fasta_path = pathtools.get_child_path(cache_dir_path, f"{sequence_id}.fasta") with open(fake_fasta_path, 'w') as fake_fasta_handler: fake_fasta_handler.write(f">{sequence_id} cached\n{fake_sequence}") actual_base = fasta_provider.get_base(sequence_id, 2) self.assertEqual(expected_base, actual_base)
def _process_block(build_state: _BuildState, block: DAGMaf.DAGMafNode): current_node_id = _get_max_node_id(build_state.nodes) block_width = len(block.alignment[0].seq) paths_join_info = _get_paths_join_info(block, build_state.free_edges) build_state.column_id = _get_max_column_id(build_state.nodes) for col in range(block_width): build_state.column_id += 1 sequence_name_to_nucleotide = { MafSequenceID(seq.id): seq[col] for seq in block.alignment } nodes_codes = _get_column_nucleotides_sorted_codes( sequence_name_to_nucleotide) column_nodes_ids = [ current_node_id + i + 1 for i, _ in enumerate(nodes_codes) ] for i, nucl in enumerate(nodes_codes): current_node_id += 1 maf_seqs_id = [ seq_id for seq_id, n in sequence_name_to_nucleotide.items() if n == nucl ] build_state.nodes += [ graph.Node(node_id=current_node_id, base=graph.Base(nucl), aligned_to=_get_next_aligned_node_id( i, column_nodes_ids), column_id=build_state.column_id, block_id=block.id) ] for maf_seq_id in maf_seqs_id: seq_id = msa.SequenceID(maf_seq_id) _add_node_to_sequence(build_state, seq_id, paths_join_info[seq_id], current_node_id) paths_join_info[seq_id] = current_node_id _add_block_out_edges_to_free_edges(build_state, block, paths_join_info) _manage_endings(build_state, block, paths_join_info)
def get_base(self, seq_id: msa.SequenceID, i: int) -> graph.Base: """Returns base at position i in sequence identified in NCBI by seq_id or sth similar. Args: seq_id: sequence_id, used as is or some guessing is performed if no results available. i: position of the base to check in fasta file. Returns: The base present in seuqence sequence_id at position i. """ if seq_id not in self._sequences.keys(): sequence_is_cached = self._fasta_disk_cache.seq_is_cached(seq_id) if self._use_cache and sequence_is_cached: self._sequences[seq_id] = self._fasta_disk_cache.read(seq_id) elif self._use_cache and not sequence_is_cached: sequence = self._download_from_ncbi(seq_id) self._sequences[seq_id] = sequence self._fasta_disk_cache._save_to_cache(seq_id, sequence) else: self._sequences[seq_id] = self._download_from_ncbi(seq_id) return graph.Base(self._sequences[seq_id][i])
def _get_poagraph_paths_and_nodes(po_lines: List[str], sequences_info: Dict[int, POSequenceInfo], sequences: Dict[msa.SequenceID, graph.Sequence]) -> \ Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]: nodes_count = int(_extract_line_value(po_lines[3])) paths_count = int(_extract_line_value(po_lines[4])) nodes: List[graph.Node] = [None] * nodes_count node_id = 0 for i in range(5 + paths_count * 2, 5 + paths_count * 2 + nodes_count): node_line = po_lines[i] base = graph.Base(node_line[0].upper()) in_nodes, po_sequences_ids, aligned_to = _extract_node_parameters(node_line) sequences_ids = [sequences_info[po_sequences_id].name for po_sequences_id in po_sequences_ids] nodes[node_id] = graph.Node(graph.NodeID(node_id), base, graph.NodeID(aligned_to)) for seq_id in sequences_ids: if len(sequences[seq_id].paths) == 1: sequences[seq_id].paths[0].append(graph.NodeID(node_id)) else: sequences[seq_id].paths.append(graph.SeqPath([graph.NodeID(node_id)])) node_id += 1 return nodes, sequences
def __init__(self, missing_base: MissingBase): self.missing_base: graph.Base = graph.Base(missing_base.value)
def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self): maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)), graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)), graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)), graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)), graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)), graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)), graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)), graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_09_inactive_edges_but_all_strands_plus(self): maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]), graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self): maf_path = self.maf_files_dir.joinpath( "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf") expected_nodes = [ # block 0 graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), # missing seq2 graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None), # block 1 graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(8)), graph.Node(node_id=nid(10), base=graph.Base('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self): maf_path = self.maf_files_dir.joinpath( "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 3, 5, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def b(x): return graph.Base(x)
def test_1_messy_sequences(self): maf_path = self.maf_files_dir.joinpath("test_1_messy_sequences.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=nid(2), block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(1), block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5), block_id=bid(0)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4), block_id=bid(0)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(9), base=graph.Base('C'), aligned_to=nid(10), block_id=bid(2)), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=nid(9), block_id=bid(2)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(13), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(14), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 8, 9, 11, 12])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [2, 3, 4, 10, 11, 12, 13, 14])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 5, 6, 7, 10, 11, 12, 14])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } actual_nodes, actual_sequences = maf2poagraph.get_poagraph( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.metadatacsv) self.assertEqual(expected_nodes, actual_nodes) self.assertEqual(expected_sequences, actual_sequences)
def test_2_missing_sequence_end(self): maf_path = self.maf_files_dir.joinpath( "test_2_missing_sequence_end.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base(self.missing_n.value), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base(self.missing_n.value), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), missings.ConstBaseProvider(self.missing_n), self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_6_missing_one_reverted_sequence_middle_minus1_1(self): maf_path = self.maf_files_dir.joinpath( "test_6_missing_one_reverted_sequence_middle_minus1_1.maf") expected_nodes = [ # block 1 because it is first in DAG and reverted graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=nid(2)), # missing seq2, on edge (-1,1) graph.Node(node_id=nid(4), base=graph.Base(self.missing_n.value), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base(self.missing_n.value), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath([*map(nid, [0, 1, 2])]), graph.SeqPath([*map(nid, [6, 8, 9, 10])]) ], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), missings.ConstBaseProvider(self.missing_n), self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def bid(x): return graph.Base(x)
def get_base(self, sequence_id: msa.SequenceID, i: int) -> graph.Base: try: return graph.Base(self.sources[sequence_id][i]) except KeyError: raise Exception("No record found with given id_!")