def setUp(self): nodes = [ graph.Node( node_id=nid(0), base=b('T'), aligned_to=None, ), graph.Node(node_id=nid(1), base=b('A'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=b('G'), aligned_to=nid(1)), graph.Node(node_id=nid(3), base=b('A'), aligned_to=nid(4)), graph.Node(node_id=nid(4), base=b('C'), aligned_to=nid(3)), graph.Node(node_id=nid(5), base=b('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=b('C'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=b('G'), aligned_to=nid(8)), graph.Node(node_id=nid(8), base=b('T'), aligned_to=nid(5)), graph.Node(node_id=nid(9), base=b('A'), aligned_to=None), graph.Node(node_id=nid(10), base=b('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=b('T'), aligned_to=nid(10)), graph.Node(node_id=nid(12), base=b('G'), aligned_to=None), graph.Node(node_id=nid(13), base=b('A'), aligned_to=nid(14)), graph.Node(node_id=nid(14), base=b('C'), aligned_to=nid(13)) ] sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 1, 3, 5, 9, 10, 13])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 6, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 4, 7, 9, 11, 12])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence( msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [2, 4, 8, 9, 11, 12, 14])])], graph.SequenceMetadata({'group': '1'})), } self.poagraph = graph.Poagraph(nodes, sequences)
def _add_node_do_sequence(sequence: graph.Sequence, node_id: graph.NodeID) -> \ graph.Sequence: if sequence.paths: a = graph.SeqPath([node_id]) updated_path = graph.SeqPath(sequence.paths[-1] + a) newpaths = [sequence.paths[:-1] + updated_path] else: newpaths = [graph.SeqPath([node_id])] return graph.Sequence(sequence.seqid, newpaths, sequence.seqmetadata)
def test_02_seq_starts_in_second_block(self): maf_path = self.maf_files_dir.joinpath( "test_2_seq_starts_in_second_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 2, 3])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 5, 7])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 8])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_08_reversed_block(self): maf_path = self.maf_files_dir.joinpath("test_8_reversed_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), # next block is reversed because it was converted to dag graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 8, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 8, 9])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 9])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def _init_sequences(sequences_info: Dict[int, POSequenceInfo], metadata: Optional[msa.MetadataCSV]) -> \ Dict[msa.SequenceID, graph.Sequence]: metadata_sequences_ids = metadata.get_all_sequences_ids() \ if metadata else [] po_sequences_ids = [seq_info.name for seq_info in sequences_info.values()] initial_sequences = {seq_id: graph.Sequence(seqid=seq_id, paths=[], seqmetadata=metadata.get_sequence_metadata(seq_id) if metadata else {}) for seq_id in set(po_sequences_ids + metadata_sequences_ids)} return initial_sequences
def _init_sequences(sequences_in_dagmaf: List[msa.SequenceID], metadata: Optional[msa.MetadataCSV]) -> \ Dict[msa.SequenceID, graph.Sequence]: metadata_sequences_ids = metadata.get_all_sequences_ids( ) if metadata else [] initial_sequences = { seq_id: graph.Sequence(seqid=seq_id, paths=[], seqmetadata=metadata.get_sequence_metadata(seq_id) if metadata else {}) for seq_id in set(sequences_in_dagmaf + metadata_sequences_ids) } return initial_sequences
def test_subpoagraph_should_omit_in_nodes_and_aligned_nodes(self): # original poagraph nodes = [ graph.Node(node_id=nid(0), base=b('A'), aligned_to=None), graph.Node(node_id=nid(1), base=b('C'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=b('T'), aligned_to=nid(1)), graph.Node(node_id=nid(3), base=b('G'), aligned_to=None) ] sequences = { msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 3])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 3])])], graph.SequenceMetadata({'group': '1'})) } poagraph = graph.Poagraph(nodes, sequences) translator = poa._PoagraphPOTranslator(poagraph, [msa.SequenceID('seq2')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n" \ "NAME=pangenome\n" \ "TITLE=pangenome\n" \ "LENGTH=3\n" \ "SOURCECOUNT=1\n" \ "SOURCENAME=seq2\n" \ "SOURCEINFO=3 0 100 -1 seq2\n" \ "a:S0\n" \ "t:L0S0\n" \ "g:L1S0" self.assertEqual(expected_po_content, actual_po_content)
def _init_poagraph(alignment: _ParsedMaf, metadata: Optional[msa.MetadataCSV]) -> \ Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]: maf_sequences_ids = _get_sequences_ids(alignment) metadata_sequences_ids = metadata.get_all_sequences_ids( ) if metadata else [] initial_sequences = { seq_id: graph.Sequence(seqid=seq_id, paths=[], seqmetadata=metadata.get_sequence_metadata(seq_id) if metadata else {}) for seq_id in set(maf_sequences_ids + metadata_sequences_ids) } return [], initial_sequences
def test_2_consensuses_and_empty_sequences(self): po_path = self.po_files_dir.joinpath("test_2.po") expected_nodes = [ graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('CONSENS0'): graph.Sequence(msa.SequenceID('CONSENS0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])], graph.SequenceMetadata({})), msa.SequenceID('CONSENS1'): graph.Sequence(msa.SequenceID('CONSENS1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])], graph.SequenceMetadata({})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) nodes, sequences = po2poagraph.get_poagraph( msa.Po(pathtools.get_file_content_stringio(po_path), po_path), self.metadatacsv) actual_poagraph = graph.Poagraph(nodes, sequences) self.assertEqual(expected_poagraph, actual_poagraph)
def test_2_consensuses_and_empty_sequences(self): expected_po_content_path = self.po_files_dir.joinpath("test_2.po") poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None) ] poagraph_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('CONSENS0'): graph.Sequence(msa.SequenceID('CONSENS0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])], None), msa.SequenceID('CONSENS1'): graph.Sequence(msa.SequenceID('CONSENS1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])], None), } poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences) actual_po_content = po.poagraph_to_PangenomePO(poagraph) expected_po_content = pathtools.get_file_content(expected_po_content_path) self.assertEqual(expected_po_content, actual_po_content)
def test_1_missing_sequence_start(self): maf_path = self.maf_files_dir.joinpath( "test_1_missing_sequence_start.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base(self.missing_n.value), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base(self.missing_n.value), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base(self.missing_n.value), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=nid(4)), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(3)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(9), base=graph.Base('A'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=None, block_id=bid(0)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 5, 6, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [4, 5, 7, 8, 9, 10, 11])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), missings.ConstBaseProvider(self.missing_n), self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_1_messy_sequences(self): maf_path = self.maf_files_dir.joinpath("test_1_messy_sequences.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=nid(2), block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(1), block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5), block_id=bid(0)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4), block_id=bid(0)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(9), base=graph.Base('C'), aligned_to=nid(10), block_id=bid(2)), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=nid(9), block_id=bid(2)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(13), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(14), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 8, 9, 11, 12])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [2, 3, 4, 10, 11, 12, 13, 14])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 5, 6, 7, 10, 11, 12, 14])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } actual_nodes, actual_sequences = maf2poagraph.get_poagraph( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.metadatacsv) self.assertEqual(expected_nodes, actual_nodes) self.assertEqual(expected_sequences, actual_sequences)