def test_1_typical_poagraph(self): po_path = self.po_files_dir.joinpath("test_1.po") expected_nodes = [ graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)), graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)), graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [11, 13, 14, 15])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) nodes, sequences = po2poagraph.get_poagraph( msa.Po(pathtools.get_file_content_stringio(po_path), po_path), self.metadatacsv) actual_poagraph = graph.Poagraph(nodes, sequences) self.assertEqual(expected_poagraph, actual_poagraph)
def test_04_single_block_no_nucleotides(self): maf_path = self.maf_files_dir.joinpath( "test_4_single_block_no_nucleotides.maf") expected_nodes = [] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_subpoagraph_should_omit_edges_2(self): nodes = [ graph.Node(node_id=nid(0), base=b('A'), aligned_to=None), graph.Node(node_id=nid(1), base=b('C'), aligned_to=None), graph.Node(node_id=nid(2), base=b('C'), aligned_to=None) ] sequences = { msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2])])], graph.SequenceMetadata({'group': '1'})) } poagraph = graph.Poagraph(nodes, sequences) translator = poa._PoagraphPOTranslator(poagraph, [msa.SequenceID('seq1')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n" \ "NAME=pangenome\n" \ "TITLE=pangenome\n" \ "LENGTH=2\n" \ "SOURCECOUNT=1\n" \ "SOURCENAME=seq1\n" \ "SOURCEINFO=2 0 100 -1 seq1\n" \ "a:S0\n" \ "c:L0S0" self.assertEqual(expected_po_content, actual_po_content)
def test_05_single_block_single_nucletodide(self): maf_path = self.maf_files_dir.joinpath( "test_5_single_block_single_nucletodide.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_subpoagraph_construction_full_graph(self): nodes = [ graph.Node(node_id=nid(0), base=b('A'), aligned_to=None), graph.Node(node_id=nid(1), base=b('A'), aligned_to=None), graph.Node(node_id=nid(2), base=b('C'), aligned_to=None), graph.Node(node_id=nid(3), base=b('A'), aligned_to=None), graph.Node(node_id=nid(4), base=b('T'), aligned_to=None) ] sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4])])], graph.SequenceMetadata({'group': '1'})) } poagraph = graph.Poagraph(nodes, sequences) translator = poa._PoagraphPOTranslator(poagraph, [msa.SequenceID('seq0')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n" \ "NAME=pangenome\n" \ "TITLE=pangenome\n" \ "LENGTH=5\n" \ "SOURCECOUNT=1\n" \ "SOURCENAME=seq0\n" \ "SOURCEINFO=5 0 100 -1 seq0\n" \ "a:S0\n" \ "a:L0S0\n" \ "c:L1S0\n" \ "a:L2S0\n" \ "t:L3S0" self.assertEqual(expected_po_content, actual_po_content)
def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self): maf_path = self.maf_files_dir.joinpath( "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf") expected_nodes = [ # block 0 graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), # missing seq2 graph.Node(node_id=nid(4), base=graph.Base(self.missing_n.value), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base(self.missing_n.value), aligned_to=None), # block 1 graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(8)), graph.Node(node_id=nid(10), base=graph.Base('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), missings.ConstBaseProvider(self.missing_n), self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def build_from_dagmaf(maf: msa.Maf, fasta_provider: Optional[missings.FastaProvider] = missings.ConstBaseProvider( missings.MissingBase()), metadata: Optional[msa.MetadataCSV] = None, datatype: Optional[graph.DataType] = graph.DataType.Nucleotides) -> \ Tuple[graph.Poagraph, DAGMaf.DAGMaf]: """Converts MAF to DagMaf and builds poagraph from MAF file. Args: maf: Multialignment as MAF file. fasta_provider: Provider of bases missing in DagMaf. metadata: Metadata of sequences present in MAF. datatype: Type of the processed data (nucleotides/proteins). Returns: Tuple: poagraph based on given input data and dagmaf created from input MAF. """ dagmaf = maf2dagmaf.get_dagmaf(maf) nodes, sequences = dagmaf2poagraph.get_poagraph(dagmaf, fasta_provider, metadata) p = graph.Poagraph(nodes, sequences) if metadata: graph.Poagraph.complement_metadata_for_sequences_absent_in_metadata_provided(p, metadata) p.datatype = datatype return p, dagmaf
def test_2_consensuses_and_empty_sequences(self): po_path = self.po_files_dir.joinpath("test_2.po") expected_nodes = [ graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('CONSENS0'): graph.Sequence(msa.SequenceID('CONSENS0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])], graph.SequenceMetadata({})), msa.SequenceID('CONSENS1'): graph.Sequence(msa.SequenceID('CONSENS1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])], graph.SequenceMetadata({})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) nodes, sequences = po2poagraph.get_poagraph( msa.Po(pathtools.get_file_content_stringio(po_path), po_path), self.metadatacsv) actual_poagraph = graph.Poagraph(nodes, sequences) self.assertEqual(expected_poagraph, actual_poagraph)
def test_09_inactive_edges_but_all_strands_plus(self): maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]), graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self): maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)), graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)), graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)), graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)), graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)), graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)), graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)), graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_6_missing_one_reverted_sequence_middle_minus1_1(self): maf_path = self.maf_files_dir.joinpath( "test_6_missing_one_reverted_sequence_middle_minus1_1.maf") expected_nodes = [ # block 1 because it is first in DAG and reverted graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=nid(2)), # missing seq2, on edge (-1,1) graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath([*map(nid, [0, 1, 2])]), graph.SeqPath([*map(nid, [6, 8, 9, 10])]) ], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_subpoagraph_unfilled_nodes(self): symbol_for_uknown = '?' nodes = [ graph.Node(node_id=nid(0), base=b('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=b('C'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=b('G'), aligned_to=None), graph.Node(node_id=nid(3), base=b(symbol_for_uknown), aligned_to=None), graph.Node(node_id=nid(4), base=b(symbol_for_uknown), aligned_to=None), graph.Node(node_id=nid(5), base=b('G'), aligned_to=None), graph.Node(node_id=nid(6), base=b('C'), aligned_to=None), graph.Node(node_id=nid(7), base=b('A'), aligned_to=None), graph.Node(node_id=nid(5), base=b('T'), aligned_to=None) ] sequences = { msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2, 3, 4, 7, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 8])])], graph.SequenceMetadata({'group': '1'})) } poagraph = graph.Poagraph(nodes, sequences) translator = poa._PoagraphPOTranslator( poagraph, [msa.SequenceID('seq1'), msa.SequenceID('seq2')]) actual_po_content = translator.get_input_po_content() expected_po_content = "VERSION=pangenome\n" \ "NAME=pangenome\n" \ "TITLE=pangenome\n" \ "LENGTH=9\n" \ "SOURCECOUNT=2\n" \ "SOURCENAME=seq1\n" \ "SOURCEINFO=6 0 100 -1 seq1\n" \ "SOURCENAME=seq2\n" \ "SOURCEINFO=6 1 100 -1 seq2\n" \ "a:S0A1\n" \ "c:S1A0\n" \ "g:L0L1S0S1\n" \ f"{symbol_for_uknown}:L2S0\n" \ f"{symbol_for_uknown}:L3S0\n" \ "g:L2S1\n" \ "c:L5S1\n" \ "a:L4L6S0S1\n" \ "t:L7S0S1" self.assertEqual(expected_po_content, actual_po_content)
def test_1_p_parameter_influence(self, p: at_params.P, expected_cutoff: graph.Compatibility): nodes = [ graph.Node(node_id=nid(0), base=b('T'), aligned_to=None), graph.Node(node_id=nid(1), base=b('A'), aligned_to=None), graph.Node(node_id=nid(2), base=b('G'), aligned_to=None), graph.Node(node_id=nid(3), base=b('A'), aligned_to=None), graph.Node(node_id=nid(4), base=b('C'), aligned_to=None), graph.Node(node_id=nid(5), base=b('A'), aligned_to=None), graph.Node(node_id=nid(6), base=b('C'), aligned_to=None), graph.Node(node_id=nid(7), base=b('G'), aligned_to=None), graph.Node(node_id=nid(8), base=b('T'), aligned_to=None), graph.Node(node_id=nid(9), base=b('A'), aligned_to=None) ] sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 7, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [ graph.SeqPath([*map(nid, [10, 11, 12, 3, 4, 5, 6, 7, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq4'): graph.Sequence( msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [10, 11, 2, 3, 4, 5, 6, 7, 8, 9])])], graph.SequenceMetadata({})) } poagraph = graph.Poagraph(nodes, sequences) consensus_path = graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])]) compatibilities = poagraph.get_compatibilities( poagraph.get_sequences_ids(), consensus_path, p) actual_cutoff = at_builders._find_node_cutoff( [c for c in compatibilities.values()], []).cutoff self.assertAlmostEqual(expected_cutoff.value, actual_cutoff.value)
def test_2_missing_sequence_end(self): maf_path = self.maf_files_dir.joinpath( "test_2_missing_sequence_end.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_1_typical_poagraph(self): expected_po_content_path = self.po_files_dir.joinpath("test_1.po") poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)), graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)), graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15)) ] poagraph_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [11, 13, 14, 15])])], graph.SequenceMetadata({'group': '1'})), } poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences) actual_po_content = po.poagraph_to_PangenomePO(poagraph) expected_po_content = pathtools.get_file_content(expected_po_content_path) self.assertEqual(expected_po_content, actual_po_content)
def test_2_consensuses_and_empty_sequences(self): expected_po_content_path = self.po_files_dir.joinpath("test_2.po") poagraph_nodes = [graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None) ] poagraph_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('CONSENS0'): graph.Sequence(msa.SequenceID('CONSENS0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])], None), msa.SequenceID('CONSENS1'): graph.Sequence(msa.SequenceID('CONSENS1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])], None), } poagraph = graph.Poagraph(poagraph_nodes, poagraph_sequences) actual_po_content = po.poagraph_to_PangenomePO(poagraph) expected_po_content = pathtools.get_file_content(expected_po_content_path) self.assertEqual(expected_po_content, actual_po_content)
def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self): maf_path = self.maf_files_dir.joinpath( "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 3, 5, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def setUp(self): nodes = [ graph.Node( node_id=nid(0), base=b('T'), aligned_to=None, ), graph.Node(node_id=nid(1), base=b('A'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=b('G'), aligned_to=nid(1)), graph.Node(node_id=nid(3), base=b('A'), aligned_to=nid(4)), graph.Node(node_id=nid(4), base=b('C'), aligned_to=nid(3)), graph.Node(node_id=nid(5), base=b('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=b('C'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=b('G'), aligned_to=nid(8)), graph.Node(node_id=nid(8), base=b('T'), aligned_to=nid(5)), graph.Node(node_id=nid(9), base=b('A'), aligned_to=None), graph.Node(node_id=nid(10), base=b('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=b('T'), aligned_to=nid(10)), graph.Node(node_id=nid(12), base=b('G'), aligned_to=None), graph.Node(node_id=nid(13), base=b('A'), aligned_to=nid(14)), graph.Node(node_id=nid(14), base=b('C'), aligned_to=nid(13)) ] sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 1, 3, 5, 9, 10, 13])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 6, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 4, 7, 9, 11, 12])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq3'): graph.Sequence( msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [2, 4, 8, 9, 11, 12, 14])])], graph.SequenceMetadata({'group': '1'})), } self.poagraph = graph.Poagraph(nodes, sequences)
def test_02_seq_starts_in_second_block(self): maf_path = self.maf_files_dir.joinpath( "test_2_seq_starts_in_second_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 2, 3])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 5, 7])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 8])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_08_reversed_block(self): maf_path = self.maf_files_dir.joinpath("test_8_reversed_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), # next block is reversed because it was converted to dag graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 8, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 8, 9])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 9])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def build_from_po(po: msa.Po, metadata: Optional[msa.MetadataCSV] = None, datatype: Optional[graph.DataType] = graph.DataType.Nucleotides) -> graph.Poagraph: """Builds poagraph from PO file. Args: po: Multialignment as PO file. metadata: Metadata of sequences present in MAF. datatype: Type of the processed data (nucleotides/proteins). Returns: Poagraph based on given input data. """ nodes, sequences = po2poagraph.get_poagraph(po, metadata) p = graph.Poagraph(nodes, sequences) if metadata: graph.Poagraph.complement_metadata_for_sequences_absent_in_metadata_provided(p, metadata) p.datatype = datatype return p