def get_ebola_consensus_tree(p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, AffinityTree]: current_path = Path(os.path.abspath(__file__)).resolve() output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name) consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus") multialignment_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/multialignment.maf") metadata_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/metadata.csv") blosum_path = current_path.parent.joinpath("../bin/blosum80.mat") fasta_provider = fp_ncbi.FromNCBI(use_cache=True) multialignment_content = pathtools.get_file_content_stringio(multialignment_path) multialignment = Maf(file_content=multialignment_content, filename=multialignment_path) metadata_content = pathtools.get_file_content_stringio(metadata_path) metadata = MetadataCSV(filecontent=metadata_content, filename=metadata_path) poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata) blosum_content = pathtools.get_file_content_stringio(path=blosum_path) blosum = Blosum(blosum_content, blosum_path) return poagraph, atree_builders.get_affinity_tree(poagraph, blosum, consensus_output_dir, Stop(stop), P(p), False)
def get_ebola_affinity_tree( p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, AffinityTree]: current_path = Path(os.path.abspath(__file__)).resolve() output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name) consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus") multialignment_path = current_path.parent.joinpath( "../data/Ebola/multialignment.maf") metadata_path = current_path.parent.joinpath("../data/Ebola/metadata.csv") blosum_path = current_path.parent.joinpath("../bin/blosum80.mat") tp = TaskParameters(running_time="", multialignment_file_path=multialignment_path, multialignment_format="MAF", datatype="N", metadata_file_path=metadata_path, blosum_file_path=blosum_path, output_path=output_dir_path, output_po=False, output_fasta=False, output_with_nodes=False, verbose=False, raw_maf=False, fasta_provider='FromNCBI', cache=True, missing_base_symbol="", fasta_source_file=None, consensus_type="", hbmin=0.8, stop=stop, p=p) fasta_provider = fp_ncbi.FromNCBI(use_cache=True) multialignment_content = pathtools.get_file_content_stringio( multialignment_path) multialignment = Maf(file_content=multialignment_content, filename=multialignment_path) metadata_content = pathtools.get_file_content_stringio(metadata_path) metadata = MetadataCSV(filecontent=metadata_content, filename=metadata_path) poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata) blosum_content = pathtools.get_file_content_stringio(path=blosum_path) blosum = Blosum(blosum_content, blosum_path) return poagraph, atree_builders.get_affinity_tree(poagraph, blosum, consensus_output_dir, Stop(stop), P(p), False)
def get_ebola_consensus_tree(p: float, stop: float, output_dir_name: str) -> Tuple[Poagraph, ConsensusTree]: current_path = Path(os.path.abspath(__file__)).resolve() output_dir_path = pathtools.get_child_dir(current_path.parent, output_dir_name) consensus_output_dir = pathtools.get_child_dir(output_dir_path, "consensus") multialignment_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/multialignment.maf") metadata_path = current_path.parent.joinpath("../data/Ebola/genome_whole/input/metadata.csv") blosum_path = current_path.parent.joinpath("../bin/blosum80.mat") tp = TaskParameters(running_time="", multialignment_file_path=multialignment_path, multialignment_format="MAF", datatype="N", metadata_file_path=metadata_path, blosum_file_path=blosum_path, output_path=output_dir_path, output_po=False, output_fasta=False, output_with_nodes=False, verbose=False, raw_maf=False, fasta_provider='FromNCBI', cache=True, missing_base_symbol="", fasta_source_file=None, consensus_type="", hbmin=0.8, max_cutoff_option="MAX2", search_range=None, node_cutoff_option="NODE3", multiplier=None, stop=stop, p=p) fasta_provider = fp_ncbi.FromNCBI(use_cache=True) multialignment_content = pathtools.get_file_content_stringio(multialignment_path) multialignment = inp.Maf(file_content=multialignment_content, filename=multialignment_path) metadata_content = pathtools.get_file_content_stringio(metadata_path) metadata = inp.MetadataCSV(filecontent=metadata_content, filename=metadata_path) poagraph, dagmaf = Poagraph.build_from_dagmaf(multialignment, fasta_provider, metadata) blosum_content = pathtools.get_file_content_stringio(path=blosum_path) blosum = cinp.Blosum(blosum_content, blosum_path) return poagraph, tree_generator.get_consensus_tree(poagraph, blosum, consensus_output_dir, cinp.Stop(stop), cinp.P(p), MAX2(), NODE3(), False)
def test_7_missing_one_reverted_sequence_middle_minus1_minus1(self): maf_path = self.maf_files_dir.joinpath( "test_7_missing_one_reverted_sequence_middle_minus1_minus1.maf") expected_nodes = [ # block 0 graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), # missing seq2 graph.Node(node_id=nid(4), base=graph.Base(self.missing_n.value), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base(self.missing_n.value), aligned_to=None), # block 1 graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(8)), graph.Node(node_id=nid(10), base=graph.Base('C'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 7, 9, 11])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 4, 5, 6, 8, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), missings.ConstBaseProvider(self.missing_n), self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_05_single_block_single_nucletodide(self): maf_path = self.maf_files_dir.joinpath( "test_5_single_block_single_nucletodide.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_2_no_seqid(self): csv_path = self.csv_files_dir.joinpath("test_2_no_seqid.csv") csv_content = pathtools.get_file_content_stringio(csv_path) with self.assertRaises(Exception) as err: _ = msa.MetadataCSV(csv_content, csv_path) self.assertEqual(f"No \'seqid\' column in metadata csv.", str(err.exception))
def test_3_empty_file(self): csv_path = self.csv_files_dir.joinpath("test_3_empty_file.csv") csv_content = pathtools.get_file_content_stringio(csv_path) with self.assertRaises(Exception) as err: _ = msa.MetadataCSV(csv_content, csv_path) self.assertEqual(f"Empty csv file.", str(err.exception))
def setUp(self): metadata_path = Path(__file__).parent.joinpath( "../seq_metadata.csv").resolve() self.metadatacsv = msa.MetadataCSV( pathtools.get_file_content_stringio(metadata_path), metadata_path) self.po_files_dir = Path(__file__).parent.joinpath( "po_files").resolve()
def test_04_single_block_no_nucleotides(self): maf_path = self.maf_files_dir.joinpath( "test_4_single_block_no_nucleotides.maf") expected_nodes = [] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def setUp(self): metadata_path = Path(__file__).parent.joinpath( "../seq_metadata.csv").resolve() self.metadatacsv = msa.MetadataCSV( pathtools.get_file_content_stringio(metadata_path), metadata_path) self.maf_files_dir = Path(__file__).parent.joinpath( "maf_files_with_gaps").resolve() self.missing_n = missings.MissingBase()
def get_default_blosum(): """Returns default blosum file: Blosum80.mat""" pangtreebuild_dir = Path(__file__).parent.parent default_blosum_path = pathtools.get_child_path( pangtreebuild_dir, "affinity_tree/bin/blosum80.mat") blosum_content = pathtools.get_file_content_stringio(default_blosum_path) return at_params.Blosum(blosum_content, default_blosum_path)
def setUp(self): metadata_path = Path(__file__).parent.joinpath( "../seq_metadata.csv").resolve() self.metadatacsv = msa.MetadataCSV( pathtools.get_file_content_stringio(metadata_path), metadata_path) self.maf_files_dir = Path(__file__).parent.joinpath( "maf_files_with_gaps").resolve() self.fasta_provider = DAGMaf2PoagraphFakeFastaProviderTests.FakeFastaProvider( )
def test_7_not_unique_seqids(self): csv_path = self.csv_files_dir.joinpath("test_7_not_unique_seqids.csv") csv_content = pathtools.get_file_content_stringio(csv_path) with self.assertRaises(Exception) as err: _ = msa.MetadataCSV(csv_content, csv_path) self.assertEqual( "Repeated values in seqid column in metadata file. Make them unique.", str(err.exception))
def test_10_parallel_blocks_1st_and_2nd_merge_into_3rd(self): maf_path = self.maf_files_dir.joinpath("test_10_parallel_blocks_1st_and_2nd_merge_into_3rd.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('G'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=graph.Base('C'), aligned_to=nid(15)), graph.Node(node_id=nid(15), base=graph.Base('G'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=graph.Base('T'), aligned_to=nid(14)), graph.Node(node_id=nid(17), base=graph.Base('A'), aligned_to=nid(18)), graph.Node(node_id=nid(18), base=graph.Base('T'), aligned_to=nid(17)), graph.Node(node_id=nid(19), base=graph.Base('A'), aligned_to=nid(20)), graph.Node(node_id=nid(20), base=graph.Base('C'), aligned_to=nid(19)), graph.Node(node_id=nid(21), base=graph.Base('C'), aligned_to=nid(22)), graph.Node(node_id=nid(22), base=graph.Base('G'), aligned_to=nid(21)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [7, 8, 9, 10, 11, 12, 15, 18, 19, 21])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 3, 4, 6, 13, 16, 17, 20, 21])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [1, 2, 3, 5, 6, 13, 14, 17, 20, 22])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_09_inactive_edges_but_all_strands_plus(self): maf_path = self.maf_files_dir.joinpath("test_9_inactive_edges_but_all_strands_plus.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(13), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(14), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(15), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(16), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(17), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(18), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(19), base=graph.Base('G'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 10, 11, 12, 13, 14])]), graph.SeqPath([*map(nid, [5, 6, 7, 8, 9, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_6_incorrect_commas_number(self): csv_path = self.csv_files_dir.joinpath( "test_6_incorrect_commas_number.csv") csv_content = pathtools.get_file_content_stringio(csv_path) with self.assertRaises(Exception) as err: _ = msa.MetadataCSV(csv_content, csv_path) self.assertEqual( "CSV metadata error. Different number of columns in line 0 than in header line.", str(err.exception))
def test_6_missing_one_reverted_sequence_middle_minus1_1(self): maf_path = self.maf_files_dir.joinpath( "test_6_missing_one_reverted_sequence_middle_minus1_1.maf") expected_nodes = [ # block 1 because it is first in DAG and reverted graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=nid(2)), # missing seq2, on edge (-1,1) graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('A'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=graph.Base('C'), aligned_to=nid(10)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath([*map(nid, [0, 1, 2])]), graph.SeqPath([*map(nid, [6, 8, 9, 10])]) ], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 11])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def run_pangtree(maf_path: Path, fasta_path: Path, output_dir: Path, po_output: bool) -> None: output_dir = pathtools.get_child_dir(output_dir, pathtools.get_current_time()) print(f"Runing pangtree for maf: {maf_path} and fasta: {fasta_path} " f"Output in: {output_dir}, include po file: {po_output}.") fasta_provider = missings.FromFile(fasta_path) maf = msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path) poagraph, dagmaf = builder.build_from_dagmaf(maf, fasta_provider) for p in p_values: current_output_dir = pathtools.get_child_dir(output_dir, str(p).replace(".", "_")) stop = at_params.Stop(0.99) at = at_builders.build_affinity_tree(poagraph, None, current_output_dir, stop, at_params.P(p), True) at_newick = at.as_newick(None, separate_leaves=True) pathtools.save_to_file( at_newick, pathtools.get_child_path(current_output_dir, "affinity_tree.newick")) if po_output: pangenome_po = po.poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(current_output_dir, "poagraph.po")) task_params = json.TaskParameters( multialignment_file_path=str(maf_path), multialignment_format="maf", datatype="nucleotides", blosum_file_path="", output_path=current_output_dir, fasta_provider=fasta_provider, fasta_source_file=fasta_path, consensus_type="tree", stop=str(stop), p=str(p), output_with_nodes=False) pangenomejson = json.to_PangenomeJSON(task_parameters=task_params, poagraph=poagraph, dagmaf=dagmaf, affinity_tree=at) pangenome_json_str = json.to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(current_output_dir, "pangenome.json"))
def test_1_typical_poagraph(self): po_path = self.po_files_dir.joinpath("test_1.po") expected_nodes = [ graph.Node(node_id=nid(0), base=bid('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(7), base=bid('G'), aligned_to=None), graph.Node(node_id=nid(8), base=bid('A'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=bid('C'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=bid('G'), aligned_to=nid(11)), graph.Node(node_id=nid(11), base=bid('T'), aligned_to=nid(8)), graph.Node(node_id=nid(12), base=bid('A'), aligned_to=nid(13)), graph.Node(node_id=nid(13), base=bid('C'), aligned_to=nid(12)), graph.Node(node_id=nid(14), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(15), base=bid('A'), aligned_to=nid(16)), graph.Node(node_id=nid(16), base=bid('C'), aligned_to=nid(17)), graph.Node(node_id=nid(17), base=bid('G'), aligned_to=nid(15)) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 2, 4, 6, 7, 8, 12, 14, 16])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 7, 10, 12, 14, 17])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [11, 13, 14, 15])])], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) nodes, sequences = po2poagraph.get_poagraph( msa.Po(pathtools.get_file_content_stringio(po_path), po_path), self.metadatacsv) actual_poagraph = graph.Poagraph(nodes, sequences) self.assertEqual(expected_poagraph, actual_poagraph)
def test_2_missing_sequence_end(self): maf_path = self.maf_files_dir.joinpath( "test_2_missing_sequence_end.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('C'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=None), graph.Node(node_id=nid(11), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(12), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 2, 4, 5, 8, 9, 10])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 7, 11, 12])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_9_get_seqids(self): metadata_path = self.csv_files_dir.joinpath("test_1_correct.csv") csv_content = pathtools.get_file_content_stringio(metadata_path) expected_seqids = [ msa.SequenceID('s1'), msa.SequenceID('s2'), msa.SequenceID('s3') ] m = msa.MetadataCSV(csv_content, metadata_path) actual_seqids = m.get_all_sequences_ids() self.assertEqual(expected_seqids, actual_seqids)
def test_2_consensuses_and_empty_sequences(self): po_path = self.po_files_dir.joinpath("test_2.po") expected_nodes = [ graph.Node(node_id=nid(0), base=bid('C'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=bid('T'), aligned_to=nid(0)), graph.Node(node_id=nid(2), base=bid('A'), aligned_to=nid(3)), graph.Node(node_id=nid(3), base=bid('G'), aligned_to=nid(2)), graph.Node(node_id=nid(4), base=bid('C'), aligned_to=None), graph.Node(node_id=nid(5), base=bid('T'), aligned_to=None), graph.Node(node_id=nid(6), base=bid('A'), aligned_to=nid(7)), graph.Node(node_id=nid(7), base=bid('T'), aligned_to=nid(6)), graph.Node(node_id=nid(8), base=bid('G'), aligned_to=None) ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 6, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 7, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('CONSENS0'): graph.Sequence(msa.SequenceID('CONSENS0'), [graph.SeqPath([*map(nid, [0, 3, 4, 5, 7, 8])])], graph.SequenceMetadata({})), msa.SequenceID('CONSENS1'): graph.Sequence(msa.SequenceID('CONSENS1'), [graph.SeqPath([*map(nid, [1, 2, 4, 5, 6, 8])])], graph.SequenceMetadata({})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) nodes, sequences = po2poagraph.get_poagraph( msa.Po(pathtools.get_file_content_stringio(po_path), po_path), self.metadatacsv) actual_poagraph = graph.Poagraph(nodes, sequences) self.assertEqual(expected_poagraph, actual_poagraph)
def test_06_1st_block_separates_into_2_branches_which_connect_in_3rd_block(self): maf_path = self.maf_files_dir.joinpath( "test_6_1st_block_separates_into_2_branches_which_connect_in_3rd_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=nid(1)), graph.Node(node_id=nid(1), base=graph.Base('C'), aligned_to=nid(2)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=nid(0)), graph.Node(node_id=nid(3), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('A'), aligned_to=nid(5)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(7), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=nid(9)), graph.Node(node_id=nid(9), base=graph.Base('G'), aligned_to=nid(10)), graph.Node(node_id=nid(10), base=graph.Base('T'), aligned_to=nid(8)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [0, 3, 4, 8])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [1, 3, 5, 6, 7, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [2, 3, 5, 10])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_02_seq_starts_in_second_block(self): maf_path = self.maf_files_dir.joinpath( "test_2_seq_starts_in_second_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('G'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=nid(5), block_id=bid(2)), graph.Node(node_id=nid(5), base=graph.Base('T'), aligned_to=nid(4), block_id=bid(2)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(8), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 2, 3])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 5, 7])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [3, 4, 6, 8])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_08_reversed_block(self): maf_path = self.maf_files_dir.joinpath("test_8_reversed_block.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('C'), aligned_to=None), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(2), base=graph.Base('T'), aligned_to=None), # next block is reversed because it was converted to dag graph.Node(node_id=nid(3), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(4), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(5), base=graph.Base('A'), aligned_to=nid(6)), graph.Node(node_id=nid(6), base=graph.Base('G'), aligned_to=nid(5)), graph.Node(node_id=nid(7), base=graph.Base('A'), aligned_to=None), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None), graph.Node(node_id=nid(9), base=graph.Base('T'), aligned_to=None), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [0, 1, 3, 4, 5, 7, 8, 9])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 8, 9])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [0, 1, 2, 3, 4, 6, 7, 9])])], graph.SequenceMetadata({'group': '2'})), } expected_poagraph = graph.Poagraph(expected_nodes, expected_sequences) actual_poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, self.metadatacsv) self.assertEqual(expected_poagraph, actual_poagraph)
def test_4_seqid_is_last(self): metadata_path = self.csv_files_dir.joinpath("test_4_seqid_is_last.csv") csv_content = pathtools.get_file_content_stringio(metadata_path) expected_metadata = { msa.SequenceID('s1'): { 'name': 'sequence1', 'group': 'A' }, msa.SequenceID('s2'): { 'name': 'sequence2', 'group': 'B' }, msa.SequenceID('s3'): { 'name': 'sequence3', 'group': 'B' } } m = msa.MetadataCSV(csv_content, metadata_path) actual_metadata = m.metadata self.assertEqual(expected_metadata, actual_metadata)
def test_10_metadata_feed_to_alignment_from_csv(self, test_name, maf_name, csv_name, po_name, expected_metadata): maf_path = self.alignment_files_dir.joinpath(maf_name) csv_path = self.csv_files_dir.joinpath(csv_name) po_path = self.alignment_files_dir.joinpath(po_name) poagraph, _ = builder.build_from_dagmaf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.fasta_provider, msa.MetadataCSV(pathtools.get_file_content_stringio(csv_path), csv_path)) actual_metadata = { seq_id: seq.seqmetadata for seq_id, seq in poagraph.sequences.items() } self.assertEqual(expected_metadata, actual_metadata) poagraph = builder.build_from_maf( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), msa.MetadataCSV(pathtools.get_file_content_stringio(csv_path), csv_path)) actual_metadata = { seq_id: seq.seqmetadata for seq_id, seq in poagraph.sequences.items() } self.assertEqual(expected_metadata, actual_metadata) poagraph = builder.build_from_po( msa.Po(pathtools.get_file_content_stringio(po_path), maf_path), msa.MetadataCSV(pathtools.get_file_content_stringio(csv_path), csv_path)) actual_metadata = { seq_id: seq.seqmetadata for seq_id, seq in poagraph.sequences.items() } self.assertEqual(expected_metadata, actual_metadata)
def setUp(self): metadata_path = Path(__file__).parent.joinpath("../seq_metadata.csv").resolve() self.metadatacsv = msa.MetadataCSV(pathtools.get_file_content_stringio(metadata_path), metadata_path) self.maf_files_dir = Path(__file__).parent.joinpath("maf_files_with_cycles_or_reversion").resolve() self.fasta_provider = missings.ConstBaseProvider(missings.MissingBase())
def test_1_messy_sequences(self): maf_path = self.maf_files_dir.joinpath("test_1_messy_sequences.maf") expected_nodes = [ graph.Node(node_id=nid(0), base=graph.Base('A'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(1), base=graph.Base('A'), aligned_to=nid(2), block_id=bid(0)), graph.Node(node_id=nid(2), base=graph.Base('C'), aligned_to=nid(1), block_id=bid(0)), graph.Node(node_id=nid(3), base=graph.Base('T'), aligned_to=None, block_id=bid(0)), graph.Node(node_id=nid(4), base=graph.Base('C'), aligned_to=nid(5), block_id=bid(0)), graph.Node(node_id=nid(5), base=graph.Base('G'), aligned_to=nid(4), block_id=bid(0)), graph.Node(node_id=nid(6), base=graph.Base('A'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(7), base=graph.Base('C'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(8), base=graph.Base('G'), aligned_to=None, block_id=bid(1)), graph.Node(node_id=nid(9), base=graph.Base('C'), aligned_to=nid(10), block_id=bid(2)), graph.Node(node_id=nid(10), base=graph.Base('G'), aligned_to=nid(9), block_id=bid(2)), graph.Node(node_id=nid(11), base=graph.Base('T'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(12), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(13), base=graph.Base('C'), aligned_to=None, block_id=bid(2)), graph.Node(node_id=nid(14), base=graph.Base('A'), aligned_to=None, block_id=bid(2)), ] expected_sequences = { msa.SequenceID('seq0'): graph.Sequence( msa.SequenceID('seq0'), [graph.SeqPath([*map(nid, [1, 3, 4, 6, 8, 9, 11, 12])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq1'): graph.Sequence( msa.SequenceID('seq1'), [graph.SeqPath([*map(nid, [2, 3, 4, 10, 11, 12, 13, 14])])], graph.SequenceMetadata({'group': '1'})), msa.SequenceID('seq2'): graph.Sequence( msa.SequenceID('seq2'), [graph.SeqPath([*map(nid, [0, 2, 5, 6, 7, 10, 11, 12, 14])])], graph.SequenceMetadata({'group': '2'})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [], graph.SequenceMetadata({'group': '2'})) } actual_nodes, actual_sequences = maf2poagraph.get_poagraph( msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path), self.metadatacsv) self.assertEqual(expected_nodes, actual_nodes) self.assertEqual(expected_sequences, actual_sequences)
def get_default_blosum(): """Returns default blosum file: Blosum80.mat""" parent_dir = Path(os.path.dirname(os.path.abspath(__file__)) + '/') default_blosum_path = pathtools.get_child_path(parent_dir, "../../bin/blosum80.mat") blosum_content = pathtools.get_file_content_stringio(default_blosum_path) return Blosum(blosum_content, default_blosum_path)