def run_pangtree(maf_path: Path, fasta_path: Path, output_dir: Path, po_output: bool) -> None: output_dir = pathtools.get_child_dir(output_dir, pathtools.get_current_time()) print(f"Runing pangtree for maf: {maf_path} and fasta: {fasta_path} " f"Output in: {output_dir}, include po file: {po_output}.") fasta_provider = missings.FromFile(fasta_path) maf = msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path) poagraph, dagmaf = builder.build_from_dagmaf(maf, fasta_provider) for p in p_values: current_output_dir = pathtools.get_child_dir(output_dir, str(p).replace(".", "_")) stop = at_params.Stop(0.99) at = at_builders.build_affinity_tree(poagraph, None, current_output_dir, stop, at_params.P(p), True) at_newick = at.as_newick(None, separate_leaves=True) pathtools.save_to_file( at_newick, pathtools.get_child_path(current_output_dir, "affinity_tree.newick")) if po_output: pangenome_po = po.poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(current_output_dir, "poagraph.po")) task_params = json.TaskParameters( multialignment_file_path=str(maf_path), multialignment_format="maf", datatype="nucleotides", blosum_file_path="", output_path=current_output_dir, fasta_provider=fasta_provider, fasta_source_file=fasta_path, consensus_type="tree", stop=str(stop), p=str(p), output_with_nodes=False) pangenomejson = json.to_PangenomeJSON(task_parameters=task_params, poagraph=poagraph, dagmaf=dagmaf, affinity_tree=at) pangenome_json_str = json.to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(current_output_dir, "pangenome.json"))
def get_compatibilities( self, sequences_ids: List[msa.SequenceID], consensus_path: SeqPath, p: Optional[at_params.P] = at_params.P(1) ) -> Dict[msa.SequenceID, Compatibility]: """Calculate compatibilities of sequences listed in sequences_ids to given consensus_path. Use P. Args: sequences_ids: Compatibilities of these seqeunces will be calculated. consensus_path: Sequences will be compared to this consensus path. p: Parameter P, see affinity tree algorithm parameters. Returns: Dictionary of sequences_ids and corresponding compatibility to given consensus_path. Raises: KeyError: If there is no sequence with given ID. """ compatibilities = dict() for seq_id in sequences_ids: try: sequence_paths = self.sequences[seq_id].paths except KeyError: raise Exception("No sequence with given ID in poagraph.") if len(sequence_paths) == 1: sequence_path = sequence_paths[0] else: sequence_path = [ node_id for path in sequence_paths for node_id in path ] compatibilities[seq_id] = Compatibility( len(set(sequence_path).intersection(set(consensus_path))) / len(sequence_path), p) return compatibilities
def get_parser() -> argparse.ArgumentParser: """Create ArgumentParser for pang module.""" p = argparse.ArgumentParser(prog='pangtreebuild', description="""This software builds poagraph and generates affinitytree.""", epilog="""For more information check github.com/meoke/pangtree""") p.add_argument('--output_dir', type=_cli_dir_arg, default=get_default_output_dir(), help='Output directory path.') p.add_argument('--multialignment', metavar='MULTIALIGNMENT_PATH', type=_mulitalignment_file, required=True, help='Path to the multialignment file.') p.add_argument('--datatype', type=_data_type, default=graph.DataType.Nucleotides, help='\'n\' for nucleotides, \'p\' for proteins. ' + inspect.getdoc(graph.DataType)) p.add_argument('--metadata', metavar='METADATA_PATH', type=_metadata_file, help='Path to the csv file with metadata. ' + inspect.getdoc(msa.MetadataCSV)) p.add_argument('--raw_maf', action='store_true', default=False, help="""Poagraph building from maf file parameter. Set if the maf content must not be transformed to DAG before building graph. Poagraph that was build in this way provides affinitytree tree but the affinitytree do not reflect the real life sequences.""") p.add_argument('--fasta_provider', metavar="FASTA_PROVIDER", choices=['ncbi', 'file'], help="""'Maf file may not include full _sequences. In such case an additional data source is needed. Use \'ncbi\' for NCBI (activates CACHE option) or \'file\' for file (then provide also FASTA_PATH). MISSING_SYMBOL is used if this argument is omitted.""") p.add_argument('--missing_symbol', metavar='MISSING_SYMBOL', type=_cli_arg(missings.MissingBase), help=inspect.getdoc(missings.MissingBase)) p.add_argument('--cache', action='store_true', help="""Set if fastas downloaded from NCBI should be cached locally in .fastacache folder. Used if Fasta Provider is NCBI. Sequences downloaded from NCBI are stored and reused by this program.""") p.add_argument('--fasta_path', metavar="FASTA_PATH", type=_path_if_valid, help="""ZIP archive with fasta files or fasta file used as FASTA_PROVIDER.""") p.add_argument('--affinity', choices=['poa', 'tree'], help="""Generate affinity tree. Use \'poa\' for direct result of poa software, \'tree\' for Affinity Tree algorithm.""") p.add_argument('--blosum', type=_blosum_file, metavar='BLOSUM_PATH', help='Path to the blosum file. ' + inspect.getdoc(at_params.Blosum)) p.add_argument('--hbmin', type=_cli_arg(at_params.Hbmin), default=at_params.Hbmin(), help='Simple POA algorithm parameter. ' 'Hbmin value. ' + inspect.getdoc(at_params.Hbmin)) p.add_argument('--stop', type=_cli_arg(at_params.Stop), default=at_params.Stop(), help='Tree POA algorithm parameter.' + inspect.getdoc(at_params.Stop)) p.add_argument('--p', type=_cli_arg(at_params.P), default=at_params.P(), help='Tree consensus algorithm parameter.' + inspect.getdoc(at_params.P)) p.add_argument('--output_fasta', action='store_true', help="""Set if fasta files for _sequences and affinitytree must be produced.""") p.add_argument('--output_po', action='store_true', default=False, help='Set if po file for poagraph must be produced.'), p.add_argument('--output_full', action='store_true', default=False, help='Set if the result pangenome.json should contain ' 'list of nodes ids for sequences and consensuses'), p.add_argument('-v', '--verbose', action='store_true', default=False, help='Set if detailed log files must be produced.') p.add_argument('-q', '--quiet', action='store_true', default=False, help='Set to turn off console logging.') return p
def __sub__( self, other: Union["Compatibility", at_params.Stop]) -> "Compatibility": self._check_p_equality(other) return Compatibility(self.value - other.value, at_params.P(self.p))
def __init__(self, compatibility: float, p: at_params.P = at_params.P(1)): self.value: float = compatibility**p.value self.p: float = p.value
class AffinityTreeGenerationTests(unittest.TestCase): @data((at_params.P(0.5), graph.Compatibility(0.836660026534076)), (at_params.P(1), graph.Compatibility(0.7)), (at_params.P(4), graph.Compatibility(0.6561))) @unpack def test_1_p_parameter_influence(self, p: at_params.P, expected_cutoff: graph.Compatibility): nodes = [ graph.Node(node_id=nid(0), base=b('T'), aligned_to=None), graph.Node(node_id=nid(1), base=b('A'), aligned_to=None), graph.Node(node_id=nid(2), base=b('G'), aligned_to=None), graph.Node(node_id=nid(3), base=b('A'), aligned_to=None), graph.Node(node_id=nid(4), base=b('C'), aligned_to=None), graph.Node(node_id=nid(5), base=b('A'), aligned_to=None), graph.Node(node_id=nid(6), base=b('C'), aligned_to=None), graph.Node(node_id=nid(7), base=b('G'), aligned_to=None), graph.Node(node_id=nid(8), base=b('T'), aligned_to=None), graph.Node(node_id=nid(9), base=b('A'), aligned_to=None) ] sequences = { msa.SequenceID('seq0'): graph.Sequence(msa.SequenceID('seq0'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq1'): graph.Sequence(msa.SequenceID('seq1'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq2'): graph.Sequence(msa.SequenceID('seq2'), [ graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 7, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq3'): graph.Sequence(msa.SequenceID('seq3'), [ graph.SeqPath([*map(nid, [10, 11, 12, 3, 4, 5, 6, 7, 8, 9])]) ], graph.SequenceMetadata({})), msa.SequenceID('seq4'): graph.Sequence( msa.SequenceID('seq3'), [graph.SeqPath([*map(nid, [10, 11, 2, 3, 4, 5, 6, 7, 8, 9])])], graph.SequenceMetadata({})) } poagraph = graph.Poagraph(nodes, sequences) consensus_path = graph.SeqPath( [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])]) compatibilities = poagraph.get_compatibilities( poagraph.get_sequences_ids(), consensus_path, p) actual_cutoff = at_builders._find_node_cutoff( [c for c in compatibilities.values()], []).cutoff self.assertAlmostEqual(expected_cutoff.value, actual_cutoff.value) @data( # single compatibility value (0.5, [graph.Compatibility(0.5)]), # two compatibilities values (0.7, [graph.Compatibility(0.5), graph.Compatibility(0.7)]), (1, [graph.Compatibility(1), graph.Compatibility(0.45)]), (0.9, [graph.Compatibility(0.9), graph.Compatibility(0.5)]), # repeated values (0.7, [*map(graph.Compatibility, [0.5, 0.7, 0.7])]), (0.9, [*map(graph.Compatibility, [0.9, 0.5, 0.5])]), (1, [*map(graph.Compatibility, [0.45, 1, 0.45, 0.45])]), # many unique compatibilities values (.8, [*map(graph.Compatibility, [.3, .4, .8])]), (0.91, [*map(graph.Compatibility, [0.31, 0.32, 0.91, 0.92, 0.93, 0.97])]), (0.91, [ *map(graph.Compatibility, [0.29, 0.3, 0.33, 0.91, 0.92, 0.93, 0.97]) ]), (1, [*map(graph.Compatibility, [0.81, 0.75, 0.8, 0.81, 1])]), (0.9, [*map(graph.Compatibility, [0.5, 0.9, 0.99])]), (0.7, [*map(graph.Compatibility, [0.2, 0.85, 0.7, 0.8])]), (0.99, [*map(graph.Compatibility, [0.99, 0.9, 0.99])]), (0.99, [*map(graph.Compatibility, [0.99])]), # repeated distance between values (.4, [*map(graph.Compatibility, [.3, .4, .5])]), # all the same values (.1, [*map(graph.Compatibility, [.1, .1, .1])])) @unpack def test_2_find_cutoff_no_so_far_values( self, expected_cutoff: float, compatibilities: List[graph.Compatibility]): actual_cutoff = at_builders._find_node_cutoff(compatibilities, []).cutoff self.assertEqual(expected_cutoff, actual_cutoff.value) def test_3_find_cutoff_no_compatibilities(self): with self.assertRaises(ValueError) as err: _ = at_builders._find_node_cutoff([], []).cutoff self.assertEqual( str(err.exception), """Empty compatibilities list. Cannot find cutoff.""") @data( # guard <= all compatibilities (0.2, [0.2, 0.7, 0.8, 0.85], [0.1, 0.01, 0]), (0.7, [0.7, 0.85, 0.7, 0.8], [0.1, 0.01, 0]), (0.8, [0.7, 0.7, 0.85, 0.8], [0.85, 0.91, 1.0]), # guard > all compatibilities (0.6, [0.3, 0.6, 0.61, 0.61], [0.99]), # big distance to guard (0.9, [0.2, 0.97, 0.98, 0.9], [0.99]), # small distance to guard # guard between compatibilities (0.5, [0.2, 0.57, 0.58, 0.5], [0.55]), # take smaller than guard (0.58, [0.2, 0.27, 0.58, 0.2], [0.55]), # take greater than guard (0.55, [0.2, 0.58, 0.27, 0.55], [0.55]) # take equal to guard ) @unpack def test_4_find_cutoff_with_so_far_values(self, expected_cutoff, compatibilities, so_far_cutoffs): compatibilities = [graph.Compatibility(c) for c in compatibilities] so_far_cutoffs = [graph.Compatibility(c) for c in so_far_cutoffs] actual_cutoff = at_builders._find_node_cutoff(compatibilities, so_far_cutoffs).cutoff self.assertEqual(expected_cutoff, actual_cutoff.value)