def build_poa_affinity_tree(p: graph.Poagraph, blosum: Optional[parameters.Blosum], output_dir: Path, hbmin: parameters.Hbmin, verbose: bool) -> tree.AffinityTree: """Builds Affinity Tree coherent with poa software. This method builds a simple version of Affinity Tree as it uses a single call to poa software. Poa provides division of sequences in Poagraph into consistent groups with a consensus path assigned to each group. These groups are converted in this method to Affinity Tree nodes and connected with a dummy root node so the result is coherent with pangtree definition of Affinity Tree. Args: p: Poagraph containing sequences to be divided into groups (Affinity Tree nodes). optional blosum: BLOSUM matrix. If not provided, default Blosum80.mat is used. output_dir: Path to a directory that can be used by poa software. hbmin: Parameter required by poa software. The minimum value of sequence compatibility to generated consensus. verbose: Switch to control logging intensity. Raises: AffinityTreeGenerationException: if consensuses cannot be found. """ def _convert_consensus_paths_to_affinity_tree_nodes(): at_nodes = [] assigned_sequences = [] for c_id, c_info in consensus_paths.items(): assigned_sequences += c_info.assigned_sequences_ids all_seq = p.get_sequences_ids() compatibilities = p.get_compatibilities(all_seq, c_info.path) if len(c_info.assigned_sequences_ids): assigned_seq_comp = [c for seq_id, c in compatibilities.items() if seq_id in c_info.assigned_sequences_ids] mincomp = min(assigned_seq_comp) else: mincomp = 0 new_node = tree.AffinityNode(id_=tree.AffinityNodeID(c_id + 1), parent=tree.AffinityNodeID(0), sequences=c_info.assigned_sequences_ids, mincomp=mincomp, compatibilities=compatibilities, consensus=c_info.path, children=[]) at_nodes.append(new_node) node_for_unassigned_sequences = tree.AffinityNode(parent=tree.AffinityNodeID(0), sequences=[seq_id for seq_id in p.get_sequences_ids() if seq_id not in assigned_sequences], id_=tree.AffinityNodeID(len(at_nodes) + 1), mincomp=graph.Compatibility(0), children=[]) at_nodes.append(node_for_unassigned_sequences) return at_nodes global_logger.info("POA defined affinity tree generation started.") if blosum is None: blosum = get_default_blosum() _raise_error_if_invalid_poagraph(p) try: consensus_paths = poa.get_consensuses(p, p.get_sequences_ids(), output_dir, "poa_tree", blosum.filepath, hbmin) except poa.NoConsensusError: raise AffinityTreeBuildException("No consensus in the Affinity Tree.") consensus_nodes = _convert_consensus_paths_to_affinity_tree_nodes() root_node = tree.AffinityNode(id_=tree.AffinityNodeID(0), children=[c_node.id_ for c_node in consensus_nodes]) affinity_tree = tree.AffinityTree([root_node] + consensus_nodes) global_logger.info("POA defined affinity tree generation finished.") return affinity_tree
def main(): parser = cli.get_parser() args = parser.parse_args() start = datetime.datetime.now() if not args.quiet and args.verbose: logprocess.add_file_handler_to_logger(args.output_dir, "details", "details.log", propagate=False) logprocess.add_file_handler_to_logger(args.output_dir, "", "details.log", propagate=False) if args.quiet: logprocess.disable_all_loggers() poagraph, dagmaf, fasta_provider = None, None, None if isinstance(args.multialignment, Maf) and args.raw_maf: poagraph = Poagraph.build_from_maf(args.multialignment, args.metadata) elif isinstance(args.multialignment, Maf) and not args.raw_maf: fasta_provider = cli.resolve_fasta_provider(args) poagraph, dagmaf = Poagraph.build_from_dagmaf(args.multialignment, fasta_provider, args.metadata) elif isinstance(args.multialignment, Po): poagraph = Poagraph.build_from_po(args.multialignment, args.metadata) consensus_tree = None if args.consensus is not None: blosum = args.blosum if args.blosum else cli.get_default_blosum() if fasta_provider is not None and isinstance(fasta_provider, ConstSymbolProvider): blosum.check_if_symbol_is_present( fasta_provider.missing_symbol.as_str()) consensus_output_dir = pathtools.get_child_dir(args.output_dir, "consensus") if args.consensus == 'poa': consensus_tree = simple_tree_generator.get_simple_consensus_tree( poagraph, blosum, consensus_output_dir, args.hbmin, args.verbose) elif args.consensus == 'tree': max_strategy = cli.resolve_max_strategy(args) node_strategy = cli.resolve_node_strategy(args) consensus_tree = tree_generator.get_consensus_tree( poagraph, blosum, consensus_output_dir, args.stop, args.p, max_strategy, node_strategy, args.verbose) try: seq_id_to_name = { seq_id: seq.seqmetadata["name"] for seq_id, seq in poagraph.sequences.items() } except: seq_id_to_name = None newick_consensus_tree = consensus_tree.as_newick(seq_id_to_name) pathtools.save_to_file( newick_consensus_tree, pathtools.get_child_path(args.output_dir, "consensus_tree.newick")) if args.output_po: pangenome_po = poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(args.output_dir, "poagraph.po")) if args.output_fasta: sequences_fasta = poagraph_to_fasta(poagraph) pathtools.save_to_file( sequences_fasta, pathtools.get_child_path(args.output_dir, "sequences.fasta")) if consensus_tree: consensuses_fasta = consensuses_tree_to_fasta( poagraph, consensus_tree) pathtools.save_to_file( consensuses_fasta, pathtools.get_child_path(args.output_dir, "consensuses.fasta")) end = datetime.datetime.now() pangenomejson = to_PangenomeJSON(task_parameters=cli.get_task_parameters( args, running_time=f"{end-start}s"), poagraph=poagraph, dagmaf=dagmaf, consensuses_tree=consensus_tree) pangenome_json_str = to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(args.output_dir, "pangenome.json"))
def build_affinity_tree(poagraph: graph.Poagraph, blosum: Optional[parameters.Blosum], output_dir: Path, stop: parameters.Stop, p: parameters.P, verbose: bool) -> tree.AffinityTree: """Builds Affinity Tree. Affinity Tree is defined in paper 'Getting insight into the pan-genome structure with Pangtree'. This method builds an Affinity Tree by iterative calls to poa software. Full algorithm and idea are described in the above-mentioned paper. Args: poagraph: Poagraph containing _sequences to be divided into groups (Affinity Tree nodes). optional blosum: BLOSUM matrix. If not provided, default Blosum80.mat is used. output_dir: Path to a directory that can be used by poa software. stop: Value of mincomp above which an affinity tree node is no more split. p: Value changing the linear meaning of compatibility when searching for cutoff. verbose: Switch to control logging intensity. Raises: AffinityTreeGenerationException: if consensuses cannot be found. Returns: Affinity Tree generated with Pangtree algorithm. """ global_logger.info("Affinity Tree generation started.") if blosum is None: blosum = get_default_blosum() if verbose: logprocess.add_file_handler_to_logger(output_dir, "tresholdsCSV", "tresholds.csv", "%(message)s", False) _raise_error_if_invalid_poagraph(poagraph) root_node = _get_root_node(poagraph, blosum.filepath, output_dir, p) affinity_tree = tree.AffinityTree([root_node]) nodes_to_process = deque([affinity_tree.get_node(tree.AffinityNodeID(0))]) while nodes_to_process: node = nodes_to_process.pop() children_nodes = _get_children_nodes_looping(node, poagraph, output_dir, blosum.filepath, p, affinity_tree.get_max_node_id()) if len(children_nodes) == 1: continue for child in children_nodes: all_sequences = [*poagraph.sequences.keys()] child.compatibilities = poagraph.get_compatibilities(sequences_ids=all_sequences, consensus_path=child.consensus, p=p) node.children.append(child.id_) affinity_tree.nodes.append(child) if not _node_is_ready(child, stop): nodes_to_process.append(child) global_logger.info("Affinity Tree generation finished.\n") return affinity_tree
def main(): parser = cli.get_parser() args = parser.parse_args() start = datetime.datetime.now() if not args.quiet and args.verbose: logprocess.add_file_handler_to_logger(args.output_dir, "details", "details.log", propagate=False) logprocess.add_file_handler_to_logger(args.output_dir, "", "details.log", propagate=False) if args.quiet: logprocess.disable_all_loggers() poagraph, dagmaf, fasta_provider = None, None, None if isinstance(args.multialignment, msa.Maf) and args.raw_maf: poagraph = builder.build_from_maf(args.multialignment, args.metadata) elif isinstance(args.multialignment, msa.Maf) and not args.raw_maf: fasta_provider = cli.resolve_fasta_provider(args) poagraph, dagmaf = builder.build_from_dagmaf(args.multialignment, fasta_provider, args.metadata) elif isinstance(args.multialignment, msa.Po): poagraph = builder.build_from_po(args.multialignment, args.metadata) affinity_tree = None if args.affinity is not None: blosum = args.blosum if args.blosum else cli.get_default_blosum() if fasta_provider is not None and isinstance( fasta_provider, missings.ConstBaseProvider): blosum.check_if_symbol_is_present( fasta_provider.missing_base.as_str()) consensus_output_dir = pathtools.get_child_dir(args.output_dir, "affinitytree") if args.affinity == 'poa': affinity_tree = at_builders.build_poa_affinity_tree( poagraph, blosum, consensus_output_dir, args.hbmin, args.verbose) elif args.affinity == 'tree': affinity_tree = at_builders.build_affinity_tree( poagraph, blosum, consensus_output_dir, args.stop, args.p, args.verbose) if args.metadata is not None: seq_id_to_metadata = { seq_id: seq.seqmetadata for seq_id, seq in poagraph.sequences.items() } else: seq_id_to_metadata = None affinity_tree_newick = affinity_tree.as_newick(seq_id_to_metadata, separate_leaves=True) pathtools.save_to_file( affinity_tree_newick, pathtools.get_child_path(consensus_output_dir, "affinity_tree.newick")) if args.output_po: pangenome_po = po.poagraph_to_PangenomePO(poagraph) pathtools.save_to_file( pangenome_po, pathtools.get_child_path(args.output_dir, "poagraph.po")) if args.output_fasta: sequences_fasta = fasta.poagraph_to_fasta(poagraph) pathtools.save_to_file( sequences_fasta, pathtools.get_child_path(args.output_dir, "_sequences.fasta")) if affinity_tree: consensuses_fasta = fasta.affinity_tree_to_fasta( poagraph, affinity_tree) pathtools.save_to_file( consensuses_fasta, pathtools.get_child_path(args.output_dir, "affinitytree.fasta")) end = datetime.datetime.now() pangenomejson = json.to_PangenomeJSON( task_parameters=cli.get_task_parameters(args, running_time=f"{end-start}s"), poagraph=poagraph, dagmaf=dagmaf, affinity_tree=affinity_tree) pangenome_json_str = json.to_json(pangenomejson) pathtools.save_to_file( pangenome_json_str, pathtools.get_child_path(args.output_dir, "pangenome.json"))