def get_poagraph(maf: msa.Maf, metadata: Optional[msa.MetadataCSV]) -> \ Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]: """Get poagraph elements from MAF. Args: maf: Multialignment file in MAF format. metadata: MetadataCSV. Returns: Tuple of poagraph elements. """ alignment = [*AlignIO.parse(maf.filecontent, "maf")] nodes, sequences = _init_poagraph(alignment, metadata) current_node_id = graph.NodeID(-1) column_id = graph.ColumnID(-1) for block_id, block in enumerate(alignment): global_logger.info(f"Processing block {block_id}...") block_width = len(block[0].seq) for col in range(block_width): column_id += 1 sequence_id_to_nucleotide = { msa.SequenceID(seq.id): seq[col] for seq in block } nodes_codes = sorted([ *(set([ nucleotide for nucleotide in sequence_id_to_nucleotide.values() ])).difference({'-'}) ]) column_nodes_ids = [ graph.NodeID(current_node_id + i + 1) for i, _ in enumerate(nodes_codes) ] for i, nucl in enumerate(nodes_codes): current_node_id += 1 nodes.append( graph.Node(node_id=current_node_id, base=graph.Base(nucl), aligned_to=_get_next_aligned_node_id( graph.NodeID(i), column_nodes_ids), column_id=graph.ColumnID(column_id), block_id=graph.BlockID(block_id))) for seq_id, nucleotide in sequence_id_to_nucleotide.items(): if nucleotide == nucl: sequences[seq_id] = _add_node_do_sequence( sequence=sequences[seq_id], node_id=current_node_id) return nodes, sequences
def _get_poagraph_paths_and_nodes(po_lines: List[str], sequences_info: Dict[int, POSequenceInfo], sequences: Dict[msa.SequenceID, graph.Sequence]) -> \ Tuple[List[graph.Node], Dict[msa.SequenceID, graph.Sequence]]: nodes_count = int(_extract_line_value(po_lines[3])) paths_count = int(_extract_line_value(po_lines[4])) nodes: List[graph.Node] = [None] * nodes_count node_id = 0 for i in range(5 + paths_count * 2, 5 + paths_count * 2 + nodes_count): node_line = po_lines[i] base = graph.Base(node_line[0].upper()) in_nodes, po_sequences_ids, aligned_to = _extract_node_parameters(node_line) sequences_ids = [sequences_info[po_sequences_id].name for po_sequences_id in po_sequences_ids] nodes[node_id] = graph.Node(graph.NodeID(node_id), base, graph.NodeID(aligned_to)) for seq_id in sequences_ids: if len(sequences[seq_id].paths) == 1: sequences[seq_id].paths[0].append(graph.NodeID(node_id)) else: sequences[seq_id].paths.append(graph.SeqPath([graph.NodeID(node_id)])) node_id += 1 return nodes, sequences
def nid(x): return graph.NodeID(x)
def _get_max_node_id(nodes: List[graph.Node]) -> graph.NodeID: return graph.NodeID(len(nodes) - 1)
def nid(x): return graph.NodeID(x) def bid(x): return graph.BlockID(x)
def nid(x): return graph.NodeID(x) def bid(x): return graph.Base(x)