def split_alt_nodes(graph: GraphContainer, max_len=300, padding_len=150): """ Split long alternate nodes :param graph: graph to work on :param max_len: max length of reference node with no sequences :param padding_len: length of sequence to keep """ assert max_len >= 2 * padding_len for node in list(graph.altNodes()): if len(node["sequence"]) <= max_len: continue logging.info(f"Splitting long ALT node: {node['name']}") n1 = graph.add_altNode(node["chrom"], node["start"], node["end"], node["sequence"][:padding_len], node["sequences"]) n2 = graph.add_altNode(node["chrom"], node["start"], node["end"], node["sequence"][-padding_len:], node["sequences"]) for e in list(graph.inEdges(node)): graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"]) for e in list(graph.outEdges(node)): graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"]) graph.del_node(node)
def add_graph(graph1: GraphContainer, graph2: GraphContainer): """ Add all nodes, edges and paths from graph2 to graph1 (inplace) """ for node in graph2.refNodes(): graph1.add_refNode( node["chrom"], node["start"], node["end"], node["sequences"]) for node in graph2.altNodes(): graph1.add_altNode( node["chrom"], node["start"], node["end"], node["sequence"], node["sequences"]) for edge in graph2.edges.values(): graph1.add_edge(graph1.nodes[edge["from"]], graph1.nodes[edge["to"]], edge["sequences"]) graph1.paths += graph2.paths
def get_graph(self, allele_graph=False): """ Create the paragraph representation of nodes and edges for this graph :param alleleGraph: create edges between any compatible allele pair (rather than just following reference and given haplotypes) :return GraphContainer object """ logging.info("Creating output graph") graph = GraphContainer() # create ref nodes pnode = None for ref in self.get_ref_alleles(): node = graph.add_refNode(self.chrom, ref.begin, ref.end - 1, ref.data.haplotypes) if pnode: assert pnode["end"] + 1 == node["start"] graph.add_edge(pnode, node) pnode = node # Create alt nodes for alt in self.alts.values(): graph.add_altNode(self.chrom, alt.start, alt.end, alt.sequence, alt.haplotypes) # Create edges connecting nodes along a haplotype (or allele in alleleGraph mode) for haplo in self.get_haplotypes(): nodes = graph.nodes_by_haplo(haplo) logging.info( f"Linking nodes in sequence {haplo}:\t{', '.join(n['name'] for n in nodes)}" ) pnode = None for node in nodes: if pnode: if pnode["end"] == node["start"] - 1: graph.add_edge(pnode, node, [haplo]) pnode_is_ref_dummy = pnode[ "end"] == pnode["start"] - 1 and not pnode["sequence"] pnode_ends_before_node = pnode["end"] < node[ "start"] and pnode["start"] < node["start"] if not pnode_is_ref_dummy and not pnode_ends_before_node: raise Exception( f"Inconsistent nodes for haplotype {haplo}: {pnode['name']}, {node['name']}" ) pnode = node # In alleleGraph mode link each alt node to all neighboring nodes # In haplotype mode link nodes without in/out edges to reference for node in graph.altNodes(): if allele_graph or not any(graph.inEdges(node)): graph.add_edge( graph.refNode_ending_at[node["chrom"], node["start"] - 1], node) if not any(graph.outEdges(node)): graph.add_edge( node, graph.refNode_starting_at[node["chrom"], node["end"] + 1]) if allele_graph: isInsertion = node["end"] < node["start"] for n in graph.nodes_starting_at[node["end"] + 1]: # Don't loop by connecting multiple insertions at the same position if not (isInsertion and n["end"] < n["start"]): graph.add_edge(node, n) # For nodes that do not have determined in/out edges for a given haplotype # label all in/out edges as compatible with that haplotype # excluding edges that connect to another allele at the same vcfVariant (e.g. insertions) for haplo in self.get_haplotypes(): for node in graph.nodes_by_haplo(haplo): if not any(graph.inEdges(node, haplo)): for e in graph.inEdges(node): graph.add_edge(graph.nodes[e["from"]], node, [haplo]) assert any(graph.inEdges(node, haplo)) if not any(graph.outEdges(node, haplo)): for e in graph.outEdges(node): graph.add_edge(node, graph.nodes[e["to"]], [haplo]) return graph