def combine_nodes(graph: GraphContainer): """ Combine adjacent nodes with the same sequence labels """ for n1 in list(graph.nodes.values()): if len(list(graph.outEdges(n1))) != 1: continue # Pair of nodes with no other in/out edges n2 = graph.nodes[next(graph.outEdges(n1))["to"]] if len(list(graph.inEdges(n2))) != 1: continue if not (n1["chrom"] == n2["chrom"] and n1["end"] + 1 == n2["start"]): continue # nodes must be adjacent haplos = n1["sequences"] if n2["sequences"] != haplos: continue # only collapse nodes with same haplotypes if "reference" in n1: if "reference" not in n2: continue # nodes must be of same type node = graph.add_refNode(n1["chrom"], n1["start"], n2["end"], haplos) else: if "reference" in n2: continue # nodes must be of same type node = graph.add_altNode( n1["chrom"], n1["start"], n2["end"], n1["sequence"] + n2["sequence"], haplos) logging.info("Combinding %s and %s", n1['name'], n2['name']) for e in list(graph.inEdges(n1)): graph.add_edge(graph.nodes[e["from"]], node, e["sequences"]) for e in list(graph.outEdges(n2)): graph.add_edge(node, graph.nodes[e["to"]], e["sequences"]) graph.del_node(n1) graph.del_node(n2)
def split_alt_nodes(graph: GraphContainer, max_len=300, padding_len=150): """ Split long alternate nodes :param graph: graph to work on :param max_len: max length of reference node with no sequences :param padding_len: length of sequence to keep """ assert max_len >= 2 * padding_len for node in list(graph.altNodes()): if len(node["sequence"]) <= max_len: continue logging.info(f"Splitting long ALT node: {node['name']}") n1 = graph.add_altNode(node["chrom"], node["start"], node["end"], node["sequence"][:padding_len], node["sequences"]) n2 = graph.add_altNode(node["chrom"], node["start"], node["end"], node["sequence"][-padding_len:], node["sequences"]) for e in list(graph.inEdges(node)): graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"]) for e in list(graph.outEdges(node)): graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"]) graph.del_node(node)
def split_node(graph: GraphContainer, node, breakpoints): """ Split a node at a set of breakpoints and link new (sub-)nodes Used to link to new variant nodes later Modifies graph and deletes node after splitting :returns Created sub-nodes """ if not breakpoints: return node breakpoints = sorted(set(breakpoints)) logging.debug(f"Splitting {node['name']} at {breakpoints}") nodes = [] lEnd = 0 for p in breakpoints: assert 0 <= p <= node["end"] - node["start"] + 1 nStart = node["start"] + lEnd nEnd = node["start"] + p - 1 if "reference" in node: nodes.append( graph.add_refNode(node["chrom"], nStart, nEnd, node["sequences"])) else: seq = node["sequence"][lEnd:p] nodes.append( graph.add_altNode(node["chrom"], nStart, nEnd, seq, node["sequences"])) lEnd = p # Add last node lStart = node["start"] + breakpoints[-1] if "reference" in node: nodes.append( graph.add_refNode(node["chrom"], lStart, node["end"], node["sequences"])) else: seq = node["sequence"][breakpoints[-1]:] nodes.append( graph.add_altNode(node["chrom"], lStart, node["end"], seq, node["sequences"])) # Connect nodes for e in graph.inEdges(node): graph.add_edge(graph.nodes[e["from"]], nodes[0], e["sequences"]) for e in graph.outEdges(node): graph.add_edge(nodes[-1], graph.nodes[e["to"]], e["sequences"]) for (n1, n2) in zip(nodes[:-1], nodes[1:]): graph.add_edge(n1, n2) # Delete original node, unless identical to new node (no split) if node['name'] not in [n['name'] for n in nodes]: graph.del_node(node) return nodes
def split_ref_nodes(graph: GraphContainer, max_len=300, padding_len=150): """ Split long reference nodes :param graph: graph to work on :param max_len: max length of reference node with no sequences :param padding_len: length of sequence to keep """ assert max_len >= 2 * padding_len for node in list(graph.refNodes()): if node["end"] - node["start"] + 1 <= max_len: continue logging.info("Splitting long REF node: %s", node['name']) firstEnd = node["start"] + padding_len - 1 n1 = graph.add_refNode(node["chrom"], node["start"], firstEnd, node["sequences"]) sndStart = node["end"] - padding_len + 1 n2 = graph.add_refNode(node["chrom"], sndStart, node["end"], node["sequences"]) for e in list(graph.inEdges(node)): graph.add_edge(graph.nodes[e["from"]], n1, e["sequences"]) for e in list(graph.outEdges(node)): graph.add_edge(n2, graph.nodes[e["to"]], e["sequences"]) graph.del_node(node)
def remove_empty_nodes(graph: GraphContainer): """ Remove nodes without sequence (from deletions / skipped insertions or split ref nodes) Merge in & out edge pairs to keep connections """ for node in list(graph.nodes.values()): if (("reference" in node and node["start"] <= node["end"]) or node.get("sequence", "") != ""): continue logging.info("Removing empty node %s", node['name']) inSeqs = [s for e in graph.inEdges(node) for s in e["sequences"]] outSeqs = [s for e in graph.outEdges(node) for s in e["sequences"]] for e1 in list(graph.inEdges(node)): for e2 in list(graph.outEdges(node)): # Label the new edges with sequence labels either observed # on both merged in- and out-edge or on an in (out) -edge only # if the label is undetermined goung out (in) haplos = e1["sequences"].intersection(e2["sequences"]).union( e1["sequences"].difference(outSeqs).union( e2["sequences"].difference(inSeqs))) graph.add_edge(graph.nodes[e1["from"]], graph.nodes[e2["to"]], haplos) graph.del_node(node)