def find_dangling_sinks(G, path_d, mermap): """ For isoforms w/ 3' alt ends that have a longer last exon, it shows up as a *branch* from the path ex: ... -> pred -> sink_node (sink_node has only one incoming edge) ... -> pred -> n' -> ...other path... (pred only has two outgoing edges) we can "tuck" sink into n' if and only if sink is a substring of n' pred = [prefix] + [suffix] sink_node = [suffix] + [extra] n' = [suffix] + [...] update by deleting sink_node, and updating: pred = [prefix] + [suffix] + [extra] n' = [just use last k-mer of extra] + [...] """ cand_sinks = [ n for n in G.nodes() if G.out_degree(n) == 0 and G.in_degree(n) == 1 ] for sink in cand_sinks: pred = next(G.predecessors(sink)) for n in G.successors(pred): if n == sink or n not in G: continue if splice_align.node_is_similar(mermap[sink], mermap[n][:len(mermap[sink])]): log.debug( "tugging dangling sink: {0}->{1}(sink), {0}->{2}".format( pred, sink, n)) # sink is just a shortened version of <n> # just update all paths with presence of <sink> to <n> # and safely remove <sink> from G for k in path_d: if sink in path_d[k]: assert path_d[k][-1] == sink path_d[k] = path_d[k][:-1] + [n] del mermap[sink] G.remove_node(sink) break
def find_dangling_sinks(G, path_d, mermap): """ For isoforms w/ 3' alt ends that have a longer last exon, it shows up as a *branch* from the path ex: ... -> pred -> sink_node (sink_node has only one incoming edge) ... -> pred -> n' -> ...other path... (pred only has two outgoing edges) we can "tuck" sink into n' if and only if sink is a substring of n' pred = [prefix] + [suffix] sink_node = [suffix] + [extra] n' = [suffix] + [...] update by deleting sink_node, and updating: pred = [prefix] + [suffix] + [extra] n' = [just use last k-mer of extra] + [...] """ cand_sinks = filter(lambda n: G.out_degree(n) == 0 and G.in_degree(n) == 1, G.nodes_iter()) for sink in cand_sinks: pred = G.predecessors(sink)[0] for n in G.successors(pred): if n == sink or n not in G: continue if splice_align.node_is_similar(mermap[sink], mermap[n][: len(mermap[sink])]): log.debug("tugging dangling sink: {0}->{1}(sink), {0}->{2}".format(pred, sink, n)) # sink is just a shortened version of <n> # just update all paths with presence of <sink> to <n> # and safely remove <sink> from G for k in path_d: if sink in path_d[k]: assert path_d[k][-1] == sink path_d[k] = path_d[k][:-1] + [n] del mermap[sink] G.remove_node(sink) break
def find_bubbles(G, path_d, mermap): """ We find all cases where n' -> n1 -> n3 n' -> n2 -> n3 (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing <i> make sure that n1 and n2 is not used in the same path (which indicates in-gene repeat?) <ii> retrace n1, n2 to make sure that they are largely similar """ def has_common_unique_pred(n1, n2): """ Case: pred -> n1 -> common succ pred -> n2 -> common succ """ preds1 = G.predecessors(n1) preds2 = G.predecessors(n2) return len(preds1) == 1 and len(preds2) == 1 and preds1[0] == preds2[0] def traceback_path(n1, n2): """ Find a common pred where pred -> n1 pred -> some_node -> n2 """ assert G.in_degree(n1) == 1 pred = G.predecessors(n1)[0] return path_finder(G, n2, pred, [n2], 2) def replace_node(n_to_del, n_to_replace_with): # pdb.set_trace() G.remove_node(n_to_del) del mermap[n_to_del] for k in path_d: if n_to_del in path_d[k]: i = path_d[k].index(n_to_del) path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i + 1 :] def replace_path_w_node(path_to_del, n_to_replace_with, common_succ): """ """ # first, it's possible that the last node in <path_to_del> has other successors # ex: path_to_del = x1 -> x2 -> common_succ # also has x2 -> another node x3 # so must change to n_to_replace_with -> x3 last_n_in_path = path_to_del[-1] for s, t, data in G.out_edges(last_n_in_path, data=True): if G.has_edge(n_to_replace_with, t): G[n_to_replace_with][t]["weight"] += data["weight"] else: G.add_edge(n_to_replace_with, t, weight=data["weight"]) # for every predecssor of path_to_del, replace with n_to_replace_with # ex: pred -> x1 -> x2 -> ... # becomes pred -> n_to_replace_with -> ... for pred in G.predecessors(path_to_del[0]): G.add_edge(pred, n_to_replace_with, weight=G.get_edge_data(pred, path_to_del[0])["weight"]) path_len = len(path_to_del) for k in path_d: if path_to_del[0] in path_d[k]: i = path_d[k].index(path_to_del[0]) m = min(i + path_len, len(path_d[k])) if path_d[k][i:m] == path_to_del[: (m - i)]: path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i + path_len :] # now delete all non branching nodes in path_to_del # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees! nodes_in_path = set() for path in path_d.itervalues(): nodes_in_path = nodes_in_path.union(path) safe_to_remove = filter(lambda x: G.out_degree(x) <= 1 and x not in nodes_in_path, path_to_del) # safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del) for node in safe_to_remove: log.debug("safe to delete from G: {0}".format(node)) G.remove_node(node) del mermap[node] in_same_path = make_in_same_path(path_d) cands = filter(lambda n: G.in_degree(n) >= 2, G.nodes_iter()) for n in cands: if n not in G: continue # deleted in loop below _pred = G.predecessors(n) if len(_pred) >= 2: for i, n1 in enumerate(_pred): if n1 not in G: continue for n2 in _pred[i + 1 :]: if n1 not in G or n2 not in G or n1 in in_same_path[n2]: continue if has_common_unique_pred(n1, n2): # what is known: common pred -> n1 -> common succ # common pred -> n2 -> common succ # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1) if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(mermap[n1], mermap[n2]): mermap[n1] = splice_align.get_consensus_through_voting( mermap[n1], G.get_edge_data(n1, n)["weight"], mermap[n2], G.get_edge_data(n2, n)["weight"], ) replace_node(n_to_del=n2, n_to_replace_with=n1) else: flag, is_skipped = splice_align.node_is_skipping( mermap[n1], mermap[n2], cc_settings.KMER_SIZE ) if is_skipped: if flag == "SEQ1": # seq1 is the one with retained exon replace_node(n_to_del=n2, n_to_replace_with=n1) else: replace_node(n_to_del=n1, n_to_replace_with=n2) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: if G.in_degree(n1) == 1: p2 = traceback_path(n1, n2) if p2 is not None: # common pred -> n1 -> common succ # common pred -> another node -> n2 -> common succ s1 = mermap[n1] s2 = stitch_string_from_path(p2, mermap) if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(s1, s2): mermap[n1] = splice_align.get_consensus_through_voting( s1, G.get_edge_data(n1, n)["weight"], s2, G.get_edge_data(n2, n)["weight"] ) replace_path_w_node(p2, n1, common_succ=n) else: flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE) if is_skipped: log.debug("path collapse possible {0},{1}".format(n1, p2)) mermap[n1] = s1 if flag == "SEQ1" else s2 replace_path_w_node(p2, n1, common_succ=n) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) elif G.in_degree(n2) == 1: p1 = traceback_path(n2, n1) if p1 is not None: s1 = stitch_string_from_path(p1, mermap) s2 = mermap[n2] if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(s1, s2): mermap[n2] = splice_align.get_consensus_through_voting( s1, G.get_edge_data(n1, n)["weight"], s2, G.get_edge_data(n2, n)["weight"] ) log.debug("path collapse possible: {0},{1}".format(p1, n2)) replace_path_w_node(p1, n2, common_succ=n) else: flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE) if is_skipped: mermap[n2] = s1 if flag == "SEQ1" else s2 replace_path_w_node(p1, n2, common_succ=n) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: log.debug("should NOT collapse {0},{1}".format(n1, n2))
def find_source_bubbles(G, path_d, mermap): """ Find all cases where src1 --> n3 ...> path1 --> n3 and that <i> src1 and path1 each has only one outgoing edge to n3 <ii> src1 and path1 are similar path1: can also be a source """ def traceback(cur): """ Retrace path of n1 -> n2 ... -> cur where n1, n2....all have exactly one outgoing edge """ acc = [] while True: acc.append(cur) preds = G.predecessors(cur) if len(preds) == 0 or len(preds) > 1 or G.out_degree(preds[0]) > 1: break cur = preds[0] return acc[::-1] def replace_node(n_to_del, path_to_replace_with): for k in path_d: if n_to_del in path_d[k]: i = path_d[k].index(n_to_del) path_d[k] = path_d[k][:i] + path_to_replace_with + path_d[k][i + 1 :] G.remove_node(n_to_del) del mermap[n_to_del] in_same_path = make_in_same_path(path_d) sources = filter(lambda n: G.in_degree(n) == 0, G.nodes_iter()) for src1 in sources: if src1 not in G: continue # deleted in the loop below succ = G.successors(src1) if len(succ) == 1: n3 = succ[0] cands = G.predecessors(n3) for n in cands: if src1 not in G: break # deleted, jump out of this if n not in G: continue # deleted in the loop below if n != src1 and n not in in_same_path[src1] and (n in sources or G.out_degree(n) == 1): t = traceback(n) seq1 = mermap[src1] seq2 = stitch_string_from_path(t, mermap) minlen = min(len(seq1), len(seq2)) # we know that seq1 and seq2 both have the same successor so they must share the same last (KMER_SIZE-1) suffix if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(seq1[::-1][:minlen], seq2[::-1][:minlen]): # should collapse src1 into n # to do so: # (1) if both are sources, replace the shorter src with the longer src # (2) if one is src other is path, replace the src with path # -- delete the replaced node from G # -- for all path in path_d that uses the deleted node, update with replacement log.debug("should collapse {0},{1}".format(src1, t)) if len(t) == 1 and t[0] in sources: # both are sources if len(mermap[t[0]]) > len(mermap[src1]): replace_node(n_to_del=src1, path_to_replace_with=t) else: # src1 is longer, use src1 replace_node(t[0], [src1]) else: # src1 is a source, <t> is not a source node but a path if len(seq2) > len(seq1): # let's just "tuck" src1 into <t> replace_node(src1, t) else: # we don't know if nodes in <t> branch out so we can't collapse them log.debug("should NOT collapse {0},{1}".format(src1, t)) else: # src1 and <t> are not similar enough, DO NOT collapse log.debug("should NOT collapse {0},{1}".format(src1, t))
def find_bubbles(G, path_d, mermap): """ We find all cases where n' -> n1 -> n3 n' -> n2 -> n3 (that is, n3 has > 1 incoming) and n1, n2 each have only one incoming and one outgoing <i> make sure that n1 and n2 is not used in the same path (which indicates in-gene repeat?) <ii> retrace n1, n2 to make sure that they are largely similar """ def has_common_unique_pred(n1, n2): """ Case: pred -> n1 -> common succ pred -> n2 -> common succ """ preds1 = G.predecessors(n1) preds2 = G.predecessors(n2) return len(preds1) == 1 and len(preds2) == 1 and preds1[0] == preds2[0] def traceback_path(n1, n2): """ Find a common pred where pred -> n1 pred -> some_node -> n2 """ assert G.in_degree(n1) == 1 pred = G.predecessors(n1)[0] return path_finder(G, n2, pred, [n2], 2) def replace_node(n_to_del, n_to_replace_with): """ Replacing <n_to_del> with <n_to_replace_with> 1. add successors of <n_to_del> to successor of <n_to_replace_with> ex: n' -> n1 -> n3 n' -> n2 -> n3 n' -> n1 -> n4 (make sure to add n2 -> n4 if not already exists) 2. remove <n_to_del> from graph G 3. replace all existence of <n_to_del> in path_d """ #pdb.set_trace() for n in G.successors_iter(n_to_del): if not G.has_edge(n_to_replace_with, n): G.add_edge(n_to_replace_with, n, weight=G.get_edge_data(n_to_del, n)['weight']) G.remove_node(n_to_del) del mermap[n_to_del] for k in path_d: if n_to_del in path_d[k]: i = path_d[k].index(n_to_del) path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+1:] def replace_path_w_node(path_to_del, n_to_replace_with, common_succ): """ """ # first, it's possible that the last node in <path_to_del> has other successors # ex: path_to_del = x1 -> x2 -> common_succ # also has x2 -> another node x3 # so must change to n_to_replace_with -> x3 last_n_in_path = path_to_del[-1] for s,t,data in G.out_edges(last_n_in_path, data=True): if G.has_edge(n_to_replace_with, t): G[n_to_replace_with][t]['weight'] += data['weight'] else: G.add_edge(n_to_replace_with, t, weight=data['weight']) # for every predecssor of path_to_del, replace with n_to_replace_with # ex: pred -> x1 -> x2 -> ... # becomes pred -> n_to_replace_with -> ... for pred in G.predecessors(path_to_del[0]): G.add_edge(pred, n_to_replace_with, weight=G.get_edge_data(pred, path_to_del[0])['weight']) path_len = len(path_to_del) for k in path_d: if path_to_del[0] in path_d[k]: i = path_d[k].index(path_to_del[0]) m = min(i+path_len, len(path_d[k])) if path_d[k][i:m] == path_to_del[:(m-i)]: path_d[k] = path_d[k][:i] + [n_to_replace_with] + path_d[k][i+path_len:] # now delete all non branching nodes in path_to_del # note: this filter must be done simultaneously because G.remove_node will dynamically change the degrees! nodes_in_path = set() for path in path_d.itervalues(): nodes_in_path = nodes_in_path.union(path) safe_to_remove = filter(lambda x: G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del) #safe_to_remove = filter(lambda x: G.in_degree(x)<=1 and G.out_degree(x)<=1 and x not in nodes_in_path, path_to_del) for node in safe_to_remove: log.debug("safe to delete from G: {0}".format(node)) G.remove_node(node) del mermap[node] in_same_path = make_in_same_path(path_d) cands = filter(lambda n: G.in_degree(n)>=2, G.nodes_iter()) for n in cands: if n not in G: continue # deleted in loop below _pred = G.predecessors(n) if len(_pred) >= 2: for i, n1 in enumerate(_pred): if n1 not in G: continue for n2 in _pred[i+1:]: if n1 not in G or n2 not in G or n1 in in_same_path[n2]: continue if has_common_unique_pred(n1, n2): # what is known: common pred -> n1 -> common succ # common pred -> n2 -> common succ # so they must share the same first (KMER_SIZE-1) and the last (KMER_SIZE-1) if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(mermap[n1], mermap[n2]): mermap[n1] = splice_align.get_consensus_through_voting(mermap[n1],\ G.get_edge_data(n1, n)['weight'],\ mermap[n2],\ G.get_edge_data(n2, n)['weight']) replace_node(n_to_del=n2, n_to_replace_with=n1) else: flag, is_skipped = splice_align.node_is_skipping(mermap[n1], mermap[n2], cc_settings.KMER_SIZE) if is_skipped: if flag == "SEQ1": # seq1 is the one with retained exon replace_node(n_to_del=n2, n_to_replace_with=n1) else: replace_node(n_to_del=n1, n_to_replace_with=n2) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: if G.in_degree(n1) == 1: p2 = traceback_path(n1, n2) if p2 is not None: # common pred -> n1 -> common succ # common pred -> another node -> n2 -> common succ s1 = mermap[n1] s2 = stitch_string_from_path(p2, mermap) if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(s1, s2): mermap[n1] = splice_align.get_consensus_through_voting(s1,\ G.get_edge_data(n1, n)['weight'],\ s2, G.get_edge_data(n2, n)['weight']) replace_path_w_node(p2, n1, common_succ=n) else: flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE) if is_skipped: log.debug("path collapse possible {0},{1}".format(n1, p2)) mermap[n1] = s1 if flag == 'SEQ1' else s2 replace_path_w_node(p2, n1, common_succ=n) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) elif G.in_degree(n2) == 1: p1 = traceback_path(n2, n1) if p1 is not None: s1 = stitch_string_from_path(p1, mermap) s2 = mermap[n2] if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(s1, s2): mermap[n2] = splice_align.get_consensus_through_voting(s1,\ G.get_edge_data(n1, n)['weight'],\ s2, G.get_edge_data(n2, n)['weight']) log.debug("path collapse possible: {0},{1}".format(p1, n2)) replace_path_w_node(p1, n2, common_succ=n) else: flag, is_skipped = splice_align.node_is_skipping(s1, s2, cc_settings.KMER_SIZE) if is_skipped: mermap[n2] = s1 if flag=='SEQ1' else s2 replace_path_w_node(p1, n2, common_succ=n) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: log.debug("should NOT collapse {0},{1}".format(n1, n2)) else: log.debug("should NOT collapse {0},{1}".format(n1, n2))
def find_source_bubbles(G, path_d, mermap): """ Find all cases where src1 --> n3 ...> path1 --> n3 and that <i> src1 and path1 each has only one outgoing edge to n3 <ii> src1 and path1 are similar path1: can also be a source """ def traceback(cur): """ Retrace path of n1 -> n2 ... -> cur where n1, n2....all have exactly one outgoing edge """ acc = [] while True: acc.append(cur) preds = G.predecessors(cur) if len(preds) == 0 or len(preds) > 1 or G.out_degree(preds[0]) > 1: break cur = preds[0] return acc[::-1] def replace_node(n_to_del, path_to_replace_with): for k in path_d: if n_to_del in path_d[k]: i = path_d[k].index(n_to_del) path_d[k] = path_d[k][:i] + path_to_replace_with + path_d[k][i+1:] G.remove_node(n_to_del) del mermap[n_to_del] in_same_path = make_in_same_path(path_d) sources = filter(lambda n: G.in_degree(n) == 0, G.nodes_iter()) for src1 in sources: if src1 not in G: continue # deleted in the loop below succ = G.successors(src1) if len(succ) == 1: n3 = succ[0] cands = G.predecessors(n3) for n in cands: if src1 not in G: break # deleted, jump out of this if n not in G: continue # deleted in the loop below if n!=src1 and n not in in_same_path[src1] and (n in sources or G.out_degree(n)==1): t = traceback(n) seq1 = mermap[src1] seq2 = stitch_string_from_path(t, mermap) minlen = min(len(seq1), len(seq2)) # we know that seq1 and seq2 both have the same successor so they must share the same last (KMER_SIZE-1) suffix if DEBUG_FLAG: pdb.set_trace() if splice_align.node_is_similar(seq1[::-1][:minlen], seq2[::-1][:minlen]): # should collapse src1 into n # to do so: # (1) if both are sources, replace the shorter src with the longer src # (2) if one is src other is path, replace the src with path # -- delete the replaced node from G # -- for all path in path_d that uses the deleted node, update with replacement log.debug("should collapse {0},{1}".format(src1, t)) if len(t) == 1 and t[0] in sources: # both are sources if len(mermap[t[0]]) > len(mermap[src1]): replace_node(n_to_del=src1, path_to_replace_with=t) else: # src1 is longer, use src1 replace_node(t[0], [src1]) else: # src1 is a source, <t> is not a source node but a path if len(seq2) > len(seq1): # let's just "tuck" src1 into <t> replace_node(src1, t) else: # we don't know if nodes in <t> branch out so we can't collapse them log.debug("should NOT collapse {0},{1}".format(src1, t)) else: # src1 and <t> are not similar enough, DO NOT collapse log.debug("should NOT collapse {0},{1}".format(src1, t))