def generate_test_graph(original_graph, state_type_node_count, chosen_state, state_wise_node_list, no_of_types): global available_type, node_latitude, node_longitude, node_state, node_type test_graph_nodes = list() node_queue = list() print state_type_node_count for i in range(len(chosen_state)): for j in range(no_of_types): print 'Running for state', chosen_state[i] print state_type_node_count[i, j] while not state_type_node_count[i, j] == 0: node_index = random.randint(1, len(state_wise_node_list[chosen_state[i]])) node = state_wise_node_list[chosen_state[i]][node_index-1] state_wise_node_list[chosen_state[i]].remove(node) if not node_type[node] in available_type[:no_of_types] or node in node_queue or node in test_graph_nodes: continue node_queue.append(node) state_type_node_count[i, j] -= 1 expand_graph(node_queue, test_graph_nodes, original_graph, chosen_state, no_of_types, state_type_node_count) test_graph = networkx.DiGraph() test_graph = original_graph.subgraph(test_graph_nodes) components = networkx.weakly_connected_component_subgraphs(test_graph) i = 1 print 'Components Before:' print '******************' for component in components: print 'Component: ' + str(i) + '- ' + str(networkx.number_of_nodes(component)) i += 1 # Check connectivity if i > 1: resolve_connectivity_issue(test_graph) components = networkx.weakly_connected_component_subgraphs(test_graph) i = 1 print 'Components After:' print '******************' for component in components: print 'Component: ' + str(i) + '- ' + str(networkx.number_of_nodes(component)) i += 1 node_mapping = dict() test_graph_node_state_assgn = list() for i in range(len(test_graph_nodes)): node_mapping[i] = test_graph_nodes[i] test_graph_node_state_assgn.append(node_state[test_graph_nodes[i]]) print 'No of nodes: ', len(test_graph_nodes) return test_graph, node_mapping, test_graph_node_state_assgn
def decompose(paths, args): """ runs decomposition Parameters ---------- paths.bundle_file : file paths.tmp1_file : file paths.tmp2_file : file paths.decomp_file : file args.msize : integer """ # load the bundle graph. logging.info("loading info") BG = nx.read_gpickle(paths.bundle_file) #BG = test_bi() #BG = test_tri() # run decomposition until satisfied. BG.graph['redo'] = False while 1 == 1: # decomposition. DC = decomp0(BG, paths.tmp1_file, paths.tmp2_file, msize=args.msize) # check if only once. if args.msize == None or BG.graph['redo'] == False: break elif BG.graph['redo'] == True: BG.graph['redo'] = False # remove temp files. if os.path.isfile(paths.tmp1_file) == True: subprocess.call(["rm","-f",paths.tmp1_file]) if os.path.isfile(paths.tmp2_file) == True: subprocess.call(["rm","-f",paths.tmp2_file]) # compact decomposition. _compact_outter(DC) for subcc in nx.weakly_connected_component_subgraphs(DC): # call recursive compaction. _compact_inner(DC) # verify decomposition. for subcc in nx.weakly_connected_component_subgraphs(DC): # check its consistency. _validate_comp(subcc) # write to disk. nx.write_gpickle(DC, paths.decomp_file) nx.write_gpickle(BG, paths.bundle_file)
def __cut(graph): ''' param: graph:a nx.DiGraph obj return: cs : edge cut set of the graph g1 , g2 : subgraphs induced by cs ''' assert isinstance(graph, nx.DiGraph), "graph class: %s " % graph.__class__ assert graph.number_of_nodes() > 1, "Number of nodes: %d" % graph.number_of_nodes() unigraph = nx.Graph( graph ) cs = nx.minimum_edge_cut( unigraph ) if not cs: raise Exception,"Cut Set of this graph is Empty" #CS中的边,可能不存在于原来的有向图中,所以需要将这种边的方向颠倒 #将所有real edge,存到RCS中 rcs = [] for eachEdge in cs: if not graph.has_edge( eachEdge[0], eachEdge[1] ): eachEdge = (eachEdge[1], eachEdge[0]) #调换方向 rcs.append(eachEdge) graph.remove_edges_from(rcs) glist = [] for eachCntComp in nx.weakly_connected_component_subgraphs(graph, copy = False): glist.append(eachCntComp) assert len(glist) == 2 return rcs, glist[0], glist[1]
def longestPath(g, dict_seq): paths = [] if g.number_of_nodes() == 0: return paths if g.number_of_nodes() == 1: paths.append(g.nodes()) return paths if is_linear_graph(g)[0]: p = get_path_linear_graph(g) return [p] for c in nx.weakly_connected_component_subgraphs(g): if c.number_of_nodes() == 1: paths.append(c.nodes()) continue dist = {} for node in nx.topological_sort(c): pairs = [(dist[v][0]+len(dict_seq[node])- g[v][node]['weight'], v) for v in c.pred[node]] if pairs: dist[node] = max(pairs) else: dist[node] = (len(dict_seq[node]), node) node, (length, _) = max(dist.items(), key=lambda x:x[1]) path = [] while length > len(dict_seq[node]): path.append(node) length, node = dist[node] paths.append(list(reversed(path))) return paths
def simplify_graph(g): for e in g.selfloop_edges(): g.remove_edge(e[0], e[1]) for node in g.nodes(): neighbors = list(nx.all_neighbors(g, node)) edges = g.in_edges(node, data=True) edges.extend(g.out_edges(node, data=True)) plus = [] minus = [] for e in edges: if e[2][node] == '+': plus.append(e) else: minus.append(e) if not plus or not minus: continue if len(plus) >= len(minus): for e in minus: if g.has_edge(e[0], e[1]): g.remove_edge(e[0], e[1]) if len(plus) <= len(minus): for e in plus: if g.has_edge(e[0], e[1]): g.remove_edge(e[0], e[1]) remove_out_tips(g) remove_in_tips(g) for c in nx.weakly_connected_component_subgraphs(g): if c.number_of_nodes() <= 2: continue isLinear, ends, source, sink= is_linear_graph(c) if isLinear: if sink==1 and source == 1: continue adjust_edge_di(g, c, ends[0], ends[1])
def layer_layout(g, level_attribute = "t"): '''Lay out a directed graph by layer g - a NetworkX directed graph with the layer defined as the node's "t" attribute. The graph must be acyclic - a restriction that's guaranteed by TrackObjects since edges are always going forward in time. level_attribute - the attribute in the node attribute dictionary that specifies the level of the node on exit, each node will have a y attribute that can be used to place the node vertically on a display. "t" can be used for the horizontal display. The algorithm is a partial implementation of Sugiyama, Kozo, Tagawa, Shojiro; Toda, Mitsuhiko (1981), "Methods for visual understanding of hierarchical system structures", IEEE Transactions on Systems, Man, and Cybernetics SMC-11 (2):109-125, doi:10.1109/TSMC.1981.4308636 as described by sydney.edu.au/engineering/it/~visual/comp4048/slides03.ppt ''' subgraphs = nx.weakly_connected_component_subgraphs(g) y = 0 for subgraph in subgraphs: y = layer_layout_subgraph(g, subgraph, y, level_attribute)
def prune_transcript_graph(G, strand, transcript_map, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0): ''' trim_utr_fraction: float specifying the fraction of the average UTR coverage below which the ends of the UTR will be trimmed trim_intron_fraction: float specifying the fraction of the average intron coverage below which intronic nodes will be removed ''' # trim utrs and intron retentions trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction) G.remove_nodes_from(trim_nodes) # collapse consecutive nodes in graph H = collapse_strand_specific_graph(G, transcript_map, introns=True) # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = nx.weakly_connected_component_subgraphs(H) for Gsub in Gsubs: # get partial path data supporting graph transcript_node_map = get_transcript_node_map(Gsub) path_score_dict = collections.defaultdict(lambda: 0) for t_id, nodes in transcript_node_map.iteritems(): # reverse path for negative strand transcripts if strand == NEG_STRAND: nodes.reverse() # get transcript scores t = transcript_map[t_id] path_score_dict[tuple(nodes)] += t.score yield Gsub, strand, path_score_dict.items()
def get_alternative_paths(subg,path): paths = [] subg1 = subg.copy() for node in path: subg1.remove_node(node) for comp in nx.weakly_connected_component_subgraphs(subg1): if len(comp.nodes()) == 1: paths.append(comp.nodes()) else: p = [] for node in comp.nodes(): if comp.out_degree(node) == 1 and comp.in_degree(node) == 0: p.append(node) for node in comp.nodes(): if comp.out_degree(node) == 0 and comp.in_degree(node) == 1: p.append(node) if len(p) == 2: try: paths.append(nx.shortest_path(comp,p[0],p[1])) except: continue return paths
def find_reach_topsort(dags, c2n): node_reach = dict() cluster_reach = dict() wccs = nx.weakly_connected_component_subgraphs(dags) for hub in wccs: # treat hubs of size 1 and 2 specially if len(hub) == 1: cluster = hub.nodes()[0] cluster_reach[cluster] = c2n[cluster] node_reach.update(dict(zip(c2n[cluster], [len(c2n[cluster])]*len(c2n[cluster])))) elif len(hub) == 2: cluster1, cluster2 = hub.edges()[0] cluster_reach[cluster2] = c2n[cluster2] cluster_reach[cluster1] = c2n[cluster1] + c2n[cluster2] node_reach.update(dict(zip(c2n[cluster1], [len(cluster_reach[cluster1])]*len(c2n[cluster1])))) node_reach.update(dict(zip(c2n[cluster2], [len(cluster_reach[cluster2])]*len(c2n[cluster2])))) else: hub_ts = nx.topological_sort(hub, reverse=True) for cluster in hub_ts: reach = set() for _, out_cluster in dags.out_edges(cluster): reach.update(cluster_reach[out_cluster]) reach.update(c2n[cluster]) cluster_reach[cluster] = reach node_reach.update(dict(zip(c2n[cluster], [len(reach)]*len(c2n[cluster])))) return node_reach
def connect_digraph(D): """ Take a DiGraph with weakly connected components, and coalesce into one component.""" s = nx.weakly_connected_component_subgraphs(D) #s is sorted by the size of the subgraph if len(s) > 1: largest = s[0] remaining = s[1:] largest_edges = largest.edges() #Let's filter out the one degree edges (otherwise we'll disconnect #the graph when we swap edges around). candidates = [] for u,v in largest_edges: if D.degree(u) > 1 and D.degree(v) > 1: candidates.append((u,v)) if len(candidates) < len(remaining): raise Exception("There are not enough candidates for swapping.") #Connect the largest subgraph to the remaining. for G in remaining: u,v = random.choice(candidates) x,y = random.choice(G.edges()) D.remove_edge(u, v) D.remove_edge(x, y) D.add_edge(u, y) D.add_edge(x, v) largest_edges.remove((u,v))
def compute_dependent_cohorts(self, objects, deletion): model_map = defaultdict(list) n = len(objects) r = range(n) indexed_objects = zip(r, objects) mG = self.model_dependency_graph[deletion] oG = DiGraph() for i in r: oG.add_node(i) for v0, v1 in mG.edges(): try: for i0 in range(n): for i1 in range(n): if i0 != i1: if not deletion and self.concrete_path_exists( objects[i0], objects[i1]): oG.add_edge(i0, i1) elif deletion and self.concrete_path_exists(objects[i1], objects[i0]): oG.add_edge(i0, i1) except KeyError: pass components = weakly_connected_component_subgraphs(oG) cohort_indexes = [reversed(topological_sort(g)) for g in components] cohorts = [[objects[i] for i in cohort_index] for cohort_index in cohort_indexes] return cohorts
def draw_graphs(G, folder_name): domain_name = G.graph['domain'] dir = folder_name + '/' + domain_name if not os.path.exists(dir): os.makedirs(dir) subgraphs = nx.weakly_connected_component_subgraphs(G) add_root_to_subgraphs(subgraphs) subgraphs.sort(key=lambda subgraph: subgraph.number_of_nodes()) for i in xrange(len(subgraphs)): subgraph = subgraphs[i] pos = nx.spring_layout(subgraph) node_labels = get_node_labels(subgraph) positive_nodes = node_labels['positive'].keys() negative_nodes = node_labels['negative'].keys() labels = dict(node_labels['positive'], **(node_labels['negative'])) edge_labels = get_edge_labels(subgraph) pl.figure(figsize=(16, 12)) nx.draw_networkx_nodes(subgraph, pos, positive_nodes, alpha=0.5, node_color='w') nx.draw_networkx_nodes(subgraph, pos, negative_nodes, alpha=0.5, node_color='b') nx.draw_networkx_nodes(subgraph, pos, ['root'], node_color='g') nx.draw_networkx_edges(subgraph, pos, color='k') nx.draw_networkx_labels(subgraph, pos, labels, font_size=20) nx.draw_networkx_edge_labels(subgraph, pos, edge_labels, font_size=20) pl.axis('off') pl.savefig('%s/%s_subgraph_%d.png' % (dir, domain_name, i+1))
def main(): file_path = sys.argv[1] global user_graph # Constructs the graph based on the dataset make_graph(file_path) # Get the weakly connected graph components. HITS is to be run on the largest of such components. weakly_connected_graph_components = nx.weakly_connected_component_subgraphs(user_graph) # Get the largest weekly connected graph component largest_weakly_connected_graph = weakly_connected_graph_components[0] (hub_score_counter, authority_score_counter) = run_hits_algorithm(largest_weakly_connected_graph) # Sort the lists sorted_hub_score_list = sorted(hub_score_counter.items(), key = lambda item: item[1], reverse = True) sorted_authority_score_list = sorted(authority_score_counter.items(), key = lambda item: item[1], reverse = True) # Print top 20 hubs print "Top 20 Hubs" print "===========" for i in range(0, 20): if sorted_hub_score_list[i] != None: print sorted_hub_score_list[i][0] print "" # Print top 20 authorities print "Top 20 Authorities" print "==================" for i in range(0, 20): if sorted_authority_score_list[i] != None: print sorted_authority_score_list[i][0]
def weakly_connected_subgraphs(self): """ Yields weakly connected subgraphs and their topolgical sort. """ for subgraph in nx.weakly_connected_component_subgraphs(self.G): yield (subgraph, nx.topological_sort(subgraph))
def split(self): '''splits into weakly connected component subgraphs''' # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = list(nx.weakly_connected_component_subgraphs(self.G)) if len(Gsubs) == 1: yield self return # map nodes to components node_subgraph_map = {} subgraph_transfrag_map = collections.defaultdict(list) for i, Gsub in enumerate(Gsubs): for n_id in Gsub: n = self.get_node_interval(n_id) node_subgraph_map[n] = i # assign transfrags to components for t in self.itertransfrags(): for n in split_transfrag(t, self.node_bounds): subgraph_id = node_subgraph_map[n] subgraph_transfrag_map[subgraph_id].append(t) break # create new graphs using the separate components for subgraph_transfrags in subgraph_transfrag_map.itervalues(): yield SpliceGraph.create(subgraph_transfrags, guided_ends=self.guided_ends, guided_assembly=self.guided_assembly)
def main(): G, karmas = read_data("karma.txt") cs = nx.weakly_connected_component_subgraphs(G) cs.sort(key=lambda c: c.number_of_nodes(), reverse=True) plt.clf() draw(cs[126], karmas) plt.show()
def __init__(self, scaffold_graph): print "Entering PathFinder module:", str(datetime.now()) self.G = scaffold_graph.copy() #Build strandless list of sequences sequences = set([n for n in self.G.nodes() if n > 0]) #Define weakly connected components print "1... Defining weakly connected components" component_graphs = set([g for g in nx.weakly_connected_component_subgraphs(self.G)]) single_node_graphs = set([g for g in component_graphs if len(g.nodes()) == 1]) multi_node_graphs = set([g for g in component_graphs if len(g.nodes()) > 1]) print "Number of single-node components:", len(single_node_graphs) print "Number of multi-node components:", len(multi_node_graphs) #Consolidate unscaffolded nodes, discard reverse strand print "2... Consolidating single-node components" unscaffolded = set([g.nodes()[0] for g in single_node_graphs]) discard_nodes = set([n for n in unscaffolded if n < 0]) for g in iter(single_node_graphs.copy()): if g.nodes()[0] in discard_nodes: single_node_graphs.discard(g) print "Number of unscaffolded sequences:", len(single_node_graphs) #Classify multi-node graphs print "3... Classifying multi-node components" DAG = set([]) Euler = set([]) for g in multi_node_graphs: if nx.is_directed_acyclic_graph(g): DAG.add(g) elif nx.is_eulerian(g): Euler.add(g) else: sys.exit("FATAL ERROR: Unknown multi-node graph type!") print "Number of directed acyclic graphs:", len(DAG) print "Number of Eulerian graphs:", len(Euler) #Build scaffolds from DAGs print "4... Building scaffolds from directed acyclic graphs" self.scaffolds = set([]) for g in DAG: self.build_dag_scaffold(g) #Consolidating complementary scaffolds, keep first found print "5... Consolidating complementary scaffolds" consolidated_scaff = set([]) for seq in iter(self.scaffolds): comp = self.revc(seq) if comp in self.scaffolds: if comp not in consolidated_scaff: consolidated_scaff.add(seq) else: print "WARNING: non-complemented scaffold" self.scaffolds = consolidated_scaff print "Number of scaffolds assembled:", len(self.scaffolds) #Build scaffolds from Eulerian graphs #Add unscaffolded seqs to scaffolds list print "6... Adding unscaffolded sequences to output" for g in single_node_graphs: seq = self.G.node[g.nodes()[0]]['seq'] self.scaffolds.add(seq) print "Leaving PathFinder module:", str(datetime.now())
def find_largest_component(self): G = self.graph list_Graphs = nx.weakly_connected_component_subgraphs(G) max_component = list_Graphs[0] for g in list_Graphs: if nx.number_of_nodes(g) > nx.number_of_nodes(max_component): max_component = g return max_component
def keep_weakly_connected(self): '''This method filters out exons (nodes) not involved in AS events''' # find weakly connected subgraphs weakly_connected_list = nx.weakly_connected_component_subgraphs(self.sub_graph) # iterate to find which subgraph has the target exon for subgraph in weakly_connected_list: if self.target in subgraph.nodes(): self.sub_graph = subgraph # assign subgraph that actually connects to target exon
def check_connected_balanced(graph): """ :type graph: nx.DiGraph """ for v in graph.nodes(): assert graph.in_degree(v) == graph.out_degree(v) sub_graph_list = nx.weakly_connected_component_subgraphs(graph, True) for sub_graph in sub_graph_list: print 'connected component:', sub_graph.edges() print
def crgraph(g): #传进来的图每一个节点都是字符串。label属性是他们的cellref assert isinstance(g, nx.DiGraph) gg = g.copy() label = nx.get_node_attributes(gg, 'label') fds = [node for node in gg.nodes() if label[node] == dffkeyword] gg.remove_nodes_from( fds ) cr = nx.DiGraph() cr.name = gg.name+"_crgraph" clouds = [] ccnt = 0 for cloud in nx.weakly_connected_component_subgraphs(gg): assert isinstance(cloud, nx.DiGraph ) ccnt += 1 cloud.name = "cloud%d" % ccnt clouds.append( cloud ) cr.add_node(cloud, label = cloud.name) cr.add_nodes_from(fds, label = dffkeyword) empty_cnt = 0 for edge in g.edges_iter(): pre = edge[0] succ = edge[1] credge = () if label[pre] == dffkeyword and label[succ] == dffkeyword: empty = nx.DiGraph(name = 'empty%d'% empty_cnt ) empty_cnt += 1 cr.add_edge( pre, empty ) cr.add_edge( empty, succ ) continue elif label[pre] != dffkeyword and label[succ] == dffkeyword: for cloud in clouds: if cloud.has_node(pre): credge = (cloud, succ) if not credge: print "None of the cloud has prim: %s %s.in edge:%s" % (label[pre], pre, str(edge) ) print "that node in originnal graph:", print "precs:%s succs:%s" %( str(g.predecessors(pre)), str(g.successors(pre)) ) raise AssertionError elif label[pre] == dffkeyword and label[succ] != dffkeyword: for cloud in clouds: if cloud.has_node( succ): credge = (pre, cloud) if not credge: print "None of the cloud has prim: %s %s.in edge:%s" % (label[succ], succ, str(edge)) print "that node in originnal graph:" print "precs:%s succs:%s" %( str(g.predecessors(succ)), str(g.successors(succ)) ) raise AssertionError else: continue cr.add_edge(credge[0], credge[1] ) cr.fds = fds cr.clouds = clouds return clouds, fds, cr
def find_intermittent_nodes_directed(g): tree = [] node_cri = find_critical_nodes_directed(g) for gi in nx.weakly_connected_component_subgraphs(g): tree += get_tree(gi,[]) node_int = [] for i in xrange(len(tree)): node,nb,_ = tree[i] if node in node_cri: tree[i] = (node, nb, 'c') node_int = get_ni(tree) return node_int
def _handle_contained_in(ctx): # for each 'contained' tree, recursively build new trees based on # scaling groups with generated ids for contained_tree in nx.weakly_connected_component_subgraphs( ctx.plan_contained_graph.reverse(copy=True)): # extract tree root node id node_id = nx.topological_sort(contained_tree)[0] _build_multi_instance_node_tree_rec( node_id=node_id, contained_tree=contained_tree, ctx=ctx) ctx.deployment_contained_graph = ctx.deployment_node_graph.copy()
def output_domain_data(G, domain_read_dict, folder_name): domain = G.graph['domain'] dir = folder_name + '/' + domain if not os.path.exists(dir): os.makedirs(dir) out_file_name = dir + '/' + domain + '.data' subgraphs = nx.weakly_connected_component_subgraphs(G) subgraphs.sort(key=lambda subgraph:subgraph.number_of_nodes()) with open(out_file_name, 'w') as f: for i in xrange(len(subgraphs)): subgraph = subgraphs[i] output_subgraph_data(subgraph, i+1, domain_read_dict, f)
def processNetwork(): try: mydata = request.json nodes = mydata['nodes'] edges = mydata['edges'] networks = mydata['networks'] commands = mydata['commands'] commands_networks = mydata['commands_networks'] commands_distances = mydata['distances'] distances_methods = {d: getattr(distances,d) for d in commands_distances} #print distances_methods net = pn() #print 'nodes',nodes #print 'networks',networks for node in nodes: #print node net.add_node(str(node['id'])); if ('label' in node) and (node['label']!=None): #print node['id'],node['label'] net._labels[str(node['id'])] = str(node['label']) for edge in edges: net.add_edge(str(edge['source']),str(edge['target'])) subnetworks = nx.weakly_connected_component_subgraphs(net) for subnetwork in subnetworks: #print 'sbn:'+subnetwork.eNewick() for node in subnetwork.nodes(): if node in net._labels: subnetwork._labels[node] = net._labels[node] #print 'sbn.'+subnetwork.eNewick() #print subnetworks; for network in networks: #print network['name'] #print network['nodes'] onenote = network['nodes'][0] for subnetwork in subnetworks: if onenote in subnetwork.nodes(): subnetwork.name = network['name'] for net in subnetworks: print net.eNewick() fs = {command:getattr(net,str(command)) for command in commands} data = {} data['nodes'] = {u:applyAll(fs,u) for u in net.nodes()} #fsnets = {command_network:getattr(net,str(command_network)) for command_network in commands_networks} #print 'aqui' data['networks'] = {n.name:applyAllNets(commands_networks,n,subnetworks,distances_methods) for n in subnetworks} #print datanetworks return jsonify(response = data); except Exception, err: print err return jsonify(response={'error':'Some error occurred. Please chech your data. If you think this is a bug, please contact us (see About section).<br> Error message: %s' % err})
def __init__(self, graph, largest=False): """ Only considers largest weakly connected component if needed """ self.graph = graph if largest: self.graph = max(nx.weakly_connected_component_subgraphs(self.graph), key=len) self.io = IOComponent(self) self.system = DynamicalSystem(self) self.math = Math(self) self.setup()
def author_interaction_weighted_graph(discussion_graph, json_data, limit=10): niter = 0 for conn_subgraph in nx.weakly_connected_component_subgraphs(discussion_graph): interaction_graph = nx.DiGraph() origin = min(int(x) for x in conn_subgraph.nodes()) add_to_weighted_graph(interaction_graph, discussion_graph, json_data, [origin], []) # print(json_data[origin]) g1 = nx.to_agraph(interaction_graph) g1.draw("author_weighted/"+str(origin)+'.png', prog='circo') niter += 1 if limit == niter and limit > 0: break
def output_graph_stat(G, mapped_read_lookup_dict): domain = G.graph['domain'] subgraphs = nx.weakly_connected_component_subgraphs(G) subgraph_num = len(subgraphs) subgraph_size_list = get_subgraph_size_list(subgraphs) mapped_read_num = len(mapped_read_lookup_dict[domain].keys()) aligned_read_num = G.number_of_nodes() sys.stdout.write('%s:%d:%d:%d' % (domain, mapped_read_num, aligned_read_num, subgraph_num)) for positive_num, negative_num in subgraph_size_list: sys.stdout.write(' %d:%d' % (positive_num, negative_num)) sys.stdout.write('\n')
def read_network_file(self, networkfile): # Read the network from file net = nx.DiGraph() for line in networkfile: line = line.decode('UTF-8') items = [x.strip() for x in line.rstrip().split('\t')] # Skip empty lines or those beginning with '#' comments if line=='': continue if line[0]=='#': continue id1 = items[0] id2 = items[1] # Ignore self-edges if id1==id2: continue # Possibly use an edge weight eWeight = 1 if len(items) > 2: eWeight = float(items[2]) elif not self.page_rank: raise PathLinkerError('ERROR: All edges must have a weight, unless --PageRank is used. Edge (%s --> %s) does not have a weight entry.'%(id1, id2)) # Assign the weight. Note in the PageRank case, "weight" is # interpreted as running PageRank and edgeflux on a weighted # graph. net.add_edge(id1, id2, ksp_weight=eWeight, weight=eWeight) # Operate on only the largest connected component if self.largest_connected_component: conn_comps = nx.weakly_connected_component_subgraphs(net) # This is the only portion of the program which prevents # compatibility between Python 2 & 3. In 2, this object is a # generator, but in 3 it is a list. Just check the type and # handle accordingly to provide cross-compatibility. if isinstance(conn_comps, types.GeneratorType): net = next(conn_comps) elif isinstance(conn_comps, list): net = conn_comps[0] else: raise PathLinkerError('Compatibility error between NetworkX and Python versions. Connected components object from NetworkX does not have acceptable type.') print("\n Using only the largest weakly connected component:\n" + nx.info(net)) self.set_network(net) return net
def filter_graph_for_weakly_connected_components(self, min_nodes=2): """ Get weakly connected components in graph. min_nodes : int Return only connected components with a minimal number of min_nodes """ edges = [] for g in nx.weakly_connected_component_subgraphs(self.graph): if len(g.nodes(data=True)) >= min_nodes: for e in g.edges(data=True): edges.append(e) self.graph = nx.DiGraph(edges)
def compartmentalize_skeletongroup(skeleton_id_list, project_id, **kwargs): skelgroup = SkeletonGroup(skeleton_id_list, project_id) compartment_graph_of_skeletons = {} resultgraph = nx.DiGraph() for skeleton_id, skeleton in skelgroup.skeletons.items(): if kwargs.has_key('confidence_threshold'): confidence_filtering(skeleton, kwargs['confidence_threshold']) elif kwargs.has_key('edgecount'): edgecount_filtering(skeleton, kwargs['edgecount']) subgraphs = nx.weakly_connected_component_subgraphs(skeleton.graph) compartment_graph_of_skeletons[skeleton_id] = subgraphs for i, subg in enumerate(subgraphs): for nodeid, d in subg.nodes_iter(data=True): d['compartment_index'] = i skeleton.graph.node[nodeid]['compartment_index'] = i if len(skeleton.neuron.name) > 30: neuronname = skeleton.neuron.name[: 30] + '...' + ' [{0}]'.format( i) else: neuronname = skeleton.neuron.name + ' [{0}]'.format(i) resultgraph.add_node( '{0}_{1}'.format(skeleton_id, i), { 'neuronname': neuronname, 'skeletonid': str(skeleton_id), 'compartment_index': i, 'node_count': subg.number_of_nodes(), }) connectors = {} for skeleton_id, skeleton in skelgroup.skeletons.items(): for connector_id, v in skeleton.connected_connectors.items(): if not connectors.has_key(connector_id): connectors[connector_id] = {'pre': [], 'post': []} if len(v['presynaptic_to']): # add the skeleton id for each treenode that is in v['presynaptic_to'] # This can duplicate skeleton id entries which is correct for e in v['presynaptic_to']: skeleton_compartment_id = '{0}_{1}'.format( skeleton_id, skeleton.graph.node[e]['compartment_index']) connectors[connector_id]['pre'].append( skeleton_compartment_id) if len(v['postsynaptic_to']): for e in v['postsynaptic_to']: skeleton_compartment_id = '{0}_{1}'.format( skeleton_id, skeleton.graph.node[e]['compartment_index']) connectors[connector_id]['post'].append( skeleton_compartment_id) # merge connectors into graph for connector_id, v in connectors.items(): for from_skeleton in v['pre']: for to_skeleton in v['post']: if not resultgraph.has_edge(from_skeleton, to_skeleton): resultgraph.add_edge(from_skeleton, to_skeleton, { 'count': 0, 'connector_ids': set() }) resultgraph.edge[from_skeleton][to_skeleton]['count'] += 1 resultgraph.edge[from_skeleton][to_skeleton][ 'connector_ids'].add(connector_id) return resultgraph
def scoreERS(codes, iterations=25): #Utility to read importance scores generated by BONITA, and calculate total ancestor overlap in preparation for plotting allRes = {} for code in codes: print(code) temp_df2 = pd.DataFrame() allRes[str(code)] = {} originalGraph = nx.read_gpickle("gpickles/" + code + ".gpickle") scoreFunction6 = ruleScore6(originalGraph) graph = originalGraph graph = max(nx.weakly_connected_component_subgraphs(originalGraph), key=len) # get around the problem of disconnected graphs if len(graph) >= 3: for iteration in range(1, iterations + 1): allRes[str(code)][str(iteration)] = {} pickleFile = str('pickles/' + code + '_' + str(iteration) + '_local1.pickle') outputList = pickle.load( open(pickleFile, 'rb') ) #python2 version #outputList=pickle.load(open(pickleFile, 'rb'), encoding='latin1') = python3 version bruteOut1, dev, storeModel, storeModel3, equivalents, dev2 = [ outputList[k] for k in range(len(outputList)) ] model1 = modelHolder(storeModel3) if os.path.isfile( "pickles/'+code+'_'+str(iteration)+'_scores1.pickle"): pathVals = pickle.Unpickler( open( 'pickles/' + code + '_' + str(iteration) + '_scores1.pickle', "rb")).load() ImportanceVals = {} else: print( "Importance scores not found, setting all values to 0") isGeneric4 = True pathVals = [0] * len(model1.nodeList) for node in range(0, len(model1.nodeList)): allRes[str(code)][str(iteration)][str( model1.nodeList[node])] = [] start1, end1 = findEnds2(model1, model1.nodeList[node], equivalents[node]) ers = equivalents[ node] # find the bitstring for just this node inEdges = findInEdges( model1, model1.nodeList.index(model1.nodeList[node])) plainRules = [] for rule in ers: plainRules.append( writeNode( model1.nodeList.index(model1.nodeList[node]), rule, model1)) ruleLengths = len(ers) ersAllNodes = plainRules rnAllNodes = [pr.count("or") + 1 for pr in plainRules] ImportanceVals = pathVals[node] inDegree = originalGraph.in_degree(model1.nodeList[node]) if model1.nodeList[node] in graph.nodes( ): #remember that we have just selected the largest component of the graph for graph theoretic analysis allRes[str(code)][str(iteration)][str( model1.nodeList[node])] = [ ruleLengths, rnAllNodes, ImportanceVals, inDegree, scoreFunction6[model1.nodeList[node]] ] else: allRes[str(code)][str(iteration)][str( model1.nodeList[node])] = [ ruleLengths, rnAllNodes, ImportanceVals, float('NaN'), scoreFunction6[model1.nodeList[node]] ] else: continue return (allRes)
import sys import networkx as nx G=nx.DiGraph() filename = sys.argv[1] out = sys.argv[2] output= open(out, 'w') with open(filename) as fp: for line in fp: var=line.split( ) if(var[0] == 'S'): G.add_node(var[1]) else: G.add_edge(var[1],var[3]) #print(G.edges()) for x in list(nx.weakly_connected_component_subgraphs(G)): output.write(">") output.write(' '.join(nx.topological_sort(x))) #print(x[0].nodes())
def trim_graph(self): self.graph = max(weakly_connected_component_subgraphs(self.graph), key=len)
def rand_split_train_test(G, train_frac=0.51): """ Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the random split approach (see Notes). The resulting train edge set has the following properties: spans a graph (digraph) with a single connected (weakly connected) component. Parameters ---------- G : graph A NetworkX graph or digraph. train_frac : float, optional The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]). Default is 0.51. Returns ------- train_E : set The set of train edges. test_E : set The set of test edges. Raises ------ ValueError If the train_frac parameter is not in range (0, 1]. Notes ----- The method proceeds as follows: (1) randomly remove 1-train_frac percent of edges from the input graph. (2) from the remaining edges compute the main connected component and these will be the train edges. (3) from the set of removed edges, those such that both end nodes exist in the train edge set computed in the previous step, are added to the final test set. """ if train_frac <= 0.0 or train_frac > 1.0: raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]') if train_frac == 1.0: return set(G.edges()), set() # Create a set of all edges in G E = set(G.edges) num_E = len(E) # Compute the potential number of train and test edges which corresponds to the fraction given num_train_E = int(np.ceil(train_frac * num_E)) num_test_E = int(num_E - num_train_E) # Randomly remove 1-train_frac edges from the graph and store them as potential test edges pte_edges = set(random.sample(E, num_test_E)) # The remaining edges are potential train edges ptr_edges = E - pte_edges # Create a graph containing all ptr_edges and compute the mainCC if G.is_directed(): H = nx.DiGraph() H.add_edges_from(ptr_edges) maincc = max(nx.weakly_connected_component_subgraphs(H), key=len) else: H = nx.Graph() H.add_edges_from(ptr_edges) maincc = max(nx.connected_component_subgraphs(H), key=len) # The edges in the mainCC graph are the actual train edges train_E = set(maincc.edges) # Remove potential test edges for which the end nodes do not exist in the train_E test_E = set() for (src, dst) in pte_edges: if src in maincc.nodes and dst in maincc.nodes: test_E.add((src, dst)) # Return the sets of edges return train_E, test_E
donations_train["Project ID"].values.tolist())] donors = donors[donors["Donor ID"].isin( donations_train["Donor ID"].values.tolist())] #donG=nx.from_pandas_edgelist(donations_train,source="Donor ID",target="Project ID",edge_attr=True,create_using=nx.DiGraph(),) donB = nx.DiGraph() donB.add_nodes_from(donors["Donor ID"].values.tolist(), project=0) donB.add_nodes_from(projects["Project ID"].values.tolist(), project=1) donB.add_weighted_edges_from( donations_train[["Donor ID", "Project ID", "Donation Amount"]].values.tolist()) projectlabel = nx.get_node_attributes(donB, "project") donornodes = {n for n, d in donB.nodes(data=True) if d['project'] == 0} donorG = bipartite.projected_graph(donB, donornodes) #remove = [node for node,degree in list(donortodonor.degree()) if degree < 3] #donortodonor.remove_nodes_from(remove) wccs = list(nx.weakly_connected_component_subgraphs(donB)) sortedwc = sorted(wccs, key=lambda x: len(x.nodes()), reverse=True) largestwcc = sortedwc[0] len(donB.nodes()) len(list(largestwcc.nodes())) comdf = pd.DataFrame() for i in range(len(communities)): print(len(communities[i])) start = time.time() community_generator = community.girvan_newman(largestwcc) for i in range(29): communities = next(community_generator) print("number of comm:" + str(len(communities))) for j in range(len(communities)): print("size:" + str(len(donB.subgraph(communities[j]).nodes()))) with open('communities30.txt', 'w') as filehandle:
def answer_four(): G = answer_one() return len( max(nx.weakly_connected_component_subgraphs(G), key=len).nodes())
def get_lcc(di_graph): di_graph = max(nx.weakly_connected_component_subgraphs(di_graph), key=len) tdl_nodes = di_graph.nodes() nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes)))) nx.relabel_nodes(di_graph, nodeListMap, copy=False) return di_graph, nodeListMap
def generate_haplotigs_for_ctg(input_): ctg_id, out_dir = input_ global p_asm_G global h_asm_G global all_rid_to_phase global seqs arid_to_phase = all_rid_to_phase[ctg_id] mkdir( out_dir ) ctg_G = p_asm_G.get_sg_for_ctg(ctg_id) ctg_nodes = set(ctg_G.nodes()) sg = nx.DiGraph() for v, w in ctg_G.edges(): vrid = v[:9] wrid = w[:9] edge_data = p_asm_G.sg_edges[ (v, w) ] if edge_data[-1] != "G": continue vphase = arid_to_phase.get(vrid, (-1,0)) wphase = arid_to_phase.get(wrid, (-1,0)) if vphase[0] == wphase[0] and vphase[1] != wphase[1]: cross_phase = "Y" else: cross_phase = "N" sg.add_node( v, label= "%d_%d" % vphase, phase="%d_%d" % vphase, src="P" ) sg.add_node( w, label= "%d_%d" % wphase, phase="%d_%d" % wphase, src="P" ) sg.add_edge(v, w, src="OP", cross_phase = cross_phase) # we need to add the complimentary edges as the ctg_graph does not contain the dual edges rv = reverse_end(v) rw = reverse_end(w) sg.add_node( rv, label= "%d_%d" % vphase, phase="%d_%d" % vphase, src="P" ) sg.add_node( rw, label= "%d_%d" % wphase, phase="%d_%d" % wphase, src="P" ) sg.add_edge(rw, rv, src="OP", cross_phase = cross_phase) PG_nodes = set(sg.nodes()) PG_edges = set(sg.edges()) for v, w in h_asm_G.sg_edges: vrid = v[:9] wrid = w[:9] if vrid not in arid_to_phase: continue if wrid not in arid_to_phase: continue if (v, w) in PG_edges: if p_asm_G.sg_edges[(v,w)][-1] == "G": continue edge_data = h_asm_G.sg_edges[ (v, w) ] if edge_data[-1] != "G": continue cross_phase = "N" if v not in PG_nodes: sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid], phase="%d_%d" % arid_to_phase[vrid], src="H" ) if w not in PG_nodes: sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid], phase="%d_%d" % arid_to_phase[wrid], src="H" ) sg.add_edge(v, w, src="H", cross_phase = cross_phase) rv = reverse_end(v) rw = reverse_end(w) if rv not in PG_nodes: sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid], phase="%d_%d" % arid_to_phase[vrid], src="H" ) if rw not in PG_nodes: sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid], phase="%d_%d" % arid_to_phase[wrid], src="H" ) sg.add_edge(rw, rv, src="H", cross_phase = cross_phase) sg0 = sg.copy() for v, w in h_asm_G.sg_edges: vrid = v[:9] wrid = w[:9] if vrid not in arid_to_phase: continue if wrid not in arid_to_phase: continue if (v, w) in PG_edges: if p_asm_G.sg_edges[(v,w)][-1] == "G": continue edge_data = h_asm_G.sg_edges[ (v, w) ] if sg0.in_degree(w) == 0: cross_phase = "Y" if v not in PG_nodes: sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid], phase="%d_%d" % arid_to_phase[vrid], src="H" ) if w not in PG_nodes: sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid], phase="%d_%d" % arid_to_phase[wrid], src="H" ) sg.add_edge(v, w, src="ext", cross_phase = cross_phase) rv = reverse_end(v) rw = reverse_end(w) if rv not in PG_nodes: sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid], phase="%d_%d" % arid_to_phase[vrid], src="H" ) if rw not in PG_nodes: sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid], phase="%d_%d" % arid_to_phase[wrid], src="H" ) sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase) if sg0.out_degree(v) == 0: cross_phase = "Y" if v not in PG_nodes: sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid], phase="%d_%d" % arid_to_phase[vrid], src="H" ) if w not in PG_nodes: sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid], phase="%d_%d" % arid_to_phase[wrid], src="H" ) sg.add_edge(v, w, src="ext", cross_phase = cross_phase) rv = reverse_end(v) rw = reverse_end(w) if rv not in PG_nodes: sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid], phase="%d_%d" % arid_to_phase[vrid], src="H" ) if rw not in PG_nodes: sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid], phase="%d_%d" % arid_to_phase[wrid], src="H" ) sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase) sg2 = sg.copy() ctg_nodes_r = set([ reverse_end(v) for v in list(ctg_nodes) ]) for v, w in ctg_G.edges(): sg2.remove_edge(v, w) rv, rw = reverse_end(v), reverse_end(w) sg2.remove_edge(rw, rv) for v in sg2.nodes(): if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0: sg2.remove_node(v) nodes_to_remove = set() edges_to_remove = set() for sub_g in nx.weakly_connected_component_subgraphs(sg2): sub_g_nodes = set(sub_g.nodes()) if len(sub_g_nodes & ctg_nodes_r) > 0 and len(sub_g_nodes & ctg_nodes) > 0: # remove cross edge sources = [n for n in sub_g.nodes() if sub_g.in_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ] sinks = [n for n in sub_g.nodes() if sub_g.out_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ] edges_to_keep = set() for v in sources: for w in sinks: path = [] if v in ctg_nodes and w not in ctg_nodes_r: try: path = nx.shortest_path( sub_g, v, w ) except nx.exception.NetworkXNoPath: path = [] elif v not in ctg_nodes and w in ctg_nodes_r: try: path = nx.shortest_path( sub_g, v, w ) except nx.exception.NetworkXNoPath: path = [] if len(path) >= 2: v1 = path[0] for w1 in path[1:]: edges_to_keep.add( (v1, w1) ) rv1, rw1 = reverse_end(v1), reverse_end(w1) edges_to_keep.add( (rw1, rv1) ) v1 = w1 for v, w in sub_g.edges(): if (v, w) not in edges_to_keep: edges_to_remove.add( (v, w) ) rv, rw = reverse_end(v), reverse_end(w) edges_to_remove.add( (rw, rv) ) if len(sub_g_nodes & ctg_nodes_r) == 0 and len(sub_g_nodes & ctg_nodes) == 0: nodes_to_remove.update( sub_g_nodes ) nodes_to_remove.update( set( [reverse_end(v) for v in list(sub_g_nodes)] ) ) for v, w in list(edges_to_remove): sg.remove_edge(v, w) for v in nodes_to_remove: sg.remove_node(v) for v in sg.nodes(): if sg.out_degree(v) == 0 and sg.in_degree(v) == 0: sg.remove_node(v) #nx.write_gexf(sg, "full_g.gexf") s_node = p_asm_G.ctg_data[ctg_id][5][0][0] t_node = p_asm_G.ctg_data[ctg_id][5][-1][-1] for v, w in sg.edges(): phase0 = sg.node[v]["phase"].split("_") phase1 = sg.node[w]["phase"].split("_") if phase0 == phase1: sg[v][w]["weight"] = 10 sg[v][w]["score"] = 1 sg[v][w]["label"] = "type0" else: if phase0[0] == phase1[0]: sg[v][w]["weight"] = 1 sg[v][w]["score"] = 100000 sg[v][w]["label"] = "type1" else: sg[v][w]["weight"] = 5 sg[v][w]["score"] = 50 sg[v][w]["label"] = "type2" sg2 = sg.copy() edge_to_remove = set() for v, w in sg2.edges(): if sg2[v][w]["src"] == "ext": edge_to_remove.add( (v, w) ) rv, rw = reverse_end(v), reverse_end(w) edge_to_remove.add( (rw, rv) ) if sg2.node[v]["phase"] == sg2.node[w]["phase"]: continue flag1 = 0 flag2 = 0 for e in sg2.out_edges(v): if sg2.node[e[0]]["phase"] == sg2.node[e[1]]["phase"]: flag1 = 1 break if flag1 == 1: for e in sg2.in_edges(w): if sg2.node[e[0]]["phase"] == sg2.node[e[1]]["phase"]: flag2 = 1 break if flag2 == 1: edge_to_remove.add( (v, w) ) rv, rw = reverse_end(v), reverse_end(w) edge_to_remove.add( (rw, rv) ) for v, w in list(edge_to_remove): sg2.remove_edge(v, w) try: s_path = nx.shortest_path(sg2, source=s_node, target=t_node, weight="score") except nx.exception.NetworkXNoPath: s_path = nx.shortest_path(sg, source=s_node, target=t_node, weight="score") s_path_edges = [] for i in xrange(len(s_path)-1): v = s_path[i] w = s_path[i+1] sg[v][w]["weight"] = 15 s_path_edges.append( (v,w) ) s_path_edge_set = set(s_path_edges) #output the updated primary contig p_tig_path = open(os.path.join(out_dir, "p_ctg_path.%s" % ctg_id),"w") p_tig_fa = open(os.path.join(out_dir, "p_ctg.%s.fa" % ctg_id),"w") edges_to_remove1 = set() edges_to_remove2 = set() with open(os.path.join(out_dir, "p_ctg_edges.%s" % ctg_id), "w") as f: seq = [] for v, w in s_path_edges: sg[v][w]["h_edge"] = 1 vrid = v.split(":")[0] wrid = w.split(":")[0] vphase = arid_to_phase.get(vrid, (-1,0)) wphase = arid_to_phase.get(wrid, (-1,0)) print >>f, "%s" % ctg_id, v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1] if sg.edge[v][w]["src"] == "OP": edge_data = p_asm_G.sg_edges[ (v,w) ] else: edge_data = h_asm_G.sg_edges[ (v,w) ] seq_id, s, t = edge_data[0] if s < t: seq.append(seqs[ seq_id ][ s:t ]) else: seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ])) print >>p_tig_path, "%s" % ctg_id, v, w, seq_id, s, t, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0)) sg[v][w]["tig_id"] = "%s" % ctg_id rv, rw = reverse_end(v), reverse_end(w) edges_to_remove1.add( (v, w) ) edges_to_remove2.add( (rw, rv) ) print >> p_tig_fa, ">%s" % ctg_id print >> p_tig_fa, "".join(seq) p_tig_fa.close() p_tig_path.close() sg2 = sg.copy() reachable1 = nx.descendants(sg2, s_node) sg2_r = sg2.reverse() reachable2 = nx.descendants(sg2_r, t_node) reachable_all = reachable1 | reachable2 reachable_both = reachable1 & reachable2 for v, w in list(edges_to_remove2 | edges_to_remove1): sg2.remove_edge( v, w ) for v, w in sg2.edges(): if sg2[v][w]["cross_phase"] == "Y": sg2.remove_edge( v, w ) for v in sg2.nodes(): if v not in reachable_all: sg2.remove_node(v) for v in sg2.nodes(): if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0: sg2.remove_node(v) continue if v in reachable_both: sg2.node[v]["reachable"] = 1 else: sg2.node[v]["reachable"] = 0 dump_graph = False # the code segement below is useful for showing the graph if dump_graph == True: nx.write_gexf(sg2, "%s_1.gexf" % ctg_id) p_path_nodes = set(s_path) p_path_rc_nodes = set( [reverse_end(v) for v in s_path] ) sg2_nodes = set(sg2.nodes()) for v in p_asm_G.get_sg_for_ctg(ctg_id).nodes(): rv = reverse_end(v) p_path_rc_nodes.add( rv ) if rv in sg2_nodes: sg2.remove_node(rv) h_tig_path = open(os.path.join(out_dir, "h_ctg_path.%s" % ctg_id),"w") h_tig_fa = open(os.path.join(out_dir, "h_ctg_all.%s.fa" % ctg_id),"w") edges_to_remove = set() labelled_node = set() with open(os.path.join(out_dir, "h_ctg_edges.%s" % ctg_id),"w") as f: h_tig_id = 1 h_paths = {} #print "number of components:", len([tmp for tmp in nx.weakly_connected_component_subgraphs(sg2)]) for sub_hg_0 in nx.weakly_connected_component_subgraphs(sg2): sub_hg = sub_hg_0.copy() while sub_hg.size() > 5: #print "sub_hg size:", len(sub_hg.nodes()) sources = [n for n in sub_hg.nodes() if sub_hg.in_degree(n) != 1 ] sinks = [n for n in sub_hg.nodes() if sub_hg.out_degree(n) != 1 ] #print "number of sources", len(sources), sources #print "number of sinks", len(sinks), sinks if len(sources) == 0 and len(sinks) == 0: #TODO, the rest of the sub-graph are circles, we need to break and print warnning message break longest = [] eliminated_sinks = set() s_longest = {} for s in sources: #print "test source",s, len(eliminated_sinks) if s in labelled_node: continue s_path = [] for t in sinks: if t in eliminated_sinks: continue try: path = nx.shortest_path(sub_hg, s, t, weight="score") #print "test path len:", len(path), s, t except nx.exception.NetworkXNoPath: path = [] continue s_path.append( [ path, t ] ) s_path.sort(key = lambda x: -len(x[0])) if len(s_path) == 0: continue s_longest[s] = s_path[0][0] if len(s_longest[s]) > len(longest): longest = s_longest[s] #print "s longest", longest[0], longest[-1], len(longest) for path, t in s_path[1:]: eliminated_sinks.add(t) #print "elimated t", t if len(longest) == 0: break s = longest[0] t = longest[-1] h_paths[ ( s, t ) ] = longest labelled_node.add(s) rs = reverse_end(s) labelled_node.add(rs) for v in longest: sub_hg.remove_node(v) for s, t in h_paths: longest = h_paths[ (s, t) ] #print "number of node in path", s,t,len(longest) seq = [] for v, w in zip(longest[:-1], longest[1:]): sg[v][w]["h_edge"] = 1 if sg.edge[v][w]["src"] == "OP": edge_data = p_asm_G.sg_edges[ (v,w) ] else: edge_data = h_asm_G.sg_edges[ (v,w) ] vrid = v.split(":")[0] wrid = w.split(":")[0] vphase = arid_to_phase.get(vrid, (-1,0)) wphase = arid_to_phase.get(wrid, (-1,0)) print >>f, "%s_%03d" % (ctg_id, h_tig_id), v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1] if sg.edge[v][w]["src"] == "OP": edge_data = p_asm_G.sg_edges[ (v,w) ] else: edge_data = h_asm_G.sg_edges[ (v,w) ] seq_id, sp, tp = edge_data[0] if sp < tp: seq.append(seqs[ seq_id ][ sp:tp ]) else: seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ sp:tp:-1 ] ])) print >> h_tig_path, "%s_%03d" % (ctg_id, h_tig_id), v, w, seq_id, sp, tp, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0)) sg[v][w]["tig_id"] = "%s_%03d" % (ctg_id, h_tig_id) rv, rw = reverse_end(v), reverse_end(w) edges_to_remove.add( (v, w) ) edges_to_remove.add( (rw, rv) ) print >> h_tig_fa, ">%s_%03d" % (ctg_id, h_tig_id) print >> h_tig_fa, "".join(seq) h_tig_id += 1 h_tig_fa.close() h_tig_path.close() dump_graph = False # the code segement below is useful for showing the graph if dump_graph == True: for v, w in sg.edges(): if "h_edge" not in sg[v][w]: sg[v][w]["h_edge"] = 0 if v in reachable_all: sg.node[v]["reachable"] = 1 else: sg.node[v]["reachable"] = 0 if w in reachable_all: sg.node[w]["reachable"] = 1 else: sg.node[w]["reachable"] = 0 nx.write_gexf(sg, "%s_0.gexf" % ctg_id)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-a', '--assembly', help='Contig assembly', required=True) parser.add_argument('-g', '--oriented_graph', help='Oriented Graph of Contigs', required=True) parser.add_argument('-s', '--seppairs', help='Separation pairs detected in the graph', required=True) parser.add_argument('-o', '--output', help='Output file for scaffold sequences', required=True) parser.add_argument('-e', '--gfa', help='Output file for graph in GFA format', required=True) parser.add_argument('-f', '--agp', help='Output agp file for scaffolds', required=True) parser.add_argument('-b', '--bub', help='Output bubbles', required=True) args = parser.parse_args() bub_output = open(args.bub, 'w') G = nx.read_gml(args.oriented_graph) write_GFA(G, args.gfa) #sys.exit() #G = nx.read_gml("small.gml") #nx.write_gexf(G,'original.gexf') pairmap = {} pair_list = [] with open(args.seppairs, 'r') as f: for line in f: attrs = line.split() if attrs[0] <= attrs[1]: key = attrs[0] + '$' + attrs[1] else: key = attrs[1] + '$' + attrs[0] pairmap[key] = attrs[2:] pair_list.append(key) validated = {} contig2id = {} cnt = 0 #write_dot(G,'graph.dot') # for key in pairmap: # print len(pairmap[key]) ''' OK. Lets fix this now. 1. Validate the bubbles first and store them in a map, keep track of source and sink for each bubble ''' valid_sources = {} #valid source nodes valid_sink = {} #valid sink nodes valid_bubble_id = 1 #valid bubble number, to be used in the new graph members = {} #members of all the bubbles component_id_counter = 1 valid_bubbles = {} #store the subgraphs for the bubbles bubble_id_to_source = {} #bubble to its source bubble_id_to_sink = {} #bubble to its sink source_to_bubble = {} sink_to_bubble = {} member_to_bubble = {} bubble_to_graph = {} for key in pair_list: comp = pairmap[key] subg = G.subgraph(comp) contigs = key.split('$') to_check = True for each in comp: if each in members: to_check = False break if to_check: res = test_pair(subg, contigs[0], contigs[1], comp) #component is a valid boubble if res: #add valid members to the members: for each in comp: members[each] = 1 member_to_bubble[each] = str(valid_bubble_id) #store the source and sink of the bubble valid_sources[contigs[0]] = 1 valid_sink[contigs[1]] = 1 valid_bubbles[valid_bubble_id] = subg bubble_id_to_sink[valid_bubble_id] = contigs[1] bubble_id_to_source[valid_bubble_id] = contigs[0] source_to_bubble[contigs[0]] = str(valid_bubble_id) sink_to_bubble[contigs[1]] = str(valid_bubble_id) bubble_to_graph[str(valid_bubble_id)] = subg valid_bubble_id += 1 line = '' for each in subg.nodes(): line += str(each) + '\t' bub_output.write(line + '\n') else: res = test_pair(subg, contigs[1], contigs[0], comp) if res: #add valid members to the members: for each in comp: members[each] = 1 member_to_bubble[each] = str(valid_bubble_id) #store the source and sink of the bubble valid_sources[contigs[1]] = 1 valid_sink[contigs[0]] = 1 valid_bubbles[valid_bubble_id] = subg bubble_id_to_sink[valid_bubble_id] = contigs[0] bubble_id_to_source[valid_bubble_id] = contigs[1] source_to_bubble[contigs[1]] = str(valid_bubble_id) sink_to_bubble[contigs[0]] = str(valid_bubble_id) bubble_to_graph[str(valid_bubble_id)] = subg valid_bubble_id += 1 line = '' for each in subg.nodes(): line += str(each) + '\t' bub_output.write(line + '\n') ''' 2. okay now we have all the valid bubbles. Create a new graph and add the edges which are not in the bubbles first, Then deal with other things. ''' G_new = nx.DiGraph() ''' Now add nodes for the collapsed bubbles ''' for key in valid_bubbles: G_new.add_node(str(key)) for u, v, data in G.edges(data=True): if u not in members and v not in members: G_new.add_edge(u, v, data) if u not in members and v in members: G_new.add_edge(u, member_to_bubble[v], data) if v not in members and u in members: G_new.add_edge(member_to_bubble[u], v, data) ''' Now add edges from all other nodes to sources and sinks if exist ''' for node in G.nodes(): if node not in valid_sources and node not in valid_sink: for source in valid_sources: if G.has_edge(node, source): data = G.get_edge_data(node, source) data['orientation'] = data['orientation'][0] + 'B' G_new.add_edge(node, source_to_bubble[source], data) # if G.has_edge(source,node): # data = G.get_edge_data(source,node) # G_new.add_edge(source_to_bubble[source],node,data) for sink in valid_sink: if G.has_edge(sink, node): data = G.get_edge_data(sink, node) data['orientation'] = 'E' + data['orientation'][1] G_new.add_edge(sink_to_bubble[sink], node, data) # if G.has_edge(node,sink): # data = G.get_edge_data(node,sink) # G_new.add_edge(node,sink_to_bubble[sink],data) ''' Now finally add edges between sources and sinks if they are in original graphs ''' for source in source_to_bubble: for sink in sink_to_bubble: if source_to_bubble[source] != sink_to_bubble[sink]: if G.has_edge(source, sink): data = G.get_edge_data(source, sink) data['orientation'] = 'BE' G_new.add_edge(source_to_bubble[source], sink_to_bubble[sink], data) if G.has_edge(sink, source): data = G.get_edge_data(sink, source) data['orientation'] = 'EB' G_new.add_edge(sink_to_bubble[sink], source_to_bubble[source], data) ''' Add node attributes now ''' node_info = {} for node in G.nodes(data=True): node_info[node[0]] = node[1] for node in G_new.nodes(data=True): if node[0] in node_info: info = node_info[node[0]] for each in info: node[1][each] = info[each] node[1]['type'] = 'contig' else: node[1]['type'] = 'bubble' # print G_new.has_edge('k99_79977','k99_192814') # in_bubble = {} # valid_source_sink = [] # all_bubble_paths = {} #stores all heaviest paths in bubble # source_and_sinks = {} # ''' # Here, first validate each source sink pair. To do this, sort them with largest number of nodes in the # biconnected component. # ''' # #pair_list = sorted(pairmap, key=lambda k: len(pairmap[k]), reverse=True) # # for key in pair_list: # # print pairmap[key] # comp_to_id = {} # id_to_comp = {} # comp_to_pair = {} # id_to_longest_path = {} # comp2pairs = {} # prev_comp = '' # id = 1 # for key in pair_list: # comp = pairmap[key] # if comp[0] == prev_comp: # continue # comp_to_id[comp[0]] = str(id) # comp2pairs[str(id)] = [] # id_to_comp[str(id)] = comp # comp_to_pair[str(id)] = [] # id_to_longest_path[str(id)] = -1 # id += 1 # prev_comp = comp[0] # for key in pair_list: # c = pairmap[key][0] # comp_id = comp_to_id[c] # comp_to_pair[comp_id].append(key) # valid_comps = {} # for key in pair_list: # contigs = key.split('$') # ''' # First find the subgraph of bicomponent. Check if current source sink pair is longer that previously # validated source sink pair. If yes then only validate current source sink pair. # ''' # subg = G.subgraph(pairmap[key]) # comp_id = pairmap[key][0] # comp_id = comp_to_id[comp_id] # res = test_pair(G,contigs[0],contigs[1],pairmap[key]) # if res: # cnt += 1 # #validated[contigs[0]] = 1 # source_and_sinks[contigs[0]] = 1 # source_and_sinks[contigs[1]] = 1 # #validated[contigs[1]] = 1 # #subg = G.subgraph(comp) # valid_comps[comp_id] = 1 # source = {} # sink = {} # source_sink_to_comp = {} # #print len(valid_comps) # cnt = 0 # bubble_to_graph = {} # for key in valid_comps: # pairs = comp_to_pair[key] # #print "Length of pairs = " + str(len(pairs)) # subg = G.subgraph(id_to_comp[key]) # if not nx.is_directed_acyclic_graph(subg): # subg = make_acyclic(subg) # if nx.is_directed_acyclic_graph(subg): # #print subg.nodes() # max_path = 0 # max_pair = -1 # #print pairs # for pair in pairs: # #print pair # pair1 = pair.split('$') # no_paths = no_of_paths(subg,pair1[0],pair1[1]) # if no_paths > max_path: # max_path = no_paths # max_pair = pair # if max_pair != -1: # # print "max_path = " + str(max_path) # # print "max_pair = " + str(max_pair) # # paths = get_variants(subg,max_pair.split('$')[0],max_pair.split('$')[1]) # # print paths # cnt += 1 # bubble_to_graph[key] = subg # line = '' # for each in subg.nodes(): # line += str(each)+'\t' # bub_output.write(line+'\n') # valid_source_sink.append(max_pair) # source[max_pair.split('$')[0]] = 1 # sink[max_pair.split('$')[1]] = 1 # source_sink_to_comp[max_pair.split('$')[0]] = key # source_sink_to_comp[max_pair.split('$')[1]] = key # for contig in id_to_comp[key]: # in_bubble[contig] = 1 # validated[contig] = 1 # # else: # # subg = make_acyclic # #print cnt # ''' # Here, find now the new graph by collapsing bubbles # TODO: Preserve node and edge attributes from the original non-collapsed graph # ''' # #node to info map # node_info = {} # for node in G.nodes(data=True): # node_info[node[0]] = node[1] # G_new = nx.DiGraph() # # print source # # print sink # # for each in source: # # print len(G.in_edges(each)) # # for each in sink: # # print len(G.out_edges(each)) # # print source # # print sink # for key in valid_comps: # G_new.add_node(str(key)) # for u,v,data in G.edges(data=True): # if u not in validated and v not in validated: # G_new.add_edge(u,v,data) # for node in G.nodes(): # if node not in source and node not in sink: # for each in source: # if G.has_edge(node,each): # #print 'here' # data = G.get_edge_data(node,each) # G_new.add_edge(node,source_sink_to_comp[each],data) # for each in sink: # if G.has_edge(each,node): # #print 'here' # data = G.get_edge_data(each,node) # G_new.add_edge(source_sink_to_comp[each],node,data) # for s in source: # for t in sink: # if source_sink_to_comp[s] != source_sink_to_comp[t]: # if G.has_edge(s,t): # data = G.get_edge_data(s,t) # G_new.add_edge(source_sink_to_comp[s],source_sink_to_comp[t],data) # if G.has_edge(t,s): # data = G.get_edge_data(t,s) # G_new.add_edge(source_sink_to_comp[t],source_sink_to_comp[s],data) # for node in G_new.nodes(data=True): # if node[0] in node_info: # info = node_info[node[0]] # for each in info: # node[1][each] = info[each] # node[1]['type'] = 'contig' # else: # node[1]['type'] = 'bubble' #node[1]['size'] = len(bubble_to_graph[node[0]].nodes()) # ''' # Output the simplified Graph # ''' # # for node in G_new.nodes(data=True): # # #print node # # m = node[1] # # node[1]['color'] = colmap[node[0]] # #nx.set_node_attribute(G_new,'color',colmap) # print len(G_new.nodes()) # print len(G_new.edges()) # #nx.write_gexf(G_new,'simplified.gexf') # #write_dot(G_new,'simplified.dot') # nx.write_gml(G_new,'simplified.gml') ''' In this simplified, for each weakly connected component, find out the heaviest linear path. If path goes through the bubble, choose the heaviest path in the bubble and continue ''' alternative_contigs = [ ] #this stores all variants. Tag these as variants while writing to file primary_contigs = [] for subg in nx.weakly_connected_component_subgraphs(G_new): #print subg.nodes() # print 'here' #First get all edges edges = subg.edges(data=True) #sort edges by weights sorted_edges = sorted(edges, key=lambda tup: tup[2]['bsize'], reverse=True) #print sorted_edges #create a new graph G_sorted = nx.Graph() #add edges to this graph until for is created, this will be undirected graph and it will have #'B' and 'E' nodes nodes = set() for edge in sorted_edges: u = edge[0] v = edge[1] data = edge[2] orientation = data['orientation'] u = u + '$' + orientation[0] v = v + '$' + orientation[1] if u not in G_sorted.nodes() and v not in G_sorted.nodes(): G_sorted.add_edge(u, v, data) nodes.add(u.split('$')[0]) nodes.add(v.split('$')[0]) #add edges between B and E nodes of same contig for node in nodes: G_sorted.add_edge(node + '$B', node + '$E') #print len(G_sorted.edges()) #now trace out all linear paths in this, each will be a scaffold for small_subg in nx.connected_component_subgraphs(G_sorted): #print small_subg.edges() p = [] for node in small_subg.nodes(): if small_subg.degree(node) == 1: p.append(node) if len(p) == 2: path = nx.shortest_path(small_subg, p[0], p[1]) #print path #if path has a bubble node, insert the contigs on the heaviest path on the bubble new_path = [] new_path_ind = 0 for i in xrange(1, len(path), 2): node = path[i].split('$')[0] if node not in bubble_to_graph: new_path.append(path[i - 1]) new_path.append(path[i]) new_path_ind += 2 continue bubble_graph = bubble_to_graph[node] #print node curr_source = '' curr_sink = '' for node1 in bubble_graph.nodes(): if node1 in source_to_bubble: curr_source = node1 if node1 in sink_to_bubble: curr_sink = node1 try: bubble_paths = get_variants(bubble_graph, curr_source, curr_sink) except: continue heaviest = bubble_paths[0] #print "HEAVIEST: " + str(heaviest) # if len(heaviest) == 1: # continue ori = path[i - 1].split('$')[1] + path[i].split('$')[1] if ori == "EB": heaviest.reverse() for each in heaviest: #print 'appending heaviest' # print each orient = G.node[each]['orientation'] if orient == 'FOW': new_path.append(each + '$B') new_path.append(each + '$E') new_path_ind += 2 if orient == 'REV': new_path.append(each + '$E') new_path.append(each + '$B') new_path_ind += 2 alt_paths = get_alternative_paths(bubble_graph, heaviest) if len(alt_paths) > 0: for i in xrange(0, len(alt_paths)): #print 'in alternate path' alt_path = [] curr_path = alt_paths[i] for each in curr_path: o_node = G.node if G.node[each]['orientation'] == 'FOW': alt_path.append(each + '$B') alt_path.append(each + '$E') if G.node[each]['orientation'] == 'REV': alt_path.append(each + '$E') alt_path.append(each + '$B') alternative_contigs.append(alt_path) primary_contigs.append(new_path) #print new_path # print len(primary_contigs) # print alternative_contigs assembly = open(args.assembly, 'r') sequences = parse_fasta(assembly.readlines()) ofile = open(args.output, 'w') scaffolded = {} agpfile = open(args.agp, 'w') scaffold_id = 1 for scaffold in primary_contigs: scaff_string = '' line = '' scaff_len = 0 begin = 1 local_comp = 0 curr_contig = '' for i in xrange(0, len(scaffold) - 1, 2): line += 'scaffold_' + str(scaffold_id) line += '\t' line += str(begin) + '\t' curr = scaffold[i] next = scaffold[i + 1] curr_len = len(sequences[curr.split('$')[0]]) scaff_len += curr_len last = curr_len + begin - 1 line += str(last) + '\t' begin = last + 1 line += str(local_comp) + '\t' local_comp += 1 scaffolded[curr.split('$')[0]] = True scaffolded[next.split('$')[0]] = True contig = curr.split('$')[0] line += ('W\t' + contig + '\t1\t' + str(curr_len) + '\t') start = curr.split('$')[1] end = next.split('$')[1] if start == 'B' and end == 'E': scaff_string += sequences[contig] line += '+' else: scaff_string += revcompl(sequences[contig]) line += '-' agpfile.write(line + '\n') line = '' if i != len(scaffold) - 2: for j in xrange(0, 100): scaff_string += 'N' chunks = [ scaff_string[i:i + 80] for i in xrange(0, len(scaff_string), 80) ] ofile.write('>scaffold_' + str(scaffold_id) + '\n') for chunk in chunks: ofile.write(chunk + '\n') scaffold_id += 1 for scaffold in alternative_contigs: scaff_string = '' line = '' scaff_len = 0 begin = 1 local_comp = 0 curr_contig = '' for i in xrange(0, len(scaffold) - 1, 2): line += 'scaffold_' + str(scaffold_id) + '_variant' line += '\t' line += str(begin) + '\t' curr = scaffold[i] next = scaffold[i + 1] curr_len = len(sequences[curr.split('$')[0]]) scaff_len += curr_len last = curr_len + begin - 1 line += str(last) + '\t' begin = last + 1 line += str(local_comp) + '\t' local_comp += 1 scaffolded[curr.split('$')[0]] = True scaffolded[next.split('$')[0]] = True contig = curr.split('$')[0] line += ('W\t' + contig + '\t1\t' + str(curr_len) + '\t') start = curr.split('$')[1] end = next.split('$')[1] if start == 'B' and end == 'E': scaff_string += sequences[contig] line += '+' else: scaff_string += revcompl(sequences[contig]) line += '-' agpfile.write(line + '\n') line = '' if i != len(scaffold) - 2: for j in xrange(0, 100): scaff_string += 'N' chunks = [ scaff_string[i:i + 80] for i in xrange(0, len(scaff_string), 80) ] ofile.write('>scaffold_' + str(scaffold_id) + '_variant\n') for chunk in chunks: ofile.write(chunk + '\n') scaffold_id += 1 for contig in sequences: if contig not in scaffolded: scaff_string = sequences[contig] chunks = [ scaff_string[i:i + 80] for i in xrange(0, len(scaff_string), 80) ] line = '' line += 'scaffold_' + str(scaffold_id) + '\t' line += '0\t' line += str(len(scaff_string)) + '\t' line += '1\t' line += 'W\t' + contig + '\t1\t' + str(len(scaff_string)) + '\t+' agpfile.write(line + '\n') ofile.write('>scaffold_' + str(scaffold_id) + '\n') for chunk in chunks: ofile.write(chunk + '\n') scaffold_id += 1 ofile.close()
def get_largest_wcc(G): print("Getting largest WCC...") largest_weakly = max(nx.weakly_connected_component_subgraphs(G), key=len) return largest_weakly
def compartmentalize_skeletongroup( skeleton_id_list, project_id, **kwargs ): skelgroup = SkeletonGroup( skeleton_id_list, project_id ) compartment_graph_of_skeletons = {} resultgraph = nx.DiGraph() for skeleton_id, skeleton in skelgroup.skeletons.items(): if 'confidence_threshold' in kwargs: confidence_filtering( skeleton, kwargs['confidence_threshold'] ) elif 'edgecount' in kwargs: edgecount_filtering( skeleton, kwargs['edgecount'] ) subgraphs = list(nx.weakly_connected_component_subgraphs( skeleton.graph)) compartment_graph_of_skeletons[ skeleton_id ] = subgraphs for i,subg in enumerate(subgraphs): for nodeid, d in subg.nodes(data=True): d['compartment_index'] = i skeleton.graph.nodes[nodeid]['compartment_index'] = i if len(skeleton.neuron.name) > 30: neuronname = f'{skeleton.neuron.name[:30]}... [{i}]' else: neuronname = f'{skeleton.neuron.name} [{i}]' resultgraph.add_node(f'{skeleton_id}_{i}', **{ 'neuronname': neuronname, 'skeletonid': str(skeleton_id), 'compartment_index': i, 'node_count': subg.number_of_nodes(), }) connectors:Dict = {} for skeleton_id, skeleton in skelgroup.skeletons.items(): for connector_id, v in skeleton.connected_connectors.items(): if connector_id not in connectors: connectors[connector_id] = { 'pre': [], 'post': [] } if len(v['presynaptic_to']): # add the skeleton id for each treenode that is in v['presynaptic_to'] # This can duplicate skeleton id entries which is correct for e in v['presynaptic_to']: skeleton_compartment_id = f'{skeleton_id}_{skeleton.graph.nodes[e]["compartment_index"]}' connectors[connector_id]['pre'].append( skeleton_compartment_id ) if len(v['postsynaptic_to']): for e in v['postsynaptic_to']: skeleton_compartment_id = f'{skeleton_id}_{skeleton.graph.nodes[e]["compartment_index"]}' connectors[connector_id]['post'].append( skeleton_compartment_id ) # merge connectors into graph for connector_id, v in connectors.items(): for from_skeleton in v['pre']: for to_skeleton in v['post']: if not resultgraph.has_edge( from_skeleton, to_skeleton ): resultgraph.add_edge(from_skeleton, to_skeleton, **{ 'count': 0, 'connector_ids': set(), }) resultgraph.edge[from_skeleton][to_skeleton]['count'] += 1 resultgraph.edge[from_skeleton][to_skeleton]['connector_ids'].add( connector_id ) return resultgraph
###################################################################### # Store figures of call graph ###################################################################### if options['output']: name2URL = lambda name: name2URLheader(name, file_list) # Draw the whole call graph A = graphBrownie.to_abigraph(options['styles'], type=defs_type, max_label=40, name2URL=name2URL) A.layout("dot") nameOut, extOut = os.path.splitext(options['output']) A.draw("%s%s" % (nameOut, extOut)) # Draw subgraphs if needed if options['subgraphs']: from networkx import to_agraph, weakly_connected_component_subgraphs i = 0 for subgraphBrownie in weakly_connected_component_subgraphs(\ graphBrownie.clone(type=defs_type)): if len(subgraphBrownie.nodes()) > 1: i += 1 # A = subgraphBrownie.to_agraph() A = subgraphBrownie.to_abigraph(options['styles'], max_label=40) A.layout('dot') A.draw("%s%i%s" % (nameOut, i, extOut))
def _parse_loop_graph(self, subg, bigg): """ Create a Loop object for a strongly connected graph, and any strongly connected subgraphs, if possible. :param subg: A strongly connected subgraph. :param bigg: The graph which subg is a subgraph of. :return: A list of Loop objects, some of which may be inside others, but all need to be documented. """ loop_body_nodes = subg.nodes()[:] entry_edges = [] break_edges = [] continue_edges = [] entry_node = None for node in loop_body_nodes: for pred_node in bigg.predecessors(node): if pred_node not in loop_body_nodes: if entry_node is not None and entry_node != node: l.warning("Bad loop: more than one entry point (%s, %s)", entry_node, node) return None, [] entry_node = node entry_edges.append((pred_node, node)) subg.add_edge(pred_node, node) for succ_node in bigg.successors(node): if succ_node not in loop_body_nodes: break_edges.append((node, succ_node)) subg.add_edge(node, succ_node) if entry_node is None: entry_node = min(loop_body_nodes, key=lambda n: n.addr) l.info("Couldn't find entry point, assuming it's the first by address (%s)", entry_node) acyclic_subg = subg.copy() for pred_node in subg.predecessors(entry_node): if pred_node in loop_body_nodes: continue_edge = (pred_node, entry_node) acyclic_subg.remove_edge(*continue_edge) continue_edges.append(continue_edge) removed_exits = {} removed_entries = {} tops, alls = self._parse_loops_from_graph(acyclic_subg) for subloop in tops: if subloop.entry in loop_body_nodes: # break existing entry edges, exit edges # re-link in loop object # the exception logic is to handle when you have two loops adjacent to each other # you gotta link the two loops together and remove the dangling edge for entry_edge in subloop.entry_edges: try: subg.remove_edge(*entry_edge) except networkx.NetworkXError: if entry_edge in removed_entries: subg.add_edge(removed_entries[entry_edge], subloop) try: subg.remove_edge(removed_entries[entry_edge], entry_edge[1]) except networkx.NetworkXError: pass else: raise else: subg.add_edge(entry_edge[0], subloop) removed_entries[entry_edge] = subloop for exit_edge in subloop.break_edges: try: subg.remove_edge(*exit_edge) except networkx.NetworkXError: if exit_edge in removed_entries: subg.add_edge(subloop, removed_entries[exit_edge]) try: subg.remove_edge(exit_edge[0], removed_entries[exit_edge]) except networkx.NetworkXError: pass else: raise else: subg.add_edge(subloop, exit_edge[1]) removed_exits[exit_edge] = subloop subg = filter(lambda g: entry_node in g.nodes(), networkx.weakly_connected_component_subgraphs(subg))[0] me = Loop(entry_node, entry_edges, break_edges, continue_edges, loop_body_nodes, subg, tops[:]) return me, [me] + alls
# S 238024 ACCAATTAT KC:i:37210 if line_type == "S": v_name = int(line[1]) v_length = len(line[2]) G.add_node(v_name, length=v_length) # L 238322 + 19590 - 55M if line_type == "L": v1 = int(line[1]) v2 = int(line[3]) G.add_edge(v1, v2) # remain only largest component new_G = nx.DiGraph() for g in nx.weakly_connected_component_subgraphs(G): #print(g.number_of_nodes()) if new_G.number_of_nodes() < g.number_of_nodes(): new_G = g.copy() G = new_G.copy() # Табличка с референсами # Считываем файл ответа, как он есть df_ref = pd.read_csv("refs/refs_edges.txt", header=None, names=["e"]) df_ref = df_ref["e"].str.split('\t', 1, expand=True) df_ref.columns = ["e_id", "strains"] df_ref = df_ref.set_index("e_id") df_ref.index = df_ref.index.astype("int") # Оставляем только ребра из большой компоненты: df_ref = df_ref.loc[list(G.nodes)]
def supplementary4(): #Correlation of BONITA's node impact score with graph theoretical measures codes = getCodes() maxReps = 6 ersAllNodes = {} rnAllNodes = {} ruleLengths = {} allRes = {} #three-layered dictionary to store results for code in codes: temp_df2 = pd.DataFrame() allRes[str(code)] = {} originalGraph = nx.read_gpickle("gpickles/" + code + ".gpickle") graph = originalGraph #Graph theoretic measures graph = max( nx.weakly_connected_component_subgraphs(originalGraph), key=len ) # get around the problem of disconnected graphs # Refs: https://stackoverflow.com/questions/26637644/in-r-how-do-igraph-and-statnet-handle-disconnected-graphs-in-measuring-network, http://reports-archive.adm.cs.cmu.edu/anon/isr2011/CMU-ISR-11-113.pdf, if len(graph) >= 3: eigenCentrality = nx.eigenvector_centrality_numpy( graph ) # get around the problem of failing when there are multiple eigenvalues with the same (largest) magnitude, perhaps when there are few peripheral nodes (star-like graph). See: https://stackoverflow.com/questions/43208737/using-networkx-to-calculate-eigenvector-centrality?rq=1 hubs, authorities = nx.hits( graph, max_iter=10000, tol=1.0e-7, normalized=True ) #changed parameters so that calculation convergences. For alternative approach see: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.415.843&rep=rep1&type=pdf. Now running into divide by zero error, not sure how to fix that apart from remove normalization; this would be very wrong though. Edit: increased max_iter to 10000 and this seems to have fixed the problem. NB: the original publication suggests a max_iter of 20 degreeCentrality = nx.degree_centrality(graph) cfCentrality = nx.current_flow_closeness_centrality( graph.to_undirected()) eccentCentrality = nx.eccentricity(graph.to_undirected()) betweenCentrality = nx.betweenness_centrality(graph) for iteration in range(1, maxReps): allRes[str(code)][str(iteration)] = {} pickleFile = str('pickles/' + code + '_' + str(iteration) + '_local1.pickle') outputList = pickle.load( open(pickleFile, 'rb') ) #python2 version #outputList=pickle.load(open(pickleFile, 'rb'), encoding='latin1') #python3 version bruteOut1, dev, storeModel, storeModel3, equivalents, dev2 = [ outputList[k] for k in range(len(outputList)) ] model1 = modelHolder(storeModel3) if os.path.isfile( "pickles/'+code+'_'+str(iteration)+'_scores1.pickle"): pathVals = pickle.Unpickler( open( 'pickles/' + code + '_' + str(iteration) + '_scores1.pickle', "rb")).load() ImportanceVals = {} else: print( "Importance scores not found, setting all values to 0") isGeneric4 = True pathVals = [0] * len(model1.nodeList) for node in range(0, len(model1.nodeList)): allRes[str(code)][str(iteration)][str( model1.nodeList[node])] = [] ImportanceVals = pathVals[node] inDegree = originalGraph.in_degree(model1.nodeList[node]) if model1.nodeList[node] in graph.nodes( ): #remember that we have just selected the largest component of the graph for graph theoretic analysis allRes[str(code)][str(iteration)][str( model1.nodeList[node])] = [ ImportanceVals, degreeCentrality[model1.nodeList[node]], eigenCentrality[model1.nodeList[node]], hubs[model1.nodeList[node]], authorities[model1.nodeList[node]], inDegree, cfCentrality[model1.nodeList[node]], eccentCentrality[model1.nodeList[node]], betweenCentrality[model1.nodeList[node]] ] else: allRes[str(code)][str(iteration)][str( model1.nodeList[node])] = [ ImportanceVals, float('NaN'), float('NaN'), float('NaN'), float('NaN'), inDegree, float('NaN'), float('NaN'), float('NaN') ] else: continue allRes_flat = flatdict.FlatDict(allRes) allRes_df = pd.DataFrame(allRes_flat.iteritems()) allRes_df[[ "ImportanceVals", "degreeCentrality", "eigenCentrality", "hubs", "auth", "inDegree", "cfCentrality", "eccentCentrality", "betweenCentrality" ]] = pd.DataFrame( [item for sublist in allRes_df[[1]].values for item in sublist], index=allRes_df.index) allRes_df[["Pathway", "Iteration", "Node"]] = pd.DataFrame( [x[0].split(":", 2) for x in allRes_df[[0]].values], index=allRes_df.index) allRes_df[[ "ImportanceVals", "degreeCentrality", "eigenCentrality", "hubs", "auth", "inDegree", "cfCentrality", "eccentCentrality", "betweenCentrality" ]] = allRes_df[[ "ImportanceVals", "degreeCentrality", "eigenCentrality", "hubs", "auth", "inDegree", "cfCentrality", "eccentCentrality", "betweenCentrality" ]].apply(pd.to_numeric, axis=1) # Aggregate results by iteration allRes_df = allRes_df.groupby(['Node', 'Pathway']) allRes_df = allRes_df["ImportanceVals", "degreeCentrality", "eigenCentrality", "hubs", "auth", "cfCentrality", "eccentCentrality", "betweenCentrality"].agg(np.mean) # Overall Pearson correlation between importance metrics sns.set_context(context='paper', font_scale=1.1) sns.set_style("ticks") #fig, ax = plt.subplots(figsize=[5.2,4]) temp_correl = allRes_df.loc[:, [ "ImportanceVals", "degreeCentrality", "eigenCentrality", "hubs", "auth", "cfCentrality", "eccentCentrality", "betweenCentrality" ]].corr(method='pearson') mask = np.triu(temp_correl, k=1) figTemp = sns.heatmap( temp_correl, xticklabels=[ "BONITA Score", "Degree Centrality", "Eigenvector Centrality", "Hub Score", "Authority", "Current Flow Centrality", "Eccentricity Centrality", "Betweenness Centrality" ], yticklabels=[ "BONITA Score", "Degree Centrality", "Eigenvector Centrality", "Hub Score", "Authority", "Current Flow Centrality", "Eccentricity Centrality", "Betweenness Centrality" ], mask=mask, square=True, vmax=1, vmin=-1, center=0, cmap='RdBu_r', linewidths=.5, cbar_kws={ "shrink": .5, 'label': 'Pearson Correlation' }, annot=True) #, annot_kws={'fontsize': 'large'}) plt.xticks(rotation=90) figTemp.figure.tight_layout() figTemp = figTemp.get_figure() figTemp.savefig("Overall_Pearson_correlation.svg") plt.close() # Overall Spearman correlation between importance metrics sns.set_context(context='paper', font_scale=1.1) sns.set_style("ticks") #fig, ax = plt.subplots(figsize=[5.2,4]) temp_correl = allRes_df.loc[:, [ "ImportanceVals", "degreeCentrality", "eigenCentrality", "hubs", "auth", "cfCentrality", "eccentCentrality", "betweenCentrality" ]].corr(method='spearman') mask = np.triu(temp_correl, k=1) figTemp = sns.heatmap( temp_correl, xticklabels=[ "BONITA Score", "Degree Centrality", "Eigenvector Centrality", "Hub Score", "Authority", "Current Flow Centrality", "Eccentricity Centrality", "Betweenness Centrality" ], yticklabels=[ "BONITA Score", "Degree Centrality", "Eigenvector Centrality", "Hub Score", "Authority", "Current Flow Centrality", "Eccentricity Centrality", "Betweenness Centrality" ], mask=mask, square=True, vmax=1, vmin=-1, center=0, cmap='RdBu_r', linewidths=.5, cbar_kws={ "shrink": .5, 'label': 'Spearman Correlation' }, annot=True) #, annot_kws={'fontsize': 'large'}) plt.xticks(rotation=90) figTemp.figure.tight_layout() figTemp = figTemp.get_figure() figTemp.savefig("Overall_Spearman_correlation.svg") plt.close()