def centralities(g, user_map): """Use graph_tool to calculate 7 centralities.""" # degrees # in degree ki = g.degree_property_map('in') # out degree ko = g.degree_property_map('out') # weighted in degree si = g.degree_property_map('in', weight=g.ep['weight']) # weighted out degree so = g.degree_property_map('out', weight=g.ep['weight']) # pagerank pr = gt.pagerank(g) # betweetnness vb, eb = gt.betweenness(g) # eigenvector e, ev = gt.eigenvector(g) # screen_name screen_name = user_map.loc[g.vp['raw_id'].a.copy()].values df = pd.DataFrame( dict(screen_name=screen_name, in_degree=ki.a, out_degree=ko.a, weighted_in_degree=si.a, weighted_out_degree=so.a, page_rank=pr.a, betweenness=vb.a, eigenvector=ev.a)) df.to_csv('centralities.raw.csv')
def graph_measures(graph: gt.Graph) -> pd.DataFrame: def get_attrs(attrs): return (attrs[1][0], attrs[1][1][1], attrs[0]) def append_val(key, prop, v): measures[key][0].append(prop[v]) _, vp_authority, vp_hub = gt.hits(graph) measures = { key: ([], prop) for key, prop in { 'tp_group': graph.vp.group_name, 'tp_author': graph.vp.username, 'tn_degree_in': graph.degree_property_map('in'), 'tn_degree_out': graph.degree_property_map('out'), 'tn_degree_total': graph.degree_property_map('total'), 'tn_pagerank': gt.pagerank(graph), 'tn_betweenness': gt.betweenness(graph)[0], 'tn_closeness': gt.closeness(graph), 'tn_eigenvector': gt.eigenvector(graph)[1], 'tn_authority': vp_authority, 'tn_hub': vp_hub, 'tn_lcc': gt.local_clustering(graph) }.items() } for attrs in product(graph.vertices(), measures.items()): append_val(*get_attrs(attrs)) return pd.DataFrame( dict(map(lambda item: (item[0], item[1][0]), measures.items()))).fillna(0)
def findBestChilds(self,nodes,k = 4): stateGraph = gt.Graph() node_list = stateGraph.new_vertex_property("string") node_parents = stateGraph.new_edge_property("object") [self.addDirectedLink(node, nodes, stateGraph, node_list, node_parents) for node in nodes] try: #self.logger.debug (len(stateGraph)) h = gt.pagerank(stateGraph) res = list(sorted(h, key=h.__getitem__, reverse=True)) vertices = dict() for vertex in stateGraph.vertices(): vertices[stateGraph.vertex_index[vertex]] = h[vertex] res = list(sorted(vertices, key=vertices.__getitem__, reverse=True)) important = res[:k] important.remove(0) important.remove(1) important_v = set() [important_v.add(stateGraph.vertex(i)) for i in important] except: self.logger.error ('Graph is empty') self.logger.error (sys.exc_info()) dereffed_list = set([node_list[i] for i in important_v]) dereffed_list.discard(0) dereffed_list.discard(1) return list(dereffed_list)
def calc_pagerank(g: gt.Graph) -> List[Tuple[int, str, float]]: """ Return: sorted list of tuples, [(vertex_idx, wk_title, pagerank_value), ....] """ vp_label = g.vp['_graphml_vertex_id'] # same as wktitle pr = gt.pagerank(g) ranks = [(g.vertex_index[v], vp_label[v], pr[v]) for v in g.vertices()] ranks = sorted(ranks, key=lambda e: -e[-1]) return ranks
def add_pagerank(): pr = gt.pagerank(g,damping=0.5) with open("all_AAN_with_fellows.csv","r") as old,\ open("all_AAN_with_fellows_and_pagerank.csv", "w+") as new: header = old.next() new.write(header.strip() + ",pagerank\n") for line in old: old_data = line.strip().split(",") gt_index = int(old_data[0]) new_data = line.strip() + "," + str(pr.a[gt_index]) + "\n" new.write(new_data)
def add_pagerank(): g = gt.load_graph( "/home/mrunelov/KTH/exjobb/SICS-cite/APS/data/APS.graphml") pr = gt.pagerank(g, damping=0.5) with open("all_APS_with_fellows.csv","r") as old,\ open("all_APS_with_fellows_and_pagerank.csv", "w+") as new: header = old.next() new.write(header.strip() + ",pagerank\n") for line in old: old_data = line.strip().split(",") gt_index = int(old_data[0]) new_data = line.strip() + "," + str(pr.a[gt_index]) + "\n" new.write(new_data)
def get_pagerank_values(self): start = time.time() logger.info('Started call to get_pagerank') g = Graph() vp = g.add_edge_list(self.__v.get_graph_edges(), hashed=True, hash_type='int') logger.info('Delta time to build graph: {}s'.format( timedelta(seconds=(time.time() - start)))) start = time.time() ranks = pagerank(g) logger.info('Delta time to compute pagerank: {}s'.format( timedelta(seconds=(time.time() - start)))) for vertex in g.vertices(): qid = vp[vertex] r = ranks[vertex] yield qid, r
def findBestChilds(self, nodes, k=4): stateGraph = gt.Graph() node_list = stateGraph.new_vertex_property("string") node_parents = stateGraph.new_edge_property("object") [ self.addDirectedLink(node, nodes, stateGraph, node_list, node_parents) for node in nodes ] try: #self.logger.debug (len(stateGraph)) h = gt.pagerank(stateGraph) res = list(sorted(h, key=h.__getitem__, reverse=True)) vertices = dict() for vertex in stateGraph.vertices(): vertices[stateGraph.vertex_index[vertex]] = h[vertex] res = list(sorted(vertices, key=vertices.__getitem__, reverse=True)) important = res[:k] important.remove(0) important.remove(1) important_v = set() [important_v.add(stateGraph.vertex(i)) for i in important] except: self.logger.error('Graph is empty') self.logger.error(sys.exc_info()) dereffed_list = set([node_list[i] for i in important_v]) dereffed_list.discard(0) dereffed_list.discard(1) return list(dereffed_list)
def pr_curves(): plt.figure().set_facecolor('white') lra = logit() in_degs_gt = g.degree_property_map("in") in_degs = in_degs_gt.a.astype("float") in_degs = in_degs / in_degs.max() with open("burst_list.pickle", "rb") as f: bla = np.asarray(pickle.load(f)).astype("float") bla /= bla.max() with open("vpa-between2.pickle", "rb") as f: ba = np.asarray(pickle.load(f)) ba /= ba.max() geometric_mean = bla geometric_mean *= ba geometric_mean = np.sqrt(geometric_mean) with open("Px_list.pickle", "rb") as f: pxa = np.asarray(pickle.load(f)).astype("float") pxa_n = pxa / pxa.max() with open("Px_list_weighted.pickle", "rb") as f: pxwa = np.asarray(pickle.load(f)).astype("float") pxwa_n = pxwa / pxwa.max() pr = gt.pagerank(g, damping=0.5) plots = [[] for _ in range(8)] print "Calculating for num_top = " + str(num_top) plots[0] = find_fellows_in_top_scores(in_degs, "indegree", num_top, printstuff=False) plots[1] = find_fellows_in_top_scores(pr.a, "PageRank alpha 0.5", num_top, printstuff=False) plots[2] = find_fellows_in_top_scores(ba, "betweenness", num_top, printstuff=False) plots[3] = find_fellows_in_top_scores(pxa_n, "progeny size", num_top, printstuff=False) plots[4] = find_fellows_in_top_scores(pxwa_n, "Weighted progeny size", num_top, printstuff=False) plots[5] = find_fellows_in_top_scores(lra, "All with logit coefficients", num_top, printstuff=False) plots[6] = find_fellows_in_top_scores(geometric_mean, "sqrt(between*burst)", num_top, printstuff=False) geometric_mean *= np.sqrt(in_degs) plots[7] = find_fellows_in_top_scores(geometric_mean, "sqrt(between*burst*indegs)", num_top, printstuff=False) #print "Plotting..." # Precision-Recall #for p in plots: #plt.plot(*zip(*p),linewidth=2.0) #ax = plt.subplot() #total_fellow_articles = len(fellow_indexes) ##total_fellow_articles = 139755 # manual total within 1980 filter #y_random = float(total_fellow_articles) / num_top #527130 #ax.plot([0.0,1.0],[y_random,y_random],ls="--",c="0.5",linewidth=2.0) #leg = plt.legend([r'$\mathrm{Indegree}$', r'$\mathrm{Betweenness}$', r'$\mathrm{Backbone\/ progeny\/ size}$', r'$\mathrm{Logit}$', r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}}$',r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}\times\/\mathrm{indegree}}$',r'$\mathrm{PageRank,\/} \alpha=0.5$',r'$\mathrm{Random\/ retrieval}$'], loc='best',fontsize=18) #for obj in leg.legendHandles: #obj.set_linewidth(4.0) #plt.xlabel(r'$\mathrm{Recall}$',fontsize=24) #plt.ylabel(r'$\mathrm{Precision}$',fontsize=24) #plt.show() # Precision @ n lss = ["-"] * 8 lss[0] = "--" lss[1] = "--" clr = ['b', 'k', 'g', 'r', 'r', 'c', 'm', 'y'] if num_top <= 1000: xs = range(10, num_top + 1, 10) #plt.tick_params(labelsize=18) #plt.figure(2) #plt.title(r"$\mathrm{Precision\/ @\/ X}$") plt.title(r"$\mathrm{DCG\/ @\/ X}$") plt.figure().set_facecolor('white') if num_top <= 1000: plt.xlabel(r'$\mathrm{@}$', fontsize=32) else: plt.xlabel(r'$\mathrm{Recall}$', fontsize=32) plt.ylabel(r'$\mathrm{Precision}$', fontsize=32) #plt.ylabel(r'$\mathrm{DCG}$',fontsize=24) if use_cutoff: total_fellow_articles = 139755 # manual total within 1980 filter y_random = float( total_fellow_articles) / 427735 #527129 # 427735 is for cutoff else: total_fellow_articles = len(fellow_indexes) y_random = float(total_fellow_articles) / 527129 i = 0 for p in plots[:2]: if num_top > 1000: xs = [point[0] for point in p] #ys = [point[1] for point in p] ys = p # for DCGs plt.plot(xs, ys, linewidth=2.0, ls=lss[i], color=clr[i]) i += 1 ax = plt.subplot() ax.tick_params(labelsize=18) if num_top <= 1000: ax.plot([0.0, num_top], [y_random, y_random], ls="--", c="0.5", linewidth=2.0) else: ax.plot([0.0, 1], [y_random, y_random], ls="--", c="0.5", linewidth=2.0) for p in plots[2:]: if num_top > 1000: xs = [point[0] for point in p] #ys = [point[1] for point in p] ys = p # for DCGs if i == 4: plt.plot(xs, ys, linewidth=2.0, ls=lss[i], color=clr[i], marker='D') else: plt.plot(xs, ys, linewidth=2.0, ls=lss[i], color=clr[i]) i += 1 leg = plt.legend([ r'$\mathrm{Indegree}$', r'$\mathrm{PageRank,\/} \alpha=0.5$', r'$\mathrm{Random\/ retrieval}$', r'$\mathrm{Betweenness}$', r'$\mathrm{Backbone\/ progeny\/ size}$', r'$\mathrm{Weighted\/ backbone\/ progeny\/ size}$', r'$\mathrm{Logit}$', r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}}$', r'$\sqrt{\mathrm{betweenness}\times\/\mathrm{burstness}\times\/\mathrm{indegree}}$' ], loc='upper right', fontsize=24) for obj in leg.legendHandles: obj.set_linewidth(2.0) plt.show()
def process(name, g): # Properties vp_pos = gt.sfdp_layout(g) vp_deg = g.degree_property_map('total') vp_deg_log = g.new_vp('double') vp_deg_log.a = np.log10(vp_deg.a) vp_cls = gt.closeness(g) vp_page = gt.pagerank(g) vp_btw, ep_btw = gt.betweenness(g, norm=False) # Colormaps for cmap in [ 'viridis', 'plasma', 'inferno', 'YlGnBu', 'Blues', 'Greys', 'Greens', 'Oranges' ]: draw_graph(g, vp_pos, f'{name}.prop=deg.color={cmap}.png', vp_color=vp_deg, vcmap=cmap) draw_graph(g, vp_pos, f'{name}.prop=deg_log.color={cmap}.png', vp_color=vp_deg_log, vcmap=cmap) draw_graph(g, vp_pos, f'{name}.prop=cls.color={cmap}.png', vp_color=vp_cls, vcmap=cmap) draw_graph(g, vp_pos, f'{name}.prop=page.color={cmap}.png', vp_color=vp_page, vcmap=cmap) draw_graph(g, vp_pos, f'{name}.prop=btw.color={cmap}.png', vp_color=vp_btw, vcmap=cmap) # Construct dicts for D3-style JSON nodes = [] for u in g.vertices(): p = vp_pos[u] nodes.append({ 'x': p[0], 'y': p[1], 'deg': vp_deg[u], 'deg_log': vp_deg_log[u], 'cls': vp_cls[u], 'page': vp_page[u], 'btw': vp_btw[u], }) vp_idx = g.vertex_index links = [{ 'source': vp_idx[e.source()], 'target': vp_idx[e.target()], } for e in g.edges()] # Save D3 style JSON d = {'nodes': nodes, 'links': links} with open(f'{name}.json', 'w') as f: json.dump(d, f)
#import packages import graph_tool.all as gt #------------------------------------------------------ #Variables: NETWORK_FILE = "" NETWORK_FEATURE_FILE = "" #------------------------------------------------------ #import the graph g = gt.load_graph(NETWORK_FILE) #calculate the features using inbuilt graph_tool functions #Pagerank rank = gt.pagerank(g) print("pagerank has been calculated") #HITS y-hubs and x-authorities eigenvalue, xauthorities, yhubs = gt.hits(g) print("HITS values have been calculated") #betweenness centrality between_vp, between_ep = gt.betweenness(g) print("betweenness centrality has been calculated") #save external to internal property map #this makes the features accessible in the future when loading the graph g.vertex_properties["page_rank"] = rank g.vertex_properties["x_authorities"] = xauthorities g.vertex_properties["y_hubs"] = yhubs
def iterateMatrix(self, blacklist=set(), additionalRes=set(), kp=75): """Iteration phase, During this phase the children of the current bottom level nodes are fetched and added to the hashed set. **Parameters** blacklist : set, optional (default = empty) set of resources predicates to exclude from the pathfinding algorithm additionalResources : set, optional (default = empty) set of resources to include anyway in the next iteration **Returns** response : stateGraph contains the updated stategraph after fetching new resources """ self.logger.info('--- NEW ITERATION ---') #self.logger.info ('Existing resources {0}'.format(str(len(self.resources.property_list())))) #self.logger.info ('Indexed resources by parents {0}'.format(str(len(self.resources_by_parent)))) self.logger.info('Grandmother: {0}'.format( self.resources[self.stateGraph.vertex(0)])) self.logger.info('Grandfather: {0}'.format( self.resources[self.stateGraph.vertex(1)])) self.logger.info('--- --- ---') start = time.clock() prevResources = set() additionalResources = dict() i = 0 for v in self.stateGraph.vertices(): i += 1 if not v in self.added and not v in self.unimportant: prevResources.add(self.resources[v]) #print('unimportant') #print(len(self.unimportant)) #print('previous') #print(len(prevResources)) #print(prevResources) #print('new') #print(len(self.added - prevResources)) #print(self.added) #print('added') #print(len(self.added)) #print('total') #print(i) #self.worker.startQueue(self.resourceretriever.fetchResource, num_of_threads=32) #if len(additionalRes) == 0: # for resource in prevResources: # self.added.add(resource) # item = [resource, additionalResources, blacklist] # self.worker.queueFunction(self.resourceretriever.fetchResource, item) # self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource) #else: # self.logger.info('Special search iteration: Deep search') # for resource in additionalRes: # self.added.add(resource) # item = [resource, additionalResources, blacklist] # self.worker.queueFunction(self.resourceretriever.fetchResource, item) # self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource) reqs = list() if len(additionalRes) == 0: for resource in prevResources: self.added.add(resource) for url in self.resourceretriever.genMultiUrls(prevResources): reqs.append(url) else: self.logger.info('Special search iteration: Deep search') for resource in additionalRes: self.added.add(resource) for url in self.resourceretriever.genMultiUrls(additionalRes): reqs.append(url) if len(reqs) > 0: resps = list() with concurrent.futures.ThreadPoolExecutor( max_workers=2) as executor: for res in reqs: # Start the load operations and mark each future with its URL future_to_url = { executor.submit(requests.get, url): url for url in res['urls'] } for future in concurrent.futures.as_completed( future_to_url): url = future_to_url[future] try: response = dict() response['resources'] = res['resources'] response['results'] = future.result() resps.append(response) except Exception as exc: self.logger.error('%r generated an exception: %s' % (url, exc)) else: self.logger.debug('retrieved results for %r' % (url)) #todo move http gets in threads vs async grequests self.worker.startQueue(self.resourceretriever.processMultiResource, num_of_threads=64) for rp in resps: #for rp in res['urls']: item = [ rp['resources'], rp['results'], self.resources_by_parent, additionalResources, blacklist ] self.worker.queueFunction( self.resourceretriever.processMultiResource, item) self.worker.waitforFunctionsFinish( self.resourceretriever.processMultiResource) #toAddResources = list(additionalResources.keys() - prevResources) #print('to add resources') #print(len(toAddResources)) #toAddResources = filter(resourceretriever.isResource, toAddResources) gc.collect() #self.logger.info('Updated indexed resources with parents {0}'.format(str(len(self.resources_by_parent.list_properties())))) self.logger.info('Total resources: %s' % str(len(prevResources))) self.checked_resources += len(additionalResources) halt1 = time.clock() self.logger.info('resource gathering: %s' % str(halt1 - start)) #print ('resource gathering: %s' % str(halt1 - start)) #self.stateGraph = gt.Graph() #vlist = self.stateGraph.add_vertex(len(toAddResources)) #[self.buildGraph(ri, self.stateGraph) for ri in ris] #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10, # output_size=(800,800), output="two-nodes.pdf") [ self.addDirectedLink(res, additionalResources, self.stateGraph) for res in prevResources ] halt2 = time.clock() self.logger.info('graph construction: %s' % str(halt2 - halt1)) #print ('graph construction: %s' % str(halt2 - halt1)) #For next iteration, e.g. if no path was found #Check for singular values to reduce dimensions of existing resources #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10, # output_size=(800,800), output="two-nodes.pdf") #pathExists = self.graph.pathExists(self) #self.logger.debug('path exists: %s' % pathExists) self.logger.debug('current iteration: %s' % self.iteration) if self.iteration > 1: try: self.logger.info('reducing matrix') #print ('reducing matrix, max important nodes') #self.logger.debug (len(self.stateGraph)) #k = np.int((1-np.divide(1,self.iteration))*500) #k = np.int((1-np.divide(1,self.iteration))*kp) k = int(kp * math.pow(1.2, self.iteration)) #print (k) h = gt.pagerank(self.stateGraph) #h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07)) #h = (nx.hits_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07)) vertices = dict() for vertex in self.stateGraph.vertices(): vertices[self.stateGraph.vertex_index[vertex]] = h[vertex] #print(vertices) res = list( sorted(vertices, key=vertices.__getitem__, reverse=True)) #print (res) self.logger.debug(k) unimportant = res[k:] self.unimportant = set() for u in unimportant: #Never delete grandmother and grandfather, even if they become insignificant if u > 1: self.unimportant.add(self.stateGraph.vertex(u)) #pass #print(self.unimportant) #self.stateGraph = resourceretriever_gt.removeUnimportantResources(unimportant, self.resources, self.stateGraph) halt3 = time.clock() self.logger.info('rank reducing: %s' % str(halt3 - halt2)) #self.logger.info('Updated resources amount: %s' % str(len(self.stateGraph.vertices()))) #print('Updated resources amount: %s' % str(len(self.stateGraph.vertices()))) #print(len(self.unimportant)) except: self.logger.error('Pathfinding reduction error') self.logger.error(sys.exc_info()) else: self.logger.info('no rank reducing') self.logger.info('total %s' % str(time.clock() - start)) self.logger.info('=== === ===') self.iteration += 1 return self.stateGraph
def page_rank(g): # return gt_stats.vertex_hist(g, gt.pagerank(g)) return gt.pagerank(g).get_array()
for i in range(1000): g.add_edge(g.vertex_index[random.randint(50, 90)], g.vertex_index[random.randint(50, 90)]) # v_prop = g.new_vertex_property("string") # v_prop[g.vertex_index[0]] = 'fooxxx' # v_prop[g.vertex_index[1]] = 'bar' # v_prop[g.vertex_index[2]] = 'bazxxx' # e_prop = g.new_edge_property("double") # e_prop[g.vertex_index[0]] = 200 # e_prop[g.vertex_index[2]] = 0.04 # e_len = g.new_edge_property("double") # e_len[e1] = 10 # e_len[e2] = 20 pr = gt.pagerank(g) # for i in pr: # print (i) graph_draw(g, vertex_fill_color=pr, vertex_font_size=2, vorder=pr, output_size=(800, 800), output="two-nodes.png") vp, ep = gt.betweenness(g) print(gt.central_point_dominance(g, vp))
weight_map[e] = 1. * common_votes / len(dep1[5:]) edges[(dep1[4],dep2[4])] = [weight_map[e], dep1, dep2] # adds for debuging except Exception, e: print str(e) # conventional centrality analysis # degree degree = g.degree_property_map('total', weight = weight_map) # vertice betweeness betweeness = gt.betweenness(g, weight = weight_map) # closeness closeness = gt.closeness(g, weight = weight_map) # Katz katz = gt.katz(g, weight = weight_map) # Pagerank pagerank = gt.pagerank(g, weight = weight_map) metrics = ['name', 'diap', 'betweenness', 'closeness', 'degree', 'katz', 'pagerank'] df = pd.DataFrame(zip(vertex_to_name.values(), diap, degree.a.tolist(), betweeness[0].a.tolist(), closeness.a.tolist(), katz.a.tolist(), pagerank.a.tolist()), columns = metrics) df.sort('pagerank', ascending=True)[:30]
def plot_random_metrics(metric): """ Random mean + std plotting for indegree and PageRank """ global g plots = [] for i in range(1, 11): num = str(i) first = get_first(num) second = get_second(num) g1, g2 = get_gt_graphs(g, first, second) if metric == "indeg": m1 = g1.degree_property_map("in") m2 = g2.degree_property_map("in") elif metric == "pagerank": m1 = gt.pagerank(g1, damping=0.5) m2 = gt.pagerank(g2, damping=0.5) ys1 = find_fellows_in_top_scores(m1.a, metric, num_top, printstuff=False) ys1 = [p[1] for p in ys1] plots.append(ys1) ys2 = find_fellows_in_top_scores(m2.a, metric, num_top, printstuff=False) ys2 = [p[1] for p in ys2] plots.append(ys2) if metric == "pagerank": original = gt.pagerank(g, damping=0.5) elif metric == "indeg": original = g.degree_property_map("in") g_plot = find_fellows_in_top_scores(original.a, metric, num_top, printstuff=False) g_plot = [p[1] for p in g_plot] plt.figure() plt.title(r"$\mathrm{Precision\/ @\/ X}$") plt.figure().set_facecolor('white') #plt.xlabel(r'$\mathrm{Recall}$',fontsize=24) plt.xlabel(r'$\mathrm{@}$', fontsize=24) plt.ylabel(r'$\mathrm{Precision}$', fontsize=24) xs = range(10, num_top + 1, 10) #for i,p in enumerate(plots): #ys = p #plt.plot(xs,ys,linewidth=1.0,color='b',ls=":") if metric == "indeg": clr = 'b' elif metric == "pagerank": clr = 'k' plt.plot(xs, g_plot, color=clr, linewidth=4.0) plot_mean_std(xs, plots, color=clr, ls=':') #leg = plt.legend([r'$\mathrm{\/ progeny\/ size}$', r'$\mathrm{Backbone\/ progeny\/ size}$', r'$\mathrm{Random\/ retrieval}$'],fontsize=18,loc='best') ax = plt.subplot() if use_cutoff: total_fellow_articles = 139755 # manual total within 1980 filter y_random = float(total_fellow_articles) / 427735 else: total_fellow_articles = len(fellow_indexes) y_random = float(total_fellow_articles) / 527129 ax.plot([0.0, num_top], [y_random, y_random], ls="--", c="0.5", linewidth=4.0) if metric == "pagerank": leg = plt.legend([ r'$\mathrm{PageRank,\/} \alpha=0.5$', r'$\mathrm{PageRank\/ random\/ mean,\/} \alpha=0.5$', r'$\mathrm{Random\/ retrieval}$' ], fontsize=32, loc='lower right') elif metric == "indeg": leg = plt.legend([ r'$\mathrm{Indegree}$', r'$\mathrm{Indegree\/ random\/ mean}$', r'$\mathrm{Random\/ retrieval}$' ], fontsize=32, loc='lower right') for obj in leg.legendHandles: obj.set_linewidth(4.0) plt.show()
element='step', stat='probability', common_norm=False, ax=axarr[0]) axarr[0].set_title('Betweenness centrality of generic/non-generic genes') axarr[0].set_xscale('log') sns.boxplot(data=bw_df, y='betweenness', x='is_generic', ax=axarr[1]) axarr[1].set_title('Betweenness centrality of generic/non-generic genes') axarr[1].set_yscale('log') # In[11]: # analyze pagerank centrality for generic vs. other genes # pagerank treats edge weights as "importance"/"confidence" rather than cost, # so we can use the original correlations as edge weights here pr = gt.pagerank(G, weight=G.ep['weight']) pr_df = (betweenness_to_df(G, pr).rename(columns={'betweenness': 'pagerank'})) pr_df.head() # In[12]: sns.set({'figure.figsize': (12, 4)}) sns.set_style('whitegrid') fig, axarr = plt.subplots(1, 2) sns.histplot(data=pr_df, x='pagerank', hue='is_generic', element='step', stat='probability', common_norm=False, ax=axarr[0])
def iterateMatrix(self, blacklist=set(), additionalRes = set(),kp=75): """Iteration phase, During this phase the children of the current bottom level nodes are fetched and added to the hashed set. **Parameters** blacklist : set, optional (default = empty) set of resources predicates to exclude from the pathfinding algorithm additionalResources : set, optional (default = empty) set of resources to include anyway in the next iteration **Returns** response : stateGraph contains the updated stategraph after fetching new resources """ self.logger.info ('--- NEW ITERATION ---') #self.logger.info ('Existing resources {0}'.format(str(len(self.resources.property_list())))) #self.logger.info ('Indexed resources by parents {0}'.format(str(len(self.resources_by_parent)))) self.logger.info ('Grandmother: {0}'.format(self.resources[self.stateGraph.vertex(0)])) self.logger.info ('Grandfather: {0}'.format(self.resources[self.stateGraph.vertex(1)])) self.logger.info ('--- --- ---') start = time.clock() prevResources = set() additionalResources = dict() i = 0 for v in self.stateGraph.vertices(): i += 1 if not v in self.added and not v in self.unimportant: prevResources.add(self.resources[v]) #print('unimportant') #print(len(self.unimportant)) #print('previous') #print(len(prevResources)) #print(prevResources) #print('new') #print(len(self.added - prevResources)) #print(self.added) #print('added') #print(len(self.added)) #print('total') #print(i) #self.worker.startQueue(self.resourceretriever.fetchResource, num_of_threads=32) #if len(additionalRes) == 0: # for resource in prevResources: # self.added.add(resource) # item = [resource, additionalResources, blacklist] # self.worker.queueFunction(self.resourceretriever.fetchResource, item) # self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource) #else: # self.logger.info('Special search iteration: Deep search') # for resource in additionalRes: # self.added.add(resource) # item = [resource, additionalResources, blacklist] # self.worker.queueFunction(self.resourceretriever.fetchResource, item) # self.worker.waitforFunctionsFinish(self.resourceretriever.fetchResource) reqs = list() if len(additionalRes) == 0: for resource in prevResources: self.added.add(resource) for url in self.resourceretriever.genMultiUrls(prevResources): reqs.append(url) else: self.logger.info('Special search iteration: Deep search') for resource in additionalRes: self.added.add(resource) for url in self.resourceretriever.genMultiUrls(additionalRes): reqs.append(url) if len(reqs) > 0: resps = list() with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: for res in reqs: # Start the load operations and mark each future with its URL future_to_url = {executor.submit(requests.get, url): url for url in res['urls']} for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: response = dict() response['resources'] = res['resources'] response['results'] = future.result() resps.append(response) except Exception as exc: self.logger.error('%r generated an exception: %s' % (url, exc)) else: self.logger.debug('retrieved results for %r' % (url)) #todo move http gets in threads vs async grequests self.worker.startQueue(self.resourceretriever.processMultiResource, num_of_threads=64) for rp in resps: #for rp in res['urls']: item = [rp['resources'], rp['results'], self.resources_by_parent, additionalResources, blacklist] self.worker.queueFunction(self.resourceretriever.processMultiResource, item) self.worker.waitforFunctionsFinish(self.resourceretriever.processMultiResource) #toAddResources = list(additionalResources.keys() - prevResources) #print('to add resources') #print(len(toAddResources)) #toAddResources = filter(resourceretriever.isResource, toAddResources) gc.collect() #self.logger.info('Updated indexed resources with parents {0}'.format(str(len(self.resources_by_parent.list_properties())))) self.logger.info ('Total resources: %s' % str(len(prevResources))) self.checked_resources += len(additionalResources) halt1 = time.clock() self.logger.info ('resource gathering: %s' % str(halt1 - start)) #print ('resource gathering: %s' % str(halt1 - start)) #self.stateGraph = gt.Graph() #vlist = self.stateGraph.add_vertex(len(toAddResources)) #[self.buildGraph(ri, self.stateGraph) for ri in ris] #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10, # output_size=(800,800), output="two-nodes.pdf") [self.addDirectedLink(res, additionalResources, self.stateGraph) for res in prevResources] halt2 = time.clock() self.logger.info ('graph construction: %s' % str(halt2 - halt1)) #print ('graph construction: %s' % str(halt2 - halt1)) #For next iteration, e.g. if no path was found #Check for singular values to reduce dimensions of existing resources #gt.graph_draw(self.stateGraph, vertex_text=self.stateGraph.vertex_index, vertex_font_size=10, # output_size=(800,800), output="two-nodes.pdf") #pathExists = self.graph.pathExists(self) #self.logger.debug('path exists: %s' % pathExists) self.logger.debug('current iteration: %s' % self.iteration) if self.iteration > 1: try: self.logger.info ('reducing matrix') #print ('reducing matrix, max important nodes') #self.logger.debug (len(self.stateGraph)) #k = np.int((1-np.divide(1,self.iteration))*500) #k = np.int((1-np.divide(1,self.iteration))*kp) k = int(kp*math.pow(1.2,self.iteration)) #print (k) h = gt.pagerank(self.stateGraph) #h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07)) #h = (nx.hits_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07)) vertices = dict() for vertex in self.stateGraph.vertices(): vertices[self.stateGraph.vertex_index[vertex]] = h[vertex] #print(vertices) res = list(sorted(vertices, key=vertices.__getitem__, reverse=True)) #print (res) self.logger.debug(k) unimportant = res[k:] self.unimportant = set() for u in unimportant: #Never delete grandmother and grandfather, even if they become insignificant if u > 1: self.unimportant.add(self.stateGraph.vertex(u)) #pass #print(self.unimportant) #self.stateGraph = resourceretriever_gt.removeUnimportantResources(unimportant, self.resources, self.stateGraph) halt3 = time.clock() self.logger.info ('rank reducing: %s' % str(halt3 - halt2)) #self.logger.info('Updated resources amount: %s' % str(len(self.stateGraph.vertices()))) #print('Updated resources amount: %s' % str(len(self.stateGraph.vertices()))) #print(len(self.unimportant)) except: self.logger.error ('Pathfinding reduction error') self.logger.error (sys.exc_info()) else: self.logger.info ('no rank reducing') self.logger.info ('total %s' % str(time.clock()-start)) self.logger.info ('=== === ===') self.iteration+=1 return self.stateGraph
def compute_and_save_pagerank(g, filename): pr = gt.pagerank(g, epsilon=1e-12) g.vertex_properties["pagerank"] = pr g.save(filename)
from graph_tool.all import * from graph_tool.draw import graphviz_draw as gd import graph_tool.all as gt import gc import math G = load_graph("graphDumpFor25BlocksTopPointFivePercent_1.2_16122016.dot") pageRank = G.new_vertex_property("double") G.vertex_properties["pageRank"] = pageRank valuation = G.ep.TransactionValues widthTrans = valuation.copy() pageRank = gt.pagerank(G) #gt.remove_parallel_edges(G) gt.remove_self_loops(G) pos = gt.sfdp_layout(G) for e in G.edges(): if (widthTrans[e] != 0): widthTrans[e] = int(math.log10(float(widthTrans[e]))/7) graph_draw(G, pos = pos,vertex_size=pageRank,edge_pen_width=widthTrans,output="graphDumpFor25BlocksTopPointFivePercent_1.2_16122016.pdf")
## out-degree degrees = g.get_out_degrees(g.get_vertices()) plt.title("out-degree distribution") plt.ylabel('#Nodes') plt.xlabel('#Connections') plt.plot(distribution(degrees)) plt.savefig(f"img/out_degree_dist.png", format='png') plt.close() del degrees ######## # Rank # ######## rank = GT.pagerank(g).get_array() plt.title("Rank distribution") plt.ylabel('#Nodes') plt.xlabel('Rank') plt.bar(*float_distribution(rank, 40), width=(max(rank)-min(rank))/50) plt.savefig(f"img/rank_dist.png", format='png') plt.close() print(f"top {TOP} rank nodes: {get_top(rank , TOP)}") del rank ############### # Betweenness # ############### betweenness = GT.betweenness(g)[0].get_array()
def central(): g = gt.load_graph(filename) print 'Graph loaded, now calculating centrality' pr = gt.pagerank(g) g.vp['rank'] = pr g.save(filename)