def gen_pagerank(graph): if graph.num_edges(): pr = pagerank( graph, weight=graph.edge_properties['weights_on_edges']) else: pr = pagerank(graph) pr.a /= pr.a.min() graph.vertex_properties['pagerank'] = pr return graph
def pageRankBiDi( g ): # calculate the product of the PageRange and reverse PageRank for each vertex pr = centr.pagerank( g ) g.set_reversed( True ) rpr = centr.pagerank( g ) g.set_reversed( False ) for v in g.vertices(): pr[v] = pr[v] * rpr[v] return pr
def get_metric(ggt, metric, n_nodes, n_edges): if "d" == metric: # Density if n_nodes <= 1: value = 0.0 else: value = ( 2.0 * n_edges ) / ( n_nodes * (n_nodes - 1.0) ) ggt.gp[metric] = ggt.new_gp("float", val=value) elif "dg" == metric: # Degree if n_nodes <= 1: value = np.zeros(n_nodes, dtype=np.float32) else: value = ggt.degree_property_map('total').get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "dgc" == metric: # Degree centrality if n_nodes <= 1: value = np.zeros(n_nodes, dtype=np.float32) else: value = ggt.degree_property_map('total').get_array() / (n_nodes - 1.0) ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "cnw" == metric: # Clustering coefficient ( non-weighted ) value = local_clustering(ggt).get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "cw" == metric: # Clustering coefficient ( weighted ) value = local_clustering(ggt, weight=ggt.ep.weight).get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value) elif "pgr" == metric: # Page Rank value = pagerank(ggt).get_array() ggt.vp[metric] = ggt.new_vp("double", vals=value)
def pagerank(self): pr = {} dates = sorted(self.graphs.iterkeys()) for date in dates: if self.graphs[date].num_vertices() > 0: pr[date] = gtc.pagerank(self.graphs[date]) self.pr = pr return pr
def calculate_pageranks(self): pagerank_dict = ct.pagerank(self.graph, weight = self.ew) result_dict = dict() for anchor in self.anchor_dictionary: result_dict[anchor] = [(concept, pagerank_dict[self.massive_dict[concept]]) for concept in self.anchor_dictionary[anchor]] result_dict[anchor].sort(key = lambda x: x[1], reverse = True) self.pagerank = result_dict self.pr_result = [(anchor, concept, score) for anchor in self.pagerank for concept, score in self.pagerank[anchor]]
def _pagerank_centrality(weighted_projection, **kwargs): if isinstance(weighted_projection, nx.DiGraph): return nx.pagerank(weighted_projection, **kwargs) else: from graph_tool.centrality import pagerank G = weighted_projection pr = pagerank(G, weight=G.ep.weights) pr = {G.vp.node_labels[v]: pr[v] for v in G.vertices()} return pr
def get_dataframe_all_topolog_metrics(self): graph = self.get_graph() eprop_trust = graph.new_edge_property('double') start_time = time.time() for e in graph.edges(): v_name_s = graph.vertex_properties['name_proteins'][e.source()] v_number_s = self.dict_genes[v_name_s] v_name_t = graph.vertex_properties['name_proteins'][e.target()] v_number_t = self.dict_genes[v_name_t] eprop_trust[e] = self.adjacency_matrix[v_number_s, v_number_t] graph.edge_properties['trust'] = eprop_trust print('confidence score за :', '--- %s seconds ---' % (time.time() - start_time)) list_metrics = [ 'betweenness', 'pagerank', 'closeness', 'katz', 'hits_authority', 'hits_hub', 'eigenvector', 'eigentrust' ] # 'trust_transitivity' dict_map = {} start_time = time.time() dict_map['betweenness'] = ct.betweenness(graph)[0] dict_map['pagerank'] = ct.pagerank(graph) dict_map['closeness'] = ct.closeness(graph) dict_map['katz'] = ct.katz(graph) dict_map['hits_authority'] = ct.hits(graph)[1] dict_map['hits_hub'] = ct.hits(graph)[2] dict_map['eigenvector'] = ct.eigenvector(graph)[1] #print('trust_transitivity') #"dict_map['trust_transitivity'] = ct.trust_transitivity(graph, graph.edge_properties["trust"]) print('все метрики кроме eigentrust за :', '--- %s seconds ---' % (time.time() - start_time)) start_time = time.time() dict_map['eigentrust'] = ct.eigentrust(graph, graph.edge_properties['trust'], max_iter=10**6) print('eigentrust за :', '--- %s seconds ---' % (time.time() - start_time)) start_time = time.time() dict_metrics = {} for key in list_metrics: dict_metrics[key] = [] for v in graph.vertices(): for metric in list_metrics: dict_metrics[metric].append(dict_map[metric][v]) dataframe_all_topolog_metrics = pd.DataFrame(dict_metrics) dataframe_all_topolog_metrics.index = graph.vertex_properties[ 'name_proteins'] print('получила датафрейм с метриками за :', '--- %s seconds ---' % (time.time() - start_time)) return dataframe_all_topolog_metrics
def pagerank_scores(g, obs, weight=None, eps=0.0): pers = g.new_vertex_property('float') pers.a += eps # add some noise for o in obs: pers.a[o] = 1 pers.a /= pers.a.sum() rank = pagerank(g, pers=pers, weight=weight) if rank.a.sum() == 0: raise ValueError('PageRank score all zero') p = rank.a / rank.a.sum() return p
def PR_subgraph(graph, subgraph, eps, threshold): pr = gc.pagerank(subgraph, epsilon=eps) vec = pr.a vec_dict = dict() index = 0 for value in vec: vec_dict[index] = value index += 1 pr_list = [] norm_dict = normalize_dictionary(vec_dict) for poz in norm_dict: poz_initial = subgraph.vertex_properties["name"][poz] pr_list.append((poz_initial, norm_dict[poz])) pr_list = sorted(pr_list, key=lambda tup: tup[1], reverse=True) return pr_list
def f_pagerank(D, stats, options={'features': [], 'skip_features': []}): """""" if 'pagerank' not in options['features']: log.debug('Skipping pagerank') return pagerank_list = pagerank(D).get_array() pr_max = (0.0, 0) idx = 0 # iterate and collect max value and idx for pr_val in pagerank_list: pr_max = (pr_val, idx) if pr_val >= pr_max[0] else pr_max idx += 1 stats['max_pagerank'], stats['max_pagerank_vertex'] = pr_max[0], str( D.vertex_properties['name'][pr_max[1]]) # plot degree distribution if 'plots' in options['features'] and ( not 'skip_features' in options or not 'plots' in options['skip_features']): pagerank_list[::-1].sort() values_counted = collections.Counter(pagerank_list) values, counted = zip(*values_counted.items()) with lock: fig, ax = plt.subplots() plt.plot(values, counted) plt.title('PageRank Histogram') plt.ylabel('Frequency') plt.xlabel('PageRank Value') ax.set_xticklabels(values) ax.set_xscale('log') ax.set_yscale('log') plt.tight_layout() plt.savefig('/'.join([ os.path.dirname(stats['path_edgelist']), 'distribution_pagerank.pdf' ])) log.debug('done plotting pagerank distribution')
def pagerank_scores(g, obs, eps=0.0, weights=None): pers = g.new_vertex_property('float') pers.a += eps # add some noise for o in obs: pers.a[o] += 1 pers.a /= pers.a.sum() rank = pagerank(g, pers=pers, weight=weights) for o in obs: rank[o] = 0 # cannot select obs nodes if rank.a.sum() == 0: raise ValueError('PageRank score all zero') p = rank.a / rank.a.sum() return p
def __init__(self, nodes_info=None, links_info=None, file_name=None): self.g = Graph() if nodes_info and links_info: self.nodes_info = nodes_info self.links_info = links_info self.g.vertex_properties["name"] = self.g.new_vertex_property( 'string') self.g.vertex_properties["id"] = self.g.new_vertex_property( 'int32_t') self.g.edge_properties["weight"] = self.g.new_edge_property( 'int32_t') self.create_network() self.g.vertex_properties["pagerank"] = pagerank( self.g, weight=self.g.edge_properties["weight"]) self.g.vertex_properties[ "degree_centrality"] = self.degree_centrality() elif file_name: self.load_network(file_name)
def run(input_file: KGTKFiles, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output_stats, vertex_in_degree, vertex_out_degree, vertex_pagerank, vertex_auth, vertex_hubs): from kgtk.exceptions import KGTKException def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' v_prop_dict = { 'vertex_pagerank': vertex_pagerank, 'vertex_hubs': vertex_hubs, 'vertex_auth': vertex_auth } try: # import modules locally import socket from graph_tool import load_graph_from_csv from graph_tool import centrality import kgtk.gt.analysis_utils as gtanalysis from pathlib import Path import sys import csv csv.field_size_limit(sys.maxsize) filename: Path = KGTKArgumentParser.get_input_file(input_file) # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later directions = ['in', 'out', 'total'] id_col = 'name' with open(filename, 'r') as f: header = next(f).split('\t') header=[h.strip() for h in header] subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate(header, options=['label', 'predicate', 'relation', 'relationship']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) with open(log_file, 'w') as writer: writer.write('loading the TSV graph now ...\n') G2 = load_graph_from_csv(str(filename), skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations(G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: writer.write('\n###Degrees:\n') for direction in directions: degree_data = gtanalysis.compute_node_degree_hist(G2, direction) max_degree = len(degree_data) - 1 mean_degree, std_degree = gtanalysis.compute_avg_node_degree(G2, direction) writer.write( '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) if compute_pagerank: writer.write('\n###PageRank\n') v_pr = G2.new_vertex_property('float') centrality.pagerank(G2, prop=v_pr) G2.properties[('v', 'vertex_pagerank')] = v_pr writer.write('Max pageranks\n') result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', 5, id_col) for n_id, n_label, pr in result: writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) if compute_hits: writer.write('\n###HITS\n') hits_eig, G2.vp['vertex_hubs'], G2.vp['vertex_auth'] = gtanalysis.compute_hits(G2) writer.write('HITS hubs\n') main_hubs = gtanalysis.get_topn_indices(G2, 'vertex_hubs', 5, id_col) for n_id, n_label, hubness in main_hubs: writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) writer.write('HITS auth\n') main_auth = gtanalysis.get_topn_indices(G2, 'vertex_auth', 5, id_col) for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) sys.stdout.write('node1\tlabel\tnode2\tid\n') id_count = 0 if not output_stats: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] sys.stdout.write( '%s\t%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count))) id_count += 1 id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] sys.stdout.write( '{}\t{}\t{}\t{}\n'.format(v_id, vertex_in_degree, v.in_degree(), '{}-{}-{}'.format(v_id, vertex_in_degree, id_count))) id_count += 1 sys.stdout.write( '{}\t{}\t{}\t{}\n'.format(v_id, vertex_out_degree, v.out_degree(), '{}-{}-{}'.format(v_id, vertex_out_degree, id_count))) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue sys.stdout.write( '%s\t%s\t%s\t%s\n' % (v_id, v_prop_dict[vprop], G2.vp[vprop][v], '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count))) id_count += 1 except Exception as e: raise KGTKException('Error: ' + str(e))
def run( input_file: KGTKFiles, output_file: KGTKFiles, undirected: bool, compute_degrees: bool, compute_pagerank: bool, compute_hits: bool, log_file: str, statistics_only: bool, vertex_in_degree: str, vertex_out_degree: str, vertex_pagerank: str, vertex_auth: str, vertex_hubs: str, top_n: int, errors_to_stdout: bool, errors_to_stderr: bool, show_options: bool, verbose: bool, very_verbose: bool, **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): # import modules locally from pathlib import Path import sys from graph_tool import centrality from kgtk.exceptions import KGTKException import kgtk.gt.analysis_utils as gtanalysis from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions v_prop_dict = { 'vertex_pagerank': vertex_pagerank, 'vertex_hubs': vertex_hubs, 'vertex_auth': vertex_auth } try: # Select where to send error messages, defaulting to stderr. error_file: typing.TextIO = sys.stdout if errors_to_stdout else sys.stderr # Build the option structures. reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file( output_file) # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later directions = ['in', 'out', 'total'] id_col = 'name' output_columns = ["node1", "label", "node2", "id"] if verbose: print('loading the KGTK input file...\n', file=error_file, flush=True) kr: KgtkReader = KgtkReader.open( input_kgtk_file, error_file=error_file, options=reader_options, value_options=value_options, verbose=verbose, very_verbose=very_verbose, ) sub: int = kr.get_node1_column_index() if sub < 0: print("Missing node1 (subject) column.", file=error_file, flush=True) pred: int = kr.get_label_column_index() if pred < 0: print("Missing label (predicate) column.", file=error_file, flush=True) obj: int = kr.get_node2_column_index() if obj < 0: print("Missing node2 (object) column", file=error_file, flush=True) if sub < 0 or pred < 0 or obj < 0: kr.close() raise KGTKException("Exiting due to missing columns.") predicate: str = kr.column_names[pred] G2 = load_graph_from_kgtk(kr, directed=not undirected, ecols=(sub, obj), verbose=verbose, out=error_file) if verbose: print('graph loaded! It has %d nodes and %d edges.' % (G2.num_vertices(), G2.num_edges()), file=error_file, flush=True) kw: KgtkWriter = KgtkWriter.open(output_columns, output_kgtk_file, mode=KgtkWriter.Mode.EDGE, require_all_columns=True, prohibit_extra_columns=True, fill_missing_columns=False, verbose=verbose, very_verbose=very_verbose) with open(log_file, 'w') as writer: writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: writer.write('\n###Degrees:\n') for direction in directions: degree_data = gtanalysis.compute_node_degree_hist( G2, direction) max_degree = len(degree_data) - 1 mean_degree, std_degree = gtanalysis.compute_avg_node_degree( G2, direction) writer.write( '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) if compute_pagerank: writer.write('\n###PageRank\n') v_pr = G2.new_vertex_property('float') centrality.pagerank(G2, prop=v_pr) G2.properties[('v', 'vertex_pagerank')] = v_pr writer.write('Max pageranks\n') result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', top_n, id_col) for n_id, n_label, pr in result: writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) if compute_hits: writer.write('\n###HITS\n') hits_eig, G2.vp['vertex_hubs'], G2.vp[ 'vertex_auth'] = gtanalysis.compute_hits(G2) writer.write('HITS hubs\n') main_hubs = gtanalysis.get_topn_indices( G2, 'vertex_hubs', top_n, id_col) for n_id, n_label, hubness in main_hubs: writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) writer.write('HITS auth\n') main_auth = gtanalysis.get_topn_indices( G2, 'vertex_auth', top_n, id_col) for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) id_count = 0 if not statistics_only: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] kw.write([ G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count) ]) id_count += 1 id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] kw.write([ v_id, vertex_in_degree, str(v.in_degree()), '{}-{}-{}'.format(v_id, vertex_in_degree, id_count) ]) id_count += 1 kw.write([ v_id, vertex_out_degree, str(v.out_degree()), '{}-{}-{}'.format(v_id, vertex_out_degree, id_count) ]) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue kw.write([ v_id, v_prop_dict[vprop], str(G2.vp[vprop][v]), '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count) ]) id_count += 1 kw.close() kr.close() except Exception as e: raise KGTKException('Error: ' + str(e))
def rank_protection(g: GT.Graph, nodes, n_protected): return { nodes.id_of(n) for n in i_of_bests(IdNodes(list(pagerank(g))), n_protected) }
def pagerank_centrality(g: Graph): return centrality.pagerank(g, weight=g.edge_properties['weight'])
def save_centrality(g, node_out_fname, edge_out_fname, weight=None): """ :param g: `Graph` instance :return: None """ df = pd.DataFrame() df['node'] = pd.Series(np.array([int(v) for v in g.vertices()])) # degree print('Degree') num_nodes = len(g.get_vertices()) denom = num_nodes - 1 if g.is_directed(): unnormalized_in_degree = np.array([v.in_degree() for v in g.vertices()]) unnormalized_out_degree = np.array([v.out_degree() for v in g.vertices()]) df['unnormalized_in_degree'] = unnormalized_in_degree df['unnormalized_out_degree'] = unnormalized_out_degree df['in_degree'] = unnormalized_in_degree / denom df['out_degree'] = unnormalized_out_degree / denom else: # check whether weighted graph or not if weight: unnormalized_degree = np.zeros(num_nodes) edge_weights = np.array(weight.get_array()) for edge, w in zip(g.get_edges(), edge_weights): for node in edge[:2]: unnormalized_degree[node] += w df['unnormalized_degree'] = unnormalized_degree df['degree'] = unnormalized_degree / denom else: unnormalized_degree = np.array([v.out_degree() for v in g.vertices()]) df['unnormalized_degree'] = unnormalized_degree df['degree'] = unnormalized_degree / denom # closeness print('Closeness') df['unnormalized_closeness'] = np.array(closeness(g, weight=weight, norm=False).get_array()) df['closeness'] = np.array(closeness(g, weight=weight, norm=True).get_array()) # pageRank print('PageRank') df['pagerank'] = np.array(pagerank(g, weight=weight).get_array()) # betweenness print('Betweenness') un_node_between, un_edge_between = betweenness(g, weight=weight, norm=False) node_between, edge_between = betweenness(g, weight=weight, norm=True) df['unnormalized_betweenness'] = np.array(un_node_between.get_array()) df['betweenness'] = np.array(node_between.get_array()) df.to_csv(node_out_fname, index=False) # edge sources = [] targets = [] for e in g.edges(): source, target = list(map(int, [e.source(), e.target()])) sources.append(source) targets.append(target) df = pd.DataFrame() df['source'] = pd.Series(np.array(sources)) df['target'] = np.array(targets) # betweenness df['unnormalized_betweenness'] = np.array(un_edge_between.get_array()) df['betweenness'] = np.array(edge_between.get_array()) df.to_csv(edge_out_fname, index=False)