def computeClusteringCoeff(G, NodeAttributes): NIdCCfH = snap.TIntFltH() snap.GetNodeClustCf(G, NIdCCfH) ClusterCoeffList = list() for nodeId in NIdCCfH: NodeAttributes[nodeId]['ClusterCoeff'] = NIdCCfH[nodeId] ClusterCoeffList.append((nodeId, NIdCCfH[nodeId])) ClusterCoeffList.sort(key=lambda x: x[1], reverse=True) minClusterCoeff = min(ClusterCoeffList, key=lambda x: x[1])[1] maxClusterCoeff = max(ClusterCoeffList, key=lambda x: x[1])[1] # # Sanity Check # print ClusterCoeffList[1], maxClusterCoeff, ClusterCoeffList[ -1], minClusterCoeff NIdCCfH = snap.TIntFltH() snap.GetNodeClustCf(G, NIdCCfH) ClusterCoeffList = list() for nodeId in NIdCCfH: clusterCoeff = NIdCCfH[nodeId] normClusterCoeff = (clusterCoeff - minClusterCoeff) / ( maxClusterCoeff - minClusterCoeff) NodeAttributes[nodeId]['NormClusterCoeff'] = normClusterCoeff #print NodeAttributes[2012] return NodeAttributes
def get_hits_venues(): mapping = snap.TStrIntSH() t0 = time() file_output_1 = open("paper_venues_hits_hub.txt", 'w') file_output_2 = open("paper_venues_hits_auth.txt", 'w') G0 = snap.LoadEdgeListStr(snap.PNGraph, "paperid_venueid_ref.txt", 0, 1, mapping) NIdHubH = snap.TIntFltH() NIdAuthH = snap.TIntFltH() snap.GetHits(G0, NIdHubH, NIdAuthH, 1000) print("HITS time:", round(time() - t0, 3), "s") for item in NIdHubH: file_output_1.write( str(mapping.GetKey(item)) + "," + str(NIdHubH[item]) + '\n') for item in NIdAuthH: file_output_2.write( str(mapping.GetKey(item)) + "," + str(NIdAuthH[item]) + '\n') # convert input string to node id # NodeId = mapping.GetKeyId("814DF491") # convert node id to input string # NodeName = mapping.GetKey(NodeId) # print "name", NodeName # print "id ", NodeId print("finish hits!") file_output_1.close() file_output_2.close()
def quick_properties(graph, name, dic_path): """Get quick properties of the graph "name". dic_path is the path of the dict {players: id} """ n_edges = graph.GetEdges() n_nodes = graph.GetNodes() print("##########") print("Quick overview of {} Network".format(name)) print("##########") print("{} Nodes, {} Edges").format(n_nodes, n_edges) print("{} Self-edges ".format(snap.CntSelfEdges(graph))) print("{} Directed edges, {} Undirected edges".format( snap.CntUniqDirEdges(graph), snap.CntUniqUndirEdges(graph))) print("{} Reciprocated edges".format(snap.CntUniqBiDirEdges(graph))) print("{} 0-out-degree nodes, {} 0-in-degree nodes".format( snap.CntOutDegNodes(graph, 0), snap.CntInDegNodes(graph, 0))) node_in = graph.GetNI(snap.GetMxInDegNId(graph)) node_out = graph.GetNI(snap.GetMxOutDegNId(graph)) print("Maximum node in-degree: {}, maximum node out-degree: {}".format( node_in.GetDeg(), node_out.GetDeg())) print("###") components = snap.TCnComV() snap.GetWccs(graph, components) max_wcc = snap.GetMxWcc(graph) print "{} Weakly connected components".format(components.Len()) print "Largest Wcc: {} Nodes, {} Edges".format(max_wcc.GetNodes(), max_wcc.GetEdges()) prankH = snap.TIntFltH() snap.GetPageRank(graph, prankH) sorted_prankH = sorted(prankH, key=lambda key: prankH[key], reverse=True) NIdHubH = snap.TIntFltH() NIdAuthH = snap.TIntFltH() snap.GetHits(graph, NIdHubH, NIdAuthH) sorted_NIdHubH = sorted(NIdHubH, key=lambda key: NIdHubH[key], reverse=True) sorted_NIdAuthH = sorted(NIdAuthH, key=lambda key: NIdAuthH[key], reverse=True) with open(dic_path, 'rb') as dic_id: mydict = pickle.load(dic_id) print("3 most central players by PageRank scores: {}, {}, {}".format( list(mydict.keys())[list(mydict.values()).index(sorted_prankH[0])], list(mydict.keys())[list(mydict.values()).index(sorted_prankH[1])], list(mydict.keys())[list(mydict.values()).index( sorted_prankH[2])])) print("Top 3 hubs: {}, {}, {}".format( list(mydict.keys())[list(mydict.values()).index( sorted_NIdHubH[0])], list(mydict.keys())[list(mydict.values()).index( sorted_NIdHubH[1])], list(mydict.keys())[list(mydict.values()).index( sorted_NIdHubH[2])])) print("Top 3 authorities: {}, {}, {}".format( list(mydict.keys())[list(mydict.values()).index( sorted_NIdAuthH[0])], list(mydict.keys())[list(mydict.values()).index( sorted_NIdAuthH[1])], list(mydict.keys())[list(mydict.values()).index( sorted_NIdAuthH[2])]))
def getNodeAttributes(self,UGraph): attriList=[] for index in range(UGraph.GetNodes()): nodelist=[] attriList.append(nodelist) #page rank PRankH = snap.TIntFltH() snap.GetPageRank(UGraph, PRankH) counter=0 for item in PRankH: attriList[counter].append(PRankH[item]) counter+=1 #HIN counter=0 NIdHubH = snap.TIntFltH() NIdAuthH = snap.TIntFltH() snap.GetHits(UGraph, NIdHubH, NIdAuthH) for item in NIdHubH: attriList[counter].append(NIdHubH[item]) attriList[counter].append(NIdAuthH[item]) counter+=1 # Betweenness Centrality counter=0 Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() snap.GetBetweennessCentr(UGraph, Nodes, Edges, 1.0) for node in Nodes: attriList[counter].append(Nodes[node]) counter+=1 # closeness centrality counter=0 for NI in UGraph.Nodes(): CloseCentr = snap.GetClosenessCentr(UGraph, NI.GetId()) attriList[counter].append(CloseCentr) counter+=1 # farness centrality counter=0 for NI in UGraph.Nodes(): FarCentr = snap.GetFarnessCentr(UGraph, NI.GetId()) attriList[counter].append(FarCentr) counter+=1 # node eccentricity counter=0 for NI in UGraph.Nodes(): attriList[counter].append(snap.GetNodeEcc(UGraph, NI.GetId(), True)) counter+=1 atrriMarix=np.array(attriList) return atrriMarix
def hits(graph_filename): # create graph name_id_map = snap.TStrIntSH() graph = snap.LoadEdgeListStr(snap.PNGraph, graph_filename, 0, 1, name_id_map) # run HITS algo id_hub_map = snap.TIntFltH() id_auth_map = snap.TIntFltH() snap.GetHits(graph, id_hub_map, id_auth_map, 1000) # iterate 1000 times return name_id_map, id_hub_map, id_auth_map
def getUndirAttribute(filename, node_num, weighted=None, param=1.0): UGraph = snap.LoadEdgeList(snap.PUNGraph, filename, 0, 1) attributeNames = [ 'Graph', 'Id', 'Degree', 'NodeBetweennessCentrality', 'PageRank', 'EgonetDegree', 'AvgNeighborDeg', 'EgonetConnectivity' ] if weighted: attributeNames += [ 'WeightedDegree', 'EgoWeightedDegree', 'AvgWeightedNeighborDeg', 'EgonetWeightedConnectivity' ] attributes = pd.DataFrame(np.zeros((node_num, len(attributeNames))), columns=attributeNames) attributes['Graph'] = [filename.split('/')[-1].split('.')[0] ] * node_num #node_num # Degree attributes['Id'] = range(0, node_num) degree = np.zeros((node_num, )) OutDegV = snap.TIntPrV() snap.GetNodeOutDegV(UGraph, OutDegV) for item in OutDegV: degree[item.GetVal1()] = item.GetVal2() attributes['Degree'] = degree getEgoAttr(UGraph, node_num, attributes, directed=False) if weighted: df = getWeightedDegree(filename, node_num, attributes, directed=False) getWeightedEgoAttr(UGraph, node_num, attributes, df, directed=False) # Betweenness Centrality betCentr = np.zeros((node_num, )) Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() snap.GetBetweennessCentr(UGraph, Nodes, Edges, param) for node in Nodes: betCentr[node] = Nodes[node] attributes['NodeBetweennessCentrality'] = betCentr # PageRank pgRank = np.zeros((node_num, )) PRankH = snap.TIntFltH() snap.GetPageRank(UGraph, PRankH) for item in PRankH: pgRank[item] = PRankH[item] attributes['PageRank'] = pgRank return attributes
def compute_hub_authority_score(self, graph): # A hash table of int keys and float values (output). # The keys are the node ids and the values are the hub scores as outputed by the HITS algorithm. # Type: snap.TIntFltH hub_scores = snap.TIntFltH() # A hash table of int keys and float values (output) # The keys are the node ids and the values are the authority scores as outputed by the HITS algorithm. # Type: snap.TIntFltH authority_scores = snap.TIntFltH() snap.GetHits(graph, hub_scores, authority_scores) return hub_scores, authority_scores
def GetOverlap(filePathName, Graph, t): # l is here the final ranking of the nodes # Intially, we just put all the nodes in this # and afterwards we sort it l = [i for i in range(Graph.GetNodes())] # The reference vector whose information is used to sort l ref_vect = [0 for i in range(Graph.GetNodes())] # if Type 1, then fill ref_vect with closeness centrality measure if (t == 1): for NI in Graph.Nodes(): ref_vect[NI.GetId()] = snap.GetClosenessCentr(Graph, NI.GetId()) # if Type 2, then fill ref_vect with betweenness centrality measure if (t == 2): Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() # Setting NodeFrac parameter as 0.8 as instructed snap.GetBetweennessCentr(Graph, Nodes, Edges, 0.8) for node in Nodes: ref_vect[node] = Nodes[node] # if Type 3, then fill ref_vect with PageRank scores if (t == 3): PRankH = snap.TIntFltH() # Taking the limit as 1e-6 as used in gen_centrality.py snap.GetPageRank(Graph, PRankH, 0.8, 1e-6, 100) for item in PRankH: ref_vect[item] = PRankH[item] # Now we sort l using the ref_vect l.sort( key=cmp_to_key(lambda item1, item2: ref_vect[item2] - ref_vect[item1])) # make a set containing top 100 nodes of l S1 = set(l[:100]) # make another set containing top 100 nodes from the text files S2 = set() f = open(filePathName, 'r') for _ in range(100): s = f.readline() a, b = s.split() S2.add(int(a)) # return the number of overlaps in S1 and S2 return len(S1.intersection(S2))
def calc_HubAndAuthorityScores(Graph, node_to_g): ## calculate Hub and Authority scores for nodes in the graph. prot_to_hub = {} prot_to_authority = {} NIdHubH = snap.TIntFltH() NIdAuthH = snap.TIntFltH() snap.GetHits(Graph, NIdHubH, NIdAuthH) for node in NIdHubH: my_prot = node_to_g[node] prot_to_hub[my_prot] = NIdHubH[node] for node in NIdAuthH: my_prot = node_to_g[node] prot_to_authority[my_prot] = NIdAuthH[node] return (prot_to_hub, prot_to_authority)
def HITS(G): NIdHubH = snap.TIntFltH() NIdAuthH = snap.TIntFltH() snap.GetHits(G, NIdHubH, NIdAuthH) max = 0.0 for item in NIdHubH: if NIdHubH[item] > max: max = NIdHubH[item] print item, NIdHubH[item] max = 0.0 for item in NIdAuthH: if NIdAuthH[item] > max: max = NIdAuthH[item] print item, NIdAuthH[item]
def pageRank(rankCommands, Graph, conn, cur): PRankH = snap.TIntFltH() before_time = time.time() snap.GetPageRank(Graph, PRankH) print "Total handling time is: ", (time.time() - before_time) slist = sorted(PRankH, key=lambda key: PRankH[key], reverse=True) createTable(rankCommands, slist, PRankH, conn, cur)
def get_ev_centr_sum(G, n1, n2, reset=False): global NIdEigenH if reset or NIdEigenH is None: print 'Initializing EV Centrality...' NIdEigenH = snap.TIntFltH() snap.GetEigenVectorCentr(G, NIdEigenH, 1e-2, 50) return NIdEigenH[n1] + NIdEigenH[n2]
def generate_scores(self): scores = {} common_neighbor_scores = {} for e in self.g.Edges(): # common_neighbor_scores[(e.GetSrcNId(), e.GetDstNId())] = snap.GetCmnNbrs(self.g, e.GetSrcNId(), e.GetDstNId()) n1 = snap.TIntV() n2 = snap.TIntV() snap.GetNodesAtHop(self.g, e.GetSrcNId(), 1, n1, True) snap.GetNodesAtHop(self.g, e.GetDstNId(), 1, n2, True) common_neighbor_scores[(e.GetSrcNId(), e.GetDstNId())] = len(set(n1) & set(n2)) Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() snap.GetBetweennessCentr(self.g, Nodes, Edges, self.node_frac, True) edge_betweenness_scores = {} for e in Edges: edge_betweenness_scores[(e.GetVal1(), e.GetVal2())] = Edges[e] max_cn = max(common_neighbor_scores.values()) max_eb = max(edge_betweenness_scores.values()) print(common_neighbor_scores) print(edge_betweenness_scores) for e in self.g.Edges(): src = e.GetSrcNId() dst = e.GetDstNId() scores[(src, dst)] = self.l * common_neighbor_scores[(src,dst)] / max_cn + (1-self.l) * edge_betweenness_scores[(src,dst)] / max_eb return scores
def computeBetweenessCentrality(G, NodeAttributes): Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() BetweenessNodeList = list() BetweenessEdgeList = list() snap.GetBetweennessCentr(G, Nodes, Edges, 1.0) for node in Nodes: NodeAttributes[node]['Betweeness'] = Nodes[node] BetweenessNodeList.append((node, Nodes[node])) for edge in Edges: #print "edge: (%d, %d) centrality: %f" % (edge.GetVal1(), edge.GetVal2(), Edges[edge]) BetweenessEdgeList.append( (edge.GetVal1(), edge.GetVal2(), Edges[edge])) BetweenessNodeList.sort(key=lambda x: x[1], reverse=True) BetweenessEdgeList.sort(key=lambda x: x[2], reverse=True) #print BetweenessNodeList[0], BetweenessNodeList[-1] minBetweeness = BetweenessNodeList[-1][1] maxBetweeness = BetweenessNodeList[0][1] for (node, betweeness) in BetweenessNodeList: normBetweeness = (betweeness - minBetweeness) / (maxBetweeness - minBetweeness) NodeAttributes[node]['normBetweeness'] = normBetweeness #print NodeAttributes[1669] #print NodeAttributes[884] return NodeAttributes
def get_page_rank_sum(G, n1, n2, reset=False): global PRankH if reset or PRankH is None: print 'Initializing Page Rank' PRankH = snap.TIntFltH() snap.GetPageRank(G, PRankH, 1e-2, 50) return PRankH[n1] + PRankH[n2]
def nodes_centrality_page_rank(): PRankH = snap.TIntFltH() snap.GetPageRank(G, PRankH) sorted_PRankH = sorted(PRankH, key=lambda key: PRankH[key], reverse=True) # print top n nodes with highest PageRank for item in sorted_PRankH[0:5]: #top 5 print(item, PRankH[item])
def CalculateClusteringCoefficient(graph): #output={} NIdCCfH = snap.TIntFltH() snap.GetNodeClustCf(graph, NIdCCfH) print "CLUSTERRING COEFFICIENT" for item in NIdCCfH: print "Node %d th have coefficient %f" % (item, NIdCCfH[item])
def RW_iteration(graph, PRankH, C=0.85): # Performs one Random Walk PRankH_temp = snap.TIntFltH() # Step 1: calculate new page ranks from in-nodes # This new page rank is 'dampened' by a factor C, usually about 0.85 for i in PRankH: node_id = i.GetKey() PR = get_in_node_PR_weight(graph, node_id, PRankH) PRankH_temp.AddDat(node_id, PR * C) # Step 2: The total rank lost to leakage is calculated (sum) # The leaked value is then apportioned to all nodes by adding leakage/|N| to each node sum = diff = NewVal = 0.00 for i in PRankH_temp: sum += i.GetDat() leaked = (1 - sum)/float(graph.GetNodes()) for i in PRankH: NewVal = PRankH_temp(i.GetKey()) + leaked diff += abs(PRankH(i.GetKey()) - NewVal) PRankH.AddDat(i.GetKey(), NewVal) print diff # Return value is the 'difference' in value between the new PRs and the old ones. # After this value goes below some threshold, we will want to stop iterating random walks return diff
def PageRank(d, e): f = open(d) s = f.read() s1 = re.split('\n', s) G1 = snap.PNGraph.New() PRankH = snap.TIntFltH() a = re.split(' ', s1[0]) for i in range(0, int(a[0])): G1.AddNode(i) for i in range(1, int(a[1]) + 1): b = re.split(' ', s1[i]) b0 = re.sub("\D", "", b[0]) b1 = re.sub("\D", "", b[1]) G1.AddEdge(int(b0), int(b1)) snap.GetPageRank(G1, PRankH) EdgePara = dict() for i in range(1, int(a[1]) + 1): c = re.split(' ', s1[i]) if PRankH[int(c[0])] == 0 and PRankH[int(c[1])] == 0: EdgePara[(int(c[0]), int(c[1]))] == 0 EdgePara[(int(c[1]), int(c[0]))] == 0 else: EdgePara[(int(c[0]), int(c[1]))] = e * PRankH[int( c[0])] / (PRankH[int(c[0])] + PRankH[int(c[1])]) EdgePara[(int(c[1]), int(c[0]))] = e * PRankH[int( c[1])] / (PRankH[int(c[0])] + PRankH[int(c[1])]) return EdgePara
def get_betweenness_centr(net, label, outpath): """ get betweenness centrality. :param net: :param label: :param outpath: :return: """ Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() snap.GetBetweennessCentr(net, Nodes, Edges, 1.0) node_betweenness_centr_file = open(outpath + label + '-node_btweennesss_centr', 'w') node_betweenness_centr_top_file = open(outpath + label + '-node_betweenness_centr-top100', 'w') node_betweenness_centr = {} for item in Nodes: node_betweenness_centr[item] = Nodes[item] node_betweenness_centr = sorted(node_betweenness_centr.items(), key=operator.itemgetter(1), reverse=True) id, value = zip(*node_betweenness_centr) for i in range(len(id)): node_betweenness_centr_file.write(str(id[i]) + '\t' + str(value[i]) + '\n') for i in range(100): node_betweenness_centr_top_file.write(str(id[i]) + '\t' + str(value[i]) + '\n') node_betweenness_centr_file.close() node_betweenness_centr_top_file.close() return id, value
def CalculatePageRank(graph, alpha, number_iteration): PRankH = snap.TIntFltH() snap.GetPageRank(graph, PRankH, alpha, 1e-4, number_iteration) output = {} for item in PRankH: output[item] = PRankH[item] return output
def compute_betwenness_centrality(self, graph): nodes_betweenness_centrality = snap.TIntFltH() edges_betweenness_centrality = snap.TIntPrFltH() snap.GetBetweennessCentr(graph, nodes_betweenness_centrality, edges_betweenness_centrality, 1.0) return nodes_betweenness_centrality, edges_betweenness_centrality
def getCentr(self, centrMethod): nodesKeysCentrVals = snap.TIntFltH() for node in self.G.Nodes(): centrValue = centrMethod(self.G, node.GetId()) nodesKeysCentrVals[node.GetId()] = centrValue return nodesKeysCentrVals
def rank_eigvec(self): """ Return dictionary of node ID and its eigenvector centrality score, in score order """ NIdEigenH = snap.TIntFltH() snap.GetEigenVectorCentr(self._graph, NIdEigenH) assert len(NIdEigenH) == self._num_nodes, 'Number of nodes in centrality result must match number of nodes in graph' return snap_hash_to_dict(NIdEigenH)
def compute_page_rank(graph): logging.info("compute pagerank") PRankH = snap.TIntFltH() snap.GetPageRank(graph, PRankH) for item in PRankH: print item, PRankH[item] return PRankH
def rank_pagerank(self , C=0.85, Eps=1e-4, MaxIter=100): """ Return dictionary of node ID and its pagerank centrality score, in score order """ PRankH = snap.TIntFltH() snap.GetPageRank(self._graph, PRankH, C, Eps, MaxIter) assert len(PRankH) == self._num_nodes, 'Number of nodes in centrality result must match number of nodes in graph' return snap_hash_to_dict(PRankH)
def getBetweennessCentr(self): nodesKeyCentrVals = snap.TIntFltH() edgesKeyCentrVals = snap.TIntPrFltH() snap.GetBetweennessCentr(self.G, nodesKeyCentrVals, edgesKeyCentrVals, 1.0) return nodesKeyCentrVals
def betweenness_test(name): if os.path.isfile(DATA_PATH + name + ".between"): print "Skipping", name return start = time.time() G, coords = osmParser.simpleLoadFromFile(name) print "Calculating betweenness", name nodeToBetweenness = snap.TIntFltH() edgeToBetweenness = snap.TIntPrFltH() snap.GetBetweennessCentr(G, nodeToBetweenness, edgeToBetweenness, 0.25) betweenness = {} for node in nodeToBetweenness: betweenness[node] = nodeToBetweenness[node] betweenOut = open(DATA_PATH + name + ".between", 'w') pickle.dump(betweenness, betweenOut, 1) plotTopK(name, betweenness, coords, "GnBu") end = time.time() print "took", end - start, "seconds"
def pageRank_components(g): print 'executing pagerank components ---- getting components for page rank' Components = snap.TCnComV() snap.GetWccs(g, Components) f = open('component_pr.txt', 'w') cgraphs = [] for com in Components: v = snap.TIntV() for ni in com: v.Add(ni) cgraphs.append(snap.GetSubGraph_PNGraph(g, v)) print 'components retrived for pagerank' f.write('Total components:' + str(len(cgraphs)) + '\n') for graph in cgraphs: if graph.GetNodes() == 2: continue sprank = snap.TIntFltH() snap.GetPageRank_PNGraph(graph, sprank) sprank.SortByDat(False) f.write( str(graph.GetNodes()) + ' ' + str(sprank[sprank.BegI().GetKey()]) + '\n') f.close() print 'finished writing pagerank components values'
def get_top_packages(graph_path, n): graph_abs_path = os.path.abspath(graph_path) graph_name = os.path.basename(graph_abs_path).replace(".graph", "") fin = snap.TFIn(graph_abs_path) graph = snap.TNEANet.Load(fin) # rebuild the id => pkg dictionary id_pkg_dict = {} for node in graph.Nodes(): id_pkg_dict[node.GetId()] = graph.GetStrAttrDatN(node.GetId(), "pkg") directory = os.path.dirname(os.path.abspath(graph_path)) # snap.py doesn't suport absolute paths for some operations. Let's cd to the directory os.chdir(directory) # print("{0} Computing top {0} nodes with highest pagerank".format(n, datetime.datetime.now())) data_file = graph_name + "_pageranks" prank_hashtable = snap.TIntFltH() if not os.path.isfile(data_file): # Damping Factor: 0.85, Convergence difference: 1e-4, MaxIter: 100 snap.GetPageRank(graph, prank_hashtable, 0.85) fout = snap.TFOut(data_file) prank_hashtable.Save(fout) else: fin = snap.TFIn(data_file) prank_hashtable.Load(fin) top_n = get_top_nodes_from_hashtable(prank_hashtable, n) top_n.sort(key=itemgetter(1)) top_packages = [] for pair in top_n: top_packages.append(id_pkg_dict[pair[0]]) return top_packages