def test_scipy_pagerank(self): G = self.G p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.e-08) for n in G: assert_almost_equal(p[n], G.pagerank[n], places=4) personalize = dict((n, random.random()) for n in G) p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.e-08, personalization=personalize)
def test_scipy_pagerank(self): G = self.G p = nx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08) for n in G: assert p[n] == pytest.approx(G.pagerank[n], abs=1e-4) personalize = {n: random.random() for n in G} p = nx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, personalization=personalize) nstart = {n: random.random() for n in G} p = nx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, nstart=nstart) for n in G: assert p[n] == pytest.approx(G.pagerank[n], abs=1e-4)
def test_scipy_pagerank(self): G = self.G try: import scipy except ImportError: raise SkipTest("scipy not available.") p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08) for n in G: assert_almost_equal(p[n], G.pagerank[n], places=4) personalize = dict((n, random.random()) for n in G) p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, personalization=personalize) assert_raises(networkx.NetworkXError, networkx.pagerank_scipy, G, max_iter=0)
def test_scipy_pagerank(self): G = self.G p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08) for n in G: assert almost_equal(p[n], G.pagerank[n], places=4) personalize = {n: random.random() for n in G} p = networkx.pagerank_scipy( G, alpha=0.9, tol=1.0e-08, personalization=personalize ) nstart = {n: random.random() for n in G} p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, nstart=nstart) for n in G: assert almost_equal(p[n], G.pagerank[n], places=4)
def test_scipy_pagerank(self): G=self.G try: import scipy except ImportError: raise SkipTest('scipy not available.') p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08) for n in G: assert_almost_equal(p[n],G.pagerank[n],places=4) personalize = dict((n,random.random()) for n in G) p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08, personalization=personalize) assert_raises(networkx.NetworkXError,networkx.pagerank_scipy,G, max_iter=0)
def ExtractSentence(text,k): "根据文本内容获得句子重要性排名" print('开始句子重要性排名') sent_tokens = nlp.sent_tokenize(text) #可以加入限制条件,如果句子中的实体数少于阈值则放弃这个句子,等等,待扩展 sent_tokens = filter_sent(sent_tokens,1) #建图结构 text_graph = graph_construct(sent_tokens) #这里pagerank有三种,一种是正常的pg,一种是利用numpy还有一种就是下面的利用scipy的稀疏矩阵 print('start to calculate') #cal_gr_page_rank = nx.pagerank(text_graph,weight='weight') cal_gr_page_rank = nx.pagerank_scipy(text_graph) print('ended') #按照最后的score得分进行排序,获得前K个,待扩展,使之取不超250个词的句子 sents = sorted(cal_gr_page_rank,key = cal_gr_page_rank.get, reverse=True) kth = get_sum_sents(sents,250) #topK str_tmp_list = [] for sidx in range(kth): str_tmp = sents[sidx] str_tmp += '[%.4f]'%(cal_gr_page_rank[sents[sidx]]) str_tmp_list.append(str_tmp) print_score(str_tmp_list) return ' '.join(sents[:kth])
def candidate_weighting(self, threshold=0.25, method='average'): """ Candidate weight calculation using random walk. Args: threshold (float): the minimum similarity for clustering, defaults to 0.25. method (str): the linkage method, defaults to average. """ # cluster the candidates self.topic_clustering(threshold=threshold, method=method) # build the topic graph self.build_topic_graph() # compute the word scores using random walk w = nx.pagerank_scipy(self.graph) # loop throught the topics for i, topic in enumerate(self.topics): # get first occuring candidate from topic offsets = [self.candidates[t].offsets[0] for t in topic] first = offsets.index(min(offsets)) self.weights[topic[first]] = w[i]
def compute_centrality(star_dict, edge_dict): #build up a nx graph galaxy = networkx.Graph() for v, vertex in star_dict.iteritems(): galaxy.add_node(v) for v, neighbors in edge_dict.iteritems(): for n in neighbors: galaxy.add_edge(v,n) print "betweenness" betweenness_map = networkx.current_flow_betweenness_centrality(galaxy) betweenness_map = normalize(betweenness_map) for key, value in betweenness_map.iteritems(): star_dict[key]['betweenness'] = value print "closeness" closeness_map = networkx.current_flow_closeness_centrality(galaxy) closeness_map = normalize(closeness_map) for key, value in closeness_map.iteritems(): star_dict[key]['closeness'] = value print "pagerank" pagerank_map = networkx.pagerank_scipy(galaxy) pagerank_map = normalize(pagerank_map) for key, value in pagerank_map.iteritems(): star_dict[key]['pagerank'] = value
def main(): print '- updating pagerank :' # DB-CONNECT conn = sqlite3.connect(db_path) c = conn.cursor() # DB-EXECUTE # get subgraph r = c.execute("SELECT blog_name, source_title FROM subgraph") graph = {key : value.split() for (key, value) in r} G = nx.DiGraph(graph) pr = nx.pagerank_scipy(G, alpha=0.85) # normalise ranks = pr.values() rank_min, rank_max = min(ranks), max(ranks) for k in pr: pr[k] = round(((pr[k] - rank_min) / (rank_max - rank_min)), 4) # update table for blog in pr: c.execute("UPDATE tumblr_model SET pagerank=? WHERE blog_name=?", [pr[blog], blog]) # DB-COMMIT AND CLOSE conn.commit() conn.close() # sorting, optional pr_sorted = sorted(pr.items(), key=operator.itemgetter(1)) print " %s is the most popular domain in the network"%pr_sorted[-1:][0][0] print ''
def candidate_weighting(self, window=10, pos=None, normalized=False): """ Candidate weight calculation using random walk. Args: window (int): the window within the sentence for connecting two words in the graph, defaults to 10. pos (set): the set of valid pos for words to be considered as nodes in the graph, defaults to (NN, NNS, NNP, NNPS, JJ, JJR, JJS). normalized (False): normalize keyphrase score by their length, defaults to False """ # define default pos tags set if pos is None: pos = set(['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']) # build the word graph self.build_word_graph(window=window, pos=pos) # compute the word scores using random walk w = nx.pagerank_scipy(self.graph) # loop through the candidates for k in self.candidates.keys(): tokens = self.candidates[k].lexical_form self.weights[k] = sum([w[t] for t in tokens]) if normalized: self.weights[k] /= len(tokens)
def get_keyphrases(self, document, speakers = None, use_main = False, topX = 5, maxlen = 50, include_scores = False): """ Get keyphrases from a document using LexRank Speakers, use_main use case similar to in keynet.py """ self.document = document if speakers: main, others = parse_doc_speakers(self.document, speakers) if use_main: self.document = main else: self.document = others self.init_counts() network_graph = self._build_graph() ranked = nx.pagerank_scipy(network_graph) ranked = [(val, text) for val, text in ranked.items()] sort_ranked = sorted(ranked, key=lambda t: t[1], reverse=True) if maxlen: sort_ranked = [t for t in sort_ranked if len(t[0].split()) < maxlen] if not include_scores: sort_ranked = [s[0] for s in sort_ranked] return sort_ranked[:topX]
def candidate_weighting(self, window=10, pos=None, normalized=False): """ Candidate weight calculation using random walk. Args: window (int): the window within the sentence for connecting two words in the graph, defaults to 10. pos (set): the set of valid pos for words to be considered as nodes in the graph, defaults to (NN, NNS, NNP, NNPS, JJ, JJR, JJS). normalized (False): normalize keyphrase score by their length, defaults to False """ # define default pos tags set if pos is None: pos = set(['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']) # build the word graph self.build_word_graph(window=window, pos=pos) # compute the word scores using random walk w = nx.pagerank_scipy(self.graph, alpha=0.85, weight='weight') # loop through the candidates for k in self.candidates.keys(): tokens = self.candidates[k].lexical_form self.weights[k] = sum([w[t] for t in tokens]) if normalized: self.weights[k] /= len(tokens)
def findBestChilds(self, nodes, k=4): n = len(nodes) node_list = dict() i = 0 for node in nodes: node_list[i] = node i += 1 self.stateGraph = np.zeros(shape=(n, n), dtype=np.byte) [self.buildSubGraph(i, n, node_list) for i in range(n)] try: self.logger.debug(len(self.stateGraph)) h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07)) res = list(sorted(h, key=h.__getitem__, reverse=True)) important = res[:k] except: self.logger.error('Graph is empty') self.logger.error(sys.exc_info()) dereffed_list = set([self.sub(i, node_list) for i in important]) dereffed_list.discard(0) dereffed_list.discard(1) return list(dereffed_list)
def textrank_tagger(tokenized, w2v_model): """ TextRank based on cosine similarity between TF-IDF-reweighted w2v sentence sums. """ idf_weights = idf_weight_dict(tokenized) original_indices, sent_representations = w2v_sentence_sums_tfidf( tokenized, w2v_model, idf_weights) distance_matrix = pairwise_distances(sent_representations, metric='cosine') similarity_matrix = np.subtract(1, distance_matrix) # Use PageRank algorithm on similarity matrix nx_graph = nx.from_numpy_matrix(similarity_matrix) # Convergence of graph (tolerance from TextRank paper) scores = nx.pagerank_scipy(nx_graph, max_iter=100, tol=1e-04) # For now, the number of summary-worthy sentences is set to ~33% of the sentences. cutoff = len(tokenized) // 3 sorted_sentences = sorted([(scores[i], original_indices[i]) for i, s in enumerate(tokenized)], reverse=True) summary_indices = [index for score, index in sorted_sentences[:cutoff]] labels = [ 1 if i in summary_indices else 0 for i, _ in enumerate(tokenized) ] return labels
def text_summary(doc, sent_count): """ Summarizes given text using word vectors and graph-based ranking. Args: doc: a spacy.Doc object sent_count: number (/ratio) of sentences in the summary Returns: Text summary """ sents = list(doc.sents) sent_graph = networkx.Graph() sent_graph.add_nodes_from(idx for idx, sent in enumerate(sents)) for i, j in it.combinations(sent_graph.nodes_iter(), 2): # Calculate cosine similarity of two sentences transformed to the interval [0,1] similarity = (sents[i].similarity(sents[j]) + 1) / 2 if similarity != 0: sent_graph.add_edge(i, j, weight=similarity) sent_ranks = networkx.pagerank_scipy(sent_graph) if 0 < sent_count < 1: sent_count = round(sent_count * len(sent_ranks)) sent_count = int(sent_count) top_indices = top_keys(sent_count, sent_ranks) # Return the key sentences in chronological order top_sents = map(lambda i: sents[i], sorted(top_indices)) return format_output(doc, list(top_sents))
def get_keyphrases(self, document, include_scores=False, maxlen=None): """ Get keyphrases from a document using LexRank Speakers, use_main use case similar to in keynet.py """ # Incoporate documnet being considered self.document = document # Initialize document counts, tfidif scores self.init_counts() # Build graph of sentences, edges are cossim network_graph = self._build_graph() # Run PageRank on the graph ranked = nx.pagerank_scipy(network_graph) ranked = [(val, text) for val, text in ranked.items()] # Sort results by score sort_ranked = sorted(ranked, key=lambda t: t[1], reverse=True) # Keep only results up to some maximum length in tokens if maxlen: sort_ranked = [ t for t in sort_ranked if len(t[0].split()) < maxlen ] # For outputting without scores if not include_scores: sort_ranked = [s[0] for s in sort_ranked] return sort_ranked
def candidate_weighting(self, doc: Doc) -> List[Tuple[Candidate, float]]: """Compute the weighted score of each keyword candidate. Args: doc (Doc): doc. Returns: list of tuples, candidate with a score. """ res = [] C = doc._.kw_candidates G = self.build_graph(doc) W = nx.pagerank_scipy(G, alpha=self.cfg["alpha"], tol=self.cfg["tol"], weight="weight") for i, topic in nx.get_node_attributes(G, "C").items(): offsets = [C[t].offsets[0] for t in topic] if self.cfg["heuristic"] == "frequent": freq = [len(C[t].surface_forms) for t in topic] indexes = [j for j, f in enumerate(freq) if f == max(freq)] indexes_offsets = [offsets[j] for j in indexes] most_frequent = offsets.index(min(indexes_offsets)) res.append((C[topic[most_frequent]], W[i])) else: first = offsets.index(min(offsets)) res.append((C[topic[first]], W[i])) res.sort(key=lambda x: x[1], reverse=True) return res
def test_rank_time(self): from pygrank.algorithms.pagerank import PageRank as ranker from pygrank.algorithms.utils import preprocessor import scipy.stats nx_time = list() test_time = list() repeats = 50 for _ in range(repeats): G = create_test_graph() tic = time.clock() ranker(to_scipy=preprocessor('col')).rank(G) test_time.append(time.clock()-tic) tic = time.clock() nx.pagerank_scipy(G) nx_time.append(time.clock()-tic) self.assertLessEqual(scipy.stats.ttest_ind(nx_time, test_time)[1], 0.001, msg="PageRank time comparable to nx with p-value<0.001")
def pre_calculate(X, k=100, ntop=50, calculate_important=None): """ Calculate the k-nearest neighbors matrix Calculate Hubs or Pagerank for each points """ from sklearn.neighbors import NearestNeighbors model = NearestNeighbors(n_neighbors=k, algorithm='ball_tree') model.fit(X) distances, indices = model.kneighbors() if calculate_important is None: top_important = [] else: nn = model.kneighbors_graph(mode='distance') g = nx.from_scipy_sparse_matrix(nn) if calculate_important == 'pagerank': pageranks = nx.pagerank_scipy(g) top_important = sorted(pageranks, key=pageranks.get, reverse=True) elif calculate_important == 'hubs': hubs, authorities = nx.hits_scipy(g) top_important = sorted(hubs, key=hubs.get, reverse=True) return { 'distances': distances.tolist(), 'neighbors': list(map(lambda s: list(map(str, s)), indices)), 'importantPoints': list(map(str, top_important[:ntop])), 'infoMsg': 'Dataset size: {}'.format(X.shape) }
def candidate_weighting(self, window=10, pos=None, normalized=False): """Keyphrase candidate ranking using the weighted variant of the TextRank formulae. Candidates are scored by the sum of the scores of their words. Args: window (int): the window within the sentence for connecting two words in the graph, defaults to 10. pos (set): the set of valid pos for words to be considered as nodes in the graph, defaults to ('NOUN', 'PROPN', 'ADJ'). normalized (False): normalize keyphrase score by their length, defaults to False. """ if pos is None: pos = {'NOUN', 'PROPN', 'ADJ'} # build the word graph self.build_word_graph(window=window, pos=pos) # compute the word scores using random walk w = nx.pagerank_scipy(self.graph, alpha=0.85, tol=0.0001, weight='weight') # loop through the candidates for k in self.candidates.keys(): tokens = self.candidates[k].lexical_form self.weights[k] = sum([w[t] for t in tokens]) if normalized: self.weights[k] /= len(tokens) # use position to break ties self.weights[k] += self.candidates[k].offsets[0] * 1e-8
def run_pagerank(self): self.extract_filenamedict() self.get_corr_matrix(self.corr_method) pid_filter = self.get_pid_filter() adj_mat = np.multiply(self.corr_mat, pid_filter) adj_mat_values = adj_mat.values st = time.time() g = nx.DiGraph() for i,ni in enumerate(adj_mat.index): #print(i) for j,nj in enumerate(adj_mat.columns): if adj_mat_values[i][j] !=0: g.add_edge(ni,nj, weight = adj_mat_values[i][j]) cost = time.time()- st print('Time Cost to build the graph:', cost) #run pagerank st = time.time() self.pr = nx.pagerank_scipy(g, alpha=self.alpha, personalization=self.filename2score, max_iter=300, tol=1.0e-12) #print (pr) print ('Time Cost to run pagerank:', time.time() - st)
def get_summary(self, docs, topK=5, stopwords=None, with_importance=False, standard_name=True): import networkx as nx def sent_sim1(words1, words2): if len(words1) <= 1 or len(words2) <= 1: return 0.0 return (len(set(words1) & set(words2))) / (np.log2(len(words1)) + np.log2(len(words2))) # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确 sents = [ self.seg(doc.strip(), standard_name=standard_name, stopwords=stopwords) for doc in docs ] sents = [sent for sent in sents if len(sent) > 0] G = nx.Graph() for u, v in combinations(range(len(sents)), 2): G.add_edge(u, v, weight=sent_sim1(sents[u], sents[v])) pr = nx.pagerank_scipy(G) pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True) if with_importance: return [(docs[i], imp) for i, imp in pr_sorted[:topK]] else: return [docs[i] for i, rank in pr_sorted[:topK]]
def rooted_pagerank(G, root, alpha=0.85, beta=0, weight='weight'): """Return the rooted PageRank of all nodes with respect to node `root` Parameters ---------- G : a networkx.(Di)Graph network to compute PR on root : a node from the network the node that will be the starting point of all random walks alpha : float PageRank probability that we will advance to a neighbour of the current node in a random walk beta : float or int Normally, we return to the root node with probability 1 - alpha. With this parameter, we can also advance to a random other node in the network with probability beta. Thus, we get back to the root node with probability 1 - alpha - beta. This is off (0) by default. weight : string or None The edge attribute that holds the numerical value used for the edge weight. If None then treat as unweighted. """ personalization = dict.fromkeys(G, beta) personalization[root] = 1 - beta return networkx.pagerank_scipy(G, alpha, personalization, weight=weight)
def candidate_weighting(self, threshold=0.74, method='average', alpha=1.1): """ Candidate weight calculation using random walk. Args: threshold (float): the minimum similarity for clustering, defaults to 0.25. method (str): the linkage method, defaults to average. alpha (float): hyper-parameter that controls the strength of the weight adjustment, defaults to 1.1. """ # cluster the candidates self.topic_clustering(threshold=threshold, method=method) # build the topic graph self.build_topic_graph() if alpha > 0.0: self.weight_adjustment(alpha) # compute the word scores using random walk self.weights = nx.pagerank_scipy(self.graph)
def findBestChilds(self,nodes,k = 4): n = len(nodes) node_list = dict() i = 0 for node in nodes: node_list[i] = node i += 1 self.stateGraph = np.zeros(shape=(n, n), dtype=np.byte) [self.buildSubGraph(i, n, node_list) for i in range(n)] try: self.logger.debug (len(self.stateGraph)) h = (nx.pagerank_scipy(nx.Graph(self.stateGraph), max_iter=100, tol=1e-07)) res = list(sorted(h, key=h.__getitem__, reverse=True)) important = res[:k] except: self.logger.error ('Graph is empty') self.logger.error (sys.exc_info()) dereffed_list = set([self.sub(i, node_list) for i in important]) if len(dereffed_list) > 1: dereffed_list.discard(0) dereffed_list.discard(1) return list(dereffed_list)
def create_node_embeddings(graph, neighborhoods): """ Creates node "embeddings" based on the degrees of neighboring vertices and approximate centrality measures """ num_nodes = graph.number_of_nodes() embeddings = np.zeros(shape=(num_nodes, 2 * (len(neighborhoods) - 1) + 2), dtype=float) eigen = nx.eigenvector_centrality_numpy(graph) pagerank = nx.pagerank_scipy(graph, alpha=0.85) out_neighbors = [] in_neighbors = [] for i in range(1, len(neighborhoods)): out_neighbors.append(neighborhoods[i].sum(axis=1, dtype=float)) in_neighbors.append(neighborhoods[i].sum(axis=0, dtype=float)) for i, node in enumerate(graph.nodes()): for j in range(len(out_neighbors)): embeddings[i][2*j] = out_neighbors[j][i, 0] / graph.number_of_nodes() embeddings[i][2*j+1] = in_neighbors[j][0, i] / graph.number_of_nodes() embeddings[i][-2] = eigen[node] embeddings[i][-1] = pagerank[node] return np.array(embeddings)
def embed(self, network): """ Create an embedding of the network Args: network (scipy sparse matrix): Sparse network adjacency matrix . Returns: scipy sparse matrix: Symbolic node embedding. """ logging.info("Generating and hashing random walks") hashes = self.generate_walk_hashes(network) # Rank nodes pagerank_scores = nx.pagerank_scipy( nx.from_scipy_sparse_matrix(network)) ranked_features = np.argsort( [pagerank_scores[i] for i in range(len(pagerank_scores))])[::-1] logging.info("Generating similarity matrix") if self.fixed_dimension: embedding = self.generate_similarity_fixed( hashes, ranked_features[:min(self.dimension, network.shape[0])]) else: embedding = self.generate_similarity_matrix( hashes, ranked_features).tocsr() # Check if embedding size is less then tau assert (not sparse.issparse(embedding)) or len( embedding.data) <= (self.dimension * network.shape[0]) logging.info("Embedding done") return embedding
def documentPagerankPrediction(G, dataset, synsets_dictionary): predicted = [] for d in dataset: pre = [] near = set() to_add = {} for l in d: near.update(synsets_dictionary[l].keys()) to_add.update({l: synsets_dictionary[l]}) TG = extendGraph(G, to_add, document_graph=False) pr = nx.pagerank_scipy(TG, personalization={n: 1 for n in near}) for l in d: max_prob = 0 best_syn = 0 for synsets in synsets_dictionary[l].keys(): rank = pr[synsets] if rank > max_prob: max_prob = rank best_syn = synsets if best_syn == 0: best_syn = np.random.choice(list(near)) assert (best_syn != 0) pre.append(best_syn) predicted.append(pre) return predicted
def test_pagerank_by_hand(): graph = Graph('gr', 'gr.xml', 4) graph.add_node(Node(0, LabelNodeLetter(0, 0))) graph.add_node(Node(1, LabelNodeLetter(0, 0))) graph.add_node(Node(2, LabelNodeLetter(0, 0))) graph.add_node(Node(3, LabelNodeLetter(0, 0))) graph.add_edge(Edge(0, 1, LabelEdge(0))) graph.add_edge(Edge(1, 2, LabelEdge(0))) graph.add_edge(Edge(2, 3, LabelEdge(0))) pagerank = PageRank() results = pagerank.calc_centrality_score(graph) results = np.asarray(results) graph2 = nx.Graph() graph2.add_node(1) graph2.add_node(2) graph2.add_node(3) graph2.add_node(4) graph2.add_edge(1, 2) graph2.add_edge(2, 3) graph2.add_edge(3, 4) expected = np.array([val for _, val in nx.pagerank_scipy(graph2).items()]) print(results) assert np.linalg.norm(results - expected) < 1e-6
def get_summary(self, docs, topK=5, stopwords=None, with_importance=False, standard_name=True, maxlen=None, avoid_repeat=False): """ 使用Textrank算法得到文本中的关键句 :param docs: str句子列表 :param topK: 选取几个句子, 如果设置了maxlen,则优先考虑长度 :param stopwords: 在算法中采用的停用词 :param with_importance: 返回时是否包括算法得到的句子重要性 :param standard_name: 如果有entity_mention_list的话,在算法中正规化实体名,一般有助于提升算法效果 :param maxlen: 设置得到的摘要最长不超过多少字数,如果已经达到长度限制但未达到topK句也会停止 :param avoid_repeat: 使用MMR principle惩罚与已经抽取的摘要重复的句子,避免重复 :return: 句子列表,或者with_importance=True时,(句子,分数)列表 """ assert topK > 0 import networkx as nx maxlen = float('inf') if maxlen is None else maxlen # 使用standard_name,相似度可以基于实体链接的结果计算而更加准确 sents = [self.seg(doc.strip(), standard_name=standard_name, stopwords=stopwords) for doc in docs] sents = [sent for sent in sents if len(sent) > 0] G = nx.Graph() for u, v in combinations(range(len(sents)), 2): G.add_edge(u, v, weight=sent_sim_textrank(sents[u], sents[v])) pr = nx.pagerank_scipy(G) pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True) if not avoid_repeat: ret = [] curr_len = 0 for i, imp in pr_sorted[:topK]: curr_len += len(docs[i]) if curr_len > maxlen: break ret.append((docs[i], imp) if with_importance else docs[i]) return [ ] else: assert topK <= len(sents) ret = [] curr_len = 0 curr_sumy_words = [] candidate_ids = list(range(len(sents))) i, imp = pr_sorted[0] curr_len += len(docs[i]) if curr_len > maxlen: return ret ret.append((docs[i], imp) if with_importance else docs[i]) curr_sumy_words.extend(sents[i]) candidate_ids.remove(i) for iter in range(topK-1): importance = [pr[i] for i in candidate_ids] norm_importance = scipy.special.softmax(importance) redundancy = np.array([sent_sim_cos(curr_sumy_words, sents[i]) for i in candidate_ids]) scores = 0.6*norm_importance - 0.4*redundancy id_in_cands = np.argmax(scores) i, imp = candidate_ids[id_in_cands], importance[id_in_cands] curr_len += len(docs[i]) if curr_len > maxlen: return ret ret.append((docs[i], imp) if with_importance else docs[i]) curr_sumy_words.extend(sents[i]) del candidate_ids[id_in_cands] return ret
def create_features(self, G_train, edge_bunch): i = 0 X = [] page_rank = nx.pagerank_scipy(G_train) for pair in edge_bunch: commmon_neighbors = len( list(nx.common_neighbors(G_train, pair[0], pair[1]))) jaccard_coefficient = nx.jaccard_coefficient(G_train, [pair]).next()[2] adamic_adar = nx.adamic_adar_index(G_train, [pair]).next()[2] degree_0 = nx.degree(G_train, pair[0]) degree_1 = nx.degree(G_train, pair[1]) prod = degree_0 * degree_1 page_rank_0 = page_rank[pair[0]] page_rank_1 = page_rank[pair[1]] f = [ degree_0, degree_1, prod, commmon_neighbors, jaccard_coefficient, adamic_adar, page_rank_0, page_rank_1, ] X.append(f) i += 1 if i % 1000000 == 0: print(i) return np.array(X)
def random_walk_word_scoring(self): """Compute a random walk ranking on the words using the power method. """ G = nx.Graph() # loop through the sentences to build the graph for i, sentence in enumerate(self.sentences): nodes = set([]) for words, offset in sentence.candidates: for w in words: nodes.add(w) # add the missing nodes to the graph for node in nodes: if not node in G: G.add_node(node) # add the edges to the graph for n1, n2 in combinations(nodes, 2): if not G.has_edge(n1, n2): G.add_edge(n1, n2, weight=0) G[n1][n2]['weight'] += 1.0 # return the random walk scores return self.normalize(nx.pagerank_scipy(G))
def calculate_weighted_page_rank( graph: Union[nx.MultiDiGraph, nx.MultiGraph, nx.Graph, nx.DiGraph], weight: str = "weight", ) -> OrderedDict: """ Calculate Page Rank for ARRG. Parameters ---------- graph : networkx.DiGraph Graph of aspect-aspect relation ARRG. weight : str, optional Name of edge attribute that consists of weight for an edge. it is used to calculate Weighted version of Page Rank. Returns ------- page_ranks : OrderedDict Page Rank values for ARRG. """ logger.info("Weighted Page Rank calculation starts.") page_ranks = nx.pagerank_scipy(graph, weight=weight) logger.info("Weighted Page Rank calculation ended.") return OrderedDict( sorted(page_ranks.items(), key=itemgetter(1), reverse=True))
def test_empty_scipy(self): try: import scipy except ImportError: raise SkipTest('scipy not available.') G = networkx.Graph() assert_equal(networkx.pagerank_scipy(G), {})
def test_empty_scipy(self): try: import scipy except ImportError: raise SkipTest("scipy not available.") G = networkx.Graph() assert_equal(networkx.pagerank_scipy(G), {})
def candidate_weighting(self, doc: Doc) -> List[Tuple[Candidate, float]]: """Compute the weighted score of each keyword candidate. Args: doc (Doc): doc. Returns: list of tuples, candidate with a score. """ res = [] G = self.build_graph(doc) W = nx.pagerank_scipy(G, alpha=self.cfg["alpha"], tol=self.cfg["tol"]) for candidate in doc._.kw_candidates: chunk_len = len(candidate.lexical_form) non_lemma = 0 rank = 0.0 for t in candidate.lexical_form: if t in W: rank += W[t] else: non_lemma += 1 non_lemma_discount = chunk_len / (chunk_len + (2.0 * non_lemma) + 1.0) candidate_w = np.sqrt(rank / (chunk_len + non_lemma)) * non_lemma_discount candidate_w += candidate.offsets[0] * 1e-8 # break ties according to position in text res.append((candidate, candidate_w)) res.sort(key=lambda x: x[1], reverse=True) return res
def generate_global_zscore(full_graph: pd.DataFrame, edgelist: pd.DataFrame, path: str, flag=False): """ Function that generates a dictionary with all the zscore of movies. If flag is true, generate file, else only reads the file :param full_graph: full graph of the movie dataset :param path: path to save the generated DataFrame :param flag: True to generate file of DataFrame with global zscores, False to read it :return: dictionary with prop and obj keys and count and zscores as columns """ if flag: full_slice = full_graph[['prop', 'obj']] full_split_dfs = pd.DataFrame() copy = full_graph.copy() copy['origin'] = ['M' + x for x in copy.index.astype(str)] copy['destination'] = copy['obj'] full_edgelist = pd.concat([edgelist, copy[['origin', 'destination']]]) # create graph G = nx.from_pandas_edgelist(full_edgelist, 'origin', 'destination') pr_np = nx.pagerank_scipy(G, max_iter=1000) for prop in full_slice['prop'].unique(): df_prop = full_slice[full_slice['prop'] == prop] df_gzscore = df_prop.copy() df_gzscore['count'] = df_prop.groupby(by='obj')['obj'].transform('count') df_gzscore['global_zscore'] = (df_gzscore['count'] - df_gzscore['count'].mean()) / df_gzscore['count'].std() full_split_dfs = pd.concat([full_split_dfs, df_gzscore]) full_split_dfs['pr'] = full_split_dfs.apply(lambda x: pr_np[x['obj']], axis=1) full_split_dfs['pr_zscore'] = (full_split_dfs['pr'] - full_split_dfs['pr'].mean()) / full_split_dfs['pr'].std() full_split_dfs.to_csv(path, mode='w', header=True, index=False) return pd.read_csv(path, usecols=['prop', 'obj', 'count', 'global_zscore', 'pr', 'pr_zscore']).set_index(['prop', 'obj']).to_dict()
def graph_stats(self, n): stats = {} stats['Top'] = self.top_nodes(n+1) stats['Pagerank'] = nx.pagerank_scipy(self.G) stats['Pagerank'] = sorted(stats['Pagerank'].iteritems(), key=itemgetter(1),reverse=True)[0:n+1] stats['Articulation Points'] = list(nx.articulation_points(self.G.to_undirected())) stats['Histogram'] = self.degree_histogram()[1:26] return stats
def networkx_algo(): import networkx as nx beta = GlobalPara.beta edges = LoadEdges() G = nx.DiGraph(edges) # print(G.edges()) pagerank_dict = nx.pagerank_scipy(G, alpha=beta) print(pagerank_dict[99])
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9): """ compute centrality score of sentences. Args: sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ] continuous: if True, apply continuous LexRank. (see reference) sim_threshold: if continuous is False and smilarity is greater or equal to sim_threshold, link the sentences. alpha: the damping factor of PageRank Returns: tuple ( { # sentence index -> score 0: 0.003, 1: 0.002, ... }, similarity_matrix ) Reference: Günes Erkan and Dragomir R. Radev. LexRank: graph-based lexical centrality as salience in text summarization. (section 3) http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html """ graph = networkx.DiGraph() # sentence -> tf sent_tf_list = [] for sent in sentences: words = tools.word_segmenter_ja(sent) tf = collections.Counter(words) sent_tf_list.append(tf) sent_vectorizer = DictVectorizer(sparse=True) sent_vecs = sent_vectorizer.fit_transform(sent_tf_list) # compute similarities between senteces sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric="cosine") if continuous: linked_rows, linked_cols = numpy.where(sim_mat > 0) else: linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold) # create similarity graph graph.add_nodes_from(range(sent_vecs.shape[0])) for i, j in zip(linked_rows, linked_cols): if i == j: continue weight = sim_mat[i, j] if continuous else 1.0 graph.add_edge(i, j, {"weight": weight}) scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000) return scores, sim_mat
def test_scipy_pagerank(self): G=self.G try: p=networkx.pagerank_scipy(G,alpha=0.9, tol=1.e-08) for (a,b) in zip(p,self.G.pagerank): assert_almost_equal(a,b) except ImportError: print "Skipping pagerank_scipy test"
def test_scipy_pagerank(self): G=self.G try: import scipy except ImportError: raise SkipTest('scipy not available.') p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08) for n in G: assert_almost_equal(p[n],G.pagerank[n],places=4)
def test_scipy_pagerank(self): G=self.G try: import scipy except ImportError: raise SkipTest('scipy not available.') p=networkx.pagerank_scipy(G,alpha=0.9,tol=1.e-08) for (a,b) in zip(p,self.G.pagerank): assert_almost_equal(a,b)
def compute(self, own_public_key): """ Compute the reputation based on the data in the TrustChain database using the Temporal PageRank algorithm. """ nodes = set() G = nx.DiGraph() for block in self.blocks: if block.link_sequence_number == UNKNOWN_SEQ or block.type != 'tx_done' \ or 'tx' not in block.transaction: continue # Don't consider half interactions pubkey_requester = block.link_public_key pubkey_responder = block.public_key sequence_number_requester = block.link_sequence_number sequence_number_responder = block.sequence_number # In our market, we consider the amount of Bitcoin that have been transferred from A -> B. # For now, we assume that the value from B -> A is of equal worth. value_exchange = block.transaction["tx"]["transferred"]["first"]["amount"] G.add_edge((pubkey_requester, sequence_number_requester), (pubkey_requester, sequence_number_requester + 1), contribution=value_exchange) G.add_edge((pubkey_requester, sequence_number_requester), (pubkey_responder, sequence_number_responder + 1), contribution=value_exchange) G.add_edge((pubkey_responder, sequence_number_responder), (pubkey_responder, sequence_number_responder + 1), contribution=value_exchange) G.add_edge((pubkey_responder, sequence_number_responder), (pubkey_requester, sequence_number_requester + 1), contribution=value_exchange) nodes.add(pubkey_requester) nodes.add(pubkey_responder) personal_nodes = [node1 for node1 in G.nodes() if node1[0] == own_public_key] number_of_nodes = len(personal_nodes) if number_of_nodes == 0: return {} personalisation = {node_name: 1.0 / number_of_nodes if node_name in personal_nodes else 0 for node_name in G.nodes()} try: result = nx.pagerank_scipy(G, personalization=personalisation, weight='contribution') except nx.NetworkXException: self._logger.info("Empty Temporal PageRank, returning empty scores") return {} sums = {} for interaction in result.keys(): sums[interaction[0]] = sums.get(interaction[0], 0) + result[interaction] return sums
def test_empty(self): try: import numpy except ImportError: raise SkipTest('numpy not available.') G=networkx.Graph() assert_equal(networkx.pagerank(G),{}) assert_equal(networkx.pagerank_numpy(G),{}) assert_equal(networkx.pagerank_scipy(G),{}) assert_equal(networkx.google_matrix(G).shape,(0,0))
def OrigPagerank(self): """ returns a 2d array containing the pagerank of the origin node for all edges probas = np.dot( np.array(nx.pagerank_scipy(self).values(), dtype=float).reshape(-1, 1), np.ones((1, self.number_of_nodes()))) """ try: return self.Orig(nx.pagerank_scipy(self)) except: return self.Orig(np.ones(self.number_of_nodes(), dtype=float) / self.number_of_nodes())
def TargPagerank(self): """ returns a 2d array containing the pagerank of the target node for all edges probas = np.dot( np.ones((self.number_of_nodes(), 1)), np.array(nx.pagerank_scipy(self).values(), dtype=float).reshape(1, -1) ) """ try: return self.Targ(nx.pagerank_scipy(self)) except: return self.Targ(np.ones(self.number_of_nodes(), dtype=float) / self.number_of_nodes())
def save_data(self, filename): """Output authors data to a CSV file.""" logger = logging.getLogger("twitter.compute") with timed(logger.info, "computing pageranks"): npr = nx.pagerank_scipy(self.g_authors) wpr = nx.pagerank_scipy(self.g_weighted_authors) rt_npr = nx.pagerank_scipy(self.g_rt_authors) rt_wpr = nx.pagerank_scipy(self.g_rt_weighted_authors) logger = logging.getLogger("twitter.save") conn = self.engine.connect() with timed(logger.info, "saving authors to %r", filename): with open(filename, "wb") as f: writer = csv.writer(f) writer.writerow([ "screen_name", "followers", "listed", "friends", "total_tweets", "tweets", "rtrank", "rtpercentile", "in_degree", "out_degree", "rt_in_degree", "rt_out_degree", "unweighted_pr", "weighted_pr", "rt_unweighted_pr", "rt_weighted_pr", ]) def write_row(row): writer.writerow([str(x) if x is not None else "" for x in row]) format_pr = lambda pr: "%f" % pr for author_id in sorted(self.g_authors, key=lambda x: wpr[x], reverse=True): a = self.authors[author_id] write_row([ author_id, a.followers, a.listed, a.friends, a.statuses, a.ntweets, a.rtrank, a.rtpercentile, a.ninrefs, a.noutrefs, a.ninrts, a.noutrts, format_pr(npr[author_id]), format_pr(wpr[author_id]), format_pr(rt_npr[author_id]), format_pr(rt_wpr[author_id]), ]) conn.close()
def personalizedPageRank(self,rootID): personalize = dict((n, 0) for n in self.graph) personalize[rootID] =1 x = nx.pagerank_scipy(self.graph, alpha=0.15, tol=1.e-05, personalization=personalize) sorted_x = sorted(x.items(), key=operator.itemgetter(1),reverse=True) count = 0 result = '' for key in sorted_x: if not self.graph.has_edge(rootID,key[0]) and rootID != key[0]: count += 1 result += rootID+','+key[0]+'\n' if count == 5: break return result
def top_k_with_score(k, g, p = None, alpha = 0.85): ''' k: the top-k g: networkx instance p: personalization dictionary ''' pr = nx.pagerank_scipy(g, alpha = alpha, personalization = p) sorted_pr = sorted(pr.items(), key=itemgetter(1), reverse=True) if k: top_nodes = sorted_pr[:k] else: top_nodes = sorted_pr return top_nodes
def top_k(k, g, p = None): ''' k: the top-k g: networkx instance p: personalization dictionary ''' pr = nx.pagerank_scipy(g, personalization = p) sorted_pr = sorted(pr.items(), key=itemgetter(1), reverse=True) sorted_nodes = map(itemgetter(0), sorted_pr) if k: top_nodes = sorted_nodes[:k] else: top_nodes = sorted_nodes return top_nodes
def candidate_weighting(self, threshold=0.74, method='average', heuristic=None): """ Candidate weight calculation using random walk. Args: threshold (float): the minimum similarity for clustering, defaults to 0.74. method (str): the linkage method, defaults to average. heuristic (str): the heuristic for selecting the best candidate for each topic, defaults to first occurring candidate. Other options are 'frequent' (most frequent candidate, position is used for ties). """ # cluster the candidates self.topic_clustering(threshold=threshold, method=method) # build the topic graph self.build_topic_graph() # compute the word scores using random walk w = nx.pagerank_scipy(self.graph) # loop throught the topics for i, topic in enumerate(self.topics): # get the offsets of the topic candidates offsets = [self.candidates[t].offsets[0] for t in topic] # get first candidate from topic if heuristic == 'frequent': # get frequencies for each candidate within the topic freq = [len(self.candidates[t].surface_forms) for t in topic] # get the indexes of the most frequent candidates indexes = [j for j, f in enumerate(freq) if f == max(freq)] # offsets of the indexes indexes_offsets = [offsets[j] for j in indexes] most_frequent = indexes_offsets.index(min(indexes_offsets)) self.weights[topic[most_frequent]] = w[i] else: first = offsets.index(min(offsets)) self.weights[topic[first]] = w[i]
def pagerank(self): """Compute pagerank centrality for words coded by Free Association. Returns ------- pagerank : dict The association of each word to its pagerank. FA link weights are taken into account in the computation. Words with pagerank zero are removed from the dict. """ # Assumes a directed weighted graph. logger.info('Computing FreeAssociation pagerank') pagerank = nx.pagerank_scipy(self._norms_graph, max_iter=10000, tol=1e-15, weight='weight') self._remove_zeros(pagerank) logger.info('Done computing FreeAssociation pagerank') return pagerank
def report(self): """Generate a LaTeX report, return as `str`.""" metrics = pandas.DataFrame({ 'indegree': self.network.in_degree(), 'pagerank': networkx.pagerank_scipy(self.network, max_iter=200), }) mean = metrics.mean() std = metrics.std() for field, series in self.metadata.items(): successes = 0 for node in random.sample(series.index.tolist(), 100): value = self.classify(node, field) try: if value == series.loc[node] or value in series.loc[node]: successes += 1 except Exception: pass print(field, successes)
def analyze_pagerank(graph, show_table=False, show_plot=False): """Run analysis on pagerank.""" if not (show_table or show_plot): return # expensive computation, skip if unneccessary indegrees = pandas.Series(graph.in_degree(), name='indegree') pagerank = pandas.Series(networkx.pagerank_scipy(graph, max_iter=200), name='pagescore') table = (pandas.DataFrame({'indegree': graph.in_degree()}) .sort(columns='indegree', ascending=False)) table['indegree_rank'] = pandas.Series(range(1, len(table)+1), index=table.index) table = table.join(pagerank).sort(columns='pagescore', ascending=False) table['page_rank'] = pandas.Series(range(1, len(table)+1), index=table.index) slope, intercept, r_val, p_val, stderr = scipy.stats.linregress( table['pagescore'], table['indegree']) if show_table: print('pagescore and indegree have r == {}'.format(r_val)) print(table.head(10))
def _gen_sim_scores(self, term_matrix, LR_method, pos_seed_vector, neg_seed_vector, pos_weight, neg_weight): if LR_method == 'unbiased': #Switch from distance to similarity measures here weights = -1*(scipy.spatial.distance.pdist(term_matrix.toarray(), 'cosine')-1) #check weights here and threshold them weights[weights < .2] = 0 weights[numpy.isnan(weights)] = 0 graph = networkx.from_numpy_matrix(scipy.spatial.distance.squareform(weights)) scores = networkx.pagerank_scipy(graph, max_iter=5000, alpha = .85) elif LR_method == 'biased': weights = -1*(scipy.spatial.distance.pdist(term_matrix.toarray(), 'cosine')-1) #check weights here and threshold them weights[weights < .2] = 0 nan2zero(weights) graph = networkx.from_numpy_matrix(scipy.spatial.distance.squareform(weights)) #check if seed is empty and return something with correct format if str(pos_seed_vector.nonzero()) == '(array([], dtype=int32), array([], dtype=int32))': pos_seed_scores = scipy.zeros_like(neg_seed_vector) else: pos_seed_scores = baseline_scorer(term_matrix, pos_seed_vector) if str(neg_seed_vector.nonzero()) == '(array([], dtype=int32), array([], dtype=int32))': neg_seed_scores = scipy.zeros_like(pos_seed_scores) else: neg_seed_scores = baseline_scorer(term_matrix, neg_seed_vector) #add a ballast to act against neg seed scores ballast = scipy.zeros_like(neg_seed_scores) ballast[neg_seed_scores == 0] = neg_weight seed_scores = pos_seed_scores*pos_weight + neg_seed_scores*neg_weight +ballast scores = biased_lexrank.b_lexrank(graph, seed_scores, personalization = 'biased', alpha=.85, max_iter = 5000, seed_weight = pos_weight) return scores