def get_random_walk_candidate_hashtags(train_df): ent_id = pickle.load(open(data_path + "ent_id.pkl", "rb")) id_hash = pickle.load(open(data_path + "id_hash.pkl", "rb")) G = pickle.load(open(data_path + "graph.pkl", "rb")) start = len(ent_id) count = len(ent_id) + len(id_hash) train_hashtags = {} for ind, row in train_df.iterrows(): print(ind) entities = [] if isinstance(row["news_entity"], list): entities += list(set(row["news_entity"])) if isinstance(row["tweet_entity"], list): entities += list(set(row["tweet_entity"])) personalized = np.zeros((count, )) for e in entities: personalized[ent_id[e]] = 1 pr = pagerank(G, p=0.85, personalize=personalized) temp_list = list(pr)[start:] args = np.argsort(temp_list)[::-1][:20] top_hashtags = [] for i in args: top_hashtags.append(id_hash[start + i]) positive_hashtags = row["hashtag"].split(";") train_hashtags[ind] = {} train_hashtags[ind]["positive"] = list( set(positive_hashtags).intersection(set(top_hashtags))) train_hashtags[ind]["negative"] = list( set(top_hashtags) - set(positive_hashtags)) pickle.dump(train_hashtags, open(data_path + "random_walk_train_hashtags.pkl", "wb"))
def rank(): file1 = open('Crawler/result.txt', 'r') lines = file1.readlines() list_r = [] dict_id = {} num_id = 0 for line in lines: json_data = json.loads(line) if int(json_data['id']) not in dict_id: dict_id[int(json_data['id'])] = num_id num_id += 1 for r in json_data['references']: r = r.replace("https://academic.microsoft.com/paper/", "") if int(r) not in dict_id: dict_id[int(r)] = num_id num_id += 1 list_r.append([dict_id[int(json_data['id'])], dict_id[int(r)]]) weights = np.array([1 for _ in range(len(list_r))]) list_r = np.array(list_r) G = sparse.csr_matrix((weights, (list_r[:, 0], list_r[:, 1])), shape=(num_id, num_id)) alpha = float(input("Enter alpha for page rank:")) pr = pagerank(G, p=alpha) for i, id in enumerate(dict_id.keys()): dict_id[id] = pr[i] k = Counter(dict_id) high = k.most_common(10) print("Ids with 10 highest pageranks:") for i in high: print(i[0], " :", i[1], " ")
def pagerank_fast(G, sim_mat, personal_vec, alpha, beta): nodelist = G.nodes() M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight='weight', dtype=float) S = scipy.array(M.sum(axis=1)).flatten() S[S != 0] = 1.0 / S[S != 0] Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr') M = Q * M # 遷移行列とsim_matを統合 #sim_mat = mk_sparse_sim_mat(G, item_mat) M = beta * M + (1 - beta) * sim_mat print('check') ppr_mat = [] print_every = 1 s = time.time() for i in range(personal_vec.shape[1]): #pr = pagerank_power(M, p=alpha, personalize=personal_vec[:, i]) pr = pagerank(M, p=alpha, personalize=personal_vec[:, i]) ppr_mat.append(pr) if (i + 1) % print_every == 0: print('{}% {}sec'.format(i / personal_vec.shape[1] * 100, time.time() - s)) return ppr_mat
def process_pagerank(): global BASE_DATA if not len(BASE_DATA): with open(f'result.json', encoding='UTF-8') as f: BASE_DATA = json.load(f) import numpy as np from fast_pagerank import pagerank from scipy import sparse base_urls = tuple(map(lambda info: info['url'], BASE_DATA)) page_counts = len(base_urls) result = list() for info_url in BASE_DATA: main_i = base_urls.index(info_url['url']) for i_block in range(1, 5): for tag_url in info_url['name_bloks'][f'block{i_block}'][ "tag_url"]: result.append([main_i, base_urls.index(tag_url[1])]) if not len(result): return A = np.array(result) G = sparse.csr_matrix(([1] * len(A), (A[:, 0], A[:, 1])), shape=(page_counts, page_counts)) #shape=(4, 4)) pr = pagerank(G, p=0.85) for info in zip(BASE_DATA, pr): info[0].update({'PR': info[1]}) with open('result.json', "w", encoding='UTF-8') as f: f.write(json.dumps(BASE_DATA, indent=4, ensure_ascii=False))
def find_pagerank(alpha=0.85): n = len(nodes_name) A = np.array(edges) weights = [edge_weight[(A[i][0], A[i][1])] for i in range(len(edges))] print(A) print(weights) G = sparse.csr_matrix((weights, (A[:, 0], A[:, 1])), shape=(n, n)) pr = pagerank(G, p=alpha) return [(node, pr[node_num]) for node, node_num in nodes_name.items()]
def rank(probs, df, G, entity_node_id, node_id_entity, label_to_index): def get_scaling_factor(key, label_entity_dict): total_sum = 0 for l in label_entity_dict: total_sum += label_entity_dict[l][key] return total_sum def scale(label_entity_dict): scaling_factor = {} for l in label_entity_dict: for key in label_entity_dict[l]: try: factor = scaling_factor[key] except: factor = get_scaling_factor(key, label_entity_dict) scaling_factor[key] = factor label_entity_dict[l][ key] = label_entity_dict[l][key] / factor for l in label_entity_dict: label_entity_dict[l] = { k: v for k, v in sorted(label_entity_dict[l].items(), key=lambda item: -item[1]) } return label_entity_dict label_entity_dict = {} start = len(df) count = len(df) + len(entity_node_id) for l in label_to_index: print("Pagerank running for: ", l, flush=True) personalized = np.zeros((count, )) personalized[:len(df)] = probs[:, label_to_index[l]] pr = pagerank(G, p=0.85, personalize=personalized) temp_list = list(pr)[start:] args = np.argsort(temp_list)[::-1] top_ents = {} for i in args: top_ents[node_id_entity[start + i]] = temp_list[i] label_entity_dict[l] = top_ents label_entity_dict = scale(label_entity_dict) return label_entity_dict
def hub_rank(page): '''Calculates PageRank of the nodes''' page.reset_index(inplace=True) page.set_index(['route', 'endpoint'], inplace=True) page.sort_index(ascending=True, inplace=True) page['route_cluster'] = page['route_cluster'].astype(int) orig, dest = page.loc[idx[:, 'orig'], ['hub']].values, page.loc[idx[:, 'dest'], ['hub']].values graph = np.hstack((orig, dest)) weights = np.ones(graph.shape[0]) nodes = np.unique(graph.flatten()) shape = [len(nodes)] * 2 position_tuple = graph[:, 0], graph[:, 1] graph_sparse = sparse.csr_matrix((weights, position_tuple), shape=shape) graph = pd.DataFrame(graph, columns=['orig', 'dest']) page_rank = pagerank(graph_sparse, p=0.85) rank_dict = dict(zip(nodes, page_rank)) page['rank'] = page['hub'].map(rank_dict) rank = page.groupby('hub')['rank'].first().to_frame('rank') return page, rank, graph
def score(self, data): node_dict = {node['name']: i for i, node in enumerate(data['node'])} node_count = len(node_dict) edges = np.array([(node_dict[edge['node'][0]], node_dict[edge['node'][1]]) for edge in data['edge']]) weights = np.array([edge['weight'] for edge in data['edge']]) G = sparse.csr_matrix((weights, (edges[:, 0], edges[:, 1])), shape=(node_count, node_count)) if data['node'][0].get('weight'): personalize = np.array([node['weight'] for node in data['node']]) else: personalize = None if self.solver == 'power': pr = pagerank_power(G, p=self.damping_factor, personalize=personalize, tol=self.tol) else: pr = pagerank(G, p=self.damping_factor, personalize=personalize) return {k: pr[v] for k, v in node_dict.items()}
def run_pagerank(probs, df, G, entity_id, id_entity, label_to_index, dump_dir, plot=False): label_entity_dict = {} start = len(df) count = len(df) + len(entity_id) for l in label_to_index: print("Pagerank running for: ", l) personalized = np.zeros((count,)) personalized[:len(df)] = probs[:, label_to_index[l]] pr = pagerank(G, p=0.85, personalize=personalized) temp_list = list(pr)[start:] args = np.argsort(temp_list)[::-1] top_auths = {} for i in args: top_auths[id_entity[start + i]] = temp_list[i] label_entity_dict[l] = top_auths label_entity_dict = scale(label_entity_dict) if plot: for l in label_entity_dict: temp_list = list(label_entity_dict[l].values()) plot_histogram(temp_list, dump_dir, l) return label_entity_dict
def run_pagerank_single_graph(probs, df, G, entity_id_list, id_entity_list, label_to_index, dump_dir, plot=False): label_all_entity_dict = {} start = len(df) count = len(df) for entity_id in entity_id_list: count += len(entity_id) for l in label_to_index: print("Pagerank running for: ", l) personalized = np.zeros((count,)) personalized[:len(df)] = probs[:, label_to_index[l]] pr = pagerank(G, p=0.85, personalize=personalized) temp_list = list(pr)[start:] args = np.argsort(temp_list)[::-1] top_auths = {} for i in args: for id_entity in id_entity_list: try: top_auths[id_entity[start + i]] = temp_list[i] except: pass label_all_entity_dict[l] = top_auths label_all_entity_dict = scale(label_all_entity_dict) label_entity_dict_list = [] for entity_id in entity_id_list: label_entity_dict_list.append({}) for l in label_all_entity_dict: for key in label_all_entity_dict[l]: for i, entity_id in enumerate(entity_id_list): try: temp = entity_id[key] try: label_entity_dict_list[i][l][key] = label_all_entity_dict[l][key] except: label_entity_dict_list[i][l] = {} label_entity_dict_list[i][l][key] = label_all_entity_dict[l][key] except: pass return label_entity_dict_list
def process_pagerank(): import numpy as np from fast_pagerank import pagerank from scipy import sparse base_urls = tuple(map(lambda info: info['url'], BASE_DATA)) page_counts = len(base_urls) result = list() for info_url in BASE_DATA: main_i = base_urls.index(info_url['url']) for i_block in range(1, 4): for tag_url in info_url['name_bloks'][f'block{i_block}'][ "tag_url"]: result.append([main_i, base_urls.index(tag_url[1])]) if not len(result): return A = np.array(result) G = sparse.csr_matrix(([1] * len(A), (A[:, 0], A[:, 1])), shape=(page_counts, page_counts)) #shape=(4, 4)) pr = pagerank(G, p=0.85) for info in zip(BASE_DATA, pr): info[0].update({'PR': info[1]})
def ego_pagerank(adjacency): ''' Given the adjacency matrix, it returns pagerank of every node as a numpy array. ''' return pagerank(csr_matrix(adjacency))
G_conf = sparse.load_npz(data_path + "G_conf.npz") df = pickle.load( open(data_path + "df_mapped_labels_phrase_removed_stopwords.pkl", "rb")) venue_id = pickle.load(open(data_path + "venue_id.pkl", "rb")) id_venue = pickle.load(open(data_path + "id_venue.pkl", "rb")) count = len(venue_id) + len(df) start = len(df) labels = list(set(df.label)) categories = list(df.label) top_conf_map = {} for l in labels: print("Pagerank running for: ", l) personalized = np.zeros((count, )) for i, cat in enumerate(categories): if cat == l: personalized[i] = 1 pr = pagerank(G_conf, p=0.85, personalize=personalized) temp_list = list(pr)[start:] sorted_temp_list = sorted(temp_list, reverse=True) args = np.argsort(temp_list)[::-1] top_auths = [] for i in args: top_auths.append(id_venue[start + i]) top_conf_map[l] = top_auths pass
def test_zero_edge(self): calculated_pagerank = pagerank(self.G4, p=self.p4, personalize=self.personalize4) assert_allclose(calculated_pagerank, self.pr4, rtol=0, atol=1e-04)
def test_empty_graph(self): calculated_pagerank = pagerank(self.G5, p=self.p5, personalize=self.personalize5) self.assertEqual(calculated_pagerank.size, 0)
start = len(df) auth_num_map, num_auth_map, count = make_auth_pair_map(df) edges = [] weights = [] for i, row in df.iterrows(): for pair in row["author pairs"]: edges.append([i, auth_num_map[pair]]) weights.append(1) edges = np.array(edges) G = sparse.csr_matrix((weights, (edges[:, 0], edges[:, 1])), shape=(count, count)) labels = list(set(df.categories)) categories = list(df.categories) for l in labels: print("Pagerank running for: ", l) personalized = np.zeros((count,)) for i, cat in enumerate(categories): if cat == l: personalized[i] = 1 pr = pagerank(G, p=0.85, personalize=personalized) temp_list = list(pr)[start:] sorted_temp_list = sorted(temp_list, reverse=True) args = np.argsort(temp_list)[::-1] top_auths = [] for i in args: top_auths.append(num_auth_map[start + i]) pickle.dump(top_auths, open(data_path + "top_auths/" + l + "_top_auths.pkl", "wb"))
def test_pagerank_1(self): calculated_pagerank = pagerank(self.G1, p=self.p1, personalize=self.personalize1) assert_allclose(calculated_pagerank, self.pr1, rtol=0, atol=1e-04)
def test_single_edge(self): calculated_pagerank = pagerank(self.G3, p=self.p3, personalize=self.personalize3) assert_allclose(calculated_pagerank, self.pr3, rtol=0, atol=1e-04)
def test_pagerank_2(self): calculated_pagerank = pagerank(self.G2, p=self.p2, personalize=self.personalize2) assert_allclose(calculated_pagerank, self.pr2, rtol=0, atol=1e-04)