예제 #1
0
def get_random_walk_candidate_hashtags(train_df):
    ent_id = pickle.load(open(data_path + "ent_id.pkl", "rb"))
    id_hash = pickle.load(open(data_path + "id_hash.pkl", "rb"))
    G = pickle.load(open(data_path + "graph.pkl", "rb"))

    start = len(ent_id)
    count = len(ent_id) + len(id_hash)
    train_hashtags = {}
    for ind, row in train_df.iterrows():
        print(ind)
        entities = []
        if isinstance(row["news_entity"], list):
            entities += list(set(row["news_entity"]))
        if isinstance(row["tweet_entity"], list):
            entities += list(set(row["tweet_entity"]))
        personalized = np.zeros((count, ))
        for e in entities:
            personalized[ent_id[e]] = 1
        pr = pagerank(G, p=0.85, personalize=personalized)
        temp_list = list(pr)[start:]
        args = np.argsort(temp_list)[::-1][:20]
        top_hashtags = []
        for i in args:
            top_hashtags.append(id_hash[start + i])

        positive_hashtags = row["hashtag"].split(";")

        train_hashtags[ind] = {}
        train_hashtags[ind]["positive"] = list(
            set(positive_hashtags).intersection(set(top_hashtags)))
        train_hashtags[ind]["negative"] = list(
            set(top_hashtags) - set(positive_hashtags))

    pickle.dump(train_hashtags,
                open(data_path + "random_walk_train_hashtags.pkl", "wb"))
예제 #2
0
def rank():
    file1 = open('Crawler/result.txt', 'r')
    lines = file1.readlines()
    list_r = []
    dict_id = {}
    num_id = 0
    for line in lines:
        json_data = json.loads(line)
        if int(json_data['id']) not in dict_id:
            dict_id[int(json_data['id'])] = num_id
            num_id += 1
        for r in json_data['references']:
            r = r.replace("https://academic.microsoft.com/paper/", "")
            if int(r) not in dict_id:
                dict_id[int(r)] = num_id
                num_id += 1
            list_r.append([dict_id[int(json_data['id'])], dict_id[int(r)]])

    weights = np.array([1 for _ in range(len(list_r))])
    list_r = np.array(list_r)
    G = sparse.csr_matrix((weights, (list_r[:, 0], list_r[:, 1])),
                          shape=(num_id, num_id))
    alpha = float(input("Enter alpha for page rank:"))
    pr = pagerank(G, p=alpha)
    for i, id in enumerate(dict_id.keys()):
        dict_id[id] = pr[i]

    k = Counter(dict_id)
    high = k.most_common(10)
    print("Ids with 10 highest pageranks:")
    for i in high:
        print(i[0], " :", i[1], " ")
예제 #3
0
def pagerank_fast(G, sim_mat, personal_vec, alpha, beta):
    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G,
                                  nodelist=nodelist,
                                  weight='weight',
                                  dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # 遷移行列とsim_matを統合
    #sim_mat = mk_sparse_sim_mat(G, item_mat)
    M = beta * M + (1 - beta) * sim_mat

    print('check')
    ppr_mat = []
    print_every = 1
    s = time.time()
    for i in range(personal_vec.shape[1]):
        #pr = pagerank_power(M, p=alpha, personalize=personal_vec[:, i])
        pr = pagerank(M, p=alpha, personalize=personal_vec[:, i])
        ppr_mat.append(pr)
        if (i + 1) % print_every == 0:
            print('{}% {}sec'.format(i / personal_vec.shape[1] * 100,
                                     time.time() - s))

    return ppr_mat
예제 #4
0
def process_pagerank():
    global BASE_DATA
    if not len(BASE_DATA):
        with open(f'result.json', encoding='UTF-8') as f:
            BASE_DATA = json.load(f)
    import numpy as np
    from fast_pagerank import pagerank
    from scipy import sparse
    base_urls = tuple(map(lambda info: info['url'], BASE_DATA))

    page_counts = len(base_urls)

    result = list()

    for info_url in BASE_DATA:
        main_i = base_urls.index(info_url['url'])
        for i_block in range(1, 5):
            for tag_url in info_url['name_bloks'][f'block{i_block}'][
                    "tag_url"]:
                result.append([main_i, base_urls.index(tag_url[1])])
    if not len(result): return
    A = np.array(result)
    G = sparse.csr_matrix(([1] * len(A), (A[:, 0], A[:, 1])),
                          shape=(page_counts, page_counts))  #shape=(4, 4))
    pr = pagerank(G, p=0.85)
    for info in zip(BASE_DATA, pr):
        info[0].update({'PR': info[1]})
    with open('result.json', "w", encoding='UTF-8') as f:
        f.write(json.dumps(BASE_DATA, indent=4, ensure_ascii=False))
예제 #5
0
def find_pagerank(alpha=0.85):
    n = len(nodes_name)
    A = np.array(edges)
    weights = [edge_weight[(A[i][0], A[i][1])] for i in range(len(edges))]
    print(A)
    print(weights)
    G = sparse.csr_matrix((weights, (A[:, 0], A[:, 1])), shape=(n, n))
    pr = pagerank(G, p=alpha)
    return [(node, pr[node_num]) for node, node_num in nodes_name.items()]
예제 #6
0
파일: train.py 프로젝트: dheeraj7596/META
    def rank(probs, df, G, entity_node_id, node_id_entity, label_to_index):
        def get_scaling_factor(key, label_entity_dict):
            total_sum = 0
            for l in label_entity_dict:
                total_sum += label_entity_dict[l][key]
            return total_sum

        def scale(label_entity_dict):
            scaling_factor = {}
            for l in label_entity_dict:
                for key in label_entity_dict[l]:
                    try:
                        factor = scaling_factor[key]
                    except:
                        factor = get_scaling_factor(key, label_entity_dict)
                        scaling_factor[key] = factor
                    label_entity_dict[l][
                        key] = label_entity_dict[l][key] / factor

            for l in label_entity_dict:
                label_entity_dict[l] = {
                    k: v
                    for k, v in sorted(label_entity_dict[l].items(),
                                       key=lambda item: -item[1])
                }
            return label_entity_dict

        label_entity_dict = {}
        start = len(df)
        count = len(df) + len(entity_node_id)
        for l in label_to_index:
            print("Pagerank running for: ", l, flush=True)
            personalized = np.zeros((count, ))
            personalized[:len(df)] = probs[:, label_to_index[l]]
            pr = pagerank(G, p=0.85, personalize=personalized)
            temp_list = list(pr)[start:]
            args = np.argsort(temp_list)[::-1]
            top_ents = {}
            for i in args:
                top_ents[node_id_entity[start + i]] = temp_list[i]
            label_entity_dict[l] = top_ents
        label_entity_dict = scale(label_entity_dict)
        return label_entity_dict
예제 #7
0
def hub_rank(page):
    '''Calculates PageRank of the nodes'''
    page.reset_index(inplace=True)
    page.set_index(['route', 'endpoint'], inplace=True)
    page.sort_index(ascending=True, inplace=True)
    page['route_cluster'] = page['route_cluster'].astype(int)
    orig, dest = page.loc[idx[:, 'orig'],
                          ['hub']].values, page.loc[idx[:, 'dest'],
                                                    ['hub']].values
    graph = np.hstack((orig, dest))
    weights = np.ones(graph.shape[0])
    nodes = np.unique(graph.flatten())
    shape = [len(nodes)] * 2
    position_tuple = graph[:, 0], graph[:, 1]
    graph_sparse = sparse.csr_matrix((weights, position_tuple), shape=shape)
    graph = pd.DataFrame(graph, columns=['orig', 'dest'])
    page_rank = pagerank(graph_sparse, p=0.85)
    rank_dict = dict(zip(nodes, page_rank))
    page['rank'] = page['hub'].map(rank_dict)
    rank = page.groupby('hub')['rank'].first().to_frame('rank')
    return page, rank, graph
예제 #8
0
 def score(self, data):
     node_dict = {node['name']: i for i, node in enumerate(data['node'])}
     node_count = len(node_dict)
     edges = np.array([(node_dict[edge['node'][0]],
                        node_dict[edge['node'][1]])
                       for edge in data['edge']])
     weights = np.array([edge['weight'] for edge in data['edge']])
     G = sparse.csr_matrix((weights, (edges[:, 0], edges[:, 1])),
                           shape=(node_count, node_count))
     if data['node'][0].get('weight'):
         personalize = np.array([node['weight'] for node in data['node']])
     else:
         personalize = None
     if self.solver == 'power':
         pr = pagerank_power(G,
                             p=self.damping_factor,
                             personalize=personalize,
                             tol=self.tol)
     else:
         pr = pagerank(G, p=self.damping_factor, personalize=personalize)
     return {k: pr[v] for k, v in node_dict.items()}
예제 #9
0
def run_pagerank(probs, df, G, entity_id, id_entity, label_to_index, dump_dir, plot=False):
    label_entity_dict = {}
    start = len(df)
    count = len(df) + len(entity_id)
    for l in label_to_index:
        print("Pagerank running for: ", l)
        personalized = np.zeros((count,))
        personalized[:len(df)] = probs[:, label_to_index[l]]
        pr = pagerank(G, p=0.85, personalize=personalized)
        temp_list = list(pr)[start:]
        args = np.argsort(temp_list)[::-1]
        top_auths = {}
        for i in args:
            top_auths[id_entity[start + i]] = temp_list[i]
        label_entity_dict[l] = top_auths
    label_entity_dict = scale(label_entity_dict)
    if plot:
        for l in label_entity_dict:
            temp_list = list(label_entity_dict[l].values())
            plot_histogram(temp_list, dump_dir, l)
    return label_entity_dict
예제 #10
0
def run_pagerank_single_graph(probs, df, G, entity_id_list, id_entity_list, label_to_index, dump_dir, plot=False):
    label_all_entity_dict = {}
    start = len(df)
    count = len(df)
    for entity_id in entity_id_list:
        count += len(entity_id)
    for l in label_to_index:
        print("Pagerank running for: ", l)
        personalized = np.zeros((count,))
        personalized[:len(df)] = probs[:, label_to_index[l]]
        pr = pagerank(G, p=0.85, personalize=personalized)
        temp_list = list(pr)[start:]
        args = np.argsort(temp_list)[::-1]
        top_auths = {}
        for i in args:
            for id_entity in id_entity_list:
                try:
                    top_auths[id_entity[start + i]] = temp_list[i]
                except:
                    pass
        label_all_entity_dict[l] = top_auths
    label_all_entity_dict = scale(label_all_entity_dict)

    label_entity_dict_list = []
    for entity_id in entity_id_list:
        label_entity_dict_list.append({})

    for l in label_all_entity_dict:
        for key in label_all_entity_dict[l]:
            for i, entity_id in enumerate(entity_id_list):
                try:
                    temp = entity_id[key]
                    try:
                        label_entity_dict_list[i][l][key] = label_all_entity_dict[l][key]
                    except:
                        label_entity_dict_list[i][l] = {}
                        label_entity_dict_list[i][l][key] = label_all_entity_dict[l][key]
                except:
                    pass
    return label_entity_dict_list
예제 #11
0
파일: test.py 프로젝트: Apostolvlad/task-2
def process_pagerank():
    import numpy as np
    from fast_pagerank import pagerank
    from scipy import sparse
    base_urls = tuple(map(lambda info: info['url'], BASE_DATA))

    page_counts = len(base_urls)

    result = list()

    for info_url in BASE_DATA:
        main_i = base_urls.index(info_url['url'])
        for i_block in range(1, 4):
            for tag_url in info_url['name_bloks'][f'block{i_block}'][
                    "tag_url"]:
                result.append([main_i, base_urls.index(tag_url[1])])
    if not len(result): return
    A = np.array(result)
    G = sparse.csr_matrix(([1] * len(A), (A[:, 0], A[:, 1])),
                          shape=(page_counts, page_counts))  #shape=(4, 4))
    pr = pagerank(G, p=0.85)
    for info in zip(BASE_DATA, pr):
        info[0].update({'PR': info[1]})
def ego_pagerank(adjacency):
    '''
    Given the adjacency matrix, it returns pagerank of every node as a numpy array.
    '''
    return pagerank(csr_matrix(adjacency))
예제 #13
0
    G_conf = sparse.load_npz(data_path + "G_conf.npz")
    df = pickle.load(
        open(data_path + "df_mapped_labels_phrase_removed_stopwords.pkl",
             "rb"))
    venue_id = pickle.load(open(data_path + "venue_id.pkl", "rb"))
    id_venue = pickle.load(open(data_path + "id_venue.pkl", "rb"))

    count = len(venue_id) + len(df)
    start = len(df)
    labels = list(set(df.label))
    categories = list(df.label)

    top_conf_map = {}
    for l in labels:
        print("Pagerank running for: ", l)
        personalized = np.zeros((count, ))
        for i, cat in enumerate(categories):
            if cat == l:
                personalized[i] = 1
        pr = pagerank(G_conf, p=0.85, personalize=personalized)
        temp_list = list(pr)[start:]
        sorted_temp_list = sorted(temp_list, reverse=True)
        args = np.argsort(temp_list)[::-1]
        top_auths = []
        for i in args:
            top_auths.append(id_venue[start + i])
        top_conf_map[l] = top_auths

    pass
 def test_zero_edge(self):
     calculated_pagerank = pagerank(self.G4,
                                    p=self.p4,
                                    personalize=self.personalize4)
     assert_allclose(calculated_pagerank, self.pr4, rtol=0, atol=1e-04)
 def test_empty_graph(self):
     calculated_pagerank = pagerank(self.G5,
                                    p=self.p5,
                                    personalize=self.personalize5)
     self.assertEqual(calculated_pagerank.size, 0)
예제 #16
0
    start = len(df)
    auth_num_map, num_auth_map, count = make_auth_pair_map(df)

    edges = []
    weights = []
    for i, row in df.iterrows():
        for pair in row["author pairs"]:
            edges.append([i, auth_num_map[pair]])
            weights.append(1)

    edges = np.array(edges)
    G = sparse.csr_matrix((weights, (edges[:, 0], edges[:, 1])), shape=(count, count))

    labels = list(set(df.categories))
    categories = list(df.categories)
    for l in labels:
        print("Pagerank running for: ", l)
        personalized = np.zeros((count,))
        for i, cat in enumerate(categories):
            if cat == l:
                personalized[i] = 1
        pr = pagerank(G, p=0.85, personalize=personalized)
        temp_list = list(pr)[start:]
        sorted_temp_list = sorted(temp_list, reverse=True)
        args = np.argsort(temp_list)[::-1]
        top_auths = []
        for i in args:
            top_auths.append(num_auth_map[start + i])
        pickle.dump(top_auths, open(data_path + "top_auths/" + l + "_top_auths.pkl", "wb"))
 def test_pagerank_1(self):
     calculated_pagerank = pagerank(self.G1,
                                    p=self.p1,
                                    personalize=self.personalize1)
     assert_allclose(calculated_pagerank, self.pr1, rtol=0, atol=1e-04)
 def test_single_edge(self):
     calculated_pagerank = pagerank(self.G3,
                                    p=self.p3,
                                    personalize=self.personalize3)
     assert_allclose(calculated_pagerank, self.pr3, rtol=0, atol=1e-04)
    def test_pagerank_2(self):

        calculated_pagerank = pagerank(self.G2,
                                       p=self.p2,
                                       personalize=self.personalize2)
        assert_allclose(calculated_pagerank, self.pr2, rtol=0, atol=1e-04)