Пример #1
0
    def do_DBCAN_eval(self):
        for w in np.arange(0.8, 1.1, 0.01):
            for eps in np.arange(0, 1, 1):
                result = {}
                for name in self.val_author_data:
                    real_counter = len(self.val_labels_data[name])

                    #print(real_counter)
                    pubs = []
                    for clusters in self.val_author_data[name]:
                        pubs.append(clusters)

                    local_features = []
                    global_features = []
                    cp = set()
                    for i, pid in enumerate(pubs):
                        if np.sum(self.val_local_features[pid]) == 0:
                            cp.add(i)
                        local_features.append(self.val_local_features[pid])
                        global_features.append(self.val_global_features[pid])

                    global_features = np.array(global_features)
                    local_features = np.array(local_features)

                    local_features = pairwise_distances(local_features,
                                                        metric="cosine")
                    global_features = pairwise_distances(global_features,
                                                         metric="cosine")
                    # w = 0.7
                    sim = (local_features + w * global_features) / (1 + w)

                    #pre = DBSCAN (eps=eps, min_samples=3, metric="precomputed").fit_predict (sim)
                    pre = AgglomerativeClustering(
                        n_clusters=real_counter,
                        affinity="precomputed",
                        linkage="average").fit_predict(sim)
                    #pre = KMeans(n_clusters=real_counter,precompute_distances=True).fit_predict(sim)
                    pre = np.array(pre)

                    outlier = set()
                    for i in range(len(pre)):
                        if pre[i] == -1:
                            outlier.add(i)
                    for i in cp:
                        outlier.add(i)

                    ##基于阈值的相似性匹配
                    paper_pair = generate_pair(pubs, outlier)
                    paper_pair1 = paper_pair.copy()
                    K = len(set(pre))
                    for i in range(len(pre)):
                        if i not in outlier:
                            continue
                        j = np.argmax(paper_pair[i])
                        while j in outlier:
                            paper_pair[i][j] = -1
                            j = np.argmax(paper_pair[i])
                        if paper_pair[i][j] >= 1.5:
                            pre[i] = pre[j]
                        else:
                            pre[i] = K
                            K = K + 1

                    for ii, i in enumerate(outlier):
                        for jj, j in enumerate(outlier):
                            if jj <= ii:
                                continue
                            else:
                                if paper_pair1[i][j] >= 1.5:
                                    pre[j] = pre[i]

                    # print (pre, len (set (pre)))

                    result[name] = []
                    for i in set(pre):
                        oneauthor = []
                        for idx, j in enumerate(pre):
                            if i == j:
                                oneauthor.append(pubs[idx])
                        result[name].append(oneauthor)

                #json.dump (result, open (self.args['val_result'], 'w', encoding='utf-8'), indent=4)
                f1 = f1_score(result, self.args)
                print("w:%2f eps:%.2f f1:%.4f " % (w, eps, f1))
Пример #2
0
    def train_val(self):
        result = {}

        for n, name in tqdm(enumerate(self.val_author_data)):

            pubs = []
            #get the author's all paper
            for clusters in self.val_author_data[name]:

                pubs.append(clusters)
            #print(pubs)

            name_pubs_raw = {}
            for i, pid in enumerate(pubs):
                name_pubs_raw[pid] = self.val_pub_data[pid]
            #load the author's features
            save_relation(name_pubs_raw, name)

            mpg = MetaPathGenerator()
            mpg.read_data("gene")

            all_embs = []
            rw_num = 10
            cp = set()
            #start to random walk
            for k in range(rw_num):
                mpg.generate_WMRW("gene/RW.txt", 5, 20)
                sentences = word2vec.Text8Corpus(r'gene/RW.txt')
                ##########use word2vec to train the paper's embedding###############
                model = word2vec.Word2Vec(sentences,
                                          size=128,
                                          negative=25,
                                          min_count=1,
                                          window=10)
                embs = []
                for i, pid in enumerate(pubs):
                    if pid in model:
                        embs.append(model[pid])
                    else:
                        cp.add(i)
                        embs.append(np.zeros(128))
                all_embs.append(embs)
            all_embs = np.array(all_embs)

            ##########################loading the sematic feautures#################
            ptext_emb = load_data('gene', 'ptext_emb.pkl')
            tcp = load_data('gene', 'tcp.pkl')

            tembs = []
            for i, pid in enumerate(pubs):
                #tembs.append (ptext_emb[pid])
                tembs.append(self.val_features[pid])

            ##############get the paper's connection's cosine matrix####################
            sk_sim = np.zeros((len(pubs), len(pubs)))
            for k in range(rw_num):
                sk_sim = sk_sim + pairwise_distances(all_embs[k],
                                                     metric="cosine")
            sk_sim = sk_sim / rw_num

            ##############get the paper's semantic's cosine matrix####################
            tembs = pairwise_distances(tembs, metric="cosine")

            w = 0.25
            sim = (np.array(sk_sim) + w * np.array(tembs)) / (1 + w)

            pre = DBSCAN(eps=0.2, min_samples=3,
                         metric="precomputed").fit_predict(sim)
            pre = np.array(pre)

            ##离群论文集
            outlier = set()
            for i in range(len(pre)):
                if pre[i] == -1:
                    outlier.add(i)
            for i in cp:
                outlier.add(i)
            for i in tcp:
                outlier.add(i)

            ##基于阈值的相似性匹配
            paper_pair = generate_pair(pubs, outlier)
            paper_pair1 = paper_pair.copy()
            K = len(set(pre))
            for i in range(len(pre)):
                if i not in outlier:
                    continue
                j = np.argmax(paper_pair[i])
                while j in outlier:
                    paper_pair[i][j] = -1
                    j = np.argmax(paper_pair[i])
                if paper_pair[i][j] >= 1.5:
                    pre[i] = pre[j]
                else:
                    pre[i] = K
                    K = K + 1

            for ii, i in enumerate(outlier):
                for jj, j in enumerate(outlier):
                    if jj <= ii:
                        continue
                    else:
                        if paper_pair1[i][j] >= 1.5:
                            pre[j] = pre[i]

            #print (pre, len (set (pre)))

            result[name] = []
            for i in set(pre):
                oneauthor = []
                for idx, j in enumerate(pre):
                    if i == j:
                        oneauthor.append(pubs[idx])
                result[name].append(oneauthor)

        json.dump(result,
                  open(self.args['val_result'], 'w', encoding='utf-8'),
                  indent=4)
        f1 = f1_score(result, self.args)
        print("f1:", f1)
Пример #3
0
    def do_DBSCAN_test(self):
        result = {}
        for name in tqdm(self.test_author_data):
            pubs = []
            for clusters in self.test_author_data[name]:
                pubs.append(clusters)

            local_features = []
            global_features = []
            cp = set()
            for i, pid in enumerate(pubs):
                if np.sum(self.test_local_features[pid]) == 0:
                    cp.add(i)
                local_features.append(self.test_local_features[pid])
                global_features.append(self.test_global_features[pid])

            local_features = pairwise_distances(local_features,
                                                metric="cosine")
            global_features = pairwise_distances(global_features,
                                                 metric="cosine")
            w = 0.3
            sim = (local_features + w * global_features) / (1 + w)

            pre = DBSCAN(eps=0.15, min_samples=3,
                         metric="precomputed").fit_predict(sim)
            pre = np.array(pre)

            outlier = set()
            for i in range(len(pre)):
                if pre[i] == -1:
                    outlier.add(i)
            for i in cp:
                outlier.add(i)

            ##基于阈值的相似性匹配
            paper_pair = generate_pair(pubs, outlier)
            paper_pair1 = paper_pair.copy()
            K = len(set(pre))
            for i in range(len(pre)):
                if i not in outlier:
                    continue
                j = np.argmax(paper_pair[i])
                while j in outlier:
                    paper_pair[i][j] = -1
                    j = np.argmax(paper_pair[i])
                if paper_pair[i][j] >= 1.5:
                    pre[i] = pre[j]
                else:
                    pre[i] = K
                    K = K + 1

            for ii, i in enumerate(outlier):
                for jj, j in enumerate(outlier):
                    if jj <= ii:
                        continue
                    else:
                        if paper_pair1[i][j] >= 1.5:
                            pre[j] = pre[i]

            # print (pre, len (set (pre)))

            result[name] = []
            for i in set(pre):
                oneauthor = []
                for idx, j in enumerate(pre):
                    if i == j:
                        oneauthor.append(pubs[idx])
                result[name].append(oneauthor)

        json.dump(result,
                  open(self.args['test_result'], 'w', encoding='utf-8'),
                  indent=4)