示例#1
0
def evaluate_embeddings(embeddings):
    X, Y = read_node_label('../data/wiki/wiki_labels.txt')
    tr_frac = 0.8
    print("Training classifier using {:.2f}% nodes...".format(
        tr_frac * 100))
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, tr_frac)
示例#2
0
def classify(vectors, args):
    if not os.path.isfile(args.classifydir + '_labels.txt'):
        return defaultdict(lambda: 0)
    X, Y = read_node_label(args.classifydir + '_labels.txt')

    #     print("Training classifier using {:.2f}% nodes...".format(args.train_percent * 100))
    clf = Classifier(vectors=vectors,
                     clf=LogisticRegression(solver="lbfgs", max_iter=4000))
    #     scores = clf.split_train_evaluate(X, Y, args.train_percent)
    features, labels, graph, idx_train, idx_val, idx_test = load_dataset(
        str(args.classifydir.split("/")[-1]))
    #     print(idx_train)
    #     print(type(idx_train))
    idx_train = list(idx_train)

    #     idx_val = list(idx_val)
    #     idx_val += list(idx_test)[:600]

    idx_test = list(idx_test)  #[600:]

    #     for i in idx_val:
    #         idx_train.append(i)

    #     idx_val = idx_val[400:]

    print("TRAINING SIZE", len(idx_train), "VALIDATION SIZE", len(idx_val),
          "TESTING SIZE: ", len(list(idx_test)))
    scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_val)

    # scores = clf.split_train_evaluate(X, Y, args.train_percent)
    test_scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_test)
    test_x.append(test_scores['macro'])
    print("micro:", test_scores['micro'], "macro:", test_scores['macro'])

    return scores
示例#3
0
def main(args):
    t1 = time.time()
    g = Graph()
    print("Reading...")

    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)

    g.read_node_label(args.label_file)
    g.read_node_features(args.feature_file)
    model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
    vectors = model.vectors
    X, Y = read_node_label(args.label_file)
    print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio *
                                                              100))
    clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, args.clf_ratio)
示例#4
0
文件: prre.py 项目: ttklm20/AliGraph
 def output(self, task):
     X = self.embedding_mat
     node_num = self.node_num
     if task == 'class':
         Y = read_node_label(self.label_path, node_num)
         eval(X, Y)
     else:
         link_prediction(X, test_path)
示例#5
0
    def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_stop = True):
        self.rep_size = rep_size
        self.order = order
        self.best_result = 0
        self.vectors = {}
        if order == 3:
            self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1)
            self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2)
            for i in range(epoch):
                self.model1.train_one_epoch()
                self.model2.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100)
                    clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['micro'] < self.best_result and auto_stop:
                        self.vectors = self.last_vectors
                        print 'Auto stop!'
                        return
                    elif result['micro'] > self.best_result:
                        self.best_result = result['micro']

        else:
            self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order)
            for i in range(epoch):
                self.model.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100)
                    clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['micro'] < self.best_result and auto_stop:
                        self.vectors = self.last_vectors
                        print 'Auto stop!'
                        return
                    elif result['micro'] > self.best_result:
                        self.best_result = result['micro']

        self.get_embeddings()
示例#6
0
def classify(vectors, args):
    if not os.path.isfile(args.classifydir + '_labels.txt'):
        return defaultdict(lambda: 0)
    X, Y = read_node_label(args.classifydir + '_labels.txt')
    print("Training classifier using {:.2f}% nodes...".format(
        args.train_percent * 100))
    clf = Classifier(vectors=vectors,
                     clf=LogisticRegression(solver="lbfgs", max_iter=4000))
    scores = clf.split_train_evaluate(X, Y, args.train_percent)
    return scores
示例#7
0
def main(args):
    node_embeddings = load_embeddings(args.embedding_file)
    if args.label_file:
        labels = read_node_label(args.label_file)

    if args.modularity:
        print("Modularity")
        modularity(args, node_embeddings, args.min_k, args.max_k)

    if args.reconstruction:
        print("Graph reconstruction")
        reconstr(args, node_embeddings, args.k_nbrs)

    if args.clustering:
        print("Clustering")
        clustering(node_embeddings, labels, args.exp_times)

    if args.link_prediction:
        print("Link prediction")
        link_prediction(args.input, node_embeddings)

    if args.classification:
        X = list(labels.keys())
        Y = list(labels.values())
        print("Node classification")
        clf_ratio_list = args.clf_ratio.strip().split(',')
        result_list = {}
        train_ratio = np.asarray(range(1, 10)) * .1
        for clf_ratio in train_ratio:  # clf_ratio_list:
            result_per_test = []
            for ti in range(args.exp_times):
                clf = Classifier(vectors=node_embeddings, clf=LogisticRegression())
                myresult = clf.split_train_evaluate(X, Y, float(clf_ratio))
                result_per_test.append(myresult)
            result_list[clf_ratio] = result_per_test

        print('-------------------')
        for clf_ratio in train_ratio:
            print('Train percent:', clf_ratio)
            results = result_list[clf_ratio]
            for index, result in enumerate(results):
                print('Shuffle #%d:   ' % (index + 1), result)

            avg_score = defaultdict(float)
            for score_dict in results:
                for metric, score in score_dict.items():
                    avg_score[metric] += score
            for metric in avg_score:
                avg_score[metric] /= len(results)
            print('Average score:', dict(avg_score))
            print('-------------------')
示例#8
0
def node_classification( embeddings, label_path, name, size):

        X, Y = read_node_label( embeddings, label_path,)

        f_c=open('results/%s_classification_%d.txt'%(name, size), 'w')

        all_ratio=[]

        for tr_frac in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:

               print(" Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
               clf = Classifier(embeddings=embeddings, clf=LogisticRegression(), name=name)
               results= clf.split_train_evaluate(X, Y, tr_frac)

               avg='macro'
               f_c.write(name+' train percentage: '+ str(tr_frac)+ ' F1-'+avg+ ' '+ str('%0.5f'%results[avg]))
               all_ratio.append(results[avg])
               f_c.write('\n')
示例#9
0
def plot_embeddings(embeddings,):
    X, Y = read_node_label('../data/wiki/wiki_labels.txt')

    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()
示例#10
0
def plot_embeddings( embeddings,label_path, name):
        X, Y = read_node_label( embeddings,label_path)

        emb_list = []
        for k in X:
            emb_list.append(embeddings[k])
        emb_list = np.array(emb_list)

        model = TSNE(n_components=2)
        node_pos = model.fit_transform(emb_list)

        color_idx = {}
        for i in range(len(X)):
            color_idx.setdefault(Y[i][0], [])
            color_idx[Y[i][0]].append(i)

        for c, idx in color_idx.items():
            plt.scatter(node_pos[idx, 0], node_pos[idx, 1],label=c)  # c=node_colors)

        plt.axis('off')
        plt.legend(loc= 'upper right', prop={'size': 15}, bbox_to_anchor=(1.15, 1), ncol=1)
        #plt.title('%s graph '%name)
        plt.savefig('%s_vis.pdf'%(name),  bbox_inches='tight',dpi=100)
def main(args):
    print("xnetmf", "begin...")
    t1 = time.time()
    print("Reading...")
    nx_graph = nx.read_edgelist(agrs.input, nodetype=int, comments="%")
    adj_matrix = nx.adjacency_matrix(nx_graph).todense()
    print(adj_matrix)
    g = Graph(adj_matrix)
    rep_method = RepMethod(
        max_layer=2
    )  # Learn representations with xNetMF.  Can adjust parameters (e.g. as in REGAL)
    representations = src.xnetmf.get_representations(g, rep_method)
    print(representations)
    print(representations.shape)
    print("TAWD", "begin...")
    print("Reading...")
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)

    g.read_node_label(args.label_file)
    g.read_node_features(args.feature_file)
    model = xtadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
    vectors = model.vectors
    X, Y = read_node_label(args.label_file)
    print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio *
                                                              100))
    clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, args.clf_ratio)
示例#12
0
def train():

    folder = '../data/wiki/'
    X, Y = read_node_label(folder + 'labels.txt')
    node_fea = read_node_features(folder + 'wiki.features')
    node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt')
    node_bag_seq = read_bag_node_sequences(folder + 'node_sequences_10_10.txt')
    node_bag_list = read_bag_node_list(folder + 'node_sequences_10_10.txt')

    all_sentence_ebd = np.load('../model/wiki/all_sentence_ebd.npy')
    print(all_sentence_ebd.shape)
    #print("sentence length", len(all_sentence_ebd[0][0]))
    all_reward = np.load('../model/wiki/all_reward.npy')
    average_reward = np.load('../model/wiki/average_reward.npy')

    g_rl = tf.Graph()
    sess2 = tf.Session(graph=g_rl)
    env = environment(500)

    with g_rl.as_default():
        with sess2.as_default():

            myAgent = agent(0.03, 500)
            updaterate = 1
            num_epoch = 5
            sampletimes = 3
            best_reward = -100000

            init = tf.global_variables_initializer()
            sess2.run(init)
            saver = tf.train.Saver()
            #saver.restore(sess2, save_path='rlmodel/rl.ckpt')

            # 对于需要训练的变量置位零
            tvars_best = sess2.run(myAgent.tvars)
            for index, var in enumerate(tvars_best):
                tvars_best[index] = var * 0

            # 保存历史的需要训练的变量
            tvars_old = sess2.run(myAgent.tvars)

            # 梯度的置为零
            gradBuffer = sess2.run(myAgent.tvars)
            for index, grad in enumerate(gradBuffer):
                gradBuffer[index] = grad * 0

            g_rl.finalize()

            for epoch in range(num_epoch):

                all_list = list(range(len(all_sentence_ebd)))
                total_reward = []

                # shuffle bags
                random.shuffle(all_list)
                # 对Bag进行shuffle

                for batch in tqdm.tqdm(all_list):
                    #print("batch", batch)
                    #for batch in tqdm.tqdm(range(10000)):

                    # 取出来bag的实体对和对应的sentence,以及对应的reward
                    bath_node = node_bag_list[batch]
                    batch_sentence_ebd = all_sentence_ebd[batch]
                    #print("batch_sentence_ebd", batch_sentence_ebd.shape)
                    batch_reward = all_reward[batch]
                    batch_len = len(batch_sentence_ebd)

                    list_list_state = []
                    list_list_action = []
                    list_list_reward = []
                    avg_reward = 0

                    # add sample times
                    for j in range(sampletimes):
                        #reset environment
                        # 环境的reset,返回当前state,历史state平均,实体对
                        state = env.reset(batch_sentence_ebd, batch_reward)
                        #print('state shape' ,state[0].shape, state[1].shape)
                        list_action = []
                        list_state = []
                        old_prob = []

                        #get action
                        #start = time.time()
                        for i in range(batch_len):

                            state_in = np.append(state[0], state[1])
                            # print("state num", i)
                            # print("state_in.shape", state_in.shape)
                            feed_dict = {}
                            #feed_dict[myAgent.node_seq] = [state[1]]
                            feed_dict[myAgent.state_in] = [state_in]
                            # 根据state计算概率,并根据概率选择action
                            prob = sess2.run(myAgent.prob, feed_dict=feed_dict)
                            # print("prob", prob)
                            old_prob.append(prob[0])
                            action = get_action(prob)
                            #add produce data for training cnn model
                            # 把action和state进行记录,方便后续更新cnnmodel
                            list_action.append(action)
                            list_state.append(state)
                            # 根据采取的action更新state
                            state = env.step(action)
                        #end = time.time()
                        #print ('get action:',end - start)

                        if env.num_selected == 0:
                            tmp_reward = average_reward
                        else:
                            tmp_reward = env.reward()
                        # 累加reward
                        avg_reward += tmp_reward
                        # 记录采取的action和reward
                        list_list_state.append(list_state)
                        list_list_action.append(list_action)
                        list_list_reward.append(tmp_reward)

                    avg_reward = avg_reward / sampletimes
                    # add sample times
                    for j in range(sampletimes):

                        # 取出来上面进行探索的随影的action,state和reward
                        list_state = list_list_state[j]
                        list_action = list_list_action[j]
                        reward = list_list_reward[j]

                        # compute gradient
                        # start = time.time()
                        list_reward = [
                            reward - avg_reward for x in range(batch_len)
                        ]
                        list_state_in = [
                            np.append(state[0], state[1])
                            for state in list_state
                        ]

                        feed_dict = {}
                        feed_dict[myAgent.state_in] = list_state_in
                        feed_dict[myAgent.reward_holder] = list_reward
                        feed_dict[myAgent.action_holder] = list_action
                        '''
                        loss =sess2.run(myAgent.loss, feed_dict=feed_dict)
                        if loss == float("-inf"):
                            probs,pis = sess2.run([myAgent.prob,myAgent.pi], feed_dict=feed_dict)
                            print(' ')
                            print ('batch:',batch)
                            print (old_prob)
                            print (list_action)
                            print(probs)
                            print (pis)
                            print('error!')
                            return 0
                        '''
                        # 计算梯度
                        grads = sess2.run(myAgent.gradients,
                                          feed_dict=feed_dict)
                        for index, grad in enumerate(grads):
                            gradBuffer[index] += grad
                        #end = time.time()
                        #print('get loss and update:', end - start)
                        '''
                        print (len(list_state),len(list_action),len(list_reward),len(list_entity1),len(list_entity2))
                        print (list_action)
                        print (list_reward)
                        print (list_entity1)
                        print (list_entity2)
                        break
                        '''
                    #decide action and compute reward
                    # reset环境
                    state = env.reset(batch_sentence_ebd, batch_reward)
                    old_prob = []
                    for i in range(batch_len):
                        # 决定action,计算reward
                        state_in = np.append(state[0], state[1])
                        feed_dict = {}
                        #feed_dict[myAgent.node_seq] = [state[1]]
                        feed_dict[myAgent.state_in] = [state_in]
                        prob = sess2.run(myAgent.prob, feed_dict=feed_dict)
                        old_prob.append(prob[0])
                        action = decide_action(prob)
                        state = env.step(action)
                    chosen_reward = [
                        batch_reward[x] for x in env.list_selected
                    ]
                    total_reward += chosen_reward

                #apply gradient 计算梯度之后进行应用梯度
                feed_dict = dictionary = dict(
                    zip(myAgent.gradient_holders, gradBuffer))
                sess2.run(myAgent.update_batch, feed_dict=feed_dict)
                for index, grad in enumerate(gradBuffer):
                    gradBuffer[index] = grad * 0

                #get tvars_new 计算最新的需要的变量
                tvars_new = sess2.run(myAgent.tvars)

                # update old variables of the target network
                # 更新参数
                tvars_update = sess2.run(myAgent.tvars)
                for index, var in enumerate(tvars_update):
                    tvars_update[index] = updaterate * tvars_new[index] + (
                        1 - updaterate) * tvars_old[index]

                feed_dict = dictionary = dict(
                    zip(myAgent.tvars_holders, tvars_update))
                sess2.run(myAgent.update_tvar_holder, feed_dict)
                tvars_old = sess2.run(myAgent.tvars)
                #break

                #find the best parameters
                chosen_size = len(total_reward)
                total_reward = np.mean(np.array(total_reward))

                if (total_reward > best_reward):
                    best_reward = total_reward
                    tvars_best = tvars_old
                #print ('chosen sentence size:',chosen_size)
                #print ('total_reward:',total_reward)
                #print ('best_reward',best_reward)

            #set parameters = best_tvars
            feed_dict = dictionary = dict(
                zip(myAgent.tvars_holders, tvars_best))
            sess2.run(myAgent.update_tvar_holder, feed_dict)
            #save model
            saver.save(
                sess2,
                save_path='../model/wiki/stne_transformer_model_rl_model.ckpt')
示例#13
0
def select(save_path):

    folder = '../data/cora/'
    X, Y = read_node_label(folder + 'labels.txt')
    node_fea = read_node_features(folder + 'cora.features')
    node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt')
    node_bag_seq = read_bag_node_sequences(folder + 'node_sequences_10_10.txt')
    node_bag_list = read_bag_node_list(folder + 'node_sequences_10_10.txt')

    all_sentence_ebd = np.load('../model/cora/all_sentence_ebd.npy')
    all_reward = np.load('../model/cora/all_reward.npy')
    average_reward = np.load('../model/cora/average_reward.npy')

    selected_seq = []
    print("selected_seq")

    g_rl = tf.Graph()
    sess2 = tf.Session(graph=g_rl)
    env = environment(500)

    with g_rl.as_default():
        with sess2.as_default():

            myAgent = agent(0.02, 500)
            init = tf.global_variables_initializer()
            sess2.run(init)
            saver = tf.train.Saver()
            saver.restore(sess2, save_path=save_path)
            g_rl.finalize()

            for epoch in range(1):

                total_reward = []
                num_chosen = 0

                all_list = list(range(len(all_sentence_ebd)))

                for batch in tqdm.tqdm(all_list):

                    batch_node = node_bag_list[batch]
                    batch_sentence_ebd = all_sentence_ebd[batch]
                    batch_reward = all_reward[batch]
                    batch_len = len(batch_sentence_ebd)

                    batch_seq = node_bag_seq[batch_node]

                    # reset environment
                    state = env.reset(batch_sentence_ebd, batch_reward)
                    old_prob = []

                    # get action
                    # start = time.time()
                    for i in range(batch_len):
                        state_in = np.append(state[0], state[1])
                        feed_dict = {}
                        feed_dict[myAgent.state_in] = [state_in]
                        prob = sess2.run(myAgent.prob, feed_dict=feed_dict)
                        old_prob.append(prob[0])
                        action = decide_action(prob)
                        # produce data for training cnn model
                        state = env.step(action)
                        if action == 1:
                            num_chosen += 1
                    #print (old_prob)
                    chosen_reward = [
                        batch_reward[x] for x in env.list_selected
                    ]
                    total_reward += chosen_reward

                    selected_seq += [batch_seq[x] for x in env.list_selected]
                print(num_chosen)
    selected_seq = np.array(selected_seq)

    np.save('../model/cora/selected_seq.npy', selected_seq)
示例#14
0
                                         X_train_idx,
                                         Y_train,
                                         X_test_idx,
                                         Y_test,
                                         Y_all,
                                         testnum=5)
    print(res)


if __name__ == '__main__':
    import os
    datafile = 'citeseer'
    edge_file = os.path.join("datasets", datafile, "graph.txt")
    label_file = os.path.join("datasets", datafile, "group.txt")
    single_label = True

    X, Y = read_node_label(label_file)
    G = read_graph_as_matrix(nodeids=X, edge_file=edge_file)

    removed_class = ['0', '1']
    X_train_idx, X_test_idx, Y_train, Y_test, X_train_cid_idx, Y_train_cid = completely_imbalanced_split_train(
        X, Y, train_precent=0.5, removed_class=removed_class)
    print('completely-imbalanced train number', len(X_train_cid_idx))

    vectors = run_RSDNE(G, X_train_cid_idx, Y_train_cid)
    res = evaluate_RSNDE(vectors,
                         X_train_idx,
                         X_test_idx,
                         Y_train,
                         Y_test,
                         Y_all=Y)
def train(num_epoch, full_seq_name):

    folder = '../data/wiki/'
    X, Y = read_node_label(folder + 'labels.txt')
    node_fea = read_node_features(folder + 'wiki.features')
    node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt')
    node_bag_seq = read_bag_node_sequences(folder + 'node_sequences_10_10.txt')
    node_bag_list = read_bag_node_list(folder + 'node_sequences_10_10.txt')
    node_degree = read_node_degree(folder + 'node_degree.txt')

    all_sentence_ebd = np.load('../model/wiki/all_sentence_ebd.npy')
    print(all_sentence_ebd.shape)
    #print("sentence length", len(all_sentence_ebd[0][0]))
    all_reward = np.load('../model/wiki/all_reward.npy')
    average_reward = np.load('../model/wiki/average_reward.npy')

    g_stne = tf.Graph()
    g_rl = tf.Graph()
    sess1 = tf.Session(graph=g_stne)
    sess2 = tf.Session(graph=g_rl)

    with g_stne.as_default():
        with sess1.as_default():
            interact = stne_transformer_new_reward.interaction(
                sess1, save_path='../model/wiki/stne_model_transformer.ckpt')
            tvars_best_cnn = interact.tvars()
            for index, var in enumerate(tvars_best_cnn):
                tvars_best_cnn[index] = var * 0

    g_stne.finalize()
    env = environment(500)
    best_score = -100000

    with g_rl.as_default():
        with sess2.as_default():

            myAgent = agent(0.02, 500)
            updaterate = 0.01
            #num_epoch = 10
            sampletimes = 3
            best_reward = -100000

            init = tf.global_variables_initializer()
            sess2.run(init)
            saver = tf.train.Saver()
            saver.restore(
                sess2,
                save_path='../model/wiki/stne_transformer_model_rl_model.ckpt')

            tvars_best_rl = sess2.run(myAgent.tvars)
            for index, var in enumerate(tvars_best_rl):
                tvars_best_rl[index] = var * 0

            tvars_old = sess2.run(myAgent.tvars)

            gradBuffer = sess2.run(myAgent.tvars)
            for index, grad in enumerate(gradBuffer):
                gradBuffer[index] = grad * 0

            g_rl.finalize()

            trained_node_set = set()
            update_full_seq = []
            for epoch in range(num_epoch):

                update_seq = []

                all_list = list(range(len(all_sentence_ebd)))
                total_reward = []

                # shuffle bags
                random.shuffle(all_list)

                print('update the rlmodel')
                for batch in tqdm.tqdm(all_list):
                    #for batch in tqdm.tqdm(range(10000)):

                    batch_node = node_bag_list[batch]
                    batch_sentence_ebd = all_sentence_ebd[batch]
                    batch_reward = all_reward[batch]
                    batch_len = len(batch_sentence_ebd)

                    batch_seq = node_bag_seq[batch_node]

                    list_list_state = []
                    list_list_action = []
                    list_list_reward = []
                    avg_reward = 0

                    # add sample times
                    for j in range(sampletimes):
                        #reset environment
                        state = env.reset(batch_sentence_ebd, batch_reward)
                        list_action = []
                        list_state = []
                        old_prob = []

                        #get action
                        #start = time.time()
                        for i in range(batch_len):

                            state_in = np.append(state[0], state[1])
                            feed_dict = {}
                            feed_dict[myAgent.state_in] = [state_in]
                            prob = sess2.run(myAgent.prob, feed_dict=feed_dict)
                            old_prob.append(prob[0])
                            action = get_action(prob)
                            '''
                            if action == None:
                                print (123)
                            action = 1
                            '''
                            #add produce data for training cnn model
                            list_action.append(action)
                            list_state.append(state)
                            state = env.step(action)
                        #end = time.time()
                        #print ('get action:',end - start)

                        if env.num_selected == 0:
                            tmp_reward = average_reward
                        else:
                            tmp_reward = env.reward()

                        avg_reward += tmp_reward
                        list_list_state.append(list_state)
                        list_list_action.append(list_action)
                        list_list_reward.append(tmp_reward)

                    avg_reward = avg_reward / sampletimes
                    # add sample times
                    for j in range(sampletimes):

                        list_state = list_list_state[j]
                        list_action = list_list_action[j]
                        reward = list_list_reward[j]

                        # compute gradient
                        # start = time.time()
                        list_reward = [
                            reward - avg_reward for x in range(batch_len)
                        ]
                        list_state_in = [
                            np.append(state[0], state[1])
                            for state in list_state
                        ]

                        feed_dict = {}
                        feed_dict[myAgent.state_in] = list_state_in
                        feed_dict[myAgent.reward_holder] = list_reward
                        feed_dict[myAgent.action_holder] = list_action

                        grads = sess2.run(myAgent.gradients,
                                          feed_dict=feed_dict)
                        for index, grad in enumerate(grads):
                            gradBuffer[index] += grad
                        #end = time.time()
                        #print('get loss and update:', end - start)

                    #decide action and compute reward
                    state = env.reset(batch_sentence_ebd, batch_reward)
                    old_prob = []
                    for i in range(batch_len):
                        state_in = np.append(state[0], state[1])
                        feed_dict = {}
                        feed_dict[myAgent.state_in] = [state_in]
                        prob = sess2.run(myAgent.prob, feed_dict=feed_dict)
                        old_prob.append(prob[0])
                        action = decide_action(prob)
                        state = env.step(action)
                    chosen_reward = [
                        batch_reward[x] for x in env.list_selected
                    ]
                    total_reward += chosen_reward

                    update_seq += [batch_seq[x] for x in env.list_selected]
                    # if epoch == 0:
                    #     pass
                    # else:
                    #     update_full_seq += [batch_seq[x] for x in env.list_selected]
                    update_full_seq += [
                        batch_seq[x] for x in env.list_selected
                    ]
                print('finished')

                #print (len(update_word),len(update_pos1),len(update_pos2),len(update_y),updaterate)

                #train and update cnnmodel
                print('update the stnemodel')
                interact.update_stne(update_seq, updaterate)
                print('finished')

                # classification result
                print('classification result')
                f1_mi = interact.classification_result()
                for f1 in f1_mi:
                    print(f1)
                # classification new result
                # print('classification new result')
                # f1_mi_new = interact.classification_selected_result(trained_node_set, np.array(update_seq))
                # for f1_new in f1_mi_new:
                #     print(f1_new)
                # print('finished')

                #produce new embedding
                print('produce new embedding')
                average_reward, all_sentence_ebd, all_reward = interact.produce_new_embedding(
                )
                np.save('../model/wiki/average_reward_new.npy', average_reward)
                np.save('../model/wiki/all_sentence_ebd_new.npy',
                        all_sentence_ebd)
                np.save('../model/wiki/all_reward_new.npy', all_reward)
                average_score = average_reward
                print('finished')

                #update the rlmodel
                #apply gradient
                feed_dict = dictionary = dict(
                    zip(myAgent.gradient_holders, gradBuffer))
                sess2.run(myAgent.update_batch, feed_dict=feed_dict)
                for index, grad in enumerate(gradBuffer):
                    gradBuffer[index] = grad * 0

                #get tvars_new
                tvars_new = sess2.run(myAgent.tvars)

                # update old variables of the target network
                tvars_update = sess2.run(myAgent.tvars)
                for index, var in enumerate(tvars_update):
                    tvars_update[index] = updaterate * tvars_new[index] + (
                        1 - updaterate) * tvars_old[index]

                feed_dict = dictionary = dict(
                    zip(myAgent.tvars_holders, tvars_update))
                sess2.run(myAgent.update_tvar_holder, feed_dict)
                tvars_old = sess2.run(myAgent.tvars)
                #break

                #find the best parameters
                chosen_size = len(total_reward)
                total_reward = np.mean(np.array(total_reward))

                if (total_reward > best_reward):
                    best_reward = total_reward
                    tvars_best_rl = tvars_old

                if average_score > best_score:
                    best_score = average_score
                    #tvars_best_rl = tvars_old
                print('epoch:', epoch)
                print('chosen seq size:', chosen_size)
                print('total_reward:', total_reward)
                print('best_reward', best_reward)
                print('average score', average_score)
                print('best score', best_score)

            #set parameters = best_tvars
            feed_dict = dictionary = dict(
                zip(myAgent.tvars_holders, tvars_best_rl))
            sess2.run(myAgent.update_tvar_holder, feed_dict)
            #save model
            saver.save(sess2, save_path='../model/wiki/union_rl_model.ckpt')
            update_full_seq = np.array(update_full_seq)
            np.save(full_seq_name, update_full_seq)

    #interact.update_tvars(tvars_best_cnn)
    interact.save_stnemodel(save_path='../model/wiki/union_cnn_model.ckpt')
                W[row, col] = 0
                W[col, row] = 0
    print('dissimilar type G', type(W), np.count_nonzero(W))
    temp = (W + W.T) / 2.0
    Lw = csgraph.laplacian(temp, normed=False)  # L = D - temp

    return Lw


if __name__ == '__main__':
    import os
    from classify import read_node_label
    from label_utils_functions import completely_imbalanced_split_train
    from basic_functions import get_data_file, read_graph_as_matrix
    edge_file, label_file, feature_file = get_data_file()
    X, Y = read_node_label(label_file)

    removed_class = ['0', '1']
    X_train_idx, X_test_idx, Y_train, Y_test, X_train_cid_idx, Y_train_cid = completely_imbalanced_split_train(
        X, Y, 0.1, removed_class)
    print('train number', len(X_train_cid_idx))
    '''node_num = len(X_train_idx) + len(X_test_idx)
    print('node num:', node_num)
    Ls = build_Ls_matrix(node_num, X_train_cid_idx, Y_train_cid, orgk=5)
    print(Ls)'''

    nodeids, nouse = read_node_label(label_file)
    G = read_graph_as_matrix(nodeids, edge_file)
    Lw = build_Lw_matrix(G, X_train_cid_idx, Y_train_cid)
    print(Lw)
    '''
示例#17
0
文件: main.py 项目: esraabil/NECL
def main(args):
    print("number-walks " + str(args.number_walks))
    print("representation-size " + str(args.representation_size))
    print("walk-length " + str(args.walk_length))
    print("inout_fle " + str(args.input))
    print("******")
    g = Graph()
    deepw = False
    #similarity thresholds for compration
    trsl = [0.45, 0.495, 0.5, 0.55, 0.6, 0.7, 0.8, 1]
    #    trsl=[ 0.5 ]
    learn = True
    X, Y = read_node_label(args.label_file)
    seed = 0
    clsratio = [0.01, 0.05, 0.07, 0.1, 0.25, 0.5, 0.7, 0.8]
    #    clsratio=[ 0.1,0.2,0.4, 0.6,0.7,0.8,0.9]#,0.7,0.8]# use for blogcatalog

    np.random.seed(seed)
    shuffle_indices = np.random.permutation(np.arange(len(X)))
    f = open(args.input + "shu.txt", "w")
    f.writelines(str(item) + "\n" for item in shuffle_indices)
    f.close()
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input, directed=args.directed)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)
    G = g.G
    print("before spar, n: " + str(len(G.nodes())) + " m: " +
          str(len(G.edges())))
    #compute similarity score for compression
    t1 = time.time()
    p = pC(G, 0.45)
    scoreNode = p.ScoreCompute()
    t3 = time.time()
    f = open(args.input + "score.txt", "w")
    f.writelines(
        str(n[0]) + " " + str(n[1]) + " " + str(scoreNode[n]) + "\n"
        for n in scoreNode)
    f.close()
    print("total scorecom time: " + str(t3 - t1))
    #   read similarity scores from file
    #    f=open(args.input+"score.txt","r")
    #    scoreNode=dict()
    #    for x in f:
    #        l=x.split()
    #        scoreNode[((l[0]),(l[1]))] = float(l[2])

    for kk in range(0, len(trsl)):
        if learn:  # do embeding
            ths = trsl[kk]  #args.trs
            print("threshold is ...", ths)
            if args.graph_format == 'adjlist':
                g.read_adjlist(filename=args.input, directed=args.directed)
            elif args.graph_format == 'edgelist':
                g.read_edgelist(filename=args.input,
                                weighted=args.weighted,
                                directed=args.directed)
            if ths != 1:  #compression
                t1 = time.time()
                G = g.G
                G, nl2 = makeCompression(G, scoreNode, ths)
                f = open(args.input + "af_spar.txt", "w")
                f.writelines(str(n) + " " + str(nl2[n]) + "\n" for n in nl2)
                f.close()
                writeg(G, args)
                t2 = time.time()
                print("total_sparc_time: " + str(t2 - t1))
            #embedding
            t1 = time.time()
            print("After_compresing,n,m " + str(len(g.G.nodes())) + " " +
                  str(len(g.G.edges())))
            model = node2vec.Node2vec(graph=g,
                                      path_length=args.walk_length,
                                      num_paths=args.number_walks,
                                      dim=args.representation_size,
                                      workers=args.workers,
                                      p=args.p,
                                      q=args.q,
                                      window=args.window_size,
                                      dw=deepw)
            t2 = time.time()
            print("total_embeding_time " + str(t2 - t1))
            vectors = model.vectors
            if ths != 1:  #add embedding of removed nodes in compression
                addBack(nl2, vectors)
            np.save(args.output + "_" + str(ths) + ".npy", vectors)
        else:  #load embeddings
            vectors = np.load(args.output + "_" + str(ths) + ".npy")
            vectors = vectors.item(0)
            print("file_loaded")

    #print("Training classifier")
    #split_train_evaluate2 for single label (cora and wiki)
    #split_train_evaluate for multi lable (dblp and blogcatalog)
        for r in clsratio:
            clfa = Classifier(vectors,
                              clf=LogisticRegression(solver='liblinear'))
            res = clfa.split_train_evaluate2(
                X, Y, r, shuffle_indices)  # args.clf_ratio)
            print(str(r) + " " + str(res["macro"]) + " " + str(res["micro"]))
示例#18
0
    dropout = 0.2  # Dropout ration
    clf_ratio = [0.1, 0.2, 0.3, 0.4, 0.5]  # Ration of training samples in subsequent classification
    # b_s = 128  # Size of batches
    lr = 0.001  # Learning rate of RMSProp
    keep_prob = 0.5
    attention_size = 1000
    # max_iters = 20000
    print_every_k_iterations = 100
    idx = 0

    loss_1 = 0
    loss_2 = 0
    config = Config()
    start = time.time()
    fobj = open(fn, 'w')
    X, Y = read_node_label(folder + 'labels.txt')
    node_fea = read_node_features(folder + 'wiki.features')
    node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt')
    nx_G = read_graph(folder + 'wiki.edgelist')
    nodes = nx_G.nodes()
    N = len(nodes)
    X_1 = read_node_features(folder + 'wiki.features')
    print(node_fea.shape[1], 'CCCCCCCCCCCCCCCCCCCC')
    # with tf.Session() as sess:
    model = STNE(config, hidden_dim=h_dim, nx_G=nx_G, X_1=X_1, seq_len=s_len, attention_size=100, depth=dpt,
                 node_fea=node_fea, node_fea_trainable=trainable,
                 node_num=node_fea.shape[0], fea_dim=node_fea.shape[1])

    init = tf.global_variables_initializer()
    sess = tf.Session(config=config_tf)
    sess.run(init)
示例#19
0
   Author :       haxu
   date:          2019/4/3
-------------------------------------------------
   Change Activity:
                   2019/4/3:
-------------------------------------------------
"""
__author__ = 'haxu'

import networkx as nx
from deepwalk import DeepWalk
from classify import read_node_label, Classifier
from sklearn.linear_model import LogisticRegression

if __name__ == '__main__':
    G = nx.read_edgelist('../data/Wiki_edgelist.txt',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = DeepWalk(G, walk_length=30, num_walks=80, workers=4)

    model.train(window_size=5, iter=3)
    embeddings = model.get_embeddings()

    X, Y = read_node_label('../data/wiki_labels.txt')

    tr_frac = 0.8
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, tr_frac)
示例#20
0
def train_stne(node_seq):
    folder = '../data/cora/'
    fn = '../data/cora/result.txt'

    dpt = 1  # Depth of both the encoder and the decoder layers (MultiCell RNN)
    h_dim = 500  # Hidden dimension of encoder LSTMs
    s_len = 10  # Length of input node sequence
    epc = 20  # Number of training epochs
    trainable = False  # Node features trainable or not
    dropout = 0.2  # Dropout ration
    clf_ratio = [0.1, 0.2, 0.3, 0.4, 0.5
                 ]  # Ration of training samples in subsequent classification
    b_s = 128  # Size of batches
    lr = 0.001  # Learning rate of RMSProp

    start = time.time()
    fobj = open(fn, 'w')
    X, Y = read_node_label(folder + 'labels.txt')
    node_fea = read_node_features(folder + 'cora.features')

    with tf.Session() as sess:
        model = STNE(hidden_dim=h_dim,
                     node_fea_trainable=trainable,
                     seq_len=s_len,
                     depth=dpt,
                     node_fea=node_fea,
                     node_num=node_fea.shape[0],
                     fea_dim=node_fea.shape[1],
                     lr=0.001)
        #train_op = tf.train.RMSPropOptimizer(lr).minimize(model.loss_ce, global_step=model.global_step)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()

        trained_node_set = set()
        all_trained = False
        for epoch in range(epc):
            start_idx, end_idx = 0, b_s
            print('Epoch,\tStep,\tLoss,\t#Trained Nodes')
            while end_idx < len(node_seq):
                _, loss, step = sess.run(
                    [model.train_op, model.loss_ce, model.global_step],
                    feed_dict={
                        model.input_seqs: node_seq[start_idx:end_idx],
                        model.dropout: dropout
                    })
                if not all_trained:
                    all_trained = check_all_node_trained(
                        trained_node_set, node_seq[start_idx:end_idx],
                        node_fea.shape[0])

                if step % 10 == 0:
                    print(epoch, '\t', step, '\t', loss, '\t',
                          len(trained_node_set))
                    # if all_trained:
                    #     f1_mi = []
                    #     for ratio in clf_ratio:
                    #         f1_mi.append(node_classification(session=sess, bs=b_s, seqne=model, sequences=node_seq,
                    #                                          seq_len=s_len, node_n=node_fea.shape[0], samp_idx=X,
                    #                                          label=Y, ratio=ratio))

                    #     print('step ', step)
                    #     fobj.write('step ' + str(step) + ' ')
                    #     for f1 in f1_mi:
                    #         print(f1)
                    #         fobj.write(str(f1) + ' ')
                    #     fobj.write('\n')
                start_idx, end_idx = end_idx, end_idx + b_s

            if start_idx < len(node_seq):
                sess.run(
                    [model.train_op, model.loss_ce, model.global_step],
                    feed_dict={
                        model.input_seqs: node_seq[start_idx:len(node_seq)],
                        model.dropout: dropout
                    })

            minute = np.around((time.time() - start) / 60)
            print('\nepoch ', epoch, ' finished in ', str(minute),
                  ' minutes\n')

            saver.save(sess, save_path="../model/cora/select_stne_model.ckpt")
示例#21
0
                    dest="neurons_hiddenlayer",
                    default=[128, 64, 32],
                    type=list,
                    help='Number of Neurons AE.')

args = parser.parse_args()
assert args.algorithm in {
    'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax'
}
assert args.domain in {'restaurant', 'beer'}

if args.seed > 0:
    np.random.seed(args.seed)
###############################################################################################################################
## Read Data
X, Y = read_node_label(args.label_file)
node_size = len(X)

###############################################################################################################################
## Building model

sentence_input = Input(shape=(args.kstep, node_size),
                       dtype='float32',
                       name='sentence_inputt')
neg_input = Input(shape=(args.neg_size, args.kstep, node_size),
                  dtype='float32',
                  name='neg_inputt')
predict = Input(shape=(17, ), dtype='int32', name='predictt')

e_w = sentence_input
y_s = Average()(sentence_input)
示例#22
0
def main(args):
    t1 = time.time()
    g = Graph()
    print("Reading...")

    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)
    # if args.method == 'node2vec':
    #     model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
    #                               num_paths=args.number_walks, dim=args.representation_size,
    #                               workers=args.workers, p=args.p, q=args.q, window=args.window_size)
    # elif args.method == 'line':
    #     if args.label_file and not args.no_auto_save:
    #         model = line.LINE(g, epoch=args.epochs, rep_size=args.representation_size, order=args.order,
    #                           label_file=args.label_file, clf_ratio=args.clf_ratio)
    #     else:
    #         model = line.LINE(g, epoch=args.epochs,
    #                           rep_size=args.representation_size, order=args.order)
    # elif args.method == 'deepWalk':
    #     model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
    #                               num_paths=args.number_walks, dim=args.representation_size,
    #                               workers=args.workers, window=args.window_size, dw=True)
    # elif args.method == 'tadw':
    #     # assert args.label_file != ''
    #     assert args.feature_file != ''
    #     g.read_node_label(args.label_file)
    #     g.read_node_features(args.feature_file)
    #     model = tadw.TADW(
    #         graph=g, dim=args.representation_size, lamb=args.lamb)
    # elif args.method == 'gcn':
    #     assert args.label_file != ''
    #     assert args.feature_file != ''
    #     g.read_node_label(args.label_file)
    #     g.read_node_features(args.feature_file)
    #     model = gcnAPI.GCN(graph=g, dropout=args.dropout,
    #                        weight_decay=args.weight_decay, hidden1=args.hidden,
    #                        epochs=args.epochs, clf_ratio=args.clf_ratio)
    # elif args.method == 'grarep':
    #     model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size)
    # elif args.method == 'lle':
    #     model = lle.LLE(graph=g, d=args.representation_size)
    # elif args.method == 'hope':
    #     model = hope.HOPE(graph=g, d=args.representation_size)
    if args.method == 'sdne':
        encoder_layer_list = ast.literal_eval(args.encoder_list)
        model = sdne.SDNE(g,
                          encoder_layer_list=encoder_layer_list,
                          alpha=args.alpha,
                          beta=args.beta,
                          nu1=args.nu1,
                          nu2=args.nu2,
                          batch_size=args.bs,
                          epoch=args.epochs,
                          learning_rate=args.lr)
    elif args.method == 'sdne_binary_loss':
        encoder_layer_list = ast.literal_eval(args.encoder_list)
        model = sdne.SDNE(g,
                          encoder_layer_list=encoder_layer_list,
                          alpha=args.alpha,
                          beta=args.beta,
                          nu1=args.nu1,
                          nu2=args.nu2,
                          batch_size=args.bs,
                          epoch=args.epochs,
                          learning_rate=args.lr)
    elif args.method == 'sdne_meta_path':
        encoder_layer_list = ast.literal_eval(args.encoder_list)
        model = sdne.SDNE(g,
                          encoder_layer_list=encoder_layer_list,
                          alpha=args.alpha,
                          beta=args.beta,
                          nu1=args.nu1,
                          nu2=args.nu2,
                          batch_size=args.bs,
                          epoch=args.epochs,
                          learning_rate=args.lr)
    # elif args.method == 'lap':
    #     model = lap.LaplacianEigenmaps(g, rep_size=args.representation_size)
    # elif args.method == 'gf':
    #     model = gf.GraphFactorization(g, rep_size=args.representation_size,
    #                                   epoch=args.epochs, learning_rate=args.lr, weight_decay=args.weight_decay)
    t2 = time.time()
    print('cost time is : {}'.format(t2 - t1))
    if args.method != 'gcn':
        print("Saving embeddings...")
        model.save_embeddings(args.output)
    if args.label_file and args.method != 'gcn':
        vectors = model.vectors
        X, Y = read_node_label(args.label_file)
        print("Training classifier using {:.2f}% nodes...".format(
            args.clf_ratio * 100))
        clf = Classifier(vectors=vectors, clf=LogisticRegression())
        clf.split_train_evaluate(X, Y, args.clf_ratio)