def hybrid_logistic(dataset, k, embeddings, model): print("Hybrid model: Using features..") train_X, train_y, test_X, test_y = read_train_test_data(dataset, k) fea = FeaExtra(k=k, dataset=dataset) train_X1 = [] test_X1 = [] for i, j in train_X: features_i_j = np.array(fea.get_features(i, j)) train_X1.append( np.concatenate([embeddings[i], embeddings[j], features_i_j])) for i, j in test_X: features_i_j = np.array(fea.get_features(i, j)) test_X1.append( np.concatenate([embeddings[i], embeddings[j], features_i_j])) #print(train_X1[1]) logistic_function = linear_model.LogisticRegression() logistic_function.fit(train_X1, train_y) pred = logistic_function.predict(test_X1) pred_p = logistic_function.predict_proba(test_X1) pos_ratio = np.sum(test_y) / test_y.shape[0] accuracy = metrics.accuracy_score(test_y, pred) f1_score0 = metrics.f1_score(test_y, pred) f1_score1 = metrics.f1_score(test_y, pred, average='macro') f1_score2 = metrics.f1_score(test_y, pred, average='micro') auc_score = metrics.roc_auc_score(test_y, pred_p[:, 1]) print("pos_ratio:", pos_ratio) print('accuracy:', accuracy) print("f1_score:", f1_score0) print("macro f1_score:", f1_score1) print("micro f1_score:", f1_score2) print("auc score:", auc_score) return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
def logistic_embedding8(k=1, dataset='epinions'): """use feature to train logistic function Returns: pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score """ print(dataset, k, 'fea') train_X, train_y, test_X, test_y = read_train_test_data(dataset, k) fea = FeaExtra(k=k, dataset=dataset) train_X1 = [] test_X1 = [] for i, j in train_X: train_X1.append(fea.get_features(i, j)) for i, j in test_X: test_X1.append(fea.get_features(i, j)) logistic = linear_model.LogisticRegression() logistic.fit(train_X1, train_y) pred = logistic.predict(test_X1) pred_p = logistic.predict_proba(test_X1) pos_ratio = np.sum(test_y) / test_y.shape[0] accuracy = metrics.accuracy_score(test_y, pred) f1_score0 = metrics.f1_score(test_y, pred) f1_score1 = metrics.f1_score(test_y, pred, average='macro') f1_score2 = metrics.f1_score(test_y, pred, average='micro') auc_score = metrics.roc_auc_score(test_y, pred_p[:, 1]) print("pos_ratio:", pos_ratio) print('accuracy:', accuracy) print("f1_score:", f1_score0) print("macro f1_score:", f1_score1) print("micro f1_score:", f1_score2) print("auc score:", auc_score) return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
def run(dataset='bitcoin_alpha', k=2): num_nodes = DATASET_NUM_DIC[dataset] + 3 filename = './experiment-data/{}-train-{}.edgelist'.format(dataset, k) adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2, adj_lists3 = load_data2( filename, add_public_foe=False) print(k, dataset, 'data load!') features = nn.Embedding(num_nodes, NODE_FEAT_SIZE) features.weight.requires_grad = True features.to(DEVICES) adj_lists = [ adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2 ] ####### fea_model = FeaExtra(dataset=dataset, k=k) adj_additions1 = [defaultdict(set) for _ in range(16)] adj_additions2 = [defaultdict(set) for _ in range(16)] adj_additions0 = [defaultdict(set) for _ in range(16)] a, b = 0, 0 for i in adj_lists3: for j in adj_lists3[i]: v_list = fea_model.feature_part2(i, j) for index, v in enumerate(v_list): if v > 0: adj_additions0[index][i].add(j) for i in adj_lists1_1: for j in adj_lists1_1[i]: v_list = fea_model.feature_part2(i, j) for index, v in enumerate(v_list): if v > 0: adj_additions1[index][i].add(j) a += 1 for i in adj_lists2_1: for j in adj_lists2_1[i]: v_list = fea_model.feature_part2(i, j) for index, v in enumerate(v_list): if v > 0: adj_additions2[index][i].add(j) b += 1 assert a > 0, 'positive something wrong' assert b > 0, 'negative something wrong' adj_lists = adj_lists + adj_additions1 + adj_additions2 print(len(adj_lists), 'motifs') def func(adj_list): edges = [] for a in adj_list: for b in adj_list[a]: edges.append((a, b)) edges = np.array(edges) adj = sp.csr_matrix((np.ones(len(edges)), (edges[:, 0], edges[:, 1])), shape=(num_nodes, num_nodes)) return adj adj_lists = list(map(func, adj_lists)) features_lists = [features for _ in range(len(adj_lists))] aggs = [ AttentionAggregator(features, NODE_FEAT_SIZE, NODE_FEAT_SIZE, num_nodes) for features, adj in zip(features_lists, adj_lists) ] enc1 = Encoder(features_lists, NODE_FEAT_SIZE, EMBEDDING_SIZE1, adj_lists, aggs) model = SiGAT(enc1) model.to(DEVICES) # print(model.train()) model.train() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, list(model.parameters()) + list(enc1.parameters()) \ + list(features.parameters())), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY ) for epoch in range(EPOCHS + 2): total_loss = [] if epoch % INTERVAL_PRINT == 0: model.eval() all_embedding = np.zeros((NUM_NODE, EMBEDDING_SIZE1)) for i in range(0, NUM_NODE, BATCH_SIZE): begin_index = i end_index = i + BATCH_SIZE if i + BATCH_SIZE < NUM_NODE else NUM_NODE values = np.arange(begin_index, end_index) embed = model.forward(values.tolist()) embed = embed.data.cpu().numpy() all_embedding[begin_index:end_index] = embed model.train() time1 = time.time() nodes_pku = np.random.permutation(NUM_NODE).tolist() for batch in range(NUM_NODE // BATCH_SIZE): optimizer.zero_grad() b_index = batch * BATCH_SIZE e_index = (batch + 1) * BATCH_SIZE nodes = nodes_pku[b_index:e_index] loss = model.criterion(nodes, adj_lists1, adj_lists2) total_loss.append(loss.data.cpu().numpy()) loss.backward() optimizer.step() print( f'epoch: {epoch}, loss: {np.sum(total_loss)}, time: {time.time()-time1}' ) fpath = os.path.join( OUTPUT_DIR, 'embedding-{}-{}-{}_{}.npy'.format(dataset, k, str(EMBEDDING_SIZE1), str(epoch - 1))) np.save(fpath, all_embedding)
def run(dataset, k): num_nodes = DATASET_NUM_DIC[dataset] + 3 # adj_lists1, adj_lists2, adj_lists3 = load_data(k, dataset) filename = './experiment-data/{}-train-{}.edgelist'.format(dataset, k) adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2, adj_lists3 = load_data2( filename) print(k, dataset, 'data load!') features = nn.Embedding(num_nodes, NODE_FEAT_SIZE) features.weight.requires_grad = True features = features.to(DEVICES) # 有向的四个边; 4个motifs adj_lists = [adj_lists1_1, adj_lists1_2, adj_lists2_1, adj_lists2_2] weight_dict = defaultdict(dict) fea_model = FeaExtra(dataset=dataset, k=k) # # u -> v for i in adj_lists1_1: for j in adj_lists1_1[i]: v_list1 = fea_model.feature_part2(i, j) mask = [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1] counts1 = np.dot(v_list1, mask) # 构造balance三角形,在后面计算loss时候使用; weight_dict[i][j] = counts1 for i in adj_lists2_1: for j in adj_lists2_1[i]: v_list1 = fea_model.feature_part2(i, j) mask = [0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0] counts1 = np.dot(v_list1, mask) weight_dict[i][j] = counts1 adj_lists = adj_lists print(len(adj_lists), 'motifs') def func(adj_list): edges = [] for a in adj_list: for b in adj_list[a]: edges.append((a, b)) edges = np.array(edges) adj = sp.csr_matrix((np.ones(len(edges)), (edges[:, 0], edges[:, 1])), shape=(num_nodes, num_nodes)) return adj if args.agg == 'mean': aggregator = MeanAggregator else: aggregator = AttentionAggregator adj_lists = list(map(func, adj_lists)) aggs = [ aggregator(features, NODE_FEAT_SIZE, NODE_FEAT_SIZE, num_nodes) for adj in adj_lists ] # 构建motifs下节点的邻居聚合 enc1 = Encoder(features, NODE_FEAT_SIZE, EMBEDDING_SIZE1, adj_lists, aggs) # 4个motifs+本身节点 + cls => embedding enc1 = enc1.to(DEVICES) aggs2 = [ aggregator(lambda n: enc1(n), EMBEDDING_SIZE1, EMBEDDING_SIZE1, num_nodes) for _ in adj_lists ] enc2 = Encoder(lambda n: enc1(n), EMBEDDING_SIZE1, EMBEDDING_SIZE1, adj_lists, aggs2) model = SDGNN(enc2) model = model.to(DEVICES) print(model.train()) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, list(model.parameters()) + list(enc1.parameters()) \ + list(features.parameters())), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY ) for epoch in range(EPOCHS + 2): total_loss = [] if epoch % INTERVAL_PRINT == 1: model.eval() all_embedding = np.zeros((NUM_NODE, EMBEDDING_SIZE1)) for i in range(0, NUM_NODE, BATCH_SIZE): begin_index = i end_index = i + BATCH_SIZE if i + BATCH_SIZE < NUM_NODE else NUM_NODE values = np.arange(begin_index, end_index) embed = model.forward(values.tolist()) embed = embed.data.cpu().numpy() all_embedding[begin_index:end_index] = embed fpath = os.path.join( OUTPUT_DIR, 'embedding-{}-{}-{}.npy'.format(dataset, k, str(epoch))) np.save(fpath, all_embedding) pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = logistic_embedding( k=k, dataset=dataset, epoch=epoch, dirname=OUTPUT_DIR) model.train() time1 = time.time() nodes_pku = np.random.permutation(NUM_NODE).tolist() for batch in range(NUM_NODE // BATCH_SIZE): optimizer.zero_grad() b_index = batch * BATCH_SIZE e_index = (batch + 1) * BATCH_SIZE nodes = nodes_pku[b_index:e_index] loss = model.criterion( nodes, adj_lists1, adj_lists2, adj_lists1_1, adj_lists2_1, weight_dict # i,j节点的平衡三角形数量 ) total_loss.append(loss.data.cpu().numpy()) loss.backward() optimizer.step() print( f'epoch: {epoch}, loss: {np.mean(total_loss)}, time: {time.time()-time1}' )