Пример #1
0
def hybrid_logistic(dataset, k, embeddings, model):
    print("Hybrid model: Using features..")
    train_X, train_y, test_X, test_y = read_train_test_data(dataset, k)
    fea = FeaExtra(k=k, dataset=dataset)
    train_X1 = []
    test_X1 = []

    for i, j in train_X:
        features_i_j = np.array(fea.get_features(i, j))
        train_X1.append(
            np.concatenate([embeddings[i], embeddings[j], features_i_j]))

    for i, j in test_X:
        features_i_j = np.array(fea.get_features(i, j))
        test_X1.append(
            np.concatenate([embeddings[i], embeddings[j], features_i_j]))

    #print(train_X1[1])
    logistic_function = linear_model.LogisticRegression()
    logistic_function.fit(train_X1, train_y)
    pred = logistic_function.predict(test_X1)
    pred_p = logistic_function.predict_proba(test_X1)

    pos_ratio = np.sum(test_y) / test_y.shape[0]
    accuracy = metrics.accuracy_score(test_y, pred)
    f1_score0 = metrics.f1_score(test_y, pred)
    f1_score1 = metrics.f1_score(test_y, pred, average='macro')
    f1_score2 = metrics.f1_score(test_y, pred, average='micro')

    auc_score = metrics.roc_auc_score(test_y, pred_p[:, 1])
    print("pos_ratio:", pos_ratio)
    print('accuracy:', accuracy)
    print("f1_score:", f1_score0)
    print("macro f1_score:", f1_score1)
    print("micro f1_score:", f1_score2)
    print("auc score:", auc_score)

    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
Пример #2
0
def logistic_embedding8(k=1, dataset='epinions'):
    """use feature to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """
    print(dataset, k, 'fea')
    train_X, train_y, test_X, test_y = read_train_test_data(dataset, k)
    fea = FeaExtra(k=k, dataset=dataset)
    train_X1 = []
    test_X1 = []

    for i, j in train_X:
        train_X1.append(fea.get_features(i, j))

    for i, j in test_X:
        test_X1.append(fea.get_features(i, j))

    logistic = linear_model.LogisticRegression()
    logistic.fit(train_X1, train_y)

    pred = logistic.predict(test_X1)
    pred_p = logistic.predict_proba(test_X1)
    pos_ratio = np.sum(test_y) / test_y.shape[0]
    accuracy = metrics.accuracy_score(test_y, pred)
    f1_score0 = metrics.f1_score(test_y, pred)
    f1_score1 = metrics.f1_score(test_y, pred, average='macro')
    f1_score2 = metrics.f1_score(test_y, pred, average='micro')

    auc_score = metrics.roc_auc_score(test_y, pred_p[:, 1])
    print("pos_ratio:", pos_ratio)
    print('accuracy:', accuracy)
    print("f1_score:", f1_score0)
    print("macro f1_score:", f1_score1)
    print("micro f1_score:", f1_score2)
    print("auc score:", auc_score)

    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
Пример #3
0
def run(dataset='bitcoin_alpha', k=2):
    num_nodes = DATASET_NUM_DIC[dataset] + 3

    filename = './experiment-data/{}-train-{}.edgelist'.format(dataset, k)
    adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2, adj_lists3 = load_data2(
        filename, add_public_foe=False)
    print(k, dataset, 'data load!')
    features = nn.Embedding(num_nodes, NODE_FEAT_SIZE)
    features.weight.requires_grad = True

    features.to(DEVICES)

    adj_lists = [
        adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1,
        adj_lists2_2
    ]

    #######
    fea_model = FeaExtra(dataset=dataset, k=k)
    adj_additions1 = [defaultdict(set) for _ in range(16)]
    adj_additions2 = [defaultdict(set) for _ in range(16)]
    adj_additions0 = [defaultdict(set) for _ in range(16)]
    a, b = 0, 0

    for i in adj_lists3:
        for j in adj_lists3[i]:
            v_list = fea_model.feature_part2(i, j)
            for index, v in enumerate(v_list):
                if v > 0:
                    adj_additions0[index][i].add(j)

    for i in adj_lists1_1:
        for j in adj_lists1_1[i]:
            v_list = fea_model.feature_part2(i, j)
            for index, v in enumerate(v_list):
                if v > 0:
                    adj_additions1[index][i].add(j)
                    a += 1

    for i in adj_lists2_1:
        for j in adj_lists2_1[i]:
            v_list = fea_model.feature_part2(i, j)
            for index, v in enumerate(v_list):
                if v > 0:
                    adj_additions2[index][i].add(j)
                    b += 1
    assert a > 0, 'positive something wrong'
    assert b > 0, 'negative something wrong'

    adj_lists = adj_lists + adj_additions1 + adj_additions2

    print(len(adj_lists), 'motifs')

    def func(adj_list):
        edges = []
        for a in adj_list:
            for b in adj_list[a]:
                edges.append((a, b))
        edges = np.array(edges)
        adj = sp.csr_matrix((np.ones(len(edges)), (edges[:, 0], edges[:, 1])),
                            shape=(num_nodes, num_nodes))
        return adj

    adj_lists = list(map(func, adj_lists))
    features_lists = [features for _ in range(len(adj_lists))]
    aggs = [
        AttentionAggregator(features, NODE_FEAT_SIZE, NODE_FEAT_SIZE,
                            num_nodes)
        for features, adj in zip(features_lists, adj_lists)
    ]

    enc1 = Encoder(features_lists, NODE_FEAT_SIZE, EMBEDDING_SIZE1, adj_lists,
                   aggs)

    model = SiGAT(enc1)
    model.to(DEVICES)
    # print(model.train())
    model.train()
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        list(model.parameters()) + list(enc1.parameters()) \
                                        + list(features.parameters())),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY
                                 )

    for epoch in range(EPOCHS + 2):
        total_loss = []
        if epoch % INTERVAL_PRINT == 0:
            model.eval()
            all_embedding = np.zeros((NUM_NODE, EMBEDDING_SIZE1))
            for i in range(0, NUM_NODE, BATCH_SIZE):
                begin_index = i
                end_index = i + BATCH_SIZE if i + BATCH_SIZE < NUM_NODE else NUM_NODE
                values = np.arange(begin_index, end_index)
                embed = model.forward(values.tolist())
                embed = embed.data.cpu().numpy()
                all_embedding[begin_index:end_index] = embed

            model.train()

        time1 = time.time()
        nodes_pku = np.random.permutation(NUM_NODE).tolist()
        for batch in range(NUM_NODE // BATCH_SIZE):
            optimizer.zero_grad()
            b_index = batch * BATCH_SIZE
            e_index = (batch + 1) * BATCH_SIZE
            nodes = nodes_pku[b_index:e_index]

            loss = model.criterion(nodes, adj_lists1, adj_lists2)
            total_loss.append(loss.data.cpu().numpy())

            loss.backward()
            optimizer.step()
        print(
            f'epoch: {epoch}, loss: {np.sum(total_loss)}, time: {time.time()-time1}'
        )

    fpath = os.path.join(
        OUTPUT_DIR, 'embedding-{}-{}-{}_{}.npy'.format(dataset, k,
                                                       str(EMBEDDING_SIZE1),
                                                       str(epoch - 1)))
    np.save(fpath, all_embedding)
Пример #4
0
def run(dataset, k):
    num_nodes = DATASET_NUM_DIC[dataset] + 3

    # adj_lists1, adj_lists2, adj_lists3 = load_data(k, dataset)
    filename = './experiment-data/{}-train-{}.edgelist'.format(dataset, k)
    adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2, adj_lists3 = load_data2(
        filename)
    print(k, dataset, 'data load!')

    features = nn.Embedding(num_nodes, NODE_FEAT_SIZE)
    features.weight.requires_grad = True

    features = features.to(DEVICES)
    # 有向的四个边; 4个motifs
    adj_lists = [adj_lists1_1, adj_lists1_2, adj_lists2_1, adj_lists2_2]

    weight_dict = defaultdict(dict)
    fea_model = FeaExtra(dataset=dataset, k=k)
    # # u -> v
    for i in adj_lists1_1:
        for j in adj_lists1_1[i]:
            v_list1 = fea_model.feature_part2(i, j)
            mask = [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1]
            counts1 = np.dot(v_list1, mask)  # 构造balance三角形,在后面计算loss时候使用;
            weight_dict[i][j] = counts1

    for i in adj_lists2_1:
        for j in adj_lists2_1[i]:
            v_list1 = fea_model.feature_part2(i, j)
            mask = [0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0]

            counts1 = np.dot(v_list1, mask)
            weight_dict[i][j] = counts1

    adj_lists = adj_lists

    print(len(adj_lists), 'motifs')

    def func(adj_list):
        edges = []
        for a in adj_list:
            for b in adj_list[a]:
                edges.append((a, b))
        edges = np.array(edges)
        adj = sp.csr_matrix((np.ones(len(edges)), (edges[:, 0], edges[:, 1])),
                            shape=(num_nodes, num_nodes))
        return adj

    if args.agg == 'mean':
        aggregator = MeanAggregator
    else:
        aggregator = AttentionAggregator

    adj_lists = list(map(func, adj_lists))
    aggs = [
        aggregator(features, NODE_FEAT_SIZE, NODE_FEAT_SIZE, num_nodes)
        for adj in adj_lists
    ]  # 构建motifs下节点的邻居聚合
    enc1 = Encoder(features, NODE_FEAT_SIZE, EMBEDDING_SIZE1, adj_lists,
                   aggs)  # 4个motifs+本身节点 + cls => embedding
    enc1 = enc1.to(DEVICES)

    aggs2 = [
        aggregator(lambda n: enc1(n), EMBEDDING_SIZE1, EMBEDDING_SIZE1,
                   num_nodes) for _ in adj_lists
    ]
    enc2 = Encoder(lambda n: enc1(n), EMBEDDING_SIZE1, EMBEDDING_SIZE1,
                   adj_lists, aggs2)

    model = SDGNN(enc2)
    model = model.to(DEVICES)

    print(model.train())
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        list(model.parameters()) + list(enc1.parameters()) \
                                        + list(features.parameters())),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY
                                 )

    for epoch in range(EPOCHS + 2):
        total_loss = []
        if epoch % INTERVAL_PRINT == 1:
            model.eval()
            all_embedding = np.zeros((NUM_NODE, EMBEDDING_SIZE1))
            for i in range(0, NUM_NODE, BATCH_SIZE):
                begin_index = i
                end_index = i + BATCH_SIZE if i + BATCH_SIZE < NUM_NODE else NUM_NODE
                values = np.arange(begin_index, end_index)
                embed = model.forward(values.tolist())
                embed = embed.data.cpu().numpy()
                all_embedding[begin_index:end_index] = embed

            fpath = os.path.join(
                OUTPUT_DIR,
                'embedding-{}-{}-{}.npy'.format(dataset, k, str(epoch)))
            np.save(fpath, all_embedding)
            pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = logistic_embedding(
                k=k, dataset=dataset, epoch=epoch, dirname=OUTPUT_DIR)
            model.train()

        time1 = time.time()
        nodes_pku = np.random.permutation(NUM_NODE).tolist()
        for batch in range(NUM_NODE // BATCH_SIZE):
            optimizer.zero_grad()
            b_index = batch * BATCH_SIZE
            e_index = (batch + 1) * BATCH_SIZE
            nodes = nodes_pku[b_index:e_index]

            loss = model.criterion(
                nodes,
                adj_lists1,
                adj_lists2,
                adj_lists1_1,
                adj_lists2_1,
                weight_dict  # i,j节点的平衡三角形数量
            )
            total_loss.append(loss.data.cpu().numpy())

            loss.backward()
            optimizer.step()
        print(
            f'epoch: {epoch}, loss: {np.mean(total_loss)}, time: {time.time()-time1}'
        )