예제 #1
0
def classify_embeddings(model, training_set, test_set, training_labels):
    """
    Calculate scores using edge embeddings and a binary classifier
    :param model: Node2vec model
    :param training_set: list, whole training set
    :param test_set: list, whole test set
    :param training_labels: list, labels for each pair in training set
    :return: dict, scores, key=node pair, value=probability for being labelled 1
    """
    print("Embedding...")
    # Using Hadamard product for the embedding vectors
    edge_embeddings = HadamardEmbedder(keyed_vectors=model.wv)
    x_train_embedded = [edge_embeddings[pair] for pair in training_set]
    x_test = [edge_embeddings[pair] for pair in test_set]

    # Using MLPClassifier from sklearn as binary classifier
    classifier = MLPClassifier(random_state=1)
    print("Classifying...")
    classify = classifier.fit(x_train_embedded, training_labels)

    predict = classify.predict_proba(x_test)
    score = dict()
    for i in range(len(test_set)):
        node1 = test_set[i][0]
        node2 = test_set[i][1]
        score[node1 + ' ' + node2] = predict[i][1]
    return score
예제 #2
0
def get_edge_embeddings(G: Union[MultiDiGraph, MultiGraph],
                        debug: bool = False) -> HadamardEmbedder:
    if G is None:
        raise TypeError(' A type of G must be MultiDiGraph or MultiGraph')

    wv = None
    try:
        wv = HadamardEmbedder(keyed_vectors=get_node_embedings(G))
    except Exception as e:
        raise e

    if debug:
        for idxI in range(len(G.nodes())):
            for idxJ in range(idxI):
                print('similar vector: ', (idxI, idxJ))
                print('similar_edge', wv.most_similar((idxI, idxJ)))
                print('')

    return wv
예제 #3
0
def graph_embedding(train_data):
    # GE using n2v
    train_G = nx.from_pandas_edgelist(train_data, 'node1', 'node2')
    n2v = Node2Vec(train_G,
                   dimensions=32,
                   walk_length=150,
                   num_walks=250,
                   workers=4)
    model = n2v.fit(window=10, min_count=1, batch_words=4)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    return edges_embs
예제 #4
0
def embed_graph(G, dimensions = 3, walk_length=30, num_walks=200, workers=4, window=10, min_count=1, batch_words=4):
    node_embbeding = np.zeros((len(G.nodes()), dimensions))
    edge_embedding = np.zeros((len(G.nodes()),len(G.nodes()), dimensions))
    node2vec = Node2Vec(G, dimensions, walk_length, num_walks, workers)  # Use temp_folder for big graphs
    model = node2vec.fit(window=window, min_count=min_count, batch_words=batch_words)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    for node in G.nodes():
        node_embbeding[node,:] = model.wv.get_vector(str(node))
        for node2 in G.nodes():
            edge_embedding[node,node2] = edges_embs[(str(node),str(node2))]
    # return model.wv.get_normed_vectors()
    return node_embbeding,edge_embedding, model
예제 #5
0
    def compute_similarity(self, first_node, second_node):
        edges_embs = HadamardEmbedder(keyed_vectors=self.model.wv)

        # Look for embeddings on the fly - here we pass normal tuples
        edges_embs[(first_node, second_node)]
        ''' OUTPUT
        array([ 5.75068220e-03, -1.10937878e-02,  3.76693785e-01,  2.69105062e-02,
               ... ... ....
               ..................................................................],
              dtype=float32)
        '''

        # Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks
        edges_kv = edges_embs.as_keyed_vectors()

        # Look for most similar edges - this time tuples must be sorted and as str
        results = edges_kv.most_similar(str((first_node, second_node)))

        # Save embeddings for later use
        # edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)

        return results
예제 #6
0
    def edges(self):  # 接下来几部一般不使用  ,边嵌入  获取边嵌入
        if self.embedder_type == "Edge":
            edges_embs = EdgeEmbedder(keyed_vectors=self.model.wv)

        elif self.embedder_type == "Wl1":
            edges_embs = WeightedL1Embedder(keyed_vectors=self.model.wv)

        elif self.embedder_type == "Wl2":
            edges_embs = WeightedL2Embedder(keyed_vectors=self.model.wv)

        else:
            edges_embs = HadamardEmbedder(keyed_vectors=self.model.wv)
        return edges_embs
예제 #7
0
    def _node2vec_encode(self, save=False):
        if self.all_connectivity is None:
            print("Need to run function _find_all_connectivityd() first.")
            return
        graph = nx.from_numpy_matrix(self.all_connectivity,
                                     create_using=nx.DiGraph)
        node2vec = Node2Vec(graph,
                            dimensions=64,
                            walk_length=30,
                            num_walks=200,
                            workers=4)
        model = node2vec.fit(window=10, min_count=1, batch_words=4)

        # Save Node2vec results
        if save:
            model.wv.save_word2vec_format(NODE2VEC_EMBEDDING_FILENAME)
            model.save(NODE2VEC_EMBEDDING_MODEL_FILENAME)
            edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
            edges_kv = edges_embs.as_keyed_vectors()
            edges_kv.save_word2vec_format(NODE2VEC_EDGES_EMBEDDING_FILENAME)

        url_embedding = model.wv.vectors[0:len(self.url_vocab_list)]
        nav_url_embedding = model.wv.vectors[len(self.url_vocab_list):]
        return url_embedding, nav_url_embedding
예제 #8
0
    G_validation = get_graph(lists=links_validation)
    links_validation = list(G_validation.edges())

    # Use positive validation set to evaluate score
    G_valid_pos = get_graph(VAL_POS)
    X_test = list(G_valid_pos.edges())
    '''NODE2VEC'''
    n2v = Node2Vec(G_train,
                   dimensions=128,
                   walk_length=16,
                   num_walks=10,
                   workers=6)  # Windows OS might be limited to 1
    print("Fitting model...")
    model = n2v.fit(window=5, min_count=1, sg=1, hs=0)
    print("Embedding nodes...")
    hadamard_embedded_links = HadamardEmbedder(keyed_vectors=model.wv)
    X_validation = preprocess_lists(links_validation)
    X_validation_embedded = [hadamard_embedded_links[x] for x in X_validation]
    X_train = preprocess_lists(X_train)
    X_train_embedded = [hadamard_embedded_links[x] for x in X_train]
    '''CLASSIFICATION'''
    # Standardize features by removing mean and scale to unit variance
    logit = LogisticRegression()
    clf = logit.fit(X_train_embedded, y_train)
    probabilities = clf.predict_proba(X_validation_embedded)
    '''EVALUATION'''
    score = dict()
    X_validation = preprocess_lists(X_validation, cast=int)
    for i, prob in enumerate(probabilities):
        score[links_validation[i]] = prob[1]
    top100 = get_top_n_links(score, out=1)
예제 #9
0
from node2vec import Node2Vec

# pre-compute the probabilities and and generate walks
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=10, workers=1)

# In[15]:

# embed the nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# In[16]:

from node2vec.edges import HadamardEmbedder

# embed the edges
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

# In[19]:

# NOTE: I have put all the different things I tried in this section of the code. Comment it out for readability
# Of course if you tried to run everything together it's NOT going to work. Since I ran everything separately and got results
# But if you had like you can uncomment/comment each of them and get the prediction of valid/invalid edge in form of 1 or 0

with open('comp2_submission.csv', 'w') as csvfile:
    fieldnames = ['edge', 'label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    with open('sample_submission.csv') as csvfile2:
        reader = csv.reader(csvfile2, delimiter=',')
        for row in reader:
예제 #10
0
def main():

    with open(r"training.txt", "r") as f:
        reader = csv.reader(f)
        training = list(reader)
    # in order of training examples
    training = [element[0].split(" ") for element in training]
    training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
    print("Training examples shape: {}".format(training.shape))

    with open(r"testing.txt", "r") as f:
        reader = csv.reader(f)
        testing = list(reader)
    # in order of testing examples
    testing = [element[0].split(" ") for element in testing]
    testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
    print("Testing examples shape: {}".format(testing.shape))

    train_graph_split_path = 'pickles/train_graph_split.PICKLE'

    if os.path.exists(train_graph_split_path):
        with open(train_graph_split_path, 'rb') as f:
            keep_indices = pickle.load(f)
        f.close()
    else:
        keep_indices = random.sample(range(len(training)),
                                     k=int(len(training) * 0.05))
        with open(train_graph_split_path, '+wb') as f:
            pickle.dump(keep_indices, f)
        f.close()

    data_train_val = training.iloc[keep_indices]
    data_train = training.loc[~training.index.isin(keep_indices)]

    linked_nodes = data_train.loc[data_train['Link'] == '1']
    linked_nodes = linked_nodes[['Node1', 'Node2']]
    linked_nodes.to_csv('linked_nodes.txt', sep=' ', index=False, header=False)
    graph = nx.read_edgelist('linked_nodes.txt',
                             create_using=nx.Graph(),
                             nodetype=str)

    from node2vec import Node2Vec
    from node2vec.edges import HadamardEmbedder
    dicti = {}
    p_d, p_wl, p_nw, p_w, p_mc, p_bw = 64, 30, 200, 10, 1, 4

    node2vec = Node2Vec(graph,
                        dimensions=p_d,
                        walk_length=p_wl,
                        num_walks=p_nw,
                        workers=15)
    model = node2vec.fit(
        window=p_w, min_count=p_mc, batch_words=p_bw
    )  # Any keywords acceptable by gensim.Word2Vec can be passed
    # Embed edges using Hadamard method
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

    embed_size = len(edges_embs[('0', '1')])
    df_train = pd.DataFrame(0,
                            index=np.arange(len(data_train_val)),
                            columns=range(embed_size))
    j = []
    for j, i in tqdm(enumerate(data_train_val.index),
                     position=0,
                     leave=True,
                     total=len(data_train_val)):
        try:
            df_train.loc[j] = edges_embs[(data_train_val.loc[i]['Node1'],
                                          data_train_val.loc[i]['Node2'])]
        except:
            df_train.loc[j] = np.zeros(embed_size)

    X = df_train
    y = data_train_val['Link']
    y = list(map(lambda i: int(i), y))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.15,
                                                        random_state=1)

    lgbm = lightgbm.LGBMClassifier()
    model_lgbm = lgbm.fit(X_train, y_train)
    predictions = model_lgbm.predict(X_test)
    dicti['params:{}:{}:{}:{}:{}:{}'.format(p_d, p_wl, p_nw, p_w, p_mc,
                                            p_bw)] = f1_score(
                                                y_test, predictions)
    print(dicti['params:{}:{}:{}:{}:{}:{}'.format(p_d, p_wl, p_nw, p_w, p_mc,
                                                  p_bw)])
예제 #11
0
data['link'] = 0
# fb_df_partial = edge_list.drop(index=edge_list_ghost.index.values)
data = data.sample(frac=2 * (len(edge_list) / len(data)), random_state=SEED)
fb_df_partial = edge_list.copy()
G_data = nx.from_pandas_edgelist(fb_df_partial, "node_1", "node_2", create_using=nx.Graph())
fb_df_partial["link"] = 1
# data = data.append(edge_list_ghost[['node_1', 'node_2', 'link']], ignore_index=True)
data = data.append(fb_df_partial, ignore_index=True)
if not os.path.exists(os.path.join(os.curdir, "author.embedding")):
    node2vec = Node2Vec(G_data, dimensions=NODE2VEC_DIMENSION)
    n2w_model = node2vec.fit(window=7, min_count=1, workers=12)
    n2w_model.save("author.embedding")
else:
    n2w_model = Word2Vec.load("author.embedding")

edge_embeddings = HadamardEmbedder(keyed_vectors=n2w_model.wv)
if not os.path.exists(os.path.join(os.curdir, "mapping.npy")):
    x = list()
    for i, j in zip(data['node_1'], data['node_2']):
        try:
            x.append(edge_embeddings[(str(i), str(j))])
        except KeyError:
            x.append(np.zeros(NODE2VEC_DIMENSION))
    x = np.array(x)
    np.save("mapping", x)
else:
    x = np.load("mapping.npy")

# y = list()
# for i, j in zip(data['node_1'], data['node_2']):
#     y.append(list(set(nx.neighbors(author_graph, i)).intersection(set(nx.neighbors(author_graph, j)))))