示例#1
0
def Node2Vec_model(g):

    large_graph = False
    temp_folder = "/tmp/node2vec"

    # embedding parameters
    dimensions = 128
    window_size = 10
    # performance parameters
    num_threads = 4
    num_walks = 200

    
    # Precompute probabilities and generate walks
    print("Generating walks on the network ...")
    ## if d_graph is too big  to fit in the memory, pass temp_folder which has enough disk space
    if large_graph:
        # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
        node2vec = Node2Vec(g, dimensions=dimensions, walk_length=30, num_walks=num_walks, workers=num_threads, temp_folder=temp_folder)
    else:
        node2vec = Node2Vec(g, dimensions=dimensions, walk_length=30, num_walks=num_walks, workers=num_threads)

    # Embed
    print("Building the embedding (%d dimensions) ..." % dimensions)
    model = node2vec.fit(window=window_size, min_count=1, batch_words=4)

    return model
示例#2
0
    def __init__(self,
                 dataset,
                 p=1,
                 q=4,
                 walk_length=100,
                 num_walks=50,
                 dimensions=200,
                 window_size=30,
                 workers=8,
                 iterations=5):

        Node2Vec.__init__(self, False, True, False, p, q, walk_length,
                          num_walks, dimensions, window_size, workers,
                          iterations)

        self.dataset = dataset

        file = 'num%d_p%d_q%d_l%d_d%d_iter%d_winsize%d.emd' % (
            num_walks, p, q, walk_length, dimensions, iterations, window_size)

        self.path = 'datasets/%s/node2vec/' % self.dataset + file

        if file not in os.listdir('datasets/%s/node2vec/' % self.dataset):

            self.run('datasets/%s/node2vec/altogether.edgelist' % self.dataset,
                     self.path)

        self.node2vec_model = KeyedVectors.load_word2vec_format(self.path,
                                                                binary=True)
示例#3
0
    def __init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions, window_size,
                 workers, iterations, feedback_file):

        Node2Vec.__init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions,
                          window_size, workers, iterations)

        self.feedback_file = feedback_file
示例#4
0
    def fit(self, dataframe):
        edges = dataframe.groupby(
            self.categorical_columns, ).size().reset_index().dropna()

        G = nx.DiGraph()
        G.add_weighted_edges_from(edges.values)

        node2vec = Node2Vec(
            G,
            dimensions=self.n_components,
            walk_length=self.walk_length,
            num_walks=self.num_walks,
            workers=self.workers,
        )
        self.model = node2vec.fit(
            window=self.window,
            min_count=self.min_count,
            batch_words=self.batch_words,
        )
        self.feature = pd.DataFrame(
            {key: self.model.wv[key]
             for key in self.model.wv.vocab}).T.reset_index()
        self.feature.columns = self.categorical_columns[:1] + [
            f'{self.name}_{i:03}' for i in range(self.n_components)
        ]
        self.features = [self.feature]
        return self
def naiveGraphEmbeddingAllStudentsInAWeek(
        transitionDataMatrix_directFollow_week, activityCodeList, w):
    result = []
    dimensions = 64
    for i in transitionDataMatrix_directFollow_week.index:
        print(f'Week {w} - student: {i}')
        b = transitionDataMatrix_directFollow_week.loc[i, :]
        graph = graphCreationForSingleStudent(b, activityCodeList)
        if len(graph._node) > 0:
            # continue
            node2vec = Node2Vec(graph,
                                dimensions=dimensions,
                                walk_length=10,
                                num_walks=100)
            model = node2vec.fit(window=10, min_count=1)

            node_embeddings = (
                model.wv.vectors
            )  # numpy.ndarray of size number of nodes times embeddings dimensionality
            result.append(node_embeddings.sum(axis=0))

        else:
            result.append(np.zeros(64))
    return pd.DataFrame(result,
                        index=transitionDataMatrix_directFollow_week.index)
def node2vec_classification(G,
                            clusters,
                            dim=128,
                            walk_length=80,
                            num_walks=10,
                            return_=1,
                            inout=1):

    node2vec = Node2Vec(G,
                        dimensions=dim,
                        walk_length=walk_length,
                        num_walks=num_walks,
                        p=return_,
                        q=inout)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    word_vector_matrix = np.vstack([model.wv[node] for node in list(G)])
    kmeans = KMeans(n_clusters=clusters,
                    random_state=0).fit(word_vector_matrix)
    labels = kmeans.labels_
    node_labels = zip(list(G), labels)
    clusters = {}
    for pair in node_labels:

        if pair[1] in clusters.keys():
            clusters[pair[1]].append(pair[0])

        else:
            clusters[pair[1]] = [pair[0]]

    return clusters, word_vector_matrix
 def create_node2vec_embeddings(graph):
     multi_gnx = graph
     if os.path.exists('pkl/Node2Vec_embedding_new.pickle'):
         with open('pkl/Node2Vec_embedding_new.pickle', 'rb') as handle:
             dict_embeddings = pickle.load(handle)
     elif os.path.exists('pkl/Node2Vec_embedding_new.csv'):
         embedding_df = pd.read_csv('pkl/Node2Vec_embedding_new.csv')
         dict_embeddings = embedding_df.to_dict(orient='list')
         with open('pkl/Node2Vec_embedding_new.pickle', 'wb') as handle:
             pickle.dump(dict_embeddings,
                         handle,
                         protocol=pickle.HIGHEST_PROTOCOL)
     else:
         node2vec = Node2Vec(multi_gnx,
                             dimensions=16,
                             walk_length=30,
                             num_walks=200,
                             workers=1)
         model = node2vec.fit()
         nodes = list(multi_gnx.nodes())
         dict_embeddings = {}
         for i in range(len(nodes)):
             dict_embeddings.update(
                 {nodes[i]: np.asarray(model.wv.get_vector(nodes[i]))})
         with open('pkl/Node2Vec_embedding_new.pickle', 'wb') as handle:
             pickle.dump(dict_embeddings,
                         handle,
                         protocol=pickle.HIGHEST_PROTOCOL)
     return dict_embeddings
示例#8
0
    def test_precompute_probs(self):
        """Test the pre-compute_probs function."""
        g1 = nx.read_weighted_edgelist(path=WEIGHTED_NETWORK_PATH,
                                       nodetype=int)

        n1 = Node2Vec(g1)
        d1 = n1._precompute_probabilities()

        g2 = get_test_network(WEIGHTED_NETWORK_PATH)
        random_walk_parameters = WalkerParameters(
            number_paths=5,
            max_path_length=10,
        )
        word2vec_parameters = Word2VecParameters()

        n2 = Node2VecModel(g2, random_walk_parameters, word2vec_parameters)

        for key in d1.keys():
            vertex1 = d1[key]
            vertex2 = n2.graph.vs.find(name=str(key))
            self.assertListEqual(
                sorted(vertex1['neighbors']),
                sorted([int(nbr['name']) for nbr in vertex2.neighbors()]))
            self.assertListEqual(list(vertex1['first_travel_key']),
                                 list(vertex2['first_travel_key']))
            for inner_key in vertex1['probabilities'].keys():
                self.assertListEqual(
                    list(vertex1['probabilities'][inner_key]),
                    list(vertex2['probabilities'][str(inner_key)]))
示例#9
0
def node_embeddings(G, f, dim=20, walk_length=16, num_walks=100, workers=2):
    """
    Adds the embeddings of the nodes to the dataframe f.
    G: a networkx graph.
    f: a pandas dataframe.
    dim: the dimension of the embedding.

    Grover, A., & Leskovec, J. (2016, August). node2vec: Scalable feature learning for networks. In Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 855-864). ACM.
    """
    if not (set(f.name) == set(G.nodes()) and len(f.name) == len(G.nodes())):
        raise ValueError(
            'The number of nodes and the length of the datadrame should be the same.'
        )
    from node2vec import Node2Vec
    node2vec = Node2Vec(G,
                        dimensions=dim,
                        walk_length=walk_length,
                        num_walks=num_walks,
                        workers=workers)
    model = node2vec.fit(window=10, min_count=1)

    embeddings_df = pd.DataFrame(
        columns=['name'] + ['node_embeddings_' + str(i) for i in range(dim)])
    embeddings_df['name'] = f['name']
    for name in embeddings_df['name']:
        embeddings_df[embeddings_df['name'] == name] = [name] + list(
            model[str(name)])
    f = pd.merge(f, embeddings_df, on='name')
    return f
示例#10
0
def main():

    workflow = 'workflows/workflow_one.ros'
    args = {
        "disease_name": "type 2 diabetes mellitus",
    }
    libpath = ['workflows']
    """ general. """
    ros = Client(url="http://localhost:5002")
    response = ros.run(workflow=workflow, args=args, library_path=libpath)

    print(json.dumps(response.result, indent=2))

    graph = response.to_nx()
    for n in graph.nodes(data=True):
        print(n)

    n2v = Node2Vec(graph,
                   dimensions=128,
                   walk_length=80,
                   num_walks=10,
                   p=1,
                   q=1,
                   weight_key='weight',
                   workers=1,
                   sampling_strategy=None,
                   quiet=False)
    model = n2v.fit()
def train_embeddings(edgelist_path, embedding_path):
    # Create path
    graph = nx.read_weighted_edgelist(edgelist_path)
    logger.info('Graph created!')
    assert graph.get_edge_data(
        '0000013714',
        '0005064295')['weight'] == 3.2, 'Expected edge weight of 3.2'

    # Precomput probabilities and generate walks
    node2vec = Node2Vec(graph,
                        dimensions=128,
                        walk_length=30,
                        num_walks=10,
                        workers=10,
                        temp_folder=DATA_PATH)
    logger.info('Computed probabilities and generated walks')
    graph = None  # We don't need graph anymore since probabilities have been precomputed

    # Embed nodes
    model = node2vec.fit(window=5, min_count=1, batch_words=128)
    logger.info('Nodes embedded')

    # Save embeddings for later use
    model.wv.save_word2vec_format(embedding_path)
    logger.info('Embedding saved')
示例#12
0
def train(filename):
    # Create a graph
    graph = make_graph(filename)
    print('number of nodes: ', nx.number_of_nodes(graph))
    # H=nx.DiGraph(G)   # create a DiGraph using the connections from G
    # H.edges()
    # edgelist=[(0,1),(1,2),(2,3)]
    # H=nx.Graph(edgelist)

    # Precompute probabilities and generate walks
    node2vec = Node2Vec(graph,
                        dimensions=64,
                        walk_length=30,
                        num_walks=200,
                        workers=4)
    # Embed
    model = node2vec.fit(
        window=10, min_count=1, batch_words=4
    )  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
    # Look for most similar nodes
    model.wv.most_similar('2')  # Output node names are always strings
    # Save embeddings for later use
    model.wv.save('embeddings2.bin')

    # Save model for later use
    model.save('ex_model')
示例#13
0
def adjacency_matrix_to_train_set(g: nx.Graph, depth: int = 3) -> pd.DataFrame:
    """
    Transforms adjacency matrix of a graph into a training set for ML model

    :param g: input graph
    :param depth: max length of paths considered when generating training set

    :return dataframe with nodes, their embeddings, and their similarity
    """

    alpha = 10
    result = []

    model = Node2Vec(g).fit()

    A = nx.adjacency_matrix(g).todense()
    AA = A

    for i in range(depth):

        for (x, y), val in np.ndenumerate(AA):
            result.append((x, y, val * (1 / alpha**(i))))

        AA = AA @ A

    df = pd.DataFrame(np.array(result), columns=['x', 'y', 'val'])

    dfg = df.groupby(['x', 'y'], as_index=False).sum()
    dfg['emb_x'] = dfg['x'].apply(lambda x: model.wv[str(int(x))])
    dfg['emb_y'] = dfg['y'].apply(lambda y: model.wv[str(int(y))])

    return dfg
示例#14
0
def train(input_file, output_file):
    if len(sys.argv) < 2:
        print('insufficient arguments')
        exit()
    else:
        print('input file:', sys.argv[1], '\noutput file:', sys.argv[2])
    # Create a graph
    print('reading edges...')
    g = nx.Graph()
    with open(sys.argv[1], 'r') as f:
        for line in f:
            line = line.split(' ')
            g.add_edge(line[0], line[1], weight=float(line[2]))
            if 'str' in line:
                break

    sub = g.subgraph(max(nx.connected_components(g), key=len))
    print('number of nodes: ', nx.number_of_nodes(sub))
    print('number of edges: ', nx.number_of_edges(sub))

    # Precompute probabilities and generate walks
    node2vec = Node2Vec(sub,
                        dimensions=256,
                        walk_length=100,
                        num_walks=20,
                        workers=8,
                        p=0.25,
                        q=1)
    # np.save(output_file + 'walks', node2vec.walks)
    model = node2vec.fit(
        window=15, min_count=1, batch_words=4
    )  # Any keywords acceptable by gesim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
    model.wv.save_word2vec_format(output_file)
示例#15
0
def emb_node2vec(g,
                 s,
                 dimension=32,
                 walk_length=15,
                 num_walks=100,
                 window=10,
                 save=False):
    """
    Compute the node embedding using Node2Vec
    :param g: a graph
    :param s: protected attribute (vector)
    :param dimension: dimension of the embedding
    :param walk_length: length of the random walk
    :param num_walks: number of walks
    :param window: window
    :param save: if true save the node2vec model
    :return: the embedding matrix and the associate protected attribute
    """

    node2vec = Node2Vec(g,
                        dimensions=dimension,
                        walk_length=walk_length,
                        num_walks=num_walks)
    model = node2vec.fit(window=window, min_count=1)
    idx = list(map(int, model.wv.index_to_key))
    emb_x = model.wv.vectors
    new_s = s[idx]
    if save: model.save('node2vec_model')
    return emb_x, new_s, model
示例#16
0
文件: compare.py 项目: dmonojit/xc
def main(args):
    _, _, _, X_tr, Y_tr, V_tr, E_tr = get_x_y_v_e(args.filepath)

    print('Learning embeddings with {}...'.format(args.embedding))

    if args.embedding == 'deepwalk':
        ## DeepWalk default values: number_walks=10,representation_size=64,seed=0,walk_length=40,window_size=5,workers=1
        label_emb = DeepWalk().transform(E_tr, 'edgedict')
    elif args.embedding == 'node2vec':
        ## Node2Vec default values: num_walks=10,dimensions=64,walk_length=40,window_size=5,workers=1,p=1,q=1,
        ## weighted=False,directed=False,iter=1
        label_emb = Node2Vec().transform(E_tr, 'edgedict')
    else:
        raise NotImplemented

    label_emb_wv = label_emb.wv

    print('Calling compare...')

    compare = Compare(args.labelfile, label_emb_wv)
    compare.invoke()

    import pdb
    pdb.set_trace()

    print('end')
def get_node2vec_embeddings(features: List[UserFeatures], hyper_params: dict):
    input_edge_list_file = '{}/node2vec_relations_edge_list_{}.txt'.format(
        TEMP_DIR, get_random_id())
    output_embeddings_file = '{}/node2vec_relations_node2vec_embeddings_{}.txt'.format(
        TEMP_DIR, get_random_id())
    if not os.path.exists(TEMP_DIR):
        os.makedirs(TEMP_DIR)

    create_edge_list_file(input_edge_list_file, features)
    graph = nx.Graph()
    with open(input_edge_list_file, 'r') as file_handler:
        for line in file_handler:
            if line:
                node1, node2 = line.split()
                graph.add_edge(node1, node2)

    node2vec = Node2Vec(graph, workers=4, temp_folder=TEMP_DIR, **hyper_params)
    model = node2vec.fit(window=5, min_count=1, batch_words=4)
    model.wv.save_word2vec_format(output_embeddings_file)

    embeddings = np.genfromtxt(output_embeddings_file,
                               delimiter=' ',
                               skip_header=1)
    embeddings_sorted = embeddings[embeddings[:, 0].argsort()]
    os.remove(input_edge_list_file)
    os.remove(output_embeddings_file)

    return embeddings_sorted[:len(features), 1:]
示例#18
0
    def run_node2vec(self):
        #if have a model already, skip rerunning
        if self.model != None:
            print("Reusing existing model")
            return True

        #verify that we have a graph
        if self.graph == None:
            print(
                "Must build graph and compute pagerank before inferring parameters"
            )
            return False

        #precompute probabilities and generate walks
        print("Running node2vec...")
        node2vec = Node2Vec(
            self.graph,
            dimensions=16,
            walk_length=10,
            num_walks=200,
            workers=4,
            quiet=True
        )  #example uses 64 dimensions and walk_length 10, let's go smaller

        #compute embeddings - dimensions and workers automatically passed from the Node2Vec constructor
        self.model = node2vec.fit(window=10, min_count=1, batch_words=4)

        print("Done")
        return True
def run_node2vec_emb(data,
                     G,
                     embedding_model_file_path,
                     enforce_end2end,
                     add_qualified_edges,
                     use_weighted_edges,
                     edges_percent,
                     edges_number,
                     dim,
                     walk_len,
                     num_walks,
                     window,
                     added_edges_percent_of,
                     emb_type,
                     save_path):
    # dim = 1
    # walk_len = 1
    # num_walks =1
    # window =1

    node2vec = Node2Vec(G, weight_key='weight', dimensions=dim,
                        walk_length=walk_len, num_walks=num_walks, workers=4)
    model = node2vec.fit(window=window, min_count=1, batch_words=4)

    # save model to
    model.wv.save_word2vec_format(save_path)
示例#20
0
def main2():

    args_list = CSVArgs('test.csv')
    workflow = 'workflows/m2m_models_v1.ros'
    libpath = ['workflows']
    """ Build graph. """
    g = nx.MultiDiGraph()
    for args in args_list.vals:
        ros = Client(url="http://localhost:5002")
        response = ros.run(workflow=workflow, args=args, library_path=libpath)

        print(json.dumps(response.result, indent=2))
        response_nx = response.to_nx()
        print(
            f"read {len(response_nx.nodes())} nodes and {len(response_nx.edges())} edges."
        )
        g = nx.compose(g, response.to_nx())
    """ Calulate node embeddings. """
    n2v = Node2Vec(g,
                   dimensions=128,
                   walk_length=80,
                   num_walks=10,
                   p=1,
                   q=1,
                   weight_key='weight',
                   workers=1,
                   sampling_strategy=None,
                   quiet=False)
    model = n2v.fit()
示例#21
0
 def cluster(self, nodes, k=4):
     print('ori_node', len(nodes))
     graph = self.sub_graph(nodes)
     print('graph_nodes', len(graph))
     print('graph_edges', graph.size())
     node2vec = Node2Vec(graph,
                         dimensions=64,
                         walk_length=80,
                         num_walks=len(graph) * 2,
                         workers=4)
     print('fitting')
     t1 = time.time()
     model = node2vec.fit(window=5, min_count=1, batch_words=4)
     print('fitting done, time:', time.time() - t1)
     num_clusters = k
     vecs = []
     for node in nodes:
         vecs.append(model.wv[node])
     km_cluster = KMeans(n_clusters=num_clusters)
     result = km_cluster.fit_predict(vecs)
     lists = []
     for id, node in zip(result, nodes):
         lists.append([id, self.titles[int(node)]])
     lists = sorted(lists, key=lambda x: x[0])
     with open('result.txt', 'w') as f:
         for x in lists:
             print(x, file=f)
def save_node2vec_emb(G, save_path = f'data/gene_disease/{args.time_stamp}/processed/embedding/node2vec/', EMBEDDING_FILENAME = 'node2vec_emb.txt', log=True):

    print(f"save node2vec emb to {save_path + EMBEDDING_FILENAME}")
    try:
        with open(save_path + EMBEDDING_FILENAME, 'w') as f:
            pass
    except:
        os.makedirs(save_path, exist_ok=True)
    # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
    node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200,
                        workers=4)  # Use temp_folder for big graphs # todo undirected_edges
    s = time.time()

    # Embed nodes
    model = node2vec.fit(window=10, min_count=1,
                         batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
    f = time.time()
    total = f-s
    print(f'total = {total}')
    output_path = save_path + EMBEDDING_FILENAME

    # Save embeddings for later use
    model.wv.save_word2vec_format(output_path)

    # # Save model for later use
    # model.save(output_path)

    if log:
        with open(f'./log/gene_disease/{EMBEDDING_FILENAME}', 'w') as f:
            f.write(f' --{save_path}{EMBEDDING_FILENAME}\n')
            f.write(f'total running time {total}')
def get_node_random_walk(x_list, adj_list, node2vec_hidden, walk_length,
                         num_walks, p, q, workers):
    node_random_walk_list = []

    for i, adj in enumerate(adj_list):

        if i % 15 == 0:
            print('node random walk ...', i, '/', len(adj_list))

        walk_dic = {}
        if type(adj).__module__ == np.__name__:
            G = nx.Graph(adj)
        else:
            G = nx.Graph(adj.to('cpu').numpy())

        node2vec = Node2Vec(
            graph=
            G,  # The first positional argument has to be a networkx graph. Node names must be all integers or all strings. On the output model they will always be strings.
            dimensions=node2vec_hidden,  # Embedding dimensions (default: 128)
            walk_length=walk_length,  # number of nodes in each walks 
            num_walks=2,  # Number of walks per node (default: 10)
            p=p,  # 전 꼭짓점으로 돌아올 가능성, 얼마나 주변을 잘 탐색하는가
            q=q,  # 전 꼭짓점으로부터 멀어질 가능성, 얼마나 새로운 곳을 잘 탐색하는가
            weight_key=
            None,  # On weighted graphs, this is the key for the weight attribute (default: 'weight')
            workers=
            workers,  # Number of workers for parallel execution (default: 1)
            quiet=True)

        # Dic key: target node number, dic value: random walks of target node
        for random_walk in node2vec.walks:
            if not int(random_walk[0]) in walk_dic:
                walk_dic[int(random_walk[0])] = []
            walk_dic[int(random_walk[0])].append(random_walk)

        # Get index of one value in one-hot vector
        if type(x_list[i]).__module__ == np.__name__:
            hot_index = np.where(x_list[i] == 1.0)[1]
        else:
            hot_index = np.where(x_list[i].to('cpu').numpy() == 1.0)[1]

        # Unify to Node Feature
        node_random_walk_list2 = []

        for a in range(len(adj)):
            walks = walk_dic[a]
            walks_list = []
            for walk in walks:
                walk2 = []
                for node in walk:
                    if not int(node) >= len(hot_index):
                        walk2.append(float(hot_index[int(node)]))

                # Padding and append
                walks_list.append([0.0] * (walk_length - len(walk2)) + walk2)

            node_random_walk_list2.append(np.array(walks_list))
        node_random_walk_list.append(np.array(node_random_walk_list2))

    return node_random_walk_list
示例#24
0
def node_embeddings(G, f, dim=20, walk_length=16, num_walks=100, workers=2):
    """
    Adds the embeddings of the nodes to the dataframe f.
    G: a networkx graph.
    f: a pandas dataframe.
    dim: the dimension of the embedding.
    """
    #https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
    #https://github.com/eliorc/Medium/blob/master/Nod2Vec-FIFA17-Example.ipynb
    #funciona con node2vec
    if not (set(f.name) == set(G.nodes()) and len(f.name) == len(G.nodes())):
        raise ValueError(
            'Los tamaños del grafo y del dataframe no son inguales')
    from node2vec import Node2Vec
    node2vec = Node2Vec(G,
                        dimensions=dim,
                        walk_length=walk_length,
                        num_walks=num_walks,
                        workers=workers)
    model = node2vec.fit(window=10, min_count=1)

    embeddings_df = pd.DataFrame(
        columns=['name'] + ['node_embeddings_' + str(i) for i in range(dim)])
    embeddings_df['name'] = f['name']
    for name in embeddings_df['name']:
        embeddings_df[embeddings_df['name'] == name] = [name] + list(
            model[str(name)])
    f = pd.merge(f, embeddings_df, on='name')
    return f
示例#25
0
 def create_node2vec_embeddings(self):
     # path1 = os.path.join(self.dataset, 'Node2Vec_embedding.pickle')
     # path2 = os.path.join(self.dataset, 'Node2Vec_embedding.csv')
     # if os.path.exists(path1):
     #     with open(path1, 'rb') as handle:
     #         dict_embeddings = pickle.load(handle)
     # elif os.path.exists(path2):
     #     embedding_df = pd.read_csv(path2)
     #     dict_embeddings = embedding_df.to_dict(orient='list')
     #     with open(path2, 'wb') as handle:
     #         pickle.dump(dict_embeddings, handle, protocol=3)
     # else:
     #     node2vec = Node2Vec(self.graph, dimensions=16, walk_length=30, num_walks=200, workers=1)
     #     model = node2vec.fit()
     #     nodes = list(self.graph.nodes())
     #     dict_embeddings = {}
     #     for i in range(len(nodes)):
     #         dict_embeddings.update({nodes[i]: np.asarray(model.wv.get_vector(nodes[i]))})
     #     with open(path1, 'wb') as handle:
     #         pickle.dump(dict_embeddings, handle, protocol=3)
     node2vec = Node2Vec(self.graph, dimensions=self.dim, walk_length=80, num_walks=16, workers=2)
     model = node2vec.fit()
     nodes = list(self.graph.nodes())
     dict_embeddings = {}
     for i in range(len(nodes)):
         dict_embeddings.update({nodes[i]: np.asarray(model.wv.get_vector(str(nodes[i])))})
     return dict_embeddings
示例#26
0
 def node2vec(self):
     if self.nodeGraph == '':
         return
     # self.graph = self.nodeGraph
     beginTime = time.time()
     print('1: node2vec Begin')
     # Precompute probabilities and generate walks
     # g = self.graph
     # g = deepcopy(self.graph)
     # if self.attriNode:
     #    g.remove_nodes_from(self.attriNode)
     node2vec = Node2Vec(self.nodeGraph,
                         dimensions=16,
                         walk_length=30,
                         num_walks=200,
                         p=self.P,
                         q=self.Q,
                         workers=4)
     print('Time of Node2Vec', time.time() - beginTime)
     beginTime = time.time()
     # Embed
     # Any keywords acceptable by gensim.Word2Vec can be passed,
     # `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)
     self.model = node2vec.fit(window=10, min_count=1, batch_words=4)
     print('Time of FIT', time.time() - beginTime)
示例#27
0
def main(argv):
    inputfile, outputfile = read_args(argv)

    if not path.exists(outputfile):
        wan = WAN(inputfile)
        print(wan.graph.number_of_edges())
        wan.prune_edge(min_weight=1)
        print(wan.graph.number_of_edges())
        wan.prune_node(min_freq=2)
        print(wan.graph.number_of_edges())
        wan.reverse_weight()

        node2vec = Node2Vec(wan.graph, dimensions=100, walk_length=80, num_walks=200, workers=4)
        model = node2vec.fit(window=10, min_count=1, batch_words=4)

        model.wv.save_word2vec_format(outputfile)

        word_embeddings = model.wv.vectors
        word_embeddings_labels = model.wv.index2word
    else:
        word_embeddings, word_embeddings_labels = read_word2vec(outputfile)

    X_embedded = TSNE(n_components=2).fit_transform(word_embeddings)
    viz = Visualizer()
    viz.scatter_plot(X_embedded[:, 0], X_embedded[:, 1], word_embeddings_labels)
示例#28
0
def emb_graph_2vec(inputpath, dim):
    print("input name will be ", inputpath)
    emb_name = inputpath.replace("weighted_edglist_filytypeTxt.edgelist", "")
    print("emb_name will be ", emb_name)

    savename = inputpath.replace("weighted_edglist_filytypeTxt.edgelist",
                                 ".emb")
    print("emb outfile name will be ", savename)
    if os.path.exists(savename):
        print("file alread exists in cache, please rename")
        sys.exit(1)

    graph = nx.read_edgelist(inputpath, create_using=nx.DiGraph())
    # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
    node2vec = Node2Vec(graph,
                        dimensions=dim,
                        walk_length=30,
                        num_walks=200,
                        workers=10)
    # Embed nodes
    print("training .... ")
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    print("training finished saving result... ")

    print("saving %s file to disk " % savename)
    # Save embeddings for later use
    model.wv.save_word2vec_format(savename)
    print("done")
示例#29
0
 def train(self,
           graph,
           dimensions=64,
           walk_length=30,
           num_walks=200,
           workers=1,
           window=10,
           min_count=1,
           batch_words=4):
     if os.path.exists('../Result/EMBEDDING_MODEL'):
         model = KeyedVectors.load_word2vec_format(
             '../Result/EMBEDDING_MODEL')
         Embedding.model = model
     node2vec = Node2Vec(graph,
                         dimensions=dimensions,
                         walk_length=walk_length,
                         num_walks=num_walks,
                         workers=workers)
     self.saveWalk(node2vec)
     model = node2vec.fit(window=window,
                          min_count=min_count,
                          batch_words=batch_words)
     model.wv.save_word2vec_format('../Result/EMBEDDING_MODEL')
     Embedding.model = model
     return model
 def create_node2vec_embeddings(self):
     path1 = os.path.join(self.data_name, 'Node2Vec_embedding_old.pickle')
     path2 = os.path.join(self.data_name, 'Node2Vec_embedding_old.csv')
     if os.path.exists(path1):
         with open(path1, 'rb') as handle:
             dict_embeddings = pickle.load(handle)
     elif os.path.exists(path2):
         embedding_df = pd.read_csv(path2)
         dict_embeddings = embedding_df.to_dict(orient='list')
         with open(path2, 'wb') as handle:
             pickle.dump(dict_embeddings, handle, protocol=3)
     else:
         node2vec = Node2Vec(self.graph,
                             dimensions=16,
                             walk_length=30,
                             num_walks=200,
                             workers=1)
         model = node2vec.fit()
         nodes = list(self.graph.nodes())
         dict_embeddings = {}
         for i in range(len(nodes)):
             dict_embeddings.update(
                 {nodes[i]: np.asarray(model.wv.get_vector(nodes[i]))})
         with open(path1, 'wb') as handle:
             pickle.dump(dict_embeddings, handle, protocol=3)
     return dict_embeddings
示例#31
0
    def __init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions, window_size,
                 workers, iterations, config, sparql, dataset, entities, default_graph, entity_class, feedback_file):

        Node2Vec.__init__(self, is_directed, preprocessing, is_weighted, p, q, walk_length, num_walks, dimensions,
                          window_size, workers, iterations)

        self.config_file = config

        self.sparql = sparql

        self.default_graph = default_graph

        self.dataset = dataset

        self.entities = entities

        self.entity_class = entity_class

        self.feedback_file = feedback_file

        self._define_properties()