Exemplo n.º 1
0
def get_embeddings(G, organizations):
    # Fit graph
    g2v = NVVV()
    g2v.fit(G)

    # Embeddings
    embeddings = g2v.model.wv.vectors
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(embeddings)
    d_e = pd.DataFrame(principalComponents)
    d_e["company"] = organizations
    return d_e, g2v
Exemplo n.º 2
0
 def train(self):
     graph = self.__get_csr_graph()
     Logger.info("Training node2vec embeddings...")
     g2v = Node2Vec(n_components=32, walklen=8, epochs=25)
     g2v.fit(graph)
     Logger.info(f"Training done. Saving embeddings to {self.model_path}")
     g2v.save_vectors(self.model_path)
Exemplo n.º 3
0
def main(args):
    print(args)
    edgelists = [qf for qf in os.listdir(args.input)
                 if qf.endswith('.edgelist') and not qf.startswith('_')]
    g = None

    exclude = args.exclude or []

    print('loading edgelists...')
    for eg in edgelists:
        if eg in exclude or eg.rsplit('.', 1)[0] in exclude:
            continue
        print('- ' + eg)
        h = nx.read_edgelist(os.path.join(args.input, eg), nodetype=str, create_using=nx.DiGraph(), delimiter=' ')
        for edge in h.edges():
            h[edge[0]][edge[1]]['weight'] = 1

        g = h if g is None else nx.compose(g, h)

    g = g.to_undirected()

    print('Nodes: %d' % nx.number_of_nodes(g))
    print('Edges: %d' % nx.number_of_edges(g))

    print('Start learning at %s' % time.asctime())
    g2v = Node2Vec(
        walklen=args.walk_length,
        epochs=args.num_walks,
        n_components=args.dimensions,
        return_weight=1 / args.p,
        neighbor_weight=1 / args.q,
        threads=args.workers,
        w2vparams={
            'window': args.window_size,
            'iter': args.iter,
            'batch_words': 128,
            'min_count': 0,
            'negative': 25,
            'sg': 1
        },
        verbose=True
    )
    g2v.fit(g)
    print('End learning at %s' % time.asctime())

    # Save model to gensim.KeyedVector format
    g2v.save_vectors(args.output)
Exemplo n.º 4
0
def learn_embeddings(graph):
    """
    input:-
        graph: nx.Graph()
    output:-
        model: node2vec
    """
    """
    n2v = Node2Vec(graph, dimensions=30, walk_length=5, num_walks=200, workers=2)
    model = n2v.fit(window=10, min_count=1)

    return model
    """
    n2v = Node2Vec()
    n2v.fit(graph)

    return n2v
Exemplo n.º 5
0
def nodevec(graph: str, output_dir: str, directed: bool, tag: str,
            params: dict) -> None:

    # Ensure directories exist
    directory_check(output_dir)
    directory_check(output_dir + "/models")
    directory_check(output_dir + "/embeddings")
    temp_dir = output_dir + "/temp"
    directory_check(temp_dir)

    w2vparams = get_w2vparams(**params)
    node2vec_init = get_n2vparams(w2vparams=w2vparams, **params)

    print("Beginning node2vec script")
    print("File: %s" % graph)
    for key, value in node2vec_init.items():
        print("%s: %s" % (key, value))
    for key, value in w2vparams.items():
        print("%s: %s" % (key, value))

    G = nx.read_gpickle(graph)
    G = uri_to_str(G)

    if not directed:
        G = G.to_undirected()

    n2v_model = Node2Vec(**node2vec_init)
    n2v_model.fit(G)

    embedding_file = generate_out_file("embeddings.pkl",
                                       out_dir + "embeddings/", tag)
    model_file = generate_out_file("model.pkl", out_dir + "models/", tag)

    # Save embeddings
    n2v_model.model.wv.save_word2vec_format(embedding_file)
    print("Embeddings saved to %s" % embedding_file)

    # Save model
    n2v_model.model.save(model_file)
    print("Model saved to %s" % embedding_file)

    print("Completed nodevectors.py")
Exemplo n.º 6
0
def learn_embeddings(df, cids, show_graph=False):
    """
    input:-
        df: pd.DataFrame
        cids: list
        show_graph: bool

    output:-
        graph: nx.Graph()
        n2v: node2vec.Node2Vec
    """
    df = df[["cid", "pbdid", "min"]]
    cid_nodes = []
    pbdid = df["pbdid"].values
    pbdid = set(pbdid)
    for each_id in pbdid:
        cid = df[df["pbdid"] == each_id]["cid"].iloc[0]
        cid_nodes.append(cid)
    graph = nx.Graph()
    print("Building Graph\n", "="*32, "\n")
    print("Adding CID to Target PBDIDs...")
    for i, row in enumerate(df.values):
        graph.add_edge(row[0], row[1])

    cid_pairs = [[node, cid_] for cid_ in cids for node in cid_nodes]
    print("Generated Structurally related pairs")
    for node1, node2 in cid_pairs:
        graph.add_edge(node1, node2)
    print("Added Structurally related CIDs")

    if show_graph:
        draw_graph(graph)

    n2v = Node2Vec(graph, dimensions=20, walk_length=5, num_walks=200, workers=2)
    model = n2v.fit(window=10, min_count=1)
    return graph, n2v, model
Exemplo n.º 7
0
            pickle.dump(n2v, open("./graphs/n2v_sub_small1.pkl", "wb"))

        train_comp_expensive = False
        if train_comp_expensive:
            graphs_subset = [
                x for x in graphs if len(related[graphs.index(x)]) > 50
            ]
            for i, graph in tqdm(enumerate(graphs_subset),
                                 total=len(graphs_subset),
                                 leave=False):
                n2v = learn_embeddings(graph)
                n2v.save(f"./graphs/n2v_sub_huge-{i+1}.pckl")
                # pickle.dump(n2v, open(f"./graphs/n2v_sub_huge-{i+1}.pkl", "wb"))

save_huge_vecs = False
if save_huge_vecs:
    graphs = [
        Node2Vec.load(f"./graphs/huge_graphs/n2v_sub_huge-{i}.pckl.zip")
        for i in range(1, 6)
    ]
    for i, graph in enumerate(graphs):
        graph.save_vectors(f"./vectors/wheel_mode_graph-{i}.bin")

save_small_vecs = False
if save_small_vecs:
    if not save_huge_vecs:
        i = 5
    small_graphs = pickle.load(open("./graphs/n2v_sub_small1.pkl", "rb"))
    for j, graph in enumerate(small_graphs):
        graph.save_vectors(f"./vectors/wheel_mode_graph-{i+j}.bin")
Exemplo n.º 8
0
                        '--output',
                        help='The output folder',
                        required=True)

    args = vars(parser.parse_args())

    G = nx.read_edgelist(args["input"], delimiter='\t')
    embedding_size = 64

    # Fit embedding model to graph
    g2v = Node2Vec(walklen=5,
                   epochs=10,
                   threads=4,
                   n_components=embedding_size,
                   keep_walks=False,
                   w2vparams={
                       "window": 3,
                       "negative": 3,
                       "iter": 3,
                       "batch_words": 64,
                       "workers": 2
                   })
    # way faster than other node2vec implementations
    # Graph edge weights are handled automatically
    g2v.fit(G)

    # query embeddings for node 42
    print(g2v.predict(42))

    g2v.save(os.path.join(args["output"], 'node2vec.pckl'))
    # Save model to gensim.KeyedVector format
    g2v.save_vectors(os.path.join(args["output"], "wheel_model.bin"))
Exemplo n.º 9
0
import networkx as nx
from nodevectors import Node2Vec
import time

start = time.time()
# 2만개 : 약 1분
G= nx.read_gml("C:/Users/DI_Lab/Desktop/연구실 자료/국보연/전지원/Full_G.gml")
print("읽은 시간 time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

start = time.time()
node2vec_model = Node2Vec(n_components=32,
    walklen=10)
node2vec_model.fit(G)
print("모델 학습 시간 time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
start = time.time()
node2vec_model.save("C:/Users/DI_Lab/Desktop/연구실 자료/국보연/전지원/word2vec_Full.model")
print("모델 저장 시간 time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
Exemplo n.º 10
0
import networkx as nx
from nodevectors import Node2Vec

# Test Graph
model_name = "structwords"
graph_file = "data/keywords.edgelist"

# G = nx.read_weighted_edgelist(graph_file)
G = nx.read_weighted_edgelist(graph_file)

# Fit embedding model to graph
g2v = Node2Vec(neighbor_weight=3)

# way faster than other node2vec implementations
# Graph edge weights are handled automatically
g2v.fit(G)

# Save and load whole node2vec model
# Uses a smart pickling method to avoid serialization errors
# Don't put a file extension after the `.save()` filename, `.zip` is automatically added
g2v.save(model_name)
Exemplo n.º 11
0
    def train(self, data):
        # index data
        self._setup_db()

        copy_input = io.StringIO()
        error_counter = 0
        for i, r in enumerate(
                tqdm(data.iterrows(),
                     total=len(data.index),
                     desc="Building index")):
            row = []
            key, x, y = get_key_x_y(r[1])

            row.append(key)

            if isnan(x) or isnan(y):
                error_counter += 1
                continue

            row.append("SRID=4326;POINT(" + str(x) + " " + str(y) + ")")
            copy_input.write(",".join(row) + "\n")

        if error_counter > 0:
            print("Warning could not encode " + str(error_counter) +
                  " instances")

        print("Executing copy to db")
        copy_input.seek(0)

        with self.db.get_connection() as conn:
            with conn.cursor() as cur:
                cur.copy_from(copy_input,
                              self._table_name(),
                              sep=",",
                              null="\"\"")

                print("Creating index")
                idx_query = "create index " + self._table_name(
                ) + "_loc_idx on " + self._table_name(
                ) + " using gist(location);"
                cur.execute(idx_query)
        print("Indexing done")
        return None

        rows = []
        data.sort_values("lat", inplace=True)
        apply_sliding_window(data, 1, self.njobs, rows)

        data.sort_values("lon", inplace=True)
        apply_sliding_window(data, 2, self.njobs, rows)

        print("Created " + str(len(rows)) + " edges")

        elist = pd.DataFrame(rows, columns=["src", "dst", "weight"])
        elist.weight = pd.to_numeric(elist.weight)

        # Create name mapping to normalize node IDs
        allnodes = list(set(elist.src.unique()).union(set(elist.dst.unique())))

        # This factors all the unique nodes to unique IDs
        names = (np.array(
            pd.Series(allnodes).astype('category').cat.categories))
        name_dict = dict(zip(names, np.arange(names.shape[0])))

        elist.src = elist.src.map(name_dict).astype(np.uint32)
        elist.dst = elist.dst.map(name_dict).astype(np.uint32)
        elist.sort_values(by='src', inplace=True, ignore_index=True)

        nnodes = names.shape[0]
        G = _edgelist_to_wdw_graph(elist, nnodes, nodenames=names)

        elist = None
        rows = None
        gc.collect()

        # train node2vec
        print("Training node2vec")

        wdw = Node2Vec(threads=self.njobs, walklen=10, n_components=100)
        wdw.fit(G)

        print("Training complete")

        self.wdw = wdw
Exemplo n.º 12
0
import networkx as nx
from nodevectors import Node2Vec
import time
from gensim.models import KeyedVectors

g2v = Node2Vec.load(
    'C:/Users/DI_Lab/Desktop/연구실 자료/국보연/전지원/word2vec_20000test.model.zip')

# Save model to gensim.KeyedVector format
g2v.save_vectors("wheel_model.bin")

# load in gensim
print(g2v)
model = KeyedVectors.load_word2vec_format("wheel_model.bin")
print(model)
model[str("cve-2019-1020019")]
Exemplo n.º 13
0
# nodevectors and csrgraph must be installed: run 'pip install nodevectors csrgraph' to install
# module nodevectors work on sparse matrix, so it much faster to fit.
# GitHub link to this repository: https://github.com/VHRanger/nodevectors
import pandas as pd
import networkx as nx
import numpy as np
from nodevectors import Node2Vec

edges = pd.read_csv('finnet_data/edges.csv')
G = nx.from_pandas_dataframe(edges, 'id_1', 'id_2')

g2v = Node2Vec(n_components=100, walklen=4)
g2v.fit(G)

# File size is about 2GB
g2v.save_vectors("n2v.bin")
Exemplo n.º 14
0
import pandas as pd
import numpy as np
import networkx as nx
from nodevectors import Node2Vec, ProNE # import Node2Vec model to encode the graph


# load the graph
G = nx.read_edgelist('data/collaboration_network.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

# Create an embedding of the graph
g2v = Node2Vec(n_components= 32, walklen= 10)
#g2v = ProNE(step = 6,n_components= 32) # other model that we tried
# Fit the model
g2v.fit(G)

# Get the embedding of each node
Embeddings = {}
for u in G.nodes:
  Embeddings[u] = g2v.predict(u)

# transform the embedding to pandas array and save it
df = pd.DataFrame.from_dict(Embeddings)
df.to_csv('data/graph_embedding.csv', index=False)
Exemplo n.º 15
0
from nodevectors import Node2Vec

model_name = "keywords_deep"
idx_file = "data/word_index.pickle"
keywords_file = "data/mag_cs_keywords.csv"

# Load in relevant data and modules
keywords_full_data = pd.read_csv(keywords_file)
keywords_full_data['normalizedName'] = keywords_full_data[
    'normalizedName'].fillna('nan')
keywords_data = keywords_full_data['normalizedName']

with open(idx_file, 'rb') as f:
    word_to_idx = pickle.load(f)

keyword_embs = Node2Vec.load(model_name + ".zip")

# Process word queries
while True:
    print("Please enter a word to search: ")
    query_word = input()

    query_node_idx = -1
    query_node = None

    while query_node_idx < 0:
        try:
            query_node_idx = word_to_idx[query_word.lower()]
            query_node = keyword_embs.predict(query_node_idx)
        except:
            print(
Exemplo n.º 16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Date    : 2021-01-24
# @Contact    : [email protected]
import csrgraph as cg
from joblib import dump
from nodevectors import Node2Vec

G = cg.read_edgelist("data/graph_data.csv", directed=False, sep=',')
node2vec = Node2Vec(threads=6, n_components=100, w2vparams=dict(workers=12))
node2vec.fit(G)
print(node2vec)
dump(node2vec, "data/node2vec.pkl")