Пример #1
0
def main(args):
    t1 = time.time()
    g = Graph()
    print("Reading...")
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)
    elif args.graph_format == 'tem_edgelist':
        g.read_tem_edgelist(filename=args.input)
    if args.method == 'node2vec':
        model = node2vec.Node2vec(graph=g,
                                  path_length=args.walk_length,
                                  num_paths=args.number_walks,
                                  dim=args.representation_size,
                                  workers=args.workers,
                                  p=args.p,
                                  q=args.q,
                                  window=args.window_size)
    elif args.method == 'deepWalk':
        model = node2vec.Node2vec(graph=g,
                                  path_length=args.walk_length,
                                  num_paths=args.number_walks,
                                  dim=args.representation_size,
                                  workers=args.workers,
                                  window=args.window_size,
                                  dw=True)

    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
Пример #2
0
def main(args):
    t1 = time.time()
    g = Graph()
    singluar_node_file = "singluar_nodes.txt"
    
    print("Reading...")


    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed)
    if args.method == 'node2vec':
        model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
                                 num_paths=args.number_walks, dim=args.representation_size,
                                 workers=args.workers, p=args.p, q=args.q, window=args.window_size)
    elif args.method == 'line':
        if args.label_file and not args.no_auto_save:
            model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order, 
                label_file=args.label_file, clf_ratio=args.clf_ratio)
        else:
            model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order)
    elif args.method == 'deepWalk':
        model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
                                 num_paths=args.number_walks, dim=args.representation_size,
                                 workers=args.workers, window=args.window_size, dw=True)
    elif args.method == 'tadw':
        # assert args.label_file != ''
        assert args.feature_file != ''
        # g.read_node_label(args.label_file)
        g.read_node_features(args.feature_file)
        fout = open(singluar_node_file, "w+")
        for node_idx in g.sgl_node_list:
            fout.write("{}\n".format(node_idx))
        fout.close()

        model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
        # model = tadw_gpu.TADW_GPU(graph=g, dim=args.representation_size, lamb=args.lamb)
    elif args.method == 'gcn':
        assert args.label_file != ''
        assert args.feature_file != ''
        g.read_node_label(args.label_file)
        g.read_node_features(args.feature_file)
        model = gcnAPI.GCN(graph=g, dropout=args.dropout,
                            weight_decay=args.weight_decay, hidden1=args.hidden,
                            epochs=args.epochs, clf_ratio=args.clf_ratio)
    elif args.method == 'grarep':
        model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size)
    t2 = time.time()
    print("time: ", t2-t1)
    if args.method != 'gcn':
        print("Saving embeddings...")
        model.save_embeddings(args.output)
    if args.label_file and args.method != 'gcn':
        vectors = model.vectors
        X, Y = read_node_label(args.label_file)
        print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio*100))
        clf = Classifier(vectors=vectors, clf=LogisticRegression())
        clf.split_train_evaluate(X, Y, args.clf_ratio)
Пример #3
0
def deepwalk_embedding(time=None,
                       path_length=10,
                       num_paths=5,
                       dim=10,
                       walkers=4,
                       window=5,
                       aggrateNum=3):

    if time == None:
        return np.zeros(dim)

    #load graph file
    interaction = sp.coo_matrix((10001, 10001))
    for i in range(aggrateNum):
        t = time - i * 600000
        fp_ = '{}/{}.npz'.format(graphPath, t)
        if os.path.exists(fp_):
            adj = load_npz(fp_)
            adj = adj.tocoo()
            print(interaction.shape)
            print(adj.shape)
            interaction += adj
    interaction = interaction.tocoo()
    row = interaction.row
    col = interaction.col
    val = interaction.data
    edges = list(zip(row, col, val))
    tmp = 'tmp.txt'
    f = open(tmp, "w+")
    for r, c, v in edges:
        f.write('{} {} {}\n'.format(r, c, v))
    f.close()
    #build graph
    g = Graph()
    g.read_edgelist(filename='tmp.txt', weighted=True, directed=True)
    #embedding
    model = node2vec.Node2vec(graph=g,
                              path_length=10,
                              num_paths=5,
                              dim=10,
                              workers=4,
                              window=5,
                              dw=True)
    # return embedding
    print(type(model.vectors))
    return model
Пример #4
0
def main(args):
    g = Graph(
    )  # see graph.py for commonly-used APIs and use g.G to access NetworkX APIs
    print(f'Summary of all settings: {args}')

    # ---------------------------------------STEP1: load data-----------------------------------------------------
    print('\nSTEP1: start loading data......')
    t1 = time.time()
    # load graph structure info; by defalt, treat as undirected and unweighted graph ------
    if args.graph_format == 'adjlist':
        g.read_adjlist(path=args.graph_file, directed=args.directed)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(path=args.graph_file,
                        weighted=args.weighted,
                        directed=args.directed)
    # load node attribute info ------
    is_ane = (args.method == 'abrw' or args.method == 'tadw'
              or args.method == 'gcn' or args.method == 'sagemean'
              or args.method == 'sagegcn' or args.method == 'attrpure'
              or args.method == 'attrcomb' or args.method == 'asne'
              or args.method == 'aane')
    if is_ane:
        assert args.attribute_file != ''
        g.read_node_attr(args.attribute_file)
    # load node label info------
    t2 = time.time()
    print(f'STEP1: end loading data; time cost: {(t2-t1):.2f}s')

    # ---------------------------------------STEP2: prepare data----------------------------------------------------
    print('\nSTEP2: start preparing data for link pred task......')
    t1 = time.time()
    test_node_pairs = []
    test_edge_labels = []
    if args.task == 'lp' or args.task == 'lp_and_nc':
        edges_removed = g.remove_edge(ratio=args.link_remove)
        num_test_links = 0
        limit_percentage = 0.2  # at most, use 0.2 randomly removed links for testing
        num_test_links = int(
            min(len(edges_removed),
                len(edges_removed) / args.link_remove * limit_percentage))
        edges_removed = random.sample(edges_removed, num_test_links)
        test_node_pairs, test_edge_labels = generate_edges_for_linkpred(
            graph=g, edges_removed=edges_removed, balance_ratio=1.0)
    t2 = time.time()
    print(f'STEP2: end preparing data; time cost: {(t2-t1):.2f}s')

    # -----------------------------------STEP3: upstream embedding task-------------------------------------------------
    print('\nSTEP3: start learning embeddings......')
    print(f'the graph: {args.graph_file}; \nthe model used: {args.method}; \
            \nthe # of edges used during embedding (edges maybe removed if lp task): {g.get_num_edges()}; \
            \nthe # of nodes: {g.get_num_nodes()}; \nthe # of isolated nodes: {g.get_num_isolates()}; \nis directed graph: {g.get_isdirected()}'
          )
    t1 = time.time()
    model = None
    if args.method == 'abrw':
        from libnrl import abrw  # ANE method; (Adaptive) Attributed Biased Random Walk
        model = abrw.ABRW(graph=g,
                          dim=args.dim,
                          topk=args.ABRW_topk,
                          beta=args.ABRW_beta,
                          beta_mode=args.ABRW_beta_mode,
                          alpha=args.ABRW_alpha,
                          number_walks=args.number_walks,
                          walk_length=args.walk_length,
                          window=args.window_size,
                          workers=args.workers)
    elif args.method == 'aane':
        from libnrl import aane  # ANE method
        model = aane.AANE(
            graph=g,
            dim=args.dim,
            lambd=args.AANE_lamb,
            rho=args.AANE_rho,
            maxiter=args.AANE_maxiter,
            mode='comb')  # mode: 'comb' struc and attri or 'pure' struc
    elif args.method == 'tadw':
        from libnrl import tadw  # ANE method
        model = tadw.TADW(graph=g,
                          dim=args.dim,
                          lamb=args.TADW_lamb,
                          maxiter=args.TADW_maxiter)
    elif args.method == 'attrpure':
        from libnrl import attrpure  # NE method simply use svd or pca for dim reduction
        model = attrpure.ATTRPURE(graph=g, dim=args.dim,
                                  mode='pca')  # mode: pca or svd
    elif args.method == 'attrcomb':
        from libnrl import attrcomb  # ANE method
        model = attrcomb.ATTRCOMB(
            graph=g,
            dim=args.dim,
            comb_with='deepwalk',
            number_walks=args.number_walks,
            walk_length=args.walk_length,
            window=args.window_size,
            workers=args.workers,
            comb_method=args.AttrComb_mode
        )  # comb_method: concat, elementwise-mean, elementwise-max
    elif args.method == 'deepwalk':
        from libnrl import node2vec  # PNE method; including deepwalk and node2vec
        model = node2vec.Node2vec(graph=g,
                                  path_length=args.walk_length,
                                  num_paths=args.number_walks,
                                  dim=args.dim,
                                  workers=args.workers,
                                  window=args.window_size,
                                  dw=True)
    elif args.method == 'node2vec':
        from libnrl import node2vec  # PNE method; including deepwalk and node2vec
        model = node2vec.Node2vec(graph=g,
                                  path_length=args.walk_length,
                                  num_paths=args.number_walks,
                                  dim=args.dim,
                                  workers=args.workers,
                                  window=args.window_size,
                                  p=args.Node2Vec_p,
                                  q=args.Node2Vec_q)
    elif args.method == 'grarep':
        from libnrl import grarep  # PNE method
        model = grarep.GraRep(graph=g, Kstep=args.GraRep_kstep, dim=args.dim)
    elif args.method == 'line':  # if auto_save, use label to justifiy the best embeddings by looking at micro / macro-F1 score
        from libnrl import line  # PNE method
        model = line.LINE(graph=g,
                          epoch=args.epochs,
                          rep_size=args.dim,
                          order=args.LINE_order,
                          batch_size=args.batch_size,
                          negative_ratio=args.LINE_negative_ratio,
                          label_file=args.label_file,
                          clf_ratio=args.label_reserved,
                          auto_save=True,
                          best='micro')
    elif args.method == 'asne':
        from libnrl import asne  # ANE method
        model = asne.ASNE(graph=g,
                          dim=args.dim,
                          alpha=args.ASNE_lamb,
                          learning_rate=args.learning_rate,
                          batch_size=args.batch_size,
                          epoch=args.epochs,
                          n_neg_samples=10)
    elif args.method == 'sagemean':  # parameters for graphsage models are in 'graphsage' -> '__init__.py'
        from libnrl.graphsage import graphsageAPI  # ANE method
        model = graphsageAPI.graphSAGE(graph=g,
                                       sage_model='mean',
                                       is_supervised=False)
    elif args.method == 'sagegcn':  # other choices: graphsage_seq, graphsage_maxpool, graphsage_meanpool, n2v
        from libnrl.graphsage import graphsageAPI  # ANE method
        model = graphsageAPI.graphSAGE(graph=g,
                                       sage_model='gcn',
                                       is_supervised=False)
    else:
        print('method not found...')
        exit(0)
    t2 = time.time()
    print(f'STEP3: end learning embeddings; time cost: {(t2-t1):.2f}s')

    if args.save_emb:
        #model.save_embeddings(args.emb_file + time.strftime(' %Y%m%d-%H%M%S', time.localtime()))
        model.save_embeddings(args.emb_file)
        print(f'Save node embeddings in file: {args.emb_file}')

    # ---------------------------------------STEP4: downstream task-----------------------------------------------
    print('\nSTEP4: start evaluating ......: ')
    t1 = time.time()
    vectors = model.vectors
    del model, g
    # ------lp task
    if args.task == 'lp' or args.task == 'lp_and_nc':
        print(
            f'Link Prediction task; the number of testing links {len(test_edge_labels)} i.e. at most 2*0.2*all_positive_links)'
        )
        ds_task = lpClassifier(
            vectors=vectors
        )  # similarity/distance metric as clf; basically, lp is a binary clf probelm
        ds_task.evaluate(test_node_pairs, test_edge_labels)
    # ------nc task
    if args.task == 'nc' or args.task == 'lp_and_nc':
        X, Y = read_node_label_downstream(args.label_file)
        print(
            f'Node Classification task; the percentage of labels for testing: {((1-args.label_reserved)*100):.2f}%'
        )
        ds_task = ncClassifier(
            vectors=vectors, clf=LogisticRegression()
        )  # use Logistic Regression as clf; we may choose SVM or more advanced ones
        ds_task.split_train_evaluate(X, Y, args.label_reserved)
    t2 = time.time()
    print(f'STEP4: end evaluating; time cost: {(t2-t1):.2f}s')
Пример #5
0
    args.graph_format = 'edgelist'
    args.method = 'node2vec'
    args.p = 1.0
    args.q = 0.5
    args.input = os.path.join(os.getcwd(), 'data', dataset, '{}_{}_edges.txt'.format(dataset, train_fts_ratio))
    args.weighted = False
    args.directed = False
    args.epochs = 1000

    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed)
    if args.method == 'node2vec':
        model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
                                     num_paths=args.number_walks, dim=args.representation_size,
                                     workers=args.workers, p=args.p, q=args.q, window=args.window_size)
    elif args.method == 'line':
        if args.label_file and not args.no_auto_save:
            model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order,
                    label_file=args.label_file, clf_ratio=args.clf_ratio)
        else:
            model = line.LINE(g, epoch = args.epochs, rep_size=args.representation_size, order=args.order)
    elif args.method == 'deepWalk':
        model = node2vec.Node2vec(graph=g, path_length=args.walk_length,
                                     num_paths=args.number_walks, dim=args.representation_size,
                                     workers=args.workers, window=args.window_size, dw=True)
    elif args.method == 'tadw':
        assert args.label_file != ''
        assert args.feature_file != ''
        g.read_node_label(args.label_file)
Пример #6
0
                directed=True)
#调参
X, Y = read_node_label('../data/load_label.csv')
tuned_parameters = {
    'path_length': [20, 100],
    'num_paths': [10, 20, 50],
    'dim': [30, 80, 200],
    'p': [0.25, 0.5, 1, 2, 4],
    'q': [0.25, 0.5, 1, 2, 4]
}
test_scores = {}
for p in tuned_parameters['p']:
    for q in tuned_parameters['q']:
        model = node2vec.Node2vec(graph=g,
                                  path_length=80,
                                  num_paths=10,
                                  dim=30,
                                  p=p,
                                  q=q,
                                  window=20)
        vectors = model.vectors
        clf = Classifier(vectors=vectors, clf=LogisticRegression())
        score = clf.split_train_evaluate(X, Y, 0.7)
        test_scores[(p, q)] = score['micro']
print(test_scores)
scorebest = max(test_scores.keys(), key=lambda s: test_scores[s])
print(scorebest)
#选取参数

model.save_embeddings('../outdata/load_embed.txt')
Пример #7
0
def main(args):
    t1 = time.time()
    g = Graph()
    print "Reading..."
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)
    if args.method == 'node2vec':
        model = node2vec.Node2vec(graph=g,
                                  path_length=args.walk_length,
                                  num_paths=args.number_walks,
                                  dim=args.representation_size,
                                  workers=args.workers,
                                  p=args.p,
                                  q=args.q,
                                  window=args.window_size)
    elif args.method == 'line':
        if args.label_file:
            model = line.LINE(g,
                              lr=args.lr,
                              batch_size=args.batch_size,
                              epoch=args.epochs,
                              rep_size=args.representation_size,
                              order=args.order,
                              label_file=args.label_file,
                              clf_ratio=args.clf_ratio,
                              auto_stop=args.no - auto_stop)
        else:
            model = line.LINE(g,
                              lr=args.lr,
                              batch_size=args.batch_size,
                              epoch=args.epochs,
                              rep_size=args.representation_size,
                              order=args.order)
    elif args.method == 'deepWalk':
        model = node2vec.Node2vec(graph=g,
                                  path_length=args.walk_length,
                                  num_paths=args.number_walks,
                                  dim=args.representation_size,
                                  workers=args.workers,
                                  window=args.window_size,
                                  dw=True)
    elif args.method == 'tadw':
        assert args.label_file != ''
        assert args.feature_file != ''
        g.read_node_label(args.label_file)
        g.read_node_features(args.feature_file)
        model = tadw.TADW(graph=g,
                          dim=args.representation_size,
                          lamb=args.lamb)
    elif args.method == 'gcn':
        assert args.label_file != ''
        assert args.feature_file != ''
        g.read_node_label(args.label_file)
        g.read_node_features(args.feature_file)
        model = gcnAPI.GCN(graph=g,
                           dropout=args.dropout,
                           weight_decay=args.weight_decay,
                           hidden1=args.hidden,
                           epochs=args.epochs,
                           clf_ratio=args.clf_ratio)
    elif args.method == 'grarep':
        model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size)
    t2 = time.time()
    print t2 - t1
    if args.method != 'gcn':
        print "Saving embeddings..."
        model.save_embeddings(args.output)
    if args.label_file and args.method != 'gcn':
        vectors = model.vectors
        X, Y = read_node_label(args.label_file)
        print "Training classifier using {:.2f}% nodes...".format(
            args.clf_ratio * 100)
        clf = Classifier(vectors=vectors, clf=LogisticRegression())
        clf.split_train_evaluate(X, Y, args.clf_ratio)