def main(): import networkx as nx import json import sys if len(sys.argv) < 3: raise ValueError( "Must specify both p and q for node2vec (2nd & 3rd args after filepath" ) from utilities import load_scotus_network from node2vec.node2vec import node2vec G, issueAreas = load_scotus_network('../data/scotus_network.graphml') with open('../data/name_to_ia.json', 'w') as fp: json.dump(issueAreas, fp) del issueAreas print('P equals: {0}'.format(sys.argv[1])) print('Q equals: {0}'.format(sys.argv[2])) n2v = node2vec(G=G, p=float(sys.argv[1]), q=float(sys.argv[1]), walk_length=100, num_walks=100, window_size=10, embedding_size=300, num_iter=10, min_count=0, sg=1, workers=12) model = n2v.run_node2vec() model.save('../data/scotus_n2v_{0}_{1}.node2vec'.format( sys.argv[1], sys.argv[2]))
def main(): from gensim import models import numpy as np import sys from node2vec.node2vec import node2vec from utilities import load_scotus_network from utilities import get_name_to_date from utilities import get_list_of_docs p = sys.argv[1] q = sys.argv[2] print('n_clusters equals: {0}'.format(sys.argv[3])) n2v_model = node2vec(model=models.Word2Vec.load( "../data/scotus_n2v_{0}_{0}_tiny.node2vec".format(p, q))) n2v_model.p = float(p) n2v_model.q = float(q) G, issue_areas = load_scotus_network( file_path="../data/scotus_network.graphml") nodes = np.random.permutation([n for n in G.nodes()]) del G del issue_areas n2v_model.run_clustering(n_clusters=int(float(sys.argv[3])), nodes=nodes, evaluate=False)
def eval_multiple_walks(sbm, w_length=50, n_classes=2, num_walks=25, p=1, q=1, iterations=5): ''' Return the bhamidi, purity, and agreement scores after sampling node2vec walks for the specified number of iterations Parameters ------------ sbm : stochastic block matrix from which the graph object should be defined w_length : length of node2vec walk n_classes : number of classes; also number of clusters because we will use kmeans to evaluate num_walks : number of node2vec walks to generate per node in graph object p : Return parameter; lower values result in more "local" walks q : In-out parameter; lower values result in more Depth-First Search behaving walks iterations : number of times the node2vec walks should be regenerated, understanding that the node embeddings must be recalculated every time the walks are regenerated ''' print('At eval_multiple_walks(...)') start_time = time.clock() bhamidi_scores = [] purity_scores = [] agreement_scores = [] for i in range(iterations): node_embeds = node2vec( G=None, Adj_M=sbm.A, labels=sbm.memberships, n_classes=n_classes, evaluate=True, p=p, q=q, walk_length=w_length, num_walks=num_walks, window_size=10, embedding_size=128, num_iter=4, min_count=0, sg=1, workers=8, ) bhamidi_scores.append(node_embeds.bhamidi_score) purity_scores.append(node_embeds.purity_score) agreement_scores.append(node_embeds.agreement_score) print("Time elapsed while running 'eval_multiple_walks' function: {0}". format(round(time.clock() - start_time, 8))) # both are of type : list return bhamidi_scores, purity_scores, agreement_scores
def eval_multiple_walks(sbm, w_length=50, n_classes=2, num_walks=25, p=1, q=1, iterations=5): ''' Return the bhamidi, purity, and agreement scores after sampling node2vec walks for the specified number of iterations Parameters ------------ sbm : stochastic block matrix from which the graph object should be defined w_length : length of node2vec walk n_classes : number of classes; also number of clusters because we will use kmeans to evaluate num_walks : number of node2vec walks to generate per node in graph object p : Return parameter; lower values result in more "local" walks q : In-out parameter; lower values result in more Depth-First Search behaving walks Output ------ bhamidi_scores: a list of floats purity_scores: a list of floats agreement_scores: a list of floats ''' bhamidi_scores = [] purity_scores = [] agreement_scores = [] for i in range(iterations): node_embeds = node2vec(G=None, Adj_M=sbm.A, labels=sbm.memberships, n_classes=n_classes, evaluate=True, p=p, q=q, walk_length=w_length, num_walks=num_walks, window_size=10, embedding_size=128, num_iter=4, min_count=0, sg=1, workers=8, ) bhamidi_scores.append(node_embeds.bhamidi_score) purity_scores.append(node_embeds.purity_score) agreement_scores.append(node_embeds.agreement_score) # all are of type : list return bhamidi_scores, purity_scores, agreement_scores
def main(): from gensim import models import numpy as np import sys from node2vec.node2vec import node2vec from utilities import load_scotus_network from utilities import get_name_to_date from utilities import get_list_of_docs p = float(sys.argv[1]) q = float(sys.argv[2]) n2v_model = node2vec(model=models.Word2Vec.load("../data/scotus_n2v_{0}_{0}_mini.node2vec".format(p,q))) n2v_model.p = p n2v_model.q = q G, issue_areas = load_scotus_network(file_path="../data/scotus_network.graphml") IA = 15 nodes = np.random.permutation([n for n in G.nodes()]) ia_to_name = {i : [] for i in range(IA)} name_to_ia = {} for n,d in G.nodes_iter(data=True): ia = int(float(d['issueArea'])) ia_to_name[ia].append(n) name_to_ia[n] = ia total = 0 for k in list(ia_to_name.keys()): print('Key : ',k," "*(3-len(str(k))),'Length(list at key): ',len(ia_to_name[k])) total += len(ia_to_name[k]) print('Total: ',total) print('Number of keys of name_to_ia: ',len(name_to_ia.keys())) print('The above two numbers should be equal.') n2v_model.run_clustering(n_clusters=len(set(ia_to_name.keys())),labels_dict=name_to_ia,evaluate=True)
def main(): ROUND_TO = 2 import sys import os from utilities import make_block_probs, save_current_status from sbm.sbm import stochastic_block_model from node2vec.node2vec import node2vec import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import numpy as np parameters = [round(float(arg), ROUND_TO) for arg in sys.argv[1:3] ] + [int(arg) for arg in sys.argv[3:-1]] print( 'Parameters are: p={0}, q={1}, walk_length={2}, num_walk={3}, embedding_size={4}, num_iter={5}' .format(*parameters)) assert len( parameters ) == 6, 'Parameters to script must be: p, q, walk_length, num_walk, embedding_size, num_iter, R' R = int(sys.argv[-1]) print('R equals: {0}'.format(R)) # to be interated over out_class_probs = [round(i * 0.01, ROUND_TO) for i in range(1, 80)] data_to_save = {} ## PARAMETERS p = parameters[0] data_to_save['p'] = p q = parameters[1] data_to_save['q'] = q walk_length = parameters[2] data_to_save['walk_length'] = walk_length num_walk = parameters[3] data_to_save['num_walk'] = num_walk embedding_size = parameters[4] data_to_save['embedding_size'] = embedding_size num_iter = parameters[5] data_to_save['num_iter'] = num_iter num_nodes = 400 data_to_save['num_nodes'] = num_nodes n_classes = 2 data_to_save['n_classes'] = n_classes in_class_prob = 0.8 data_to_save['in_class_prob'] = in_class_prob iterations = 1 data_to_save['iterations'] = iterations samples = 1 data_to_save['samples'] = samples # store the labels from the following loop data_to_save['data'] = [] # for saving purposes; where we store data data_dir = 'data/' if os.path.isdir(data_dir): pass elif os.path.isdir('../' + data_dir): data_dir = '../' + data_dir elif os.path.isdir('../' + '../' + data_dir): data_dir = '../' + '../' + data_dir else: data_dir = '../' + data_dir # dont do the same thing twice file_name = 'p{0}_q{1}_wl{2}_nw{3}_es{4}_ni{5}_R{6}.json'.format( p, q, walk_length, num_walk, embedding_size, num_iter, R).format(in_class_prob, walk_length) if os.path.isfile(data_dir + file_name): print( 'SIMULATION HAS BEEN PERFORMED. SKIPPING: p{0}_q{1}_wl{2}_nw{3}_es{4}_ni{5}_s{6}' .format(*parameters)) raise ValueError( 'Simulation has been run before. Change file_name if wanting to run again.' ) for r in range(R): tmp_statuses = [] for out_class_prob in out_class_probs: tmp_status = {} out_class_prob = out_class_prob tmp_status['out_class_prob'] = out_class_prob block_probs = make_block_probs(in_class_prob=in_class_prob, out_class_prob=out_class_prob) sbm = stochastic_block_model(size=num_nodes, block_probabilities=block_probs, num_classes=n_classes) node_embeds = node2vec(G=None, Adj_M=sbm.A, labels=sbm.memberships, n_classes=n_classes, evaluate=True, p=p, q=q, walk_length=walk_length, num_walks=num_walk, window_size=5, embedding_size=embedding_size, num_iter=num_iter, min_count=0, sg=1, workers=8) true_labels = node_embeds.labels tmp_status['true_labels'] = true_labels predicted_labels = node_embeds.predicted_labels tmp_status['predicted_labels'] = predicted_labels tmp_statuses.append(tmp_status) # save data from data_to_save['data'].append(tmp_statuses) # save labels current_status = save_current_status(file_name=file_name, data=data_to_save)