Пример #1
0
def main():
    import networkx as nx
    import json
    import sys
    if len(sys.argv) < 3:
        raise ValueError(
            "Must specify both p and q for node2vec (2nd & 3rd args after filepath"
        )

    from utilities import load_scotus_network
    from node2vec.node2vec import node2vec

    G, issueAreas = load_scotus_network('../data/scotus_network.graphml')

    with open('../data/name_to_ia.json', 'w') as fp:
        json.dump(issueAreas, fp)
    del issueAreas
    print('P equals: {0}'.format(sys.argv[1]))
    print('Q equals: {0}'.format(sys.argv[2]))

    n2v = node2vec(G=G,
                   p=float(sys.argv[1]),
                   q=float(sys.argv[1]),
                   walk_length=100,
                   num_walks=100,
                   window_size=10,
                   embedding_size=300,
                   num_iter=10,
                   min_count=0,
                   sg=1,
                   workers=12)

    model = n2v.run_node2vec()
    model.save('../data/scotus_n2v_{0}_{1}.node2vec'.format(
        sys.argv[1], sys.argv[2]))
Пример #2
0
def main():
    from gensim import models
    import numpy as np
    import sys

    from node2vec.node2vec import node2vec
    from utilities import load_scotus_network
    from utilities import get_name_to_date
    from utilities import get_list_of_docs
    p = sys.argv[1]
    q = sys.argv[2]
    print('n_clusters equals: {0}'.format(sys.argv[3]))

    n2v_model = node2vec(model=models.Word2Vec.load(
        "../data/scotus_n2v_{0}_{0}_tiny.node2vec".format(p, q)))
    n2v_model.p = float(p)
    n2v_model.q = float(q)
    G, issue_areas = load_scotus_network(
        file_path="../data/scotus_network.graphml")

    nodes = np.random.permutation([n for n in G.nodes()])
    del G
    del issue_areas

    n2v_model.run_clustering(n_clusters=int(float(sys.argv[3])),
                             nodes=nodes,
                             evaluate=False)
Пример #3
0
def eval_multiple_walks(sbm,
                        w_length=50,
                        n_classes=2,
                        num_walks=25,
                        p=1,
                        q=1,
                        iterations=5):
    '''
    Return the bhamidi, purity, and agreement scores after sampling node2vec walks for the specified number of iterations

    Parameters
    ------------
    sbm : stochastic block matrix from which the graph object should be defined
    w_length : length of node2vec walk
    n_classes : number of classes; also number of clusters because we will use kmeans to evaluate
    num_walks : number of node2vec walks to generate per node in graph object
    p : Return parameter; lower values result in more "local" walks
    q : In-out parameter; lower values result in more Depth-First Search behaving walks
    iterations : number of times the node2vec walks should be regenerated, understanding that the node embeddings must
                    be recalculated every time the walks are regenerated
    '''
    print('At eval_multiple_walks(...)')
    start_time = time.clock()
    bhamidi_scores = []
    purity_scores = []
    agreement_scores = []
    for i in range(iterations):
        node_embeds = node2vec(
            G=None,
            Adj_M=sbm.A,
            labels=sbm.memberships,
            n_classes=n_classes,
            evaluate=True,
            p=p,
            q=q,
            walk_length=w_length,
            num_walks=num_walks,
            window_size=10,
            embedding_size=128,
            num_iter=4,
            min_count=0,
            sg=1,
            workers=8,
        )
        bhamidi_scores.append(node_embeds.bhamidi_score)
        purity_scores.append(node_embeds.purity_score)
        agreement_scores.append(node_embeds.agreement_score)
    print("Time elapsed while running 'eval_multiple_walks' function: {0}".
          format(round(time.clock() - start_time, 8)))
    # both are of type : list
    return bhamidi_scores, purity_scores, agreement_scores
Пример #4
0
def eval_multiple_walks(sbm, w_length=50, n_classes=2, num_walks=25, p=1, q=1, iterations=5):
    '''
    Return the bhamidi, purity, and agreement scores after sampling node2vec walks for the specified number of iterations

    Parameters
    ------------
    sbm : stochastic block matrix from which the graph object should be defined
    w_length : length of node2vec walk
    n_classes : number of classes; also number of clusters because we will use kmeans to evaluate
    num_walks : number of node2vec walks to generate per node in graph object
    p : Return parameter; lower values result in more "local" walks
    q : In-out parameter; lower values result in more Depth-First Search behaving walks

    Output
    ------
    bhamidi_scores: a list of floats
    purity_scores: a list of floats
    agreement_scores: a list of floats
    '''
    bhamidi_scores = []
    purity_scores = []
    agreement_scores = []
    for i in range(iterations):
        node_embeds = node2vec(G=None,
                                Adj_M=sbm.A,
                                labels=sbm.memberships,
                                n_classes=n_classes,
                                evaluate=True,
                                p=p,
                                q=q,
                                walk_length=w_length,
                                num_walks=num_walks,
                                window_size=10,
                                embedding_size=128,
                                num_iter=4,
                                min_count=0,
                                sg=1,
                                workers=8,
                                )
        bhamidi_scores.append(node_embeds.bhamidi_score)
        purity_scores.append(node_embeds.purity_score)
        agreement_scores.append(node_embeds.agreement_score)
    # all are of type : list
    return bhamidi_scores, purity_scores, agreement_scores
Пример #5
0
def main():
    from gensim import models
    import numpy as np
    import sys

    from node2vec.node2vec import node2vec
    from utilities import load_scotus_network
    from utilities import get_name_to_date
    from utilities import get_list_of_docs
    p = float(sys.argv[1])
    q = float(sys.argv[2])

    n2v_model = node2vec(model=models.Word2Vec.load("../data/scotus_n2v_{0}_{0}_mini.node2vec".format(p,q)))
    n2v_model.p = p
    n2v_model.q = q
    G, issue_areas = load_scotus_network(file_path="../data/scotus_network.graphml")

    IA = 15
    
    nodes = np.random.permutation([n for n in G.nodes()])

    ia_to_name = {i : [] for i in range(IA)}
    name_to_ia = {}
    for n,d in G.nodes_iter(data=True):
        ia = int(float(d['issueArea']))
        ia_to_name[ia].append(n)
        name_to_ia[n] = ia

    total = 0
    for k in list(ia_to_name.keys()):
        print('Key : ',k," "*(3-len(str(k))),'Length(list at key): ',len(ia_to_name[k]))
        total += len(ia_to_name[k])
        
    print('Total: ',total)

    print('Number of keys of name_to_ia: ',len(name_to_ia.keys()))
    print('The above two numbers should be equal.')

    n2v_model.run_clustering(n_clusters=len(set(ia_to_name.keys())),labels_dict=name_to_ia,evaluate=True)
def main():
    ROUND_TO = 2
    import sys
    import os

    from utilities import make_block_probs, save_current_status
    from sbm.sbm import stochastic_block_model
    from node2vec.node2vec import node2vec
    import warnings
    warnings.filterwarnings('ignore')
    warnings.simplefilter('ignore')
    import numpy as np

    parameters = [round(float(arg), ROUND_TO) for arg in sys.argv[1:3]
                  ] + [int(arg) for arg in sys.argv[3:-1]]
    print(
        'Parameters are: p={0}, q={1}, walk_length={2}, num_walk={3}, embedding_size={4}, num_iter={5}'
        .format(*parameters))
    assert len(
        parameters
    ) == 6, 'Parameters to script must be: p, q, walk_length, num_walk, embedding_size, num_iter, R'
    R = int(sys.argv[-1])
    print('R equals: {0}'.format(R))

    # to be interated over
    out_class_probs = [round(i * 0.01, ROUND_TO) for i in range(1, 80)]

    data_to_save = {}
    ## PARAMETERS
    p = parameters[0]
    data_to_save['p'] = p
    q = parameters[1]
    data_to_save['q'] = q
    walk_length = parameters[2]
    data_to_save['walk_length'] = walk_length
    num_walk = parameters[3]
    data_to_save['num_walk'] = num_walk
    embedding_size = parameters[4]
    data_to_save['embedding_size'] = embedding_size
    num_iter = parameters[5]
    data_to_save['num_iter'] = num_iter
    num_nodes = 400
    data_to_save['num_nodes'] = num_nodes
    n_classes = 2
    data_to_save['n_classes'] = n_classes
    in_class_prob = 0.8
    data_to_save['in_class_prob'] = in_class_prob
    iterations = 1
    data_to_save['iterations'] = iterations
    samples = 1
    data_to_save['samples'] = samples
    # store the labels from the following loop
    data_to_save['data'] = []

    # for saving purposes; where we store data
    data_dir = 'data/'
    if os.path.isdir(data_dir):
        pass
    elif os.path.isdir('../' + data_dir):
        data_dir = '../' + data_dir
    elif os.path.isdir('../' + '../' + data_dir):
        data_dir = '../' + '../' + data_dir
    else:
        data_dir = '../' + data_dir

    # dont do the same thing twice
    file_name = 'p{0}_q{1}_wl{2}_nw{3}_es{4}_ni{5}_R{6}.json'.format(
        p, q, walk_length, num_walk, embedding_size, num_iter,
        R).format(in_class_prob, walk_length)
    if os.path.isfile(data_dir + file_name):
        print(
            'SIMULATION HAS BEEN PERFORMED. SKIPPING: p{0}_q{1}_wl{2}_nw{3}_es{4}_ni{5}_s{6}'
            .format(*parameters))
        raise ValueError(
            'Simulation has been run before. Change file_name if wanting to run again.'
        )

    for r in range(R):
        tmp_statuses = []
        for out_class_prob in out_class_probs:
            tmp_status = {}
            out_class_prob = out_class_prob
            tmp_status['out_class_prob'] = out_class_prob

            block_probs = make_block_probs(in_class_prob=in_class_prob,
                                           out_class_prob=out_class_prob)

            sbm = stochastic_block_model(size=num_nodes,
                                         block_probabilities=block_probs,
                                         num_classes=n_classes)
            node_embeds = node2vec(G=None,
                                   Adj_M=sbm.A,
                                   labels=sbm.memberships,
                                   n_classes=n_classes,
                                   evaluate=True,
                                   p=p,
                                   q=q,
                                   walk_length=walk_length,
                                   num_walks=num_walk,
                                   window_size=5,
                                   embedding_size=embedding_size,
                                   num_iter=num_iter,
                                   min_count=0,
                                   sg=1,
                                   workers=8)
            true_labels = node_embeds.labels
            tmp_status['true_labels'] = true_labels
            predicted_labels = node_embeds.predicted_labels
            tmp_status['predicted_labels'] = predicted_labels
            tmp_statuses.append(tmp_status)
        # save data from
        data_to_save['data'].append(tmp_statuses)

    # save labels
    current_status = save_current_status(file_name=file_name,
                                         data=data_to_save)