示例#1
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    else:
        raise Exception(
            "unknown file format: '%s'. valid formats: 'adjlist', 'edgelist'" %
            args.format)

    print("number of nodes: {}".format(len(G.nodes())))  # .format 格式化字符串(取代{})

    num_walks = len(G.nodes()) * args.number_walks  # 每个节点有多个walks
    print("number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length
    print("data size (walk*length): {}".format(data_size))

    print("walking...")
    walk_file = walks.write_walks_to_disk(G,
                                          args.output,
                                          num_paths=args.number_walks,
                                          path_length=args.walk_length,
                                          alpha=0,
                                          rand=random.Random(args.seed))
    model = Word2Vec(walk_file,
                     args.output,
                     emb_dimension=args.representation_size,
                     window_size=args.window_size,
                     min_count=0)
    print("Training...")

    model.skip_gram_train()
示例#2
0
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=args.number_walks,
                                        path_length=args.walk_length,
                                        alpha=0,
                                        rand=random.Random(args.seed))
    model = Word2Vec(walks,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     workers=args.workers)

    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.readline()
        groundtruth = [line.strip().split("\t")[:3]
                       for line in fin]  # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    pr = [
        predict_rating(model, nodedict, "u" + g[0], "m" + g[1])
        for g in groundtruth
    ]

    print("MSE = %f" % mean_squared_error(tr, pr))
    print("accuracy = %f" % accuracy_score(tr, pr))
    cm = confusion_matrix(tr, pr, labels=range(1, 6))
    print(cm)
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()
    # print(args)

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    # YOUR CODE HERE
    # print(args.number_walks)
    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0))
    print len(walk)
    model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    print model                    
    # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    # print(groundtruth)
    pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]
    # print(pr)
    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1,6))
    print cm
示例#4
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1,
                         workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size,
                                                                                                             args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0,
                                                          rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output, binary=False)
    print('saved!')
示例#5
0
def process(args):

  if args.format == "adjlist":
      G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
      G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
      G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
      raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

#   G = graphConstruction.buildGraphAPA()


  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
示例#6
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")

    start = time.time()
    walks_filebase = args.output + ".txt"
    walk_files = serialized_walks.write_walks_to_disk(
        G,
        walks_filebase,
        num_paths=args.number_walks,
        path_length=args.walk_length,
        alpha=0,
        rand=random.Random(args.seed),
        num_workers=args.workers)

    # print("Counting vertex frequency...")
    # if not args.vertex_freq_degree:
    #   vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    # else:
    #   # use degree distribution for frequency in tree
    #   vertex_counts = G.degree(nodes=G.iterkeys())
    end = time.time()
    exe_time = end - start
    print("--------- walking time: {:.5f} -----------".format(exe_time))
示例#7
0
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()
    # print(args)

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    # YOUR CODE HERE
    # print(args.number_walks)
    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G,
                                       args.number_walks,
                                       args.walk_length,
                                       alpha=0,
                                       rand=random.Random(0))
    print len(walk)
    model = Word2Vec(walk,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     workers=args.workers)
    print model
    # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3]
                       for line in fin]  # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    # print(groundtruth)
    pr = [
        predict_rating(model, nodedict, "u" + g[0], "m" + g[1])
        for g in groundtruth
    ]
    # print(pr)
    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1, 6))
    print cm
示例#8
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, 
                test_links_ratio=args.test_links, test_links_file=args.test_links_file,
                train_links_file=args.train_links_file)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    if args.heuristic_wrb_for_wbr is not None:
        wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr))
        print(wrb, err)
        return


    if (args.weighted is not None) and (args.weighted != 'unweighted'):
      G = graph.set_weights(G, args.weighted)

    if args.just_write_graph:
        with open('wgraph.out', 'w') as fout:
            if args.weighted == 'unweighted':
                for v in G:
                    s = len(G[v])
                    for u in G[v]:
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n')
            elif args.weighted.startswith('random_walk'):
                for v in G:
                    for u, w in zip(G[v], G.edge_weights[v]):
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n')
            else:
                raise Exception('just-write-graph is not supported for this weighting method')
        return None




    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, p_modified=args.pmodified,
                                            alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                             path_length=args.walk_length, p_modified=args.pmodified,
                                             alpha=0, rand=random.Random(args.seed),
                                             num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
          vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
          # use degree distribution for frequency in tree
          vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output)
示例#9
0
import numpy as np
from numpy import linalg as LA

parser = argparse.ArgumentParser(description='Criar bolas por vértices.')
parser.add_argument('--grafoModel',
                    nargs='?',
                    required=True,
                    help='Input graph file')
parser.add_argument('--grafoData',
                    nargs='?',
                    required=True,
                    help='Input graph file')

args = parser.parse_args()

Gm = graph.load_adjacencylist(args.grafoModel, undirected=True)
Gd = graph.load_adjacencylist(args.grafoData, undirected=True)

Am = editDistance.binaryMatrix(Gm)
Ad = editDistance.binaryMatrix(Gd)
Dm = editDistance.diagonalDegreeMatrixFromBinaryMatrix(Am)
Dd = editDistance.diagonalDegreeMatrixFromBinaryMatrix(Ad)
#d = graph.diagonalDegreeMatrix(m)
print "Matriz binária (Am)"
print Am
print "Matriz binária (Ad)"
print Ad
print "DiagonalDegreeMatrix (Dm)"
print Dm
print "DiagonalDegreeMatrix (Dd)"
print Dd
示例#10
0
import graph
import algoritmos
import calculos

import argparse

parser = argparse.ArgumentParser(description='Executa testes arvores.')
parser.add_argument('--grafo1', nargs='?', required=True,
                      help='Input graph file')
parser.add_argument('--grafo2', nargs='?', required=True,
                      help='Input graph file')

args = parser.parse_args()

print (" - Carregando matriz de adjacência para Grafo (na memória)...")
G1 = graph.load_adjacencylist(args.grafo1,undirected=True)
print (" - Carregando matriz de adjacência para Grafo (na memória)...")
G2 = graph.load_adjacencylist(args.grafo2,undirected=True)
print (" - Convertendo grafo para Dict (na memória)...")
dictG1 = G1.gToDict()
dictG2 = G2.gToDict()

print ("Criando listas...")
l1,v1 = calculos.geraListas(dictG1,1)
l2,v2 = calculos.geraListas(dictG2,1)


print ("Listas v1:")
calculos.printDataVertice(l1)

print ("Listas v2:")
示例#11
0
    #resultadosConsolidados = []
    #for r in resultados:
    #	resultadosConsolidados.append({'label': r[1], 'arvore': r[0]})

    #return resultadosConsolidados


rand = random.Random()

parser = argparse.ArgumentParser(description='Criar bolas por vértices.')
parser.add_argument('--grafo',
                    nargs='?',
                    required=True,
                    help='Input graph file')

args = parser.parse_args()

print " - Carregando matriz de adjacência para Grafo (na memória)..."
G = graph.load_adjacencylist(args.grafo, undirected=True)
print " - Convertendo grafo para Dict (na memória)..."
dictG = G.gToDict()

print " - Gerando árvore..."
t0 = time()

montaArvores(dictG)

t1 = time()

print('Árvores geradas em {}m'.format((t1 - t0) / 60))
示例#12
0
文件: __main__.py 项目: shaoyx/kbc
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    if (os.path.isfile(format(args.excludlist))):
        #num_exlud = number_excluded_nodes(args.excludlist)
        list_exclud = open(args.excludlist).readlines()
        list_exclud = [int(x) for x in list_exclud]
        list_exclud = set(list_exclud)
        num_exlud = len(set(list_exclud))
    else:
        num_exlud = 0
        list_exclud = []
    if (num_exlud > 0):
        print("Number of nodes excluded from the walk: {}".format(num_exlud))

    #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            list_exclud=list_exclud,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            list_exclud,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(
            sentences=serialized_walks.combine_files_iter(walk_files),
            vocabulary_counts=vertex_counts,
            size=args.representation_size,
            window=args.window_size,
            min_count=0,
            workers=args.workers)

    model.wv.save_word2vec_format(args.output)
示例#13
0
import graph
import algoritmos
import argparse
import numpy as np
from numpy import linalg as LA

parser = argparse.ArgumentParser(description='Criar bolas por vértices.')
parser.add_argument('--input',
                    nargs='?',
                    required=True,
                    help='Input graph file')
parser.add_argument('--deepth',
                    nargs='?',
                    required=True,
                    type=int,
                    help='Deepth')

args = parser.parse_args()

G = graph.load_adjacencylist(args.input, undirected=True)

print G.printAdjList()

#print "Com arestas:"

#algoritmos.montaBolaComArestasUltimaCamada(G,3,args.deepth).printAdjList()

#print "Sem arestas:"

#algoritmos.montaBolaSemArestasUltimaCamada(G,3,args.deepth).printAdjList()