예제 #1
0
파일: input_data.py 프로젝트: SongFGH/MLSG
 def read_graph(self):
     print("read_graph\n")
     G = nx.Graph()
     matrix = graph.load_matfile('blogcatalog.mat', variable_name='network')
     if issparse(matrix):
         cx = matrix.tocoo()
         for i, j, v in zip(cx.row, cx.col, cx.data):
             G.add_edge(i, j)
     for edge in G.edges():
         G[edge[0]][edge[1]]['weight'] = 1
     # G=G.to_undirected()
     return G
예제 #2
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1,
                         workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size,
                                                                                                             args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0,
                                                          rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output, binary=False)
    print('saved!')
예제 #3
0
def process(args):

  if args.format == "adjlist":
      G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
      G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
      G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
      raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

#   G = graphConstruction.buildGraphAPA()


  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
예제 #4
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")

    start = time.time()
    walks_filebase = args.output + ".txt"
    walk_files = serialized_walks.write_walks_to_disk(
        G,
        walks_filebase,
        num_paths=args.number_walks,
        path_length=args.walk_length,
        alpha=0,
        rand=random.Random(args.seed),
        num_workers=args.workers)

    # print("Counting vertex frequency...")
    # if not args.vertex_freq_degree:
    #   vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    # else:
    #   # use degree distribution for frequency in tree
    #   vertex_counts = G.degree(nodes=G.iterkeys())
    end = time.time()
    exe_time = end - start
    print("--------- walking time: {:.5f} -----------".format(exe_time))
예제 #5
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, 
                test_links_ratio=args.test_links, test_links_file=args.test_links_file,
                train_links_file=args.train_links_file)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    if args.heuristic_wrb_for_wbr is not None:
        wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr))
        print(wrb, err)
        return


    if (args.weighted is not None) and (args.weighted != 'unweighted'):
      G = graph.set_weights(G, args.weighted)

    if args.just_write_graph:
        with open('wgraph.out', 'w') as fout:
            if args.weighted == 'unweighted':
                for v in G:
                    s = len(G[v])
                    for u in G[v]:
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n')
            elif args.weighted.startswith('random_walk'):
                for v in G:
                    for u, w in zip(G[v], G.edge_weights[v]):
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n')
            else:
                raise Exception('just-write-graph is not supported for this weighting method')
        return None




    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, p_modified=args.pmodified,
                                            alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                             path_length=args.walk_length, p_modified=args.pmodified,
                                             alpha=0, rand=random.Random(args.seed),
                                             num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
          vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
          # use degree distribution for frequency in tree
          vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output)
예제 #6
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--input', help="input graph")
    parser.add_argument('--output', help="output result")
    parser.add_argument('--name', help="name of the method")
    parser.add_argument('--round', type=int, default=1, help="round")
    parser.add_argument('--u', type=float, help='hyper parameter 1')
    parser.add_argument('--v', type=float, help='hyper parameter 2')
    parser.add_argument('--all', action='store_true', help='validate all')

    args = parser.parse_args()

    num_shuffle = args.round
    ori_graph = graph.load_matfile(file_=args.input)
    ori_graph.make_undirected()
    ori_graph.make_consistent()
    train_ratio = 0.8
    sample_node = 1024
    map_round = [None] * num_shuffle
    curve_round = [None] * num_shuffle
    auc_round = [None] * num_shuffle
    print('start validating link prediction...')
    if not args.all:
        try:
            file = open(args.output, 'w')
            for round_id in range(num_shuffle):
                map_round[round_id], curve_round[round_id], auc_round[
                    round_id] = evaluatePrediction(ori_graph, [args.name],
                                                   train_ratio, sample_node,
                                                   [args.u], [args.v])
                map_round[round_id] = map_round[round_id][0]
                curve_round[round_id] = curve_round[round_id][0]
                auc_round[round_id] = auc_round[round_id][0]
                file.write(str(map_round[round_id]))
                print('MAP:{} AUC:{}'.format(map_round[round_id],
                                             auc_round[round_id]))
                for i in curve_round[round_id]:
                    file.write(" {}".format(i))
                file.write("\n")
            file.write(
                str(numpy.mean(map_round)) + ' ' + str(numpy.std(map_round)))
        finally:
            file.close()
    else:
        uargs = [190, 1, 4]
        vargs = [0.4, 1, 4]
        try:
            file = open(args.output, 'w')
            for round_id in range(num_shuffle):
                map_round[round_id], curve_round[
                    round_id], auc_round[round_id] = evaluatePrediction(
                        ori_graph, ['manela', 'deepwalk', 'node2vec'],
                        train_ratio, sample_node, uargs, vargs)
                for m, a, curve in zip(map_round[round_id],
                                       auc_round[round_id],
                                       curve_round[round_id]):
                    file.write(str(m))
                    file.write(" " + str(a))
                    for i in curve:
                        file.write(" {}".format(i))
                    file.write("\n")
        finally:
            file.close()

    print("saved to file: {}".format(args.output))
예제 #7
0
파일: __main__.py 프로젝트: hanzh015/MANELA
def main():
    '''
    main method of the pre_experiment program
    parameters:
    
    '''

    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        help="whether to use default values",
                        action="store_true")
    parser.add_argument('-c',
                        help="continued training path",
                        action="store_true")
    parser.add_argument('--embpath',
                        help="the embedding path for continued training")
    parser.add_argument('--path', help="path of the graph")
    parser.add_argument('--output', help="output path of the embeddings")
    parser.add_argument('--dimension',
                        type=int,
                        default=128,
                        help="dimension of embeddings")
    parser.add_argument('--updates', type=int, help="number of updates")
    parser.add_argument('--alpha',
                        type=float,
                        default=0.025,
                        help="initial learning rate")
    parser.add_argument('--negative',
                        type=int,
                        default=5,
                        help="negative sampling number")
    parser.add_argument('--neglen',
                        type=int,
                        default=10**8,
                        help="the maximum number used for negative sampling")
    parser.add_argument(
        '--ratio',
        type=float,
        help="the ratio of numbers of 1st and 2nd degree nodes")
    parser.add_argument(
        '--fmax',
        default=1,
        type=float,
        help=
        "the maximum ratio of # of 1st deg nodes participating updates to the total # of 1st deg nodes, when ratio=0.5"
    )
    #parser.add_argument('-p',help="whether to use poisson process",action="store_true")
    parser.add_argument('--window',
                        type=int,
                        help="the update window in poisson update mode")
    parser.add_argument(
        '--timeslot',
        default=1000,
        type=int,
        help="the number of timeslots in order to simulate poisson process")
    parser.add_argument('--seed',
                        default=1,
                        type=int,
                        help="the random seed of a Distributed instance")

    args = parser.parse_args()

    G = graph.load_matfile(file_=args.path)
    d = ds.Distributed(G)

    if args.d:
        print('using default settings')
        d.defaultArgs()
    else:
        d.setArgs(alpha=args.alpha,
                  numUpdates=args.updates,
                  numNegSampling=args.negative,
                  maxNeglen=10**8,
                  representSize=args.dimension,
                  outputPath=args.output,
                  ratio=args.ratio,
                  fmax=args.fmax,
                  c=args.c,
                  cpath=args.embpath,
                  poisson=True,
                  window=args.window,
                  timeslot=args.timeslot,
                  seed=args.seed)

    d.process()
    d.save2File()
예제 #8
0
파일: __main__.py 프로젝트: shaoyx/kbc
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    if (os.path.isfile(format(args.excludlist))):
        #num_exlud = number_excluded_nodes(args.excludlist)
        list_exclud = open(args.excludlist).readlines()
        list_exclud = [int(x) for x in list_exclud]
        list_exclud = set(list_exclud)
        num_exlud = len(set(list_exclud))
    else:
        num_exlud = 0
        list_exclud = []
    if (num_exlud > 0):
        print("Number of nodes excluded from the walk: {}".format(num_exlud))

    #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            list_exclud=list_exclud,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            list_exclud,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(
            sentences=serialized_walks.combine_files_iter(walk_files),
            vocabulary_counts=vertex_counts,
            size=args.representation_size,
            window=args.window_size,
            min_count=0,
            workers=args.workers)

    model.wv.save_word2vec_format(args.output)