示例#1
0
def process(args):

  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
  print("Walking...")

  walks_filebase = args.output + ".walks"
  walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)
示例#2
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(
        G,
        walks_filebase,
        num_paths=args.number_walks,
        path_length=args.walk_length,
        alpha=0,
        rand=random.Random(args.seed),
        num_workers=args.workers)
示例#3
0
def process(args):

  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  elif args.format == 'weighted_edgelist':
    G = nx.read_weighted_edgelist(args.input)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    if args.format == 'weighted_edgelist':
      #only changaed this part -- shun
      walks = weighted_random_walk.random_walk(G, num_paths=args.number_walks,path_length=args.walk_length, alpha=0)
    else:
      walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
 
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
示例#4
0
    def load_graph(self, input_address, output_name="g1_out.embeddings", number_walks=10, walk_length=40,
                   max_memory_data_size=1000000000, matfile_variable_name="network", format='adjlist', undirected=True,
                   representation_size=16, workers=1, window_size=5, vertex_freq_degree=False, seed=0):
        if format == "adjlist":
            G = graph.load_adjacencylist(input_address, undirected=undirected)
        elif format == "edgelist":
            G = graph.load_edgelist(input_address, undirected=undirected)
        elif format == "mat":
            G = graph.load_matfile(input_address, variable_name=matfile_variable_name, undirected=undirected)
        else:
            raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % format)

        print("Number of nodes: {}".format(len(G.nodes())))

        num_walks = len(G.nodes()) * number_walks

        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * walk_length

        print("Data size (walks*length): {}".format(data_size))

        if data_size < max_memory_data_size:
            print("Walking...")
            walks = graph.build_deepwalk_corpus(G, num_paths=number_walks,
                                                path_length=walk_length, alpha=0, rand=random.Random(seed))
            print("Training...")
            model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1,
                             workers=workers)
        else:
            print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(
                data_size,
                max_memory_data_size))
            print("Walking...")

            walks_filebase = output_name + ".walks"
            walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
                                                              path_length=walk_length, alpha=0,
                                                              rand=random.Random(seed),
                                                              num_workers=workers)

            print("Counting vertex frequency...")
            if not vertex_freq_degree:
                vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
            else:
                # use degree distribution for frequency in tree
                vertex_counts = G.degree(nodes=G.iterkeys())

            print("Training...")
            walks_corpus = serialized_walks.WalksCorpus(walk_files)
            model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                             size=representation_size,
                             window=window_size, min_count=0, trim_rule=None, workers=workers)

        model.wv.save_word2vec_format("./dataset/{}".format(output_name))
示例#5
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(
            args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size,
                         window=args.window_size, min_count=0, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(
            data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, workers=args.workers)

    model.save_word2vec_format(args.output)
示例#6
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    nxG = nx.Graph(G)
    Gmat = nx.adjacency_matrix(nxG).tocoo()

    num_iters = args.num_iters

    tf.logging.info('Train Start: {:%Y-%m-%d %H:%M:%S}'.format(
        datetime.datetime.now()))

    # generate model
    input_tensor, row_factor, col_factor, model = wals.wals_model(
        Gmat, args.dim // 2, args.reg, args.unobs)

    # factorize matrix
    session = wals.simple_train(model, input_tensor, num_iters)

    tf.logging.info('Train Finish: {:%Y-%m-%d %H:%M:%S}'.format(
        datetime.datetime.now()))

    # evaluate output factor matrices
    output_row = row_factor.eval(session=session)
    output_col = col_factor.eval(session=session)

    # close the training session now that we've evaluated the output

    session.close()

    embedding = np.concatenate((output_row, output_col), axis=1)
    print(embedding)
    # save trained model to job directory
    np.savetxt(args.output, embedding)
    # log results
    train_rmse = wals.get_rmse(output_row, output_col, Gmat)

    log_info = 'train RMSE = %.2f' % train_rmse
    tf.logging.info(log_info)
    print(log_info)
示例#7
0
def load_blogcat():
    "Load BlogCatalog labels from mat file."
    matfile = "/home/jimmie/git/deepwalk/example_graphs/blogcatalog.mat"
    print('\nLoading BlogCatalog from mat file: %s' % matfile)
    G = graph.load_matfile(matfile, variable_name='network', undirected=1)
    G = G.stringify()

    mat = loadmat(matfile)
    #A = mat['network']
    #graph = sparse2graph(A)

    labels_matrix = mat['group']
    labels_matrix = labels_matrix.todense().astype(np.int32)
    return G, labels_matrix
示例#8
0
def load_blogcat():
    "Load BlogCatalog labels from mat file."
    matfile = "/home/jimmie/git/deepwalk/example_graphs/blogcatalog.mat"
    print('\nLoading BlogCatalog from mat file: %s' % matfile)
    G = graph.load_matfile(matfile, variable_name='network', undirected=1)
    G = G.stringify()

    mat = loadmat(matfile)
    #A = mat['network']
    #graph = sparse2graph(A)
    
    labels_matrix = mat['group']
    labels_matrix = labels_matrix.todense().astype(np.int32)
    return G, labels_matrix   
示例#9
0
def process(args):

  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
示例#10
0
def process(params, save=True):
    """
    :param params:  传入参数用于训练
    :param save:   是否保存 训练的数据
    :return:
    """
    if params["format"] == "adjlist":
        G = graph.load_adjacencylist(params["input"],
                                     undirected=params["undirected"])
    elif params["format"] == "edgelist":
        G = graph.load_edgelist(params["input"],
                                undirected=params["undirected"])
    elif params["format"] == "mat":
        G = graph.load_matfile(params["input"],
                               variable_name=params["matfile_variable_name"],
                               undirected=params["undirected"])
    else:
        print("输入格式有误,当前输入格式为 %s" % (params["format"]))
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', "
            "mat" % params["format"])
    print("Number of node :{}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * params["number_walks"]

    print("Number of walks:{}".format(num_walks))

    data_size = num_walks * params["walk_length"]

    print("Data size (walks*length):{}".format(data_size))

    if data_size < params["max_memory_data_size"]:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(
            G,
            num_paths=params.get("number_walks", 10),
            path_length=params.get("walk_length", 40),
            alpha=params.get("alpha", 0),
            rand=random.Random(params.get("seed", 0)))

        print("Training...")
        model = Word2Vec(walks,
                         size=params.get("representation_size", 64),
                         window=params.get("window_siz", 5),
                         min_count=params.get("min_count", 0),
                         sg=params.get("sg", 1),
                         hs=params.get("hs", 1),
                         workers=params.get("workers", 1))
    else:
        print(
            "Data size{} is larger than limit(max-memory-data-size:{}).Dumping walks t disk."
            .format(data_size, params.get("max_memory_data_size")))

        print("walking...")

        walks_filebase = params["output"] + ".walks"
        walks_files = wk.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=params.get("number_walks", 10),
            path_length=params.get("walk_length", 40),
            alpha=params.get("alpha", 0),
            rand=random.Random(params.get("seed", 0)),
            num_workers=params.get("workers", 1))

        print("Counting vertex frequecy...")  # 统计节点频次

        if params["vertex_freq_degree"]:
            vertex_counts = wk.count_textfiles(walks_files, params["workers"])

        else:
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")

        walks_corpus = wk.WalksCorpus(walks_files)  # walk 语料

        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=params.get("representation_size"),
                         window=params.get("windows_size", 80),
                         min_count=params.get("min_count", 0),
                         trim_rule=params.get("trim_rule", None),
                         workers=params.get("workers", 8))
    if save == True:
        model.wv.save_word2vec_format(params["output"])  # 对模型进行保存
    else:
        models = model.wv.load_word2vec_format(params["output"])  # 加载模型.
        return models
def process(args):
    """Section1:
      deepwalk的输入可以是三种格式的数据,因此针对这三种格式的数据分别构建图类对象G
      我们这里主要关心的是karae.adjlist和p2p-Gnutella08.edgelist两个文件对应的格式,
      这两个文件都在文件夹example_graphs里面
      对于.mat格式我们不关心,也不使用
    """
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)
    """Section2:
    下面打印的都是一些基本信息
    """
    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))
    """Section3:
      这里的data_size不是实际的内存大小,只是节点使用总次数的计数。
      在这个if-else代码段中,更常出现的情况是if,而else里的代码几乎从来不会出现,除非data_size超过了max_memory_data_size
      max_memory_data_size默认的是10亿次。
    """
    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        """
        这里直接调用图对象里的build_deepwalk_corpus,它的主要作用是,对每一个点进行多次随机游走,最后的到全部的walks。
        相当于deepwalk论文里两层for循环randomwalk,
        也相当于node2vec里两层for循环node2vecWalk,
        也相当于你算法里的两层for循环下的adawalk
        num_paths=args.number_walks:是采样的轮数,即每一个起点采样的次数,相当于你论文里的$\gamma$
        path_length=args.walk_length, 
        alpha=0, 指的是以概率1-alpha从当前节点继续走下去(可以回到上一个节点),或者以alpha的概率停止继续走,回到路径的起点重新走。
                 请注意:这里的随机游走路径未必是连续的,有可能是走着走着突然回到起点接着走
                 即便alpha=0,也不能保证以后的路径不会出现起点,因为有可能从起点开始走到第二个点时,这第二个点以1的概率继续走下去,
                 这时候它可能会回到起点。
                 但是如果alpha=1,那么就意味着它不可能继续走下去,只能每次都从起点开始重新走,所以这个时候打印出来的路径序列是一连串的起点。
        rand=random.Random(args.seed)
        """

        print("Training...")

        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
        """
        这段代码几乎不需要改动,它就是你算法里的SkipGram,你算发里有三个输入:
        random walks---->walks
        context size---->window=args.window_size
        embedding size---->size=args.representation_size
        如果你想对这个模型了解的更多,或者是做更加人性化的定制,需要继续了解from gensim.models import Word2Vec下的源码。
        我们会在下一篇博文中专门解读这个源码
        Word2Vec:

        Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.

        sg : int {1, 0}
            Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used.
        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        cbow_mean : int {1,0}
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int
            Number of iterations (epochs) over the corpus.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        sorted_vocab : int {1,0}
            If 1, sort the vocabulary by descending frequency before assigning word indexes.
        batch_words : int
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        compute_loss: bool
            If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `Word2Vec` model

        >>> from gensim.models import Word2Vec
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = Word2Vec(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for word
        """

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")
        """
        在这种情况下,游走路径会被存入到一系列文件output.walks.x中,即如果你在命令行指定了--output file_path
        那么程序结束后会出现两个文件,一个是file_path,一个是file_path.walks.0,file_path.walks.1,...file_path.walks.n。
        file_path存的是各个节点的embedding,
        output.walks.x存的是采样的路径,x表示这个文件是第x个处理器存入的
        x的个数与处理器个数相同
        """
        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)
        """
        在目前的层次上,你只需要知道serialized_walks.write_walks_to_disk本质上也是在调用graph里的randwalk,只不过包装一下
        加入了并行化代码和写到磁盘的程序。
        这个函数具体的细节需要稍后剖析另一个文件walk.py
        函数里各个参数的含义可以参考if语句里面的代码
        """

        # print("Counting vertex frequency...")
        # if not args.vertex_freq_degree:
        #     """Use vertex degree to estimate the frequency of nodes
        #     in the random walks. This option is faster than
        #     calculating the vocabulary."""
        #     vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        # else:
        #     # use degree distribution for frequency in tree
        #     vertex_counts = G.degree(nodes=G.iterkeys())
        #
        # print("Training...")
        # walks_corpus = serialized_walks.WalksCorpus(walk_files)
        # model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
        #                  size=args.representation_size,
        #                  window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
        """
        上面注释掉的代码是原来的代码,这里没有使用Word2Vec而是使用了自己编写的Skipgram,
        但是Skipgram仅仅是对Word2Vec的输入参数做了一些保装,里面仍然在调用word2vec,
        word2vec本身也可以实现并行化运算,不知道这里为什么要这么做。
        需要说明的是:从现有的情况类看vertex_counts似乎并没什么卵用
        直接向下面这样写就可以:         
        """
        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Word2Vec(sentences=walks_corpus,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    """Section?:
      训练完成之后,将模型输出
    """
    model.wv.save_word2vec_format(args.output)
def process(args):

  # Build "(Node, Layer)" map
  if args.floor != "":
    floorFile = open(args.floor, 'r')
    for line in floorFile:
      nd, layer = line.strip().split()[:2]
      nd = int(nd)
      layer = int(layer)
      #print nd, layer
      if nd not in graph.Graph.nodePos:
        graph.Graph.nodeList.append(graph.NodeType(nd,layer))
        graph.Graph.nodePos[nd] = len(graph.Graph.nodeList)-1

  # read input Graph
  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
  
  timelog = ""

  print("Number of nodes: {}".format(len(G.nodes())))
  num_walks = len(G.nodes()) * args.number_walks
  print("Number of walks: {}".format(num_walks))
  data_size = num_walks * args.walk_length
  print("Data size (walks*length): {}".format(data_size))

  # Centrality calculation >> store in File
  '''
  centrality = nxGraph(args.input)
  print centrality
  fo = open("closeness.txt","wb")
  for k in centrality.keys():
    fo.write("{} {}\n".format(k,centrality[k]))
  fo.close()
  '''
  #exit()
  lsfile = open(args.LSfile, 'r')
  calculateBC(lsfile)
  #exit()

  #building (Unit)Metapath Table
  MPList = []
  graph.Graph.mpath = []
  if args.metapath != "":
    mpfile = open(args.metapath, 'r')
    for line in mpfile:
      MPList.append(int(line.strip().split()[0]))
  print "(Unit)Metapath: {}".format(MPList)
  while len(graph.Graph.mpath) < args.walk_length:
    graph.Graph.mpath.extend(MPList)
  args.walk_length = len(graph.Graph.mpath)
  print "(Full)Metapath: {}\nargs.walk_length: {}".format(graph.Graph.mpath, args.walk_length)
  
  tStart = time.time()

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random())
    tEnd = time.time()
    print "Walking takes {} seconds".format(round(tEnd - tStart, 3))
    timelog = "{}, {}".format( timelog, round(tEnd-tStart, 3) )
    print "Number of walks generated: {}".format(len(walks))

    tStart = time.time()
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    tEnd = time.time()

    print "Training takes {} seconds".format(round(tEnd - tStart, 3))
    timelog = "{}, {}, ,{}".format( timelog, round(tEnd-tStart, 3), len(walks) )
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
  with open(args.output, 'r') as f:
    timelog = "{}, {}\n".format( timelog, f.readline().split()[0] )
  with open(args.timelog, 'ab') as tl:
    tl.write(timelog)
示例#13
0
import random
from deepwalk import graph
from gensim.models import Word2Vec
#1.第一步导入图
G = graph.load_matfile("./blogcatalog.mat")
print(G.values())
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * 10  #(每个节点随机游走的数目)
print("Number of walks: {}".format(num_walks))
data_size = num_walks * 40  #游走长度
print("Data size (walks*length): {}".format(data_size))
#2.开始游走
walks = graph.build_deepwalk_corpus(G,
                                    num_paths=10,
                                    path_length=40,
                                    alpha=0,
                                    rand=random.Random(0))
#3转化为向量
model = Word2Vec(walks, size=64, window=5, min_count=0, sg=1, hs=1, workers=1)
#前面那种适用于数据小的模式
#数据量大的需要写到硬盘
示例#14
0
def process(args):

  print "Loading graph..."
  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    #print("Walking...")
    #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
    #                                    path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    max_features = len(G.nodes())  # vocabulary size
    dim_proj = args.representation_size  # embedding space dimension
    nb_epoch = 1   # number of training epochs

    # Neural network ( in Keras )
    model = Sequential()
    model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
    model.compile(loss='mse', optimizer='rmsprop')
    sampling_table = sequence.make_sampling_table(max_features)

    print("Fitting tokenizer on walks...")
    tokenizer = text.Tokenizer(nb_words=max_features)

    print "Epochs: %d" % nb_epoch
    #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length))

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        #progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

#        for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )):

        for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) ):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                print "Started fitting..."
                loss = model.fit(X, labels)

                print "Dumping..."

                # Dump weights to a temp file
                weights = model.layers[0].get_weights()[0]

                norm_weights = np_utils.normalize(weights)

                # TODO: save weights with indices
                np.savetxt( args.output, norm_weights )

                losses.append(loss)
                if len(losses) % 100 == 0:
    #                progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        print('Samples seen:', samples_seen)
    print("Training completed!")

  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    #TODO: IMPLEMENT THAT
    print "Not implemented yet..."
    sys.exit(1)

  print "Optimization done. Saving..."
  # recover the embedding weights trained with skipgram:
  weights = model.layers[0].get_weights()[0]

  # we no longer need this
  del model

  norm_weights = np_utils.normalize(weights)

  # TODO: save weights with indices
  np.savetxt( args.output, norm_weights )
  print "Saved!"
示例#15
0
def process(args):

    print "Loading graph..."
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        #print("Walking...")
        #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
        #                                    path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        max_features = len(G.nodes())  # vocabulary size
        dim_proj = args.representation_size  # embedding space dimension
        nb_epoch = 1  # number of training epochs

        # Neural network ( in Keras )
        model = Sequential()
        model.add(
            WordContextProduct(max_features, proj_dim=dim_proj,
                               init="uniform"))
        model.compile(loss='mse', optimizer='rmsprop')
        sampling_table = sequence.make_sampling_table(max_features)

        print("Fitting tokenizer on walks...")
        tokenizer = text.Tokenizer(nb_words=max_features)

        print "Epochs: %d" % nb_epoch
        #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length))

        for e in range(nb_epoch):
            print('-' * 40)
            print('Epoch', e)
            print('-' * 40)

            #progbar = generic_utils.Progbar(tokenizer.document_count)
            samples_seen = 0
            losses = []

            #        for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )):

            for i, seq in enumerate(
                    build_deepwalk_corpus_minibatch_iter(
                        G, args.number_walks, args.walk_length)):
                # get skipgram couples for one text in the dataset
                couples, labels = sequence.skipgrams(
                    seq,
                    max_features,
                    window_size=5,
                    negative_samples=1.,
                    sampling_table=sampling_table)
                if couples:
                    # one gradient update per sentence (one sentence = a few 1000s of word couples)
                    X = np.array(couples, dtype="int32")
                    print "Started fitting..."
                    loss = model.fit(X, labels)

                    print "Dumping..."

                    # Dump weights to a temp file
                    weights = model.layers[0].get_weights()[0]

                    norm_weights = np_utils.normalize(weights)

                    # TODO: save weights with indices
                    np.savetxt(args.output, norm_weights)

                    losses.append(loss)
                    if len(losses) % 100 == 0:
                        #                progbar.update(i, values=[("loss", np.mean(losses))])
                        losses = []
                    samples_seen += len(labels)
            print('Samples seen:', samples_seen)
        print("Training completed!")

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        #TODO: IMPLEMENT THAT
        print "Not implemented yet..."
        sys.exit(1)

    print "Optimization done. Saving..."
    # recover the embedding weights trained with skipgram:
    weights = model.layers[0].get_weights()[0]

    # we no longer need this
    del model

    norm_weights = np_utils.normalize(weights)

    # TODO: save weights with indices
    np.savetxt(args.output, norm_weights)
    print "Saved!"