Exemplo n.º 1
0
def generate_vocab(walks):
    index2word = []
    raw_vocab = defaultdict(int)

    for walk in walks:
        for word in walk:
            raw_vocab[word] += 1

    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(count=v, index=len(index2word))
        index2word.append(word)

    index2word.sort(key=lambda word: vocab[word].count, reverse=True)
    for i, word in enumerate(index2word):
        vocab[word].index = i

    return vocab, index2word
Exemplo n.º 2
0
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(int)

    for walks in all_walks:  # 遍历每一层walks
        for walk in walks:
            for word in walk:
                raw_vocab[word] += 1  # 统计一下word(node)出现的次数

    vocab = {}
    for word, v in iteritems(raw_vocab):  # 构建词袋,给word一个编码,按照出现的次数进行排序
        vocab[word] = Vocab(count=v, index=len(index2word))
        index2word.append(word)

    index2word.sort(key=lambda word: vocab[word].count,
                    reverse=True)  # 按照词袋的数量从大到小进行排序
    for i, word in enumerate(index2word):
        vocab[word].index = i  # 词袋的index重新排序

    return vocab, index2word
Exemplo n.º 3
0
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(int)

    for walks in all_walks:
        for walk in walks:
            for word in walk:
                raw_vocab[word] += 1

    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(count=v, index=len(index2word))
        index2word.append(word)

    # 按照每个单词出现的频次进行从大到小排序在vocab中
    index2word.sort(key=lambda word: vocab[word].count, reverse=True)
    for i, word in enumerate(index2word):
        vocab[word].index = i

    # vocab是一个按照walks中所有节点出现的频率从大到小排序后的单词表
    # index2word是节点集合
    return vocab, index2word
Exemplo n.º 4
0
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(
        int)  # stores the count of a word appearing in the walk

    for walks in all_walks:
        for walk in walks:
            for word in walk:
                raw_vocab[word] += 1

    # compute count, then to sort based on count, at last set index after sort
    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(
            count=v, index=len(index2word))  # vocab[word] = (count, index)
        index2word.append(word)  # index2word[index] == word

    index2word.sort(key=lambda word: vocab[word].count,
                    reverse=True)  # decending order
    for i, word in enumerate(index2word):
        vocab[word].index = i  # word2inedx

    return vocab, index2word  # vocab: