def build_doc_dataset(docs, vocabulary_size=50000): ''' Build the dictionary and replace rare words with UNK token. Parameters ---------- docs: list of token lists, each token list represent a sentence vocabulary_size: maximum number of top occurring tokens to produce, rare tokens will be replaced by 'UNK' ''' count = [['UNK', -1]] words = reduce(concat_lists, docs) doc_ids = [] # collect document(sentence) indices for i, doc in enumerate(docs): doc_ids.extend([i] * len(doc)) word_ids, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size=vocabulary_size) return doc_ids, word_ids, count, dictionary, reverse_dictionary
def build_doc_dataset(docs, vocabulary_size=50000): ''' Build the dictionary and replace rare words with UNK token. Parameters ---------- docs: list of token lists, each token list represent a sentence vocabulary_size: maximum number of top occurring tokens to produce, rare tokens will be replaced by 'UNK' ''' count = [['UNK', -1]] words = reduce(concat_lists, docs) doc_ids = [] # collect document(sentence) indices for i, doc in enumerate(docs): doc_ids.extend([i] * len(doc)) word_ids, count, dictionary, reverse_dictionary = build_dataset( words, vocabulary_size=vocabulary_size) return doc_ids, word_ids, count, dictionary, reverse_dictionary
from itertools import chain import tensorflow as tf from six.moves import xrange from sklearn.manifold import TSNE from word2vec import read_data, build_dataset, plot_with_labels, cosine_similarity from word2vec.skip_gram_model import generate_batch, generate_graph if __name__ == '__main__': vocabulary_size = 10000 words = read_data() words = [x for sublist in words for x in sublist] data, count, dictionary, reverse_dictionary = build_dataset( vocabulary_size, words) del words # 释放内存 batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. valid_size = 9 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. num_sampled = 64 # Number of negative examples to sample. # Input data. valid_word = ['主任', '语言表达', '观察', '床头', '护理', '病人', '研究成果', '省部级', '引进'] valid_examples = [dictionary[li] for li in valid_word]