def build_model(corpus): model = tf_glove.GloVeModel(embedding_size=300, context_size=10) model.fit_to_corpus(corpus) model.train(num_epochs=100) model.generate_tsne() return model
def glove(year, month, documents, preloadEmbeddings, preloadW2c): """ Update the GloVe embeddings using the tokens of the current month and embeddings from last month as initialization. :param year: :param month: :param documents: {post_id : list of tokens} :param preloadEmbeddings: embeddigns matrix :param preloadW2c: {word : onehot index} :return: updated embeddings and word_2_code indices """ import tf_glove embPath = "intermed/embeddings/embeddings-{}-{}.p".format(year, month) w2cPath = "intermed/w2c/w2c-{}-{}.p".format(year, month) if not os.path.isfile(embPath): wordlist = [] for k, v in documents.items(): wordlist.append(v) model = tf_glove.GloVeModel(embedding_size=300, context_size=10, pre_load_weights=preloadEmbeddings, pre_load_w2c=preloadW2c) model.fit_to_corpus(wordlist) model.train(num_epochs=100) embeddings = model.embeddings pickle.dump(embeddings, open(embPath, "wb+")) w2c = model.word_to_id() pickle.dump(w2c, open(w2cPath, "wb+")) else: embeddings = pickle.load(open(embPath, 'rb+')) w2c = pickle.load(open(w2cPath, 'rb+')) return embeddings, w2c
import tf_glove import matplotlib embSize = 50 minOccur = 5 model = tf_glove.GloVeModel(embedding_size=embSize, context_size=10, min_occurrences=minOccur, learning_rate=0.5, batch_size=512) # min_occurrences=25, import re import nltk def extract_reddit_comments(path): # A regex for extracting the comment body from one line of JSON (faster than parsing) body_snatcher = re.compile(r"\{.*?(?<!\\)\"body(?<!\\)\":(?<!\\)\"(.*?)(?<!\\)\".*}") with open(path) as file_: for line in file_: yield line # match = body_snatcher.match(line) # if match: # body = match.group(1) # # Ignore deleted comments # if not body == '[deleted]': # # Return the comment as a string (not yet tokenized) # yield body def tokenize_comment(comment_str): # Use the excellent NLTK to tokenize the comment body #
# 1. Estimate complexity of model for one iteration. Choose appropriate $c$ for you. # 2. Train GloVe word embeddings ($d$=256). # 3. Check that v(king) - v(man) + v(women) is approximately equal v(queen). # 4. Read about [t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding). # 5. Use t-SNE to reduce the size of embeddings to 3. Make sure that the following groups of vectors are collinear (use visualization) # * [man, woman], [Mr., Ms], [king, queen], etc # * [CEO, company] # * [adjective, its comparative form] # Для каждой пары слов из словаря считается ошибка(перемножение и сложение векторов длины d). Поэтому в худшем случае на один проход потребуется $O(W^2d)$ времени. Однако стоит учесть, что предлагаемая в статье функция $f$ на нулевых элементах $X_{ij}$ равна нулю и поэтому для большинства пар слов $(i, j)$ их вклад в общую функцию потерь будет нулевой. Поэтому реальные временные затраты сильно зависят от того, какой корпус подается на вход и от того, сколько слов в словаре. # In[5]: glove_model = tf_glove.GloVeModel(embedding_size=256, context_size=10, min_occurrences=12, max_vocab_size=10000, learning_rate=0.05, batch_size=512) # In[6]: glove_model.fit_to_corpus(sentences) # In[8]: glove_model.train(num_epochs=25, summary_batch_interval=1000) # In[56]: target = glove_model.embedding_for("king") - glove_model.embedding_for( "man") + glove_model.embedding_for("woman")
import tf_glove import os import sys import pickle pickle_path=sys.argv[1] with open(pickle_path) as f: corpus = pickle.load(f) path=sys.argv[2] data = sys.argv[3] model = tf_glove.GloVeModel(embedding_size=50, context_size=1) model.fit_to_corpus(corpus) model.train(num_epochs=100) embed=[] embedstring=[] file = open(path, 'r') vector=[0]*50 for line in file: line=line.strip("\n") s=line.split(" ") count=0 for si in s: if(si=='\n'): continue try: vector+=model.embedding_for(si)
return df['message'].tolist() if __name__ == '__main__': file_scan = File_scan("./Cleaned_database/") all_file_paths = file_scan.path_gen(extension='.pkl') all_documents = [] for single_file_path in tqdm(all_file_paths): all_documents.extend(process_single_data(single_file_path)) start_time = time.time() print('start training at', start_time) embedding_size = 256 model = tf_glove.GloVeModel(embedding_size=embedding_size, context_size=500, min_occurrences=2000, learning_rate=0.05, batch_size=4096) model.fit_to_corpus(all_documents) model.train(num_epochs=400) print('finish training, took', time.time() - start_time, 's') vocab = model.words corresponding_dict = model.get_word_to_id() with open('corresponding_dict.pickle', 'wb') as handle: pickle.dump(corresponding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) embeddings = model.embeddings
) args = parser.parse_args() corpus_file = args.corpus output = args.output log_dir = args.log_dir if path.exists(log_dir): rmtree(log_dir) mkdir(log_dir) emb_size = int(args.emb_size) context_size = int(args.context_size) epoch = int(args.epoch) model = tf_glove.GloVeModel(embedding_size=emb_size, context_size=context_size) text8 = [] with open (corpus_file) as fin: for line in fin: text8 = line.rstrip().split() corpus, sent = [], [] for w in text8: sent.append(w) if len(sent) == 1000: corpus.append(sent) sent = [] model.fit_to_corpus(corpus) model.train(num_epochs=int(epoch), log_dir=log_dir) with open (output, "w") as fout:
import argparse import tf_glove arg_parser = argparse.ArgumentParser() arg_parser.add_argument('corpus_path') arg_parser.add_argument('output_path') args = arg_parser.parse_args() corpus = open(args.corpus_path).read().split() model = tf_glove.GloVeModel(embedding_size=100, context_size=10, min_occurrences=5) model.fit_to_corpus(corpus) # model.train(50, log_dir=args.output_path, save_embed_interval=5) model.train_concurrent(100) model.save_embeddings(args.output_path)
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader import tf_glove corpusdir = 'abstract/' corpus = PlaintextCorpusReader(corpusdir, '.*') model = tf_glove.GloVeModel(embedding_size=200, context_size=10, min_occurrences=25, learning_rate=0.05, batch_size=512) model.fit_to_corpus(corpus.sents()) model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000) import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader import tf_glove corpusdir = 'abstract/' corpus = PlaintextCorpusReader(corpusdir, '.*') model = tf_glove.GloVeModel(embedding_size=200, context_size=10, min_occurrences=25, learning_rate=0.05, batch_size=512) model.fit_to_corpus(corpus.sents()) model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000)
text_corpus = 'data/belling_the_cat.txt' num_glove_epoch = 100 tsne_path = 'data/demo_' + str(num_glove_epoch) + '.png' print("Loading training data...") corpus, training_data = read_data( text_corpus ) #tf_glove accepts a list while array needed to create dictionary print('Training GloVe vectors...') glove_dim = 25 glove_context = 20 glove_model = tf_glove.GloVeModel( glove_dim, glove_context) #25 dimension vectors, context taken 10 steps from word glove_model.fit_to_corpus(corpus) glove_model.train(num_glove_epoch) def build_dataset(words): count = collections.Counter(words).most_common() dictionary = dict() glove_dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) glove_dictionary[word] = glove_model.embeddings[ glove_model.id_for_word(word)] reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return dictionary, reverse_dictionary, glove_dictionary
# Read the data into a list of strings. def read_data(filename): """Extract the first file enclosed in a zip file as a list of words""" with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data filename = maybe_download('corpus/text8.zip', 31344016) data = read_data(filename) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) print("========init GloveModel========") model = glove.GloVeModel(embedding_size=100, context_size=10, learning_rate=0.1) print("=======fit to corpus========") corpus = [] corpus.append(data) model.fit_to_corpus(corpus) print("=======start training========") model.train(num_epochs=50) print("=======finish training========") words = model.words print("=======write file========") with open("tmp/glove", "w+") as f: for w in words: s = w + " " + ' '.join([str(x) for x in model.embedding_for(w)]) + "\n" #print(s) f.write(s) print("========finish========")
# Where to write out summaries. save_path = FLAGS.save_path # The text file for eval. #eval_data = FLAGS.eval_data embedding_size_p = 200 context_size_p = 10 max_vocab_size_p = 100000000000000000000 min_occurences_p = 5 epochs = 150 model = tf_glove.GloVeModel(embedding_size=embedding_size_p, context_size=context_size_p, max_vocab_size=max_vocab_size_p, min_occurrences=min_occurences_p) # text8 preprocessing data = open(train_data,"r").read() data = data.split(" ") corpus = [] corpus.append(data) corpus_set = set(data) model.fit_to_corpus(corpus) model.train(num_epochs=epochs) # save the answer embedding_file_txt = "glove_result.txt" vocab_size = model.vocab_size