Exemplo n.º 1
0
def build_model(corpus):
    model = tf_glove.GloVeModel(embedding_size=300, context_size=10)
    model.fit_to_corpus(corpus)
    model.train(num_epochs=100)
    model.generate_tsne()

    return model
Exemplo n.º 2
0
def glove(year, month, documents, preloadEmbeddings, preloadW2c):
    """
    Update the GloVe embeddings using the tokens of the current month and embeddings from last month as initialization.
    
    :param year: 
    :param month: 
    :param documents: {post_id : list of tokens}
    :param preloadEmbeddings: embeddigns matrix
    :param preloadW2c: {word : onehot index}
    :return: updated embeddings and word_2_code indices
    """

    import tf_glove

    embPath = "intermed/embeddings/embeddings-{}-{}.p".format(year, month)
    w2cPath = "intermed/w2c/w2c-{}-{}.p".format(year, month)

    if not os.path.isfile(embPath):
        wordlist = []
        for k, v in documents.items():
            wordlist.append(v)

        model = tf_glove.GloVeModel(embedding_size=300,
                                    context_size=10,
                                    pre_load_weights=preloadEmbeddings,
                                    pre_load_w2c=preloadW2c)
        model.fit_to_corpus(wordlist)
        model.train(num_epochs=100)

        embeddings = model.embeddings
        pickle.dump(embeddings, open(embPath, "wb+"))

        w2c = model.word_to_id()
        pickle.dump(w2c, open(w2cPath, "wb+"))

    else:
        embeddings = pickle.load(open(embPath, 'rb+'))
        w2c = pickle.load(open(w2cPath, 'rb+'))

    return embeddings, w2c
Exemplo n.º 3
0
import tf_glove
import matplotlib
embSize = 50
minOccur = 5

model = tf_glove.GloVeModel(embedding_size=embSize, context_size=10, min_occurrences=minOccur,
                            learning_rate=0.5, batch_size=512)
# min_occurrences=25,
import re
import nltk


def extract_reddit_comments(path):

    # A regex for extracting the comment body from one line of JSON (faster than parsing)
    body_snatcher = re.compile(r"\{.*?(?<!\\)\"body(?<!\\)\":(?<!\\)\"(.*?)(?<!\\)\".*}")
    with open(path) as file_:
        for line in file_:
            yield line
            # match = body_snatcher.match(line)
            # if match:
            #     body = match.group(1)
            #     # Ignore deleted comments
            #     if not body == '[deleted]':
            #         # Return the comment as a string (not yet tokenized)
            #         yield body


def tokenize_comment(comment_str):
    # Use the excellent NLTK to tokenize the comment body
    #
Exemplo n.º 4
0
# 1. Estimate complexity of model for one iteration. Choose appropriate $c$ for you.
# 2. Train GloVe word embeddings ($d$=256).
# 3. Check that v(king) - v(man) + v(women) is approximately equal v(queen).
# 4. Read about [t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding).
# 5. Use t-SNE to reduce the size of embeddings to 3. Make sure that the following groups of vectors are collinear (use visualization)
#   * [man, woman], [Mr., Ms], [king, queen], etc
#   * [CEO, company]
#   * [adjective, its comparative form]

# Для каждой пары слов из словаря считается ошибка(перемножение и сложение векторов длины d). Поэтому в худшем случае на один проход потребуется $O(W^2d)$ времени. Однако стоит учесть, что предлагаемая в статье функция $f$ на нулевых элементах $X_{ij}$ равна нулю и поэтому для большинства пар слов $(i, j)$ их вклад в общую функцию потерь будет нулевой. Поэтому реальные временные затраты сильно зависят от того, какой корпус подается на вход и от того, сколько слов в словаре.

# In[5]:

glove_model = tf_glove.GloVeModel(embedding_size=256,
                                  context_size=10,
                                  min_occurrences=12,
                                  max_vocab_size=10000,
                                  learning_rate=0.05,
                                  batch_size=512)

# In[6]:

glove_model.fit_to_corpus(sentences)

# In[8]:

glove_model.train(num_epochs=25, summary_batch_interval=1000)

# In[56]:

target = glove_model.embedding_for("king") - glove_model.embedding_for(
    "man") + glove_model.embedding_for("woman")
import tf_glove
import os
import sys
import pickle

pickle_path=sys.argv[1]

with open(pickle_path) as f:
	corpus = pickle.load(f)

path=sys.argv[2]

data = sys.argv[3]

model = tf_glove.GloVeModel(embedding_size=50, context_size=1)
model.fit_to_corpus(corpus)
model.train(num_epochs=100)

embed=[]
embedstring=[]
file = open(path, 'r')
vector=[0]*50
for line in file:
	line=line.strip("\n")
	s=line.split(" ")
	count=0
	for si in s:
		if(si=='\n'):
			continue
		try:
			vector+=model.embedding_for(si)
Exemplo n.º 6
0
    return df['message'].tolist()


if __name__ == '__main__':
    file_scan = File_scan("./Cleaned_database/")
    all_file_paths = file_scan.path_gen(extension='.pkl')
    all_documents = []
    for single_file_path in tqdm(all_file_paths):
        all_documents.extend(process_single_data(single_file_path))

    start_time = time.time()
    print('start training at', start_time)
    embedding_size = 256
    model = tf_glove.GloVeModel(embedding_size=embedding_size,
                                context_size=500,
                                min_occurrences=2000,
                                learning_rate=0.05,
                                batch_size=4096)
    model.fit_to_corpus(all_documents)
    model.train(num_epochs=400)
    print('finish training, took', time.time() - start_time, 's')
    vocab = model.words

    corresponding_dict = model.get_word_to_id()

    with open('corresponding_dict.pickle', 'wb') as handle:
        pickle.dump(corresponding_dict,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    embeddings = model.embeddings
Exemplo n.º 7
0
    )
    args = parser.parse_args()

    corpus_file = args.corpus
    output = args.output
    log_dir = args.log_dir

    if path.exists(log_dir):
        rmtree(log_dir)
    mkdir(log_dir)

    emb_size = int(args.emb_size)
    context_size = int(args.context_size)
    epoch = int(args.epoch)

    model = tf_glove.GloVeModel(embedding_size=emb_size, context_size=context_size)
    text8 = []
    with open (corpus_file) as fin:
        for line in fin:
            text8 = line.rstrip().split()

    corpus, sent = [], []
    for w in text8:
        sent.append(w)
        if len(sent) == 1000:
            corpus.append(sent)
            sent = []

    model.fit_to_corpus(corpus)
    model.train(num_epochs=int(epoch), log_dir=log_dir)
    with open (output, "w") as fout:
Exemplo n.º 8
0
import argparse
import tf_glove

arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('corpus_path')
arg_parser.add_argument('output_path')
args = arg_parser.parse_args()

corpus = open(args.corpus_path).read().split()

model = tf_glove.GloVeModel(embedding_size=100,
                            context_size=10,
                            min_occurrences=5)
model.fit_to_corpus(corpus)
# model.train(50, log_dir=args.output_path, save_embed_interval=5)
model.train_concurrent(100)
model.save_embeddings(args.output_path)
Exemplo n.º 9
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import tf_glove

corpusdir = 'abstract/'
corpus = PlaintextCorpusReader(corpusdir, '.*')

model = tf_glove.GloVeModel(embedding_size=200,
                            context_size=10,
                            min_occurrences=25,
                            learning_rate=0.05,
                            batch_size=512)
model.fit_to_corpus(corpus.sents())
model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000)

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import tf_glove

corpusdir = 'abstract/'
corpus = PlaintextCorpusReader(corpusdir, '.*')

model = tf_glove.GloVeModel(embedding_size=200,
                            context_size=10,
                            min_occurrences=25,
                            learning_rate=0.05,
                            batch_size=512)
model.fit_to_corpus(corpus.sents())
model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000)

text_corpus = 'data/belling_the_cat.txt'
num_glove_epoch = 100
tsne_path = 'data/demo_' + str(num_glove_epoch) + '.png'

print("Loading training data...")
corpus, training_data = read_data(
    text_corpus
)  #tf_glove accepts a list while array needed to create dictionary
print('Training GloVe vectors...')

glove_dim = 25
glove_context = 20
glove_model = tf_glove.GloVeModel(
    glove_dim,
    glove_context)  #25 dimension vectors, context taken 10 steps from word
glove_model.fit_to_corpus(corpus)
glove_model.train(num_glove_epoch)


def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    glove_dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        glove_dictionary[word] = glove_model.embeddings[
            glove_model.id_for_word(word)]
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary, glove_dictionary
Exemplo n.º 11
0
# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

filename = maybe_download('corpus/text8.zip', 31344016)
data = read_data(filename)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

print("========init GloveModel========")
model = glove.GloVeModel(embedding_size=100, context_size=10, learning_rate=0.1)
print("=======fit to corpus========")
corpus = []
corpus.append(data)
model.fit_to_corpus(corpus)
print("=======start training========")
model.train(num_epochs=50)
print("=======finish training========")
words = model.words
print("=======write file========")
with open("tmp/glove", "w+") as f:
  for w in words:
    s = w + " " + ' '.join([str(x) for x in model.embedding_for(w)]) + "\n"
    #print(s)
    f.write(s)
print("========finish========")
Exemplo n.º 12
0
# Where to write out summaries.
save_path = FLAGS.save_path

# The text file for eval.
#eval_data = FLAGS.eval_data


embedding_size_p = 200
context_size_p = 10
max_vocab_size_p = 100000000000000000000
min_occurences_p = 5
epochs = 150



model = tf_glove.GloVeModel(embedding_size=embedding_size_p, context_size=context_size_p, max_vocab_size=max_vocab_size_p, min_occurrences=min_occurences_p)

# text8 preprocessing 
data = open(train_data,"r").read()
data = data.split(" ")
corpus = []
corpus.append(data)
corpus_set = set(data)

model.fit_to_corpus(corpus)
model.train(num_epochs=epochs)

# save the answer 
embedding_file_txt = "glove_result.txt"

vocab_size = model.vocab_size