def add(self, document):
     words = helpers.get_words(document)
     for word in words:
         if word in self.tokens.keys():
             self.tokens[word] += 1  # adds each word count to the tokens list
         else:
             self.tokens[word] = 1
예제 #2
0
 def test_get_words(self):
     bags_of_words = [{'aa': 2, 'bb': 1, 'cc': 3}, {'cc': 2, 'dd': 1}]
     words = helpers.get_words(bags_of_words)
     print(words)
     assert len(words) == 4
     assert 'aa' in words
     assert 'bb' in words
     assert 'cc' in words
     assert 'dd' in words
예제 #3
0
import helpers
import sys
from representations.sequentialembedding import SequentialEmbedding
"""
Let's examine the closest neighbors for a word over time
"""

import collections
from sklearn.manifold import TSNE

import numpy as np
import matplotlib.pyplot as plt

WORDS = helpers.get_words()
if __name__ == "__main__":
    embeddings = helpers.load_embeddings()

    for word1 in WORDS:
        time_sims, lookups, nearests, sims = helpers.get_time_sims(
            embeddings, word1)

        helpers.clear_figure()

        # we remove word1 from our words because we just want to plot the different
        # related words
        words = filter(lambda word: word.split("|")[0] != word1,
                       lookups.keys())

        values = [lookups[word] for word in words]
        fitted = helpers.fit_tsne(values)
        if not len(fitted):
import helpers
from hyperparameters import hps
import random

class NearbyWords(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(NearbyWords, self).__init__()
        self.hidden = nn.Linear(vocab_size, embed_size, bias=False)
        self.output = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        hidden = self.hidden(x)
        output = self.output(hidden)
        return output

words = helpers.get_words(hps['filename'])
vocab = set(words)
word_to_int, int_to_word = helpers.get_word_mappings(words)
pairs = helpers.get_training_pairs(words, 3)

vocab_size = len(vocab)
embed_size = hps['embed_size']

model = NearbyWords(vocab_size, embed_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=hps['learning_rate'])

for t in range(hps['epochs']):
    pair_sample = random.sample(pairs, 1000)

    losses = []
import helpers
import sys
from representations.sequentialembedding import SequentialEmbedding

"""
Let's examine the closest neighbors for a word over time
"""

import collections
from sklearn.manifold import TSNE


import numpy as np
import matplotlib.pyplot as plt

WORDS = helpers.get_words()
if __name__ == "__main__":
    embeddings = helpers.load_embeddings()

    for word1 in WORDS:
        time_sims, lookups, nearests, sims = helpers.get_time_sims(embeddings, word1)

        helpers.clear_figure()

        # we remove word1 from our words because we just want to plot the different
        # related words
        words = filter(lambda word: word.split("|")[0] != word1, lookups.keys())

        values = [ lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)
        if not len(fitted):
import json
import math
from pprint import pprint
from helpers import pretty_date, get_board, get_words

frequencies = {}
puzzle_dates = {}

with open("data.json", 'r') as f:
    data = json.loads(f.read())

    for board in data:
        words = get_words(board['board']['cells'])

        for word_tup in words['across']:
            if not word_tup[0] in frequencies:
                frequencies[word_tup[0]] = {
                    "average_time": 0,
                    "freq": 0,
                    "puzzles": []
                }

            frequencies[word_tup[0]]['freq'] += 1
            frequencies[word_tup[0]]['average_time'] = (
                frequencies[word_tup[0]]['average_time'] +
                word_tup[1]) / frequencies[word_tup[0]]['freq']
            frequencies[word_tup[0]]['puzzles'].append(board['puzzle_id'])

        for word_tup in words['down']:
            if not word_tup[0] in frequencies:
                frequencies[word_tup[0]] = {
예제 #7
0
import sys
import pickle
import os
import helpers

""" Train data and store model to file """
args = sys.argv
train_dataset_path = os.path.abspath(args[1])
print('Dataset path: {}'.format(train_dataset_path))

print('Loading dataset...')
train_target, train_data = helpers.load_dataset(train_dataset_path)

bags_of_words = helpers.create_bags_of_words(train_data)
words = helpers.get_words(bags_of_words)
labels = helpers.get_labels(train_target)

model_path = os.path.join(os.getcwd(), 'model')
if not os.path.exists(model_path):
    os.mkdir(model_path)

print('Training data...')
label_probs, probs_per_label = helpers.train(
    bags_of_words, words, train_target, labels)

print('Storing model...')
with open(os.path.join(model_path, 'train.pickle'), 'wb') as f:
    pickle.dump((label_probs, probs_per_label, words, labels), f)

print('Training done.')
print('============== INFO ===============')