def add(self, document): words = helpers.get_words(document) for word in words: if word in self.tokens.keys(): self.tokens[word] += 1 # adds each word count to the tokens list else: self.tokens[word] = 1
def test_get_words(self): bags_of_words = [{'aa': 2, 'bb': 1, 'cc': 3}, {'cc': 2, 'dd': 1}] words = helpers.get_words(bags_of_words) print(words) assert len(words) == 4 assert 'aa' in words assert 'bb' in words assert 'cc' in words assert 'dd' in words
import helpers import sys from representations.sequentialembedding import SequentialEmbedding """ Let's examine the closest neighbors for a word over time """ import collections from sklearn.manifold import TSNE import numpy as np import matplotlib.pyplot as plt WORDS = helpers.get_words() if __name__ == "__main__": embeddings = helpers.load_embeddings() for word1 in WORDS: time_sims, lookups, nearests, sims = helpers.get_time_sims( embeddings, word1) helpers.clear_figure() # we remove word1 from our words because we just want to plot the different # related words words = filter(lambda word: word.split("|")[0] != word1, lookups.keys()) values = [lookups[word] for word in words] fitted = helpers.fit_tsne(values) if not len(fitted):
import helpers from hyperparameters import hps import random class NearbyWords(nn.Module): def __init__(self, vocab_size, embed_size): super(NearbyWords, self).__init__() self.hidden = nn.Linear(vocab_size, embed_size, bias=False) self.output = nn.Linear(embed_size, vocab_size) def forward(self, x): hidden = self.hidden(x) output = self.output(hidden) return output words = helpers.get_words(hps['filename']) vocab = set(words) word_to_int, int_to_word = helpers.get_word_mappings(words) pairs = helpers.get_training_pairs(words, 3) vocab_size = len(vocab) embed_size = hps['embed_size'] model = NearbyWords(vocab_size, embed_size) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=hps['learning_rate']) for t in range(hps['epochs']): pair_sample = random.sample(pairs, 1000) losses = []
import helpers import sys from representations.sequentialembedding import SequentialEmbedding """ Let's examine the closest neighbors for a word over time """ import collections from sklearn.manifold import TSNE import numpy as np import matplotlib.pyplot as plt WORDS = helpers.get_words() if __name__ == "__main__": embeddings = helpers.load_embeddings() for word1 in WORDS: time_sims, lookups, nearests, sims = helpers.get_time_sims(embeddings, word1) helpers.clear_figure() # we remove word1 from our words because we just want to plot the different # related words words = filter(lambda word: word.split("|")[0] != word1, lookups.keys()) values = [ lookups[word] for word in words ] fitted = helpers.fit_tsne(values) if not len(fitted):
import json import math from pprint import pprint from helpers import pretty_date, get_board, get_words frequencies = {} puzzle_dates = {} with open("data.json", 'r') as f: data = json.loads(f.read()) for board in data: words = get_words(board['board']['cells']) for word_tup in words['across']: if not word_tup[0] in frequencies: frequencies[word_tup[0]] = { "average_time": 0, "freq": 0, "puzzles": [] } frequencies[word_tup[0]]['freq'] += 1 frequencies[word_tup[0]]['average_time'] = ( frequencies[word_tup[0]]['average_time'] + word_tup[1]) / frequencies[word_tup[0]]['freq'] frequencies[word_tup[0]]['puzzles'].append(board['puzzle_id']) for word_tup in words['down']: if not word_tup[0] in frequencies: frequencies[word_tup[0]] = {
import sys import pickle import os import helpers """ Train data and store model to file """ args = sys.argv train_dataset_path = os.path.abspath(args[1]) print('Dataset path: {}'.format(train_dataset_path)) print('Loading dataset...') train_target, train_data = helpers.load_dataset(train_dataset_path) bags_of_words = helpers.create_bags_of_words(train_data) words = helpers.get_words(bags_of_words) labels = helpers.get_labels(train_target) model_path = os.path.join(os.getcwd(), 'model') if not os.path.exists(model_path): os.mkdir(model_path) print('Training data...') label_probs, probs_per_label = helpers.train( bags_of_words, words, train_target, labels) print('Storing model...') with open(os.path.join(model_path, 'train.pickle'), 'wb') as f: pickle.dump((label_probs, probs_per_label, words, labels), f) print('Training done.') print('============== INFO ===============')