def do_search(word1): if not word1 in search_cache: embeddings = helpers.load_embeddings() words = word1.split(":") all_lookups = {} all_sims = defaultdict(list) all_terms = defaultdict(list) for word2 in words: if not word2 in term_cache: term_cache[word2] = helpers.get_time_sims(embeddings, word2) else: print "USING CACHED NEIGHBORS FOR", word2 time_sims, lookups, nearests, sims = term_cache[word2] for word in lookups: all_terms[word].append(word2) for word in lookups: all_sims[word].append(sims[word]) all_lookups.update(lookups) words = all_lookups.keys() values = [ all_lookups[word] for word in words ] fitted = helpers.fit_tsne(values) # we should stitch the arrays together into objects, i guess objs = [] for i in xrange(len(words)): word = words[i] ww, decade = word.split("|") obj = { "word" : ww, "query" : all_terms[word], "year" : int(decade), "similarity" : all_sims[word], "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]), "sum_similarity" : sum(all_sims[word]), "position" : { "x" : round(fitted[i][0], 3), "y" : round(fitted[i][1], 3) } } objs.append(obj) search_cache[word1] = objs return { "term" : word1, "results" : search_cache[word1] }
def do_search(word1): if not word1 in search_cache: embeddings = helpers.load_embeddings() words = word1.split(":") all_lookups = {} all_sims = defaultdict(list) all_terms = defaultdict(list) for word2 in words: if not word2 in term_cache: term_cache[word2] = helpers.get_time_sims(embeddings, word2) else: print("USING CACHED NEIGHBORS FOR", word2) time_sims, lookups, nearests, sims = term_cache[word2] for word in lookups: all_terms[word].append(word2) for word in lookups: all_sims[word].append(sims[word]) all_lookups.update(lookups) words = list(all_lookups.keys()) values = [ all_lookups[word] for word in words ] fitted = helpers.fit_tsne(values) # we should stitch the arrays together into objects, i guess objs = [] for i in range(len(words)): word = words[i] ww, decade = word.split("|") obj = { "word" : ww, "query" : all_terms[word], "year" : int(decade), "similarity" : all_sims[word], "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]), "sum_similarity" : sum(all_sims[word]), "position" : { "x" : round(fitted[i][0], 3), "y" : round(fitted[i][1], 3) } } objs.append(obj) search_cache[word1] = objs return { "term" : word1, "results" : search_cache[word1] }
def __init__(self): self.batch_idx = 0 self.questions = [] self.responses = [] self.labels = [] self.embeddings = helpers.load_embeddings() data = helpers.load_data() for item in data: self.questions.append(item[0]) self.responses.append(item[1]) self.labels.append(item[2]) del data
import helpers import sys from representations.sequentialembedding import SequentialEmbedding """ Let's examine the closest neighbors for a word over time """ import collections from sklearn.manifold import TSNE import numpy as np import matplotlib.pyplot as plt WORDS = helpers.get_words() if __name__ == "__main__": embeddings = helpers.load_embeddings() for word1 in WORDS: time_sims, lookups, nearests, sims = helpers.get_time_sims( embeddings, word1) helpers.clear_figure() # we remove word1 from our words because we just want to plot the different # related words words = filter(lambda word: word.split("|")[0] != word1, lookups.keys()) values = [lookups[word] for word in words] fitted = helpers.fit_tsne(values) if not len(fitted):
#x2 = x[len(q1):] # The models are not perfectly symmetric in the combination layer, so we can flip the order of the # questions to synthesize additional training examples # x1 = np.concatenate((x1_sliced, x2_sliced), axis=0) # x2 = np.concatenate((x2_sliced, x1_sliced), axis=0) # y = np.concatenate((y, y), axis=0) # x1_lengths = np.concatenate((q1_lengths, q2_lengths), axis=0) # x2_lengths = np.concatenate((q2_lengths, q1_lengths), axis=0) # Create word embeddings print "Loading word embeddings..." vocab_dict = vocab_processor.vocabulary_._mapping #print vocab_dict pretrained_embeddings = helpers.load_embeddings(FLAGS.embeddings_file, vocab_dict, FLAGS.embedding_dim, FLAGS.use_cached_embeddings) # Randomly shuffle data print "Shuffling data..." np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] q1_lengths_shuffled = x1_lengths[shuffle_indices] q2_lengths_shuffled = x2_lengths[shuffle_indices] # Split train/test set print "Splitting training/dev..."
from representations.sequentialembedding import SequentialEmbedding """ Let's examine the closest neighbors for a word over time """ import collections from sklearn.manifold import TSNE import numpy as np import matplotlib.pyplot as plt WORDS = helpers.get_words() if __name__ == "__main__": embeddings = helpers.load_embeddings() for word1 in WORDS: time_sims, lookups, nearests, sims = helpers.get_time_sims(embeddings, word1) helpers.clear_figure() # we remove word1 from our words because we just want to plot the different # related words words = filter(lambda word: word.split("|")[0] != word1, lookups.keys()) values = [ lookups[word] for word in words ] fitted = helpers.fit_tsne(values) if not len(fitted): print "Couldn't model word", word1 continue