def load_model(embedding_size, vocab_size, num_steps, dataset): reverse_filename = get_reverse_filename(embedding_size, num_steps, vocab_size, dataset) embeddings_filename = get_embeddin_filename(embedding_size, num_steps, vocab_size, dataset) reverse_dictionary = load_pickle_file(reverse_filename) final_embeddings = load_pickle_file(embeddings_filename) dict_filename = get_dict_filename(embedding_size, num_steps, vocab_size, dataset) dictionary = load_pickle_file(dict_filename) return reverse_dictionary, final_embeddings, dictionary
def hist_plotter(): colors = ['#F95400', '#0C56A2', '#F9DC00', '#00A670', '#C60074'] seqgan_dict = load_pickle_file( "/Users/markus/workspace/master/Master/seqgan_score_dict.p") emb_dict = load_pickle_file( "/Users/markus/workspace/master/Master/emb_score_dict.p") color_seqgan = colors.pop(0) color_emb = colors.pop(0) seqgan_intervals = [] seqgan_intervals_uniq = [] for (s, (b, n)) in seqgan_dict.iteritems(): seqgan_intervals_uniq.append(b) for _ in range(n): seqgan_intervals.append(b) emb_intervals = [] emb_intervals_uniq = [] for (s, (b, n)) in emb_dict.iteritems(): emb_intervals_uniq.append(b) for _ in range(n): emb_intervals.append(b) num_bins = 10 fig, ax = plt.subplots() plt.rc('font', family='Arial') # the histogram of the data data = np.vstack([seqgan_intervals, emb_intervals]).T ax.hist(data, num_bins, color=[color_seqgan, color_emb], label=["Baseline", "Our model"]) # ax.hist(data_uniq, num_bins) # ax.hist(emb_intervals, num_bins, normed=1) # add a 'best fit' line ax.set_xlabel(u'β') ax.set_ylabel('Count') # Tweak spacing to prevent clipping of ylabel fig.tight_layout() plt.legend() plt.show()
def get_word_embeddings(conf): if conf.WORD_EMBEDDING_METHOD == 'glove': embeddings_index = {} f = open('data/embeddings/glove.6B.300d.txt') count = 0 for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs count += 1 if count % 100 == 0: print_progress(count, 400000, prefix="Getting glove word embeddings") f.close() return embeddings_index elif conf.WORD_EMBEDDING_METHOD == 'word2vec': embedding_dict_name = "word2vec/saved_models/word2vec_%sd%svoc100001steps_dict_%s.pkl" % ( conf.EMBEDDING_DIMENSION, conf.NB_WORDS, conf.DATASET if conf.DATASET != None else "flickr") return load_pickle_file(embedding_dict_name) print("WORD_EMBEDDING_METHOD not found") return None
def preprocess_sentences(config, sentences): sos_token = "<sos>" eos_token = "<eos>" pad_token = "<pad>" if config[Conf.WORD_EMBEDDING] == WordEmbedding.GLOVE: print "Loading Glove dictionary..." word_embedding_dict = get_word_embeddings() sos_token = "<" eos_token = ">" pad_token = "=" else: filename = get_dict_filename(config[Conf.EMBEDDING_SIZE], config[Conf.WORD2VEC_NUM_STEPS], config[Conf.VOCAB_SIZE], config[Conf.W2V_SET]) print "Loading Word2Vec dictionary (%s)..." % filename # word_embedding_dict = load_pickle_file("word2vec/saved_models/word2vec_%sd%svoc%ssteps_dict.pkl" % (config[Conf.EMBEDDING_SIZE], config[Conf.VOCAB_SIZE], config[Conf.WORD2VEC_NUM_STEPS])) word_embedding_dict = load_pickle_file(filename) word_list_sentences = [] for sentence in sentences: word_list = [sos_token] for word in sentence.split(" "): word_list.append(word.lower()) word_list.append(eos_token) while len(word_list) < config[Conf.MAX_SEQ_LENGTH]: word_list.append(pad_token) word_list_sentences.append(word_list) # word_list_sentences = [[word.lower() for word in sentence.split(" ")] for sentence in sentences] return np.asarray(word_list_sentences), word_embedding_dict
def get_word_embedding_matrix(word_to_id, embedding_dim): embeddings_dict = load_pickle_file( 'word2vec/saved_models/word2vec_50d1000voc100001steps_dict_flowers.pkl' ) embedding_matrix = numpy.zeros((len(word_to_id) + 1, embedding_dim)) for word, i in word_to_id.items(): if word in embeddings_dict: embedding_matrix[i] = embeddings_dict[word]
def generate_input_noise(config): if config[Conf.PREINIT] == PreInit.ENCODER_DECODER: if config[Conf.WORD_EMBEDDING] == WordEmbedding.ONE_HOT: noise_size = config[Conf.VOCAB_SIZE] else: noise_size = config[Conf.EMBEDDING_SIZE] else: noise_size = config[Conf.NOISE_SIZE] if config[Conf.NOISE_MODE] == NoiseMode.REPEAT: noise_matrix = np.zeros( (config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size)) for batch_index in range(config[Conf.BATCH_SIZE]): word_noise = np.random.normal(size=noise_size) for word_index in range(config[Conf.MAX_SEQ_LENGTH]): noise_matrix[batch_index][word_index] = word_noise return noise_matrix elif config[Conf.NOISE_MODE] == NoiseMode.REPEAT_SINGLE: noise_matrix = np.zeros((config[Conf.BATCH_SIZE], noise_size)) for batch_index in range(config[Conf.BATCH_SIZE]): noise_matrix[batch_index] = np.random.normal(size=noise_size) return noise_matrix elif config[Conf.NOISE_MODE] == NoiseMode.NEW: return np.random.rand(config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size) elif config[Conf.NOISE_MODE] == NoiseMode.FIRST_ONLY: noise_matrix = np.zeros( (config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size)) for batch_index in range(config[Conf.BATCH_SIZE]): word_noise = np.random.normal(size=noise_size) for word_index in range(config[Conf.MAX_SEQ_LENGTH]): noise_matrix[batch_index][word_index] = word_noise for batch_index in range(config[Conf.BATCH_SIZE]): if random.random() < 0.5: word_noise = np.zeros(noise_size) else: word_noise = np.ones(noise_size) noise_matrix[batch_index][0] = word_noise return noise_matrix elif config[Conf.NOISE_MODE] == NoiseMode.ONES: return np.ones( (config[Conf.BATCH_SIZE], config[Conf.MAX_SEQ_LENGTH], noise_size)) elif config[Conf.NOISE_MODE] == NoiseMode.ENCODING: embedded_data = load_pickle_file( "sequence_to_sequence/logs/S2S_2EMB_2017-04-04_VS2+1000_BS128_HD30_DHL1_ED50_SEQ5_WEMword2vec/encoded_data.pkl" ) random_distribution_of_embedded_data = [] for i in range(config[Conf.BATCH_SIZE]): # random_distribution_of_embedded_data.append(embedded_data[np.random.randint(0, len(embedded_data))]) random_distribution_of_embedded_data.append(embedded_data[i]) return np.asarray(random_distribution_of_embedded_data)
def compare_distributions(): perplexity = 15 data = "encoded" tsne = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=5000) suffix = "_lambda" # suffix = "" if data == "encoded": embs = load_pickle_file( "sequence_to_sequence/logs/NORM_S2S_2EMB_2017-04-07_VS2+1000_BS128_HD40_DHL1_ED50_SEQ5_WEMword2vec/encoded_data_lambda.pkl") embs = embs[:1000, 0, :] elif data == "word2vec": embs = load_pickle_file("word2vec/saved_models/word2vec_50d1000voc100001steps_embs.pkl") embs = embs[:1000, :] random_uniform = numpy.random.normal(size=embs.shape) # embs = normalize(embs, norm="l2") append = numpy.append(random_uniform, embs, axis=0) embs_pca = tsne.fit_transform(append) plot_collections([embs_pca[:1000], embs_pca[1000:]], ["gaussian-yellow", data + "-blue"], perplexity, suffix)
def gen_class_embs(): # create_common_words_pickle() print "Generating classes" common_words = load_pickle_file("common_words.p") print "Loading captions..." filename_caption_text_tuples = fetch_all_caption_text_tuples()[:5000] print "Loading word embeddings..." word_embedding_dict = dict(fetch_all_word_vectors()) filname_text_vector_tuples = [] tot = len(filename_caption_text_tuples) counter = 1 print_progress(counter, tot, prefix="Converting classes to embs") for filename, caption in filename_caption_text_tuples: classes = get_classes(caption, common_words) filname_text_vector_tuples.extend([(filename, c, word_embedding_dict[c]) for c in classes if c in word_embedding_dict.keys()]) counter += 1 print_progress(counter, tot, prefix="Converting classes to embs") save_class_vector_list(filname_text_vector_tuples)
def wmd_retrieval(pred_strindgs, dataset_string_list_sentences): filename = get_dict_filename(config[Conf.EMBEDDING_SIZE], config[Conf.WORD2VEC_NUM_STEPS], config[Conf.VOCAB_SIZE], config[Conf.W2V_SET]) word_embedding_dict = load_pickle_file(filename) best_sentence_lists = [] for pred_string in pred_strings: score_tuples = [] for dataset_string_list_sentence in dataset_string_list_sentences: dataset_string = " ".join(dataset_string_list_sentence) score = get_wmd_distance(pred_string, dataset_string, word_embedding_dict) score_tuples.append((dataset_string, score)) score_tuples = sorted(score_tuples, key=lambda x: x[1], reverse=False) result = [x[0] for x in score_tuples[:5]] best_sentence_lists.append(result) return best_sentence_lists
def background_wmd_retrieval(pred_strings, dataset_string_list_sentences): filename = get_dict_filename(config[Conf.EMBEDDING_SIZE], config[Conf.WORD2VEC_NUM_STEPS], config[Conf.VOCAB_SIZE], config[Conf.W2V_SET]) word_embedding_dict = load_pickle_file(filename) counter = Value('i', 0) sentence_count = len(pred_strings) cpu_count = multiprocessing.cpu_count() print "CPUs:", cpu_count if cpu_count > 8 and cpu_count < 15: cpu_count = 10 elif cpu_count > 40: cpu_count = 40 print "Starting pool with %s processes" % cpu_count pool = Pool(cpu_count, initializer=init, initargs=(counter, )) tuple_array = [(pred_string, dataset_string_list_sentences, word_embedding_dict, sentence_count) for pred_string in pred_strings] best_sentence_lists = pool.map(background_wmd, tuple_array, chunksize=1) pool.close() pool.join() return best_sentence_lists
def plotter(): colors = ['#F95400', '#0C56A2', '#F9DC00', '#00A670', '#C60074'] seqgan_dict = load_pickle_file( "/Users/markus/workspace/master/Master/seqgan_score_dict.p") emb_dict = load_pickle_file( "/Users/markus/workspace/master/Master/emb_score_dict.p") color_seqgan = colors.pop(0) colors.pop(0) color_emb = colors.pop(0) plt.rc('font', family='Arial') buckets = 10 plt.rcParams.update({'font.size': 20}) seqgan_count = [0 for _ in range(buckets)] seqgan_count_uniq = [0 for _ in range(buckets)] emb_count = [0 for _ in range(buckets)] emb_count_uniq = [0 for _ in range(buckets)] for (n, (c, u)) in seqgan_dict.iteritems(): seqgan_count[int(c * 10 - 1)] += u seqgan_count_uniq[int(c * 10 - 1)] += 1 for (n, (c, u)) in emb_dict.iteritems(): emb_count[int(c * 10 - 1)] += u emb_count_uniq[int(c * 10 - 1)] += 1 ind = np.arange(buckets) # the x locations for the groups # width = 0.35 # the width of the bars width = 0.49 # the width of the bars alpha_uniq = 0.4 fig, ax = plt.subplots() # axes.set_xlim([0.5, 1.0]) # axes.set_ylim([ymin,ymax]) seqgan_bars = ax.bar(ind, seqgan_count, width, color=color_seqgan) seqgan_bars_uniq = ax.bar(ind, seqgan_count_uniq, width, color='black', alpha=alpha_uniq) emb_bars = ax.bar(ind + width, emb_count, width, color=color_emb) emb_bars_uniq = ax.bar(ind + width, emb_count_uniq, width, color='black', alpha=alpha_uniq) # add some text for labels, title and axes ticks ax.set_ylabel('Count') ax.set_xlabel(u'β') ax.set_xticks(ind + width / 2) x_tick_labels = [] for i in range(buckets): x_tick_labels.append("%.1f-%.1f" % (float(i) / buckets, float(i + 1) / buckets)) # x_tick_labels.append("%.1f" % (float(i + 1) / buckets)) ax.set_xticklabels(x_tick_labels) # ax.set_xlabel(x_tick_labels) plt.tick_params( axis='x', which='both', bottom='off', ) ax.legend((seqgan_bars[0], emb_bars[0]), ('SeqGan', 'Word Embedding Model'), fontsize=20) autolabel(seqgan_bars_uniq, ax, seqgan_count) autolabel(emb_bars_uniq, ax, emb_count) plt.show()