def get_word_embeddings(conf): if conf.WORD_EMBEDDING_METHOD == 'glove': embeddings_index = {} f = open('data/embeddings/glove.6B.300d.txt') count = 0 for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs count += 1 if count % 100 == 0: print_progress(count, 400000, prefix="Getting glove word embeddings") f.close() return embeddings_index elif conf.WORD_EMBEDDING_METHOD == 'word2vec': embedding_dict_name = "word2vec/saved_models/word2vec_%sd%svoc100001steps_dict_%s.pkl" % ( conf.EMBEDDING_DIMENSION, conf.NB_WORDS, conf.DATASET if conf.DATASET != None else "flickr") return load_pickle_file(embedding_dict_name) print("WORD_EMBEDDING_METHOD not found") return None
def find_n_most_similar_class(class_embedding, n=1): class_vector_pairs = fetch_all_word_vectors() first_class_text = class_vector_pairs[0][0] first_class_vector = class_vector_pairs[0][1] first_class_mse = compare_vectors(class_embedding, first_class_vector) best_class_vector_mse_list = [0 for i in range(n)] best_class_text_list = ["" for i in range(n)] best_class_vector_list = [[] for i in range(n)] best_class_vector_mse_list = insert_and_remove_last(0, best_class_vector_mse_list, first_class_mse) best_class_text_list = insert_and_remove_last(0, best_class_text_list, first_class_text) best_class_vector_list = insert_and_remove_last(0, best_class_vector_list, first_class_vector) total_class = len(class_vector_pairs) counter = 1 print_progress(counter, total_class, prefix="Searching for class") for temp_class_text, temp_class_vector in class_vector_pairs: temp_image_mse = compare_vectors(class_embedding, temp_class_vector) for index in range(len(best_class_vector_list)): if temp_image_mse < best_class_vector_mse_list[index]: best_class_vector_mse_list = insert_and_remove_last(index, best_class_vector_mse_list, temp_image_mse) best_class_text_list = insert_and_remove_last(index, best_class_text_list, temp_class_text) best_class_vector_list = insert_and_remove_last(index, best_class_vector_list, temp_class_vector) break counter += 1 if counter % 100 == 0 or counter > total_class - 2: print_progress(counter, total_class, prefix="Searching for class") return best_class_text_list
def background_find_n_most_similar_vectors(tuple_array): global counter pred_vector, vector_list, sentence_list, sentence_count = tuple_array n = 5 first_vector = vector_list[0] first_sentence = sentence_list[0] first_mse = compare_vectors(pred_vector, first_vector) best_mse_list = [0 for _ in range(n)] best_sentence_list = ["" for _ in range(n)] best_vector_list = [[] for _ in range(n)] best_mse_list = insert_and_remove_last(0, best_mse_list, first_mse) best_vector_list = insert_and_remove_last(0, best_vector_list, first_vector) best_sentence_list = insert_and_remove_last(0, best_sentence_list, first_sentence) for i in range(len(vector_list)): temp_vector = vector_list[i] temp_mse = compare_vectors(pred_vector, temp_vector) for index in range(len(best_vector_list)): if temp_mse < best_mse_list[index]: best_mse_list = insert_and_remove_last(index, best_mse_list, temp_mse) best_vector_list = insert_and_remove_last( index, best_vector_list, temp_vector) best_sentence_list = insert_and_remove_last( index, best_sentence_list, sentence_list[i]) break with counter.get_lock(): counter.value += 1 print_progress(counter.value, sentence_count, "Running Cosine distance") return [" ".join(x) for x in best_sentence_list]
def preprocess(img_path, num_images): img = image.load_img(img_path, target_size=(299, 299)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) global counter print_progress(counter, num_images, prefix="Processing images") counter += 1 return x
def convert_and_store(): all_image_imgvec_pairs = fetch_all_image_vector_pairs() image_filenames = [x[0] for x in all_image_imgvec_pairs] image_vectors = np.asarray([x[1] for x in all_image_imgvec_pairs]) image_count = len(image_filenames) del x del all_image_imgvec_pairs pca = PCA(n_components=50) pca_vectors = pca.fit_transform(image_vectors) for i in range(image_count): store_pca_vector_to_db(image_filenames[i], pca_vectors[i]) print_progress(i, image_count, prefix="Storing PCA vectors")
def get_word_embeddings(): embeddings_index = {} f = open('LSTM/glove.6B.300d.txt') count = 0 for line in f: values = line.split() word = values[0] coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs count += 1 if count % 100 == 0: print_progress(count, 400000, prefix="Getting glove word embeddings") f.close() return embeddings_index
def run_inception(): flowers_path = "/Users/markus/workspace/master/Master/data/datasets/flowers/" flowers_image_path = flowers_path + "jpg/" image_filenames = fetch_all_filenames(flowers_image_path) num_images = len(image_filenames) print "Fetch image paths" image_paths = [flowers_image_path + image_filename for image_filename in image_filenames] processed_imgs = [preprocess(x, num_images) for x in image_paths] print "Loading model" inception = get_model() for i in range(num_images): img_vec = inception.predict(np.asarray(processed_imgs[i])) store_image_vector_to_db(image_filenames[i], img_vec[0]) print_progress(i, num_images, prefix="Running images through model and storing...")
def background_wmd(tuple): global counter pred_string, dataset_string_list_sentences, word_embedding_dict, sentence_count = tuple score_tuples = [] for dataset_string_list_sentence in dataset_string_list_sentences: dataset_string = " ".join(dataset_string_list_sentence) score = get_wmd_distance(pred_string, dataset_string, word_embedding_dict) score_tuples.append((dataset_string, score)) score_tuples = sorted(score_tuples, key=lambda x: x[1], reverse=False) result = [x[0] for x in score_tuples[:5]] with counter.get_lock(): counter.value += 1 print_progress(counter.value, sentence_count, "Running WMD") # print counter.value return result
def gen_class_embs(): # create_common_words_pickle() print "Generating classes" common_words = load_pickle_file("common_words.p") print "Loading captions..." filename_caption_text_tuples = fetch_all_caption_text_tuples()[:5000] print "Loading word embeddings..." word_embedding_dict = dict(fetch_all_word_vectors()) filname_text_vector_tuples = [] tot = len(filename_caption_text_tuples) counter = 1 print_progress(counter, tot, prefix="Converting classes to embs") for filename, caption in filename_caption_text_tuples: classes = get_classes(caption, common_words) filname_text_vector_tuples.extend([(filename, c, word_embedding_dict[c]) for c in classes if c in word_embedding_dict.keys()]) counter += 1 print_progress(counter, tot, prefix="Converting classes to embs") save_class_vector_list(filname_text_vector_tuples)
def read_flower_data(): index = 0 filename = 'data/datasets/all_flowers.txt' """Extract the first file enclosed in a zip file as a list of words""" with open(filename) as f: data = [] readlines = f.readlines() length = len(readlines) for line in readlines: index += 1 sentence = line.strip() preprocessed_words = preprocessing(sentence, add_sos_eos=True) for x in preprocessed_words: data.append(x) if index % 10000 == 0: print_progress(index, length, prefix='Read data:', suffix='Complete', barLength=50) return data
def cosine_distance_retrieval(pred_strings, dataset_string_list_sentences, word_embedding_dict): dataset_emb_list_sentences = convert_to_emb_list( dataset_string_list_sentences, word_embedding_dict) dataset_single_vector_sentences = [ convert_vectors(sentence) for sentence in dataset_emb_list_sentences ] pred_emb_list_sentences = convert_to_word_embeddings( pred_strings, word_embedding_dict) pred_single_vector_sentences = [ convert_vectors(sentence) for sentence in pred_emb_list_sentences ] best_sentence_lists = [] tot_count = len(pred_single_vector_sentences) for i in range(tot_count): pred_single_vector_sentence = pred_single_vector_sentences[i] best_sentence_list = find_n_most_similar_vectors( pred_single_vector_sentence, dataset_single_vector_sentences, dataset_string_list_sentences) best_sentence_lists.append([" ".join(x) for x in best_sentence_list]) print_progress(i + 1, tot_count, "Calculating cosine distances") return best_sentence_lists