示例#1
0
def get_word_embeddings(conf):
    if conf.WORD_EMBEDDING_METHOD == 'glove':
        embeddings_index = {}
        f = open('data/embeddings/glove.6B.300d.txt')
        count = 0
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
            count += 1
            if count % 100 == 0:
                print_progress(count,
                               400000,
                               prefix="Getting glove word embeddings")
        f.close()
        return embeddings_index

    elif conf.WORD_EMBEDDING_METHOD == 'word2vec':
        embedding_dict_name = "word2vec/saved_models/word2vec_%sd%svoc100001steps_dict_%s.pkl" % (
            conf.EMBEDDING_DIMENSION, conf.NB_WORDS,
            conf.DATASET if conf.DATASET != None else "flickr")
        return load_pickle_file(embedding_dict_name)

    print("WORD_EMBEDDING_METHOD not found")
    return None
示例#2
0
def find_n_most_similar_class(class_embedding, n=1):
	class_vector_pairs = fetch_all_word_vectors()

	first_class_text = class_vector_pairs[0][0]
	first_class_vector = class_vector_pairs[0][1]
	first_class_mse = compare_vectors(class_embedding, first_class_vector)

	best_class_vector_mse_list = [0 for i in range(n)]
	best_class_text_list = ["" for i in range(n)]
	best_class_vector_list = [[] for i in range(n)]

	best_class_vector_mse_list = insert_and_remove_last(0, best_class_vector_mse_list, first_class_mse)
	best_class_text_list = insert_and_remove_last(0, best_class_text_list, first_class_text)
	best_class_vector_list = insert_and_remove_last(0, best_class_vector_list, first_class_vector)
	total_class = len(class_vector_pairs)
	counter = 1

	print_progress(counter, total_class, prefix="Searching for class")
	for temp_class_text, temp_class_vector in class_vector_pairs:
		temp_image_mse = compare_vectors(class_embedding, temp_class_vector)
		for index in range(len(best_class_vector_list)):
			if temp_image_mse < best_class_vector_mse_list[index]:
				best_class_vector_mse_list = insert_and_remove_last(index, best_class_vector_mse_list, temp_image_mse)
				best_class_text_list = insert_and_remove_last(index, best_class_text_list, temp_class_text)
				best_class_vector_list = insert_and_remove_last(index, best_class_vector_list, temp_class_vector)
				break
		counter += 1
		if counter % 100 == 0 or counter > total_class - 2:
			print_progress(counter, total_class, prefix="Searching for class")
	return best_class_text_list
示例#3
0
def background_find_n_most_similar_vectors(tuple_array):
    global counter

    pred_vector, vector_list, sentence_list, sentence_count = tuple_array

    n = 5
    first_vector = vector_list[0]
    first_sentence = sentence_list[0]
    first_mse = compare_vectors(pred_vector, first_vector)

    best_mse_list = [0 for _ in range(n)]
    best_sentence_list = ["" for _ in range(n)]
    best_vector_list = [[] for _ in range(n)]

    best_mse_list = insert_and_remove_last(0, best_mse_list, first_mse)
    best_vector_list = insert_and_remove_last(0, best_vector_list,
                                              first_vector)
    best_sentence_list = insert_and_remove_last(0, best_sentence_list,
                                                first_sentence)
    for i in range(len(vector_list)):
        temp_vector = vector_list[i]
        temp_mse = compare_vectors(pred_vector, temp_vector)
        for index in range(len(best_vector_list)):
            if temp_mse < best_mse_list[index]:
                best_mse_list = insert_and_remove_last(index, best_mse_list,
                                                       temp_mse)
                best_vector_list = insert_and_remove_last(
                    index, best_vector_list, temp_vector)
                best_sentence_list = insert_and_remove_last(
                    index, best_sentence_list, sentence_list[i])
                break
    with counter.get_lock():
        counter.value += 1
    print_progress(counter.value, sentence_count, "Running Cosine distance")
    return [" ".join(x) for x in best_sentence_list]
示例#4
0
def preprocess(img_path, num_images):
	img = image.load_img(img_path, target_size=(299, 299))
	x = image.img_to_array(img)
	x = np.expand_dims(x, axis=0)
	x = preprocess_input(x)
	global counter
	print_progress(counter, num_images, prefix="Processing images")
	counter += 1
	return x
示例#5
0
def convert_and_store():
    all_image_imgvec_pairs = fetch_all_image_vector_pairs()
    image_filenames = [x[0] for x in all_image_imgvec_pairs]
    image_vectors = np.asarray([x[1] for x in all_image_imgvec_pairs])
    image_count = len(image_filenames)
    del x
    del all_image_imgvec_pairs
    pca = PCA(n_components=50)
    pca_vectors = pca.fit_transform(image_vectors)
    for i in range(image_count):
        store_pca_vector_to_db(image_filenames[i], pca_vectors[i])
        print_progress(i, image_count, prefix="Storing PCA vectors")
示例#6
0
def get_word_embeddings():
	embeddings_index = {}
	f = open('LSTM/glove.6B.300d.txt')
	count = 0
	for line in f:
		values = line.split()
		word = values[0]
		coefs = numpy.asarray(values[1:], dtype='float32')
		embeddings_index[word] = coefs
		count += 1
		if count % 100 == 0:
			print_progress(count, 400000, prefix="Getting glove word embeddings")
	f.close()

	return embeddings_index
示例#7
0
def run_inception():
	flowers_path = "/Users/markus/workspace/master/Master/data/datasets/flowers/"
	flowers_image_path = flowers_path + "jpg/"

	image_filenames = fetch_all_filenames(flowers_image_path)
	num_images = len(image_filenames)
	print "Fetch image paths"
	image_paths = [flowers_image_path + image_filename for image_filename in image_filenames]
	processed_imgs = [preprocess(x, num_images) for x in image_paths]
	print "Loading model"
	inception = get_model()
	for i in range(num_images):
		img_vec = inception.predict(np.asarray(processed_imgs[i]))
		store_image_vector_to_db(image_filenames[i], img_vec[0])
		print_progress(i, num_images, prefix="Running images through model and storing...")
示例#8
0
def background_wmd(tuple):
    global counter

    pred_string, dataset_string_list_sentences, word_embedding_dict, sentence_count = tuple
    score_tuples = []
    for dataset_string_list_sentence in dataset_string_list_sentences:
        dataset_string = " ".join(dataset_string_list_sentence)
        score = get_wmd_distance(pred_string, dataset_string,
                                 word_embedding_dict)
        score_tuples.append((dataset_string, score))
    score_tuples = sorted(score_tuples, key=lambda x: x[1], reverse=False)
    result = [x[0] for x in score_tuples[:5]]
    with counter.get_lock():
        counter.value += 1

    print_progress(counter.value, sentence_count, "Running WMD")
    # print counter.value
    return result
示例#9
0
def gen_class_embs():
	# create_common_words_pickle()
	print "Generating classes"
	common_words = load_pickle_file("common_words.p")
	print "Loading captions..."
	filename_caption_text_tuples = fetch_all_caption_text_tuples()[:5000]
	print "Loading word embeddings..."
	word_embedding_dict = dict(fetch_all_word_vectors())
	filname_text_vector_tuples = []
	tot = len(filename_caption_text_tuples)
	counter = 1
	print_progress(counter, tot, prefix="Converting classes to embs")
	for filename, caption in filename_caption_text_tuples:
		classes = get_classes(caption, common_words)
		filname_text_vector_tuples.extend([(filename, c, word_embedding_dict[c]) for c in classes if c in word_embedding_dict.keys()])
		counter += 1
		print_progress(counter, tot, prefix="Converting classes to embs")

	save_class_vector_list(filname_text_vector_tuples)
示例#10
0
def read_flower_data():
    index = 0
    filename = 'data/datasets/all_flowers.txt'
    """Extract the first file enclosed in a zip file as a list of words"""
    with open(filename) as f:
        data = []
        readlines = f.readlines()
        length = len(readlines)
        for line in readlines:
            index += 1
            sentence = line.strip()
            preprocessed_words = preprocessing(sentence, add_sos_eos=True)
            for x in preprocessed_words:
                data.append(x)
            if index % 10000 == 0:
                print_progress(index,
                               length,
                               prefix='Read data:',
                               suffix='Complete',
                               barLength=50)
    return data
示例#11
0
def cosine_distance_retrieval(pred_strings, dataset_string_list_sentences,
                              word_embedding_dict):
    dataset_emb_list_sentences = convert_to_emb_list(
        dataset_string_list_sentences, word_embedding_dict)
    dataset_single_vector_sentences = [
        convert_vectors(sentence) for sentence in dataset_emb_list_sentences
    ]
    pred_emb_list_sentences = convert_to_word_embeddings(
        pred_strings, word_embedding_dict)
    pred_single_vector_sentences = [
        convert_vectors(sentence) for sentence in pred_emb_list_sentences
    ]

    best_sentence_lists = []
    tot_count = len(pred_single_vector_sentences)
    for i in range(tot_count):
        pred_single_vector_sentence = pred_single_vector_sentences[i]
        best_sentence_list = find_n_most_similar_vectors(
            pred_single_vector_sentence, dataset_single_vector_sentences,
            dataset_string_list_sentences)
        best_sentence_lists.append([" ".join(x) for x in best_sentence_list])
        print_progress(i + 1, tot_count, "Calculating cosine distances")
    return best_sentence_lists