Пример #1
0
def get_embedding_complex(text, word_weights):

    words = text.split(' ')
    topics = np.zeros(num_topics)

    if embedding == 'LDA':
        for w in words:
            w_topics = text2topics.LDA(w, model, num_topics)
            topics = topics + w_topics
        topics = topics / len(words)

    elif embedding == 'word2vec_mean':
        topics = text2topics.word2vec_mean(text, word_weights, model,
                                           num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text, model, num_topics,
                                            tfidf_model, tfidf_dictionary)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text, model, num_topics)

    elif embedding == 'glove':
        topics = text2topics.glove(text, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text, model, num_topics)

    return topics
Пример #2
0
def get_results_complex(database, text, word_weights, num_results, results_path):

    words = text.split(' ')
    topics = np.zeros(num_topics)

    if embedding == 'LDA':
        for w in words:
            w_topics = text2topics.LDA(w, model, num_topics)
            topics = topics + w_topics
        topics = topics / len(words)

    elif embedding == 'word2vec_mean':
        topics = text2topics.word2vec_mean(text, word_weights, model, num_topics)


    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text, model, num_topics, tfidf_model, tfidf_dictionary)
        topics = topics / len(words)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text, model, num_topics)

    elif embedding == 'glove':
        topics = text2topics.glove(text, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text, model, num_topics)


    # Create empty dict for distances
    distances = {}
    print "Max text query: " + str(max(topics))
    print "Max total query: " + str(max(topics))

    # Compute distances
    for id in database:
        distances[id] = np.dot(database[id], topics)

    # Sort dictionary
    distances = sorted(distances.items(), key=operator.itemgetter(1), reverse=True)

    # Get elements with min distances
    for idx, id in enumerate(distances):
        # Copy image results
        if test_dataset == 'webvision':
            copyfile('../../../datasets/WebVision/test_images_256/' + id[0] , results_path + id[0].replace('/', '_'))
        else:
            copyfile('../../../datasets/SocialMedia/img_resized_1M/cities_instagram/' + id[0] + '.jpg', results_path + id[0].replace('/', '_') + '.jpg')
        if idx == num_results - 1: break
Пример #3
0
                used_words += 1
        topics = topics / used_words

    elif embedding == 'word2vec_mean':
        word_weights = 0
        topics = text2topics.word2vec_mean(text_query, word_weights, model, num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text_query, model, num_topics, tfidf_model, tfidf_dictionary)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text_query, model, num_topics)


    elif embedding == 'glove':
        topics = text2topics.glove(text_query, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text_query, model, num_topics)

    else:
        print("Select a correct embedding")
        raise SystemExit(0)

    distances = {}

    for id in database:
        distances[id] = np.linalg.norm(database[id]-topics)

    correct = 0
    precisions = []
Пример #4
0
def get_results_complex(database, text, word_weights, im_id, num_results):

    words = text.split(' ')
    topics = np.zeros(num_topics)

    if embedding == 'LDA':
        for w in words:
            w_topics = text2topics.LDA(w, model, num_topics)
            topics = topics + w_topics
        topics = topics / len(words)

    elif embedding == 'word2vec_mean':
        topics = text2topics.word2vec_mean(text, word_weights, model,
                                           num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text, model, num_topics,
                                            tfidf_model, tfidf_dictionary)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text, model, num_topics)

    elif embedding == 'glove':
        topics = text2topics.glove(text, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text, model, num_topics)

    if FC:
        topics = topics - min(topics)
        if max(topics) > 0:
            topics = topics / max(topics)
            topics = get_NN_txt_embedding.get_NN_txt_embedding(text_NN, topics)

    topics = topics - min(topics)
    topics = topics / max(topics)

    # Create empty dict for distances
    distances = {}

    # Compute distances
    for id in database:
        distances[id] = np.dot(database[id], topics)

    # Sort dictionary
    distances = sorted(distances.items(),
                       key=operator.itemgetter(1),
                       reverse=True)

    # Get elements with min distances
    results = []
    for idx, id in enumerate(distances):
        copyfile('../../../datasets/COCO/val2014/' + id[0],
                 '../../../datasets/COCO/rr/' + id[0].replace('/', '_'))
        results.append(int(id[0][13:-4]))
        if idx == num_results - 1: break

    if im_id in results:
        return True
    else:
        return False
Пример #5
0
for cur_q in q:
    cur_w = '1 1'

    if len(cur_q.split(' ')) == 1:

        if embedding == 'LDA':
            topics = text2topics.LDA(cur_q, model, num_topics)
        elif embedding == 'word2vec_mean':
            topics = text2topics.word2vec_mean(cur_q, cur_w, model, num_topics)
        elif embedding == 'doc2vec':
            topics = text2topics.doc2vec(cur_q, model, num_topics)
        elif embedding == 'word2vec_tfidf':
            topics = text2topics.word2vec_tfidf(cur_q, model, num_topics,
                                                tfidf_model, tfidf_dictionary)
        elif embedding == 'glove':
            topics = text2topics.glove(cur_q, cur_w, model, num_topics)
        elif embedding == 'glove_tfidf':
            topics = text2topics.glove_tfidf(cur_q, model, num_topics)

    else:
        topics = get_embedding_complex(cur_q, cur_w)

    # Normalize by max to compare with net
    topics = topics - min(topics)
    topics = topics / max(topics)

    f.write('word_images/' + cur_q.replace(' ', '_'))
    for t in topics:
        f.write(',' + str(t))
    f.write('\n')
q.append('snow ski')
q.append('rain umbrella')

q.append('icecream beach')
q.append('chocolate cake')
q.append('pizza wine')

q.append('woman bag')
q.append('man boat')
q.append('kid dog')

file = open(out_file_path,'w')

for e,cur_q in enumerate(q):
    if len(cur_q.split(' ')) == 1:
        topics = text2topics.glove(cur_q, '1', model, num_topics)
    if len(cur_q.split(' ')) == 2:
        topics = text2topics.glove(cur_q, '0.5 0.5', model, num_topics)
    file.write(cur_q)
    for t in topics:
        file.write(',' + str(t))









Пример #7
0
        word_weights = 0
        topics = text2topics.word2vec_mean(text_query, word_weights, model,
                                           num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text_query, model, num_topics,
                                            tfidf_model, tfidf_dictionary)
        # topics = topics + w_topics
        # topics = topics / len(words)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text_query, model, num_topics)

    elif embedding == 'glove':
        word_weights = 0
        topics = text2topics.glove(text_query, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text_query, model, num_topics)

    # Create empty dict for ditances
    distances = {}

    topics = topics - min(topics)
    topics = topics / max(topics)

    # Compute distances)
    for id in database:
        distances[id] = np.dot(database[id], topics)

    # Get elements with min distances
Пример #8
0
    print(cur_q)
    cur_w = w[e]
    if test_dataset == 'webvision': results_path = "../../../datasets/WebVision/rr/" + data + "/" + cur_q.replace(' ', '_') + '__' + cur_w.replace(' ', '_') + '/'
    else: results_path = "../../../datasets/SocialMedia/retrieval_results/" + data + "/" + cur_q.replace(' ', '_') + '__' + cur_w.replace(' ', '_') + '/'
    if not os.path.exists(results_path):
        print("Creating dir: " + results_path)
        os.makedirs(results_path)


    if len(cur_q.split(' ')) == 1:

        if embedding == 'LDA': topics = text2topics.LDA(cur_q,  model, num_topics)
        elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(cur_q, cur_w, model, num_topics)
        elif embedding == 'doc2vec': topics = text2topics.doc2vec(cur_q, model, num_topics)
        elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(cur_q, model, num_topics, tfidf_model, tfidf_dictionary)
        elif embedding == 'glove': topics = text2topics.glove(cur_q, cur_w, model, num_topics)
        elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(cur_q, model, num_topics)


        get_results(database, topics, num_results,results_path)

    else:
        get_results_complex(database, cur_q, cur_w, num_results, results_path)