示例#1
0
def search_SkipGram(model, query, id2corpus=ID2CORPUS,
                    result_len=MAX_NUMBER_OF_RESULTS):
    """
    Return the top 1000 ranked documents that match best the query according
    to the input model.
    """

    # Create a vector representation of the query.
    query_repr = []
    for q_tok in read_ap.process_text(query):
        if q_tok in model.tok2idx:
            query_repr.append(model.tok2idx[q_tok])
    q_vec = model.doc2vec(query_repr).unsqueeze(dim=0)
    q_vec_norm = torch.mm(q_vec, q_vec.T)

    print('Comparing all document vectors to the query vector...')
    results = []
    for doc_id, doc in id2corpus.items():
        vec = model.doc2vec(doc).unsqueeze(dim=0)
        norm = torch.mm(vec, vec.T)
        score = torch.mm(vec, q_vec.T) / (norm * q_vec_norm)
        results.append((doc_id, float(score)))

    results.sort(key=lambda _: -_[1])
    return results[:result_len]
示例#2
0
def similar_words(vocab_embs, word, n_of_similar_words, word2id, id2word):
    '''
    Takes a word, gets the corresponding word embedding, and computes the
    cosine similarity score with the embeddings of all other words in the vocab.

    Returns:
        similar words: list of n most similar words.
    '''
    word = ra.process_text(word)[0]
    word_id = word2id[word]
    word_emb = vocab_embs[word_id, :].reshape(1, -1)

    # Compute cosine similarity score for every word in the vocabulary
    scores = sklearn.metrics.pairwise.cosine_similarity(word_emb,
                                                        vocab_embs,
                                                        dense_output=True)
    scores = list(scores[0])

    # Get n words with highest scores
    best_n = heapq.nlargest(n_of_similar_words, range(len(scores)),
                            scores.__getitem__)
    similar_words = []
    for id in best_n:
        similar_words.append(id2word[id])

    return (similar_words)
示例#3
0
def rank_documents(model, model_name, type, query):

    sims_list = []

    processed_query = read_ap.process_text(query)
    print(processed_query)

    if model_name == "LSI":
        if type == "bow":
            # calculating cosine similarity for LSI (BoW)
            index = gensim.similarities.MatrixSimilarity(model[corpus])
            #make a bow representation of the query, and split the words
            vec_bow = dictionary.doc2bow(processed_query)
            vec_lsi = model[vec_bow]  # convert the query to LSI space
            sims = index[vec_lsi]  # get index
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            # store the scores with the associated doc id's for the retrieval evaluation
            doc_ids = list(new_docs.keys())
            for i, s in sims:
                sims_list.append((doc_ids[i], np.float64(s)))
            return sims_list

        if type == "tfidf":
            #calculating cosine similarity for LSI, tf idf using similarities
            #use the tfidf corpus -> lsi corpus
            corpus_lsi = model[corpus_tfidf]
            #transform corpus to LSI space and index it
            index = gensim.similarities.MatrixSimilarity(corpus_lsi)
            #convert query to lsi space via tf-idf
            vec_bow = dictionary.doc2bow(processed_query)
            vec_lsi = model[vec_bow]
            sims = index[vec_lsi]
            #same as with LSI BoW
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            doc_ids = list(new_docs.keys())
            for i, s in sims:
                sims_list.append((doc_ids[i], np.float64(s)))
            return sims_list
    else:
        #calculating the negative Kullback–Leibler divergence scores for LDA
        #transform query
        vec_bow = dictionary.doc2bow(processed_query)
        # transform query to the LDA space
        vec_lda_query = model[vec_bow][0]
        kl_divergence = []
        for text in corpus:
            #transform current document text in bow space to lda space
            vec_lda_text = model[text][0]
            # KL(Q||D) =\sum_w p(w|Q) log p(w|D) as explained in http://times.cs.uiuc.edu/course/410s11/kldir.pdf, using gensim mathutil
            kl_divergence.append(kullback_leibler(vec_lda_query, vec_lda_text))

        #sims = index[vec_lda]

        #sort the kl scores
        kl_divergence = sorted(enumerate(kl_divergence),
                               key=lambda item: -item[1])
        doc_ids = list(new_docs.keys())
        for i, s in kl_divergence:
            sims_list.append((doc_ids[i], np.float64(s)))
        return sims_list
示例#4
0
def rank(model, docs, query_raw):
    query = process_text(query_raw)
    query_vector = model.infer_vector(query)

    ranking = model.docvecs.most_similar([query_vector],
                                         topn=len(model.docvecs))

    return ranking
示例#5
0
def doc2vec_search(model, query):
    processed_query = read_ap.process_text(query)
    inferred_vector = model.infer_vector(processed_query)
    sims = model.docvecs.most_similar([inferred_vector],
                                      topn=len(model.docvecs))
    sims = [(keys_tags_dict[doc_id], np.float64(score))
            for (doc_id, score) in sims]
    return sims
def rank_query_given_document(query_text, doc2vec_model):
    #   Function that ranks documents given a query
    query_repr = read_ap.process_text(query_text)
    query_vector = doc2vec_model.infer_vector(query_repr)

    results = doc2vec_model.docvecs.most_similar([query_vector],
                                                 topn=len(
                                                     doc2vec_model.docvecs))
    return results
示例#7
0
    def match_query_against_docs(self, query, doc_ids, doc_embeddings):
        query_repr = read_ap.process_text(query)
        q_embeddings = self.model.inference_on_words(query_repr)
        q_embedding = aggregate_embeddings(q_embeddings, method=self.ARGS.aggr)

        similarities, sorted_doc_idx = calc_cosine_similarity(q_embedding, doc_embeddings)

        results = [(doc_ids[i.item()], similarities[i.item()].item()) for i in sorted_doc_idx]
        return results
示例#8
0
def rank_documents(model, model_name, type, query):

    sims_list = []

    processed_query = read_ap.process_text(query)
    print(processed_query)

    if model_name == "LSI":
        if type == "bow":
            # calculating cosine similarity for LSI (BoW)
            index = gensim.similarities.MatrixSimilarity(model[corpus])
            #make a bow representation of the query, and split the words
            vec_bow = dictionary.doc2bow(processed_query)
            print(query.lower().split())
            vec_lsi = model[vec_bow]  # convert the query to LSI space
            sims = index[vec_lsi]
            # print(sims)
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            # store the scores with the associated doc id's for the retrieval evaluation
            for i, s in sims:
                doc_id = list(new_docs.keys())[i]
                sims_list.append((doc_id, np.float64(s)))
            return sims_list

        if type == "tfidf":
            #calculating cosine similarity for LSI, tf idf using similarities
            #use the tfidf corpus -> lsi corpus
            corpus_lsi = model[corpus_tfidf]
            #transform corpus to LSI space and index it
            index = gensim.similarities.MatrixSimilarity(corpus_lsi)
            #convert query to lsi space via tf-idf
            vec_bow = dictionary.doc2bow(processed_query)
            vec_lsi = model[vec_bow]
            sims = index[vec_lsi]
            # pprint(sims)
            #same as with LSI BoW
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            for i, s in sims:
                doc_id = list(new_docs.keys())[i]
                sims_list.append((doc_id, np.float64(s)))
            return sims_list
    else:
        #calculating the negative Kullback–Leibler divergence scores for LDA
        lda = gensim.models.LdaModel(corpus,
                                     id2word=dictionary,
                                     num_topics=NUM_TOPICS)
        index = gensim.similarities.MatrixSimilarity(lda[corpus])
        vec_bow = dictionary.doc2bow(query.lower().split())
        vec_lda = lda[vec_bow]
        sims_index = index[vec_lda]
        sims = [(doc, gensim.matutils.kullback_leibler(doc, vec_lda))
                for doc in sims_index]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        for i, s in sims:
            doc_id = list(new_docs.keys())[i]
            sims_list.append((doc_id, np.float64(s)))
        return sims_list
示例#9
0
def compute_metrics(docs, vocab_embs, word2id, id2word):
    """
    For a trained model, compute the MAP and NDCG based on a set of queries and
    all documents in the corpus.

    Returns:
        metrics: a nested dict of queries and their MAP and NDCG scores.
    """
    # Create document embeddings
    if not os.path.exists("./pickles/word2vec_doc_embs.pkl"):
        print("constructing document embeddings")
        doc_embs = {}
        keys = list(docs.keys())
        for d in tqdm(keys):
            doc = docs[d]
            doc_emb = create_doc_emb(vocab_embs, doc, word2id, id2word)
            doc_embs[d] = doc_emb

        with open("./pickles/word2vec_doc_embs.pkl", "wb") as writer:
            pkl.dump(doc_embs, writer)
    else:
        with open("./pickles/word2vec_doc_embs.pkl", "rb") as reader:
            doc_embs = pkl.load(reader)

    # Create query embedding and compare to every docuemnt embedding
    qrels, queries = ra.read_qrels()
    overall_ser = {}  #ranking per query
    for qid in tqdm(qrels):
        query = queries[qid]
        query = ra.process_text(query)
        query_emb = create_doc_emb(vocab_embs, query, word2id, id2word)
        ranking, trec_results = get_ranking(qid, query_emb, doc_embs,
                                            vocab_embs)
        overall_ser[qid] = ranking

        if not int(qid) in range(76, 100):
            with open("./results/word2vec_trec.csv", "a+") as f:
                f.write("\n".join("{},{},{},{},{},{}".format(
                    x[0], x[1], x[2], x[3], x[4], x[5]) for x in trec_results))
                f.write("\n")

    # Compute the MAP and NDCG per query
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_ser)

    # Get the average model evaluation scores over all queries
    average = {'map': 0, 'ndcg': 0}
    for q in list(metrics.values()):
        average['map'] += q['map']
        average['ndcg'] += q['ndcg']
    average['map'] = average['map'] / len(queries)
    average['ndcg'] = average['ndcg'] / len(queries)
    print(
        'average model evaluation scores over all queries {}'.format(average))

    return (metrics)
示例#10
0
    def search(self, query, max_docs=1000):
        query_repr = read_ap.process_text(query)
        vec_bow = self.dictionary.doc2bow(query_repr)
        vec_lsi = self.model[vec_bow]
        sims = self.index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        results = [(self.doc_index_map[doc_id], score.item())
                   for doc_id, score in sims[:max_docs]]

        return results
示例#11
0
    def match_query_against_words(self, query):
        query_repr = read_ap.process_text(query)
        q_embeddings = self.model.inference_on_words(query_repr)
        # If the query is a sentence we can compare the sentence against words
        agg_embeddings = aggregate_embeddings(q_embeddings, method=self.ARGS.aggr)

        _, sorted_w_idx = calc_cosine_similarity(agg_embeddings, self.model.w_embeddings.weight)

        results = [self.model.vocab["id2token"][i.item()] for i in sorted_w_idx[:self.ARGS.top_n]]
        return results
示例#12
0
 def search(self, query):
     query_repr = read_ap.process_text(query)
     orig = self.get_doc_vec(query_repr)
     orig = orig.unsqueeze(1).repeat(1, len(self.docs))
     cos = nn.CosineSimilarity(dim=0, eps=1e-6)
     prod = cos(orig, self.doc_vecs)
     print('sorting results')
     indices = (-prod.numpy()).argsort()
     results = [(self.idx2docid[index], float(prod.numpy()[index]))
                for index in indices]
     return results
示例#13
0
    def search(self, query):

        query_repr = read_ap.process_text(query)
        vec_query = self.corpus.dictionary.doc2bow(query_repr)

        if self.embedding == "bow":
            lsi_query = self.model[vec_query]
        elif self.embedding == "tfidf":
            lsi_query = self.model[self.corpus.tfidf_model[vec_query]]

        sims = self.index[lsi_query]
        sims = sorted(zip(self.corpus.doc_ids, sims), key=lambda item: -item[1])
        return sims
示例#14
0
    def search(self, query):
        query_repr = self.dictionary.doc2bow(read_ap.process_text(query))
        qvec = np.zeros(self.model.num_topics)
        for i, frac in self.model[query_repr]:
          qvec[i] = frac

        results = {}
        for doc in self.docvecs:
          results[doc] = -kl_divergence(self.docvecs[doc], qvec)

        results = list(results.items())
        results.sort(key=lambda _: -_[1])
        return results
示例#15
0
def rank_docs(model, query, qid, run_id):
    """
    Use a trained model to return the most similar docs to a query.
    
    Returns:
        sims: list of tuples (doc_id, score)
        trec_results: list of tuples with all TREC values per doc
    """
    query = ra.process_text(query)
    query_vec = model.infer_vector(query, epochs=200)
    sims = model.docvecs.most_similar([query_vec], topn=len(model.docvecs))
    trec_results = [(qid,"") + (tup[0],) + (i,) + (tup[1],) + (run_id,) for i,tup in enumerate(sims)]
    return sims, trec_results
示例#16
0
    def search(self, query):
        query_repr = read_ap.process_text(query)

        results = defaultdict(float)
        for query_term in query_repr:
            if query_term not in self.ii:
                continue
            for (doc_id, tf) in self.ii[query_term]:
                results[doc_id] += np.log(1 + tf) / self.df[query_term]

        results = list(results.items())
        results.sort(key=lambda _: -_[1])
        return results
示例#17
0
def search(config):
   
    if not os.path.exists(config.model_file):
        raise ValueError("no model available for search, try setting '-t' to true to train model first")
    else:
        model = gensim.models.doc2vec.Doc2Vec.load(config.model_file)
        
    query = read_ap.process_text(config.search)
    vector = model.infer_vector(query)
    most_similar = model.docvecs.most_similar([vector], topn=config.top_n)

    display_result(most_similar)
    return most_similar
示例#18
0
    def search(self, query):
        query_repr = read_ap.process_text(query)

        results = defaultdict(float)
        for query_term in query_repr:
            if query_term not in self.ii:
                continue
            for (doc_id, tf) in self.ii[query_term]:
                # divide by df is apparently an approximation of inverse df... but why no log?
                results[doc_id] += np.log(1 + tf) / self.df[query_term]

        results = list(results.items())
        results.sort(key=lambda _: -_[1])
        return results
示例#19
0
    def query(self, q):

        # get doc representation
        q = read_ap.process_text(q)
        q = self.dictionary.doc2bow(q)

        # convert vector to LSI space
        vec_query = self.model[q]

        sims = self.index[vec_query]

        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        return sims
示例#20
0
    def embed_query(self, word_to_vec, query, aggregation='mean'):

        query_repr = process_text(query)

        doc = []
        for query_term in query_repr:
            if query_term not in word_to_vec:
                continue
            else:
                doc.append(word_to_vec[query_term])

        if aggregation == 'mean':
            doc = np.mean(doc, axis=0)
        return doc
示例#21
0
 def find_similar_words(self, query, n=11):
     word_to_vec = pkl.load(open("word2vec_embedding.pkl", "rb"))
     query = process_text(query)[0]
     word_vec = word_to_vec[query]
     distances = []
     for index, word_key in enumerate(word_to_vec):
         cos_sim = np.dot(word_vec, word_to_vec[word_key]) / (
             np.linalg.norm(word_vec) *
             np.linalg.norm(word_to_vec[word_key]))
         distances.append((index, word_key, cos_sim))
     sorted_by_distance = sorted(distances,
                                 reverse=True,
                                 key=lambda tup: tup[2])
     for matching_word in sorted_by_distance[:n]:
         print(matching_word[1])
示例#22
0
    def search(self, query):

        query_repr = read_ap.process_text(query)
        vec_query = self.corpus.dictionary.doc2bow(query_repr)
        lda_query = sparse2full(self.model[vec_query], self.num_topics)

        results = defaultdict(float)
        for doc_id, lda_doc_repr in zip(self.corpus.doc_ids,
                                        self.lda_corpus_pers):
            results[doc_id] = kullback_leibler(lda_query, lda_doc_repr)

        results = {
            k: v
            for k, v in sorted(
                results.items(), key=lambda item: item[1], reverse=True)
        }
        return list(results.items())
示例#23
0
def query_similarity(query, dictionary, model, index, doc_ids):
    """
    Return the ranking of relevant docs given a query.
    """
    query = ra.process_text(query)
    vec_bow = dictionary.doc2bow(query)
    vec_lsi = model[vec_bow]
    sims = index[vec_lsi]
    scores = {}
    for i, score in enumerate(sims):
        score = score.item()
        doc_id = doc_ids[i]
        scores[doc_id] = score

    ranking = dict(
        sorted(scores.items(), key=operator.itemgetter(1), reverse=True))
    return ranking
示例#24
0
    def rank(self, query, first_query=True):
        query_repr = read_ap.process_text(query)
        vec_bow = self.index.doc2bow(query_repr)
        if self.tfidf:
            vec_bow = bow2tfidf(vec_bow, self.index)
        vec_lsi = self.model[vec_bow]  # convert the query to LSI space

        index_path = os.path.join(self.model_path, 'lsi_index_rank.index')
        if first_query:  # and not os.path.exists(os.path.join(self.model_path, 'lsi_index_rank.index')):
            used_corpus = self.corpus_tfidf if self.tfidf else self.corpus_bow
            index = similarities.Similarity(os.path.join(self.model_path,"shard"), self.model[used_corpus], self.num_topics)  #len(self.index))  # transform corpus to LSI space and index it
            index.save(index_path)
        else:
            index = similarities.Similarity.load(index_path)
        sims = index[vec_lsi]  # query similarity
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        sims = [(self.index2docid[idx], np.float64(value)) for (idx, value) in sims]
        return sims
示例#25
0
文件: lda.py 项目: Sasafrass/IR1
def ranking_LDA(query, model, model_docs, num_topics=10):
    scores = []

    # Process query to correct KL divergence form
    query = read_ap.process_text(query)
    query = dictionary.doc2bow(query)
    query = model[query]
    query = gensim.matutils.sparse2full(query, num_topics)

    # Calculate KL divergence for each document in the corpus
    for i in range(len(corpus)):
        doc = model_docs[i]
        neg_kl = float(-1 * kullback_leibler(query, doc))
        scores.append((i2str[i], neg_kl))

    # Sort on second tuple value
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores
示例#26
0
def create_doc_emb(matrix, doc, word2id, id2word):
    '''
    Takes a list of words, converts these words to id's, computes the word
    embedding for each word and sums these embeddings to get the
    document representation

    Returns:
        doc_emb: array [emb_dim]
    '''
    embeddings = []
    for word in doc:
        word = ra.process_text(word)
        if len(word) == 1:
            word_id = word2id.get(word[0])
            if word_id != None:
                word_embedding = matrix[word_id, :].reshape(1, -1)
                embeddings.append(word_embedding)
    embeddings = np.asarray(embeddings)
    doc_emb = embeddings.mean(axis=0)
    return doc_emb
示例#27
0
def search_doc2vec(model, query, docs_by_id=None,
                   result_len=MAX_NUMBER_OF_RESULTS):
    if docs_by_id is None:
        docs_by_id = read_ap.get_processed_docs()

    # Deleting training data is advice by the official gensim website.
    model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

    print("Comparing the query embedding with all document embeddings...")
    # Get cosine similarity for the query compared to the documents.
    q_vec = model.infer_vector([q_tok for q_tok in read_ap.process_text(query)])
    q_vec = torch.FloatTensor(q_vec).unsqueeze(dim=0)
    cos = torch.nn.CosineSimilarity()
    results = {}
    for doc_id, doc in docs_by_id.items():
        vec = torch.FloatTensor(model.infer_vector(doc)).unsqueeze(dim=0)
        results[doc_id] = float(cos(vec, q_vec))

    # Rank the top results in a list.
    results = list(results.items())
    results.sort(key=lambda _: _[1])
    return results[:result_len]
示例#28
0
def get_sims(model, query, corpus_full, dictionary, n_topics):
    ''' get ranking for single query '''

    # avoid division by 0
    eps = 1e-8

    # process query
    query_processed = read_ap.process_text(query)
    query_bow = dictionary.doc2bow(query_processed)
    q_lda = sparse2full(model[query_bow], n_topics)
    q_lda += eps

    sims = []

    # loop over all docs
    for i, doc in enumerate(corpus_full):
        doc += eps
        sim = -1 * kullback_leibler(q_lda, doc)
        sims.append(sim)

    sim_ordered = sorted(enumerate(sims), key=lambda item: -1 * item[1])

    return sim_ordered
示例#29
0
def evaluate(config, qrels, queries):

    if not os.path.exists(config.model_file):
        raise ValueError("no model available for search, try setting '-t' to true to train model first")
    else:
        model = gensim.models.doc2vec.Doc2Vec.load(config.model_file)
        model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    # read in the qrels
    overall_ser = {}

    print("Running TFIDF Benchmark")
    # collect results
    results_lines = []
    for qid in tqdm(qrels): 
        query_text = queries[qid]

        vector = model.infer_vector(read_ap.process_text(query_text))
        results = model.docvecs.most_similar([vector], topn=164557)
        to_write = [str(qid)+ '\tQO\t' + doc_id + '\t0\t' + str(score) + '\tSTANDARD\n' for doc_id, score in results]
        
        with smart_open.open(config.write_file, 'a') as f:
            f.writelines(to_write)
示例#30
0
def rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=None, index=None):
    query_prepro = read_ap.process_text(query)

    # transform query to bow vector space
    q_cspace = dictionary.doc2bow(query_prepro)

    if not tfidf_model == None:
        # transform query to tfidf vector space
        q_cspace = tfidf_model[q_cspace]

    q_modelspace = model[q_cspace]
    
    if isinstance(model, LsiModel):
        ## LSI
        scores = index[q_modelspace]

        results = defaultdict(float)
        for doc_id, score in zip(doc_ids, scores):
          results[doc_id] = score

        results = list(results.items())
        results.sort(key=lambda _: -_[1])

    elif isinstance(model, LdaModel):
        ## LDA
        doc_ids = list(doc_ids)
        scores = []
        # have to use the for loop, otherwise kullback_leibler has problems
        for d in corpus_modelspace:
            scores.append(float(-kullback_leibler(q_modelspace, d)))

        # have to use torch here to do this more efficiently
        order = torch.Tensor(scores).argsort(descending=True).numpy()
        ordered_results = [(doc_ids[i], scores[i]) for i in order]
        results = dict(ordered_results)

    return results