示例#1
0
def generate_document_vectors(question_text, question_tokens2pos, questions_search_results):
    if not _documents_vectors_cache:
        import os
        cache_file = globals.config.get('WebSearchFeatures', 'document-vectors')
        if os.path.isfile(cache_file):
            logger.info("Reading cached document vectors...")
            with open(cache_file, 'r') as inp:
                # Unpickle until the end of file is reached
                index = 0
                while True:
                    try:
                        question, vectors = pickle.load(inp)
                        _documents_vectors_cache[question] = vectors
                    except (EOFError, pickle.UnpicklingError):
                        break
                    index += 1
                    if index % 1000 == 0:
                        logger.info("Read " + str(index) + " document vectors...")
            logger.info("Reading cached document vectors done!")

    if question_text in _documents_vectors_cache:
        return _documents_vectors_cache[question_text]

    documents_vectors = []
    snippets_vectors = []
    fragment_vectors = []
    combined_doc_token2pos = dict()
    combined_doc_entity2pos = dict()
    combined_doc_snippet_token2pos = dict()
    combined_doc_snippet_entity2pos = dict()
    for document in questions_search_results[question_text][:globals.SEARCH_RESULTS_TOPN]:
        # Whole document
        doc_entity2pos = document.get_mentioned_entities_to_pos()
        doc_token2pos, doc_lemma2pos = document.get_token_to_positions_map()
        documents_vectors.append({
            "entity": SparseVector.from_2pos(doc_entity2pos),
            "entity_tfidf": SparseVector.from_2pos(doc_entity2pos,
                                                   element_calc_func=SparseVector.compute_tfidf_entity_elements),
            "token": SparseVector.from_2pos(doc_token2pos),
            "token_tfidf": SparseVector.from_2pos(doc_token2pos,
                                                  element_calc_func=SparseVector.compute_tfidf_token_elements),
        })
        merge_2pos_dicts(combined_doc_token2pos, doc_token2pos)
        merge_2pos_dicts(combined_doc_entity2pos, doc_entity2pos)

        # Snippet
        doc_snippet_entity2pos = document.get_snippet_entities_to_pos()
        doc_snippet_token2pos, doc_snippet_lemma2pos = document.get_snippet_token2pos()
        snippets_vectors.append({
            "entity": SparseVector.from_2pos(doc_snippet_entity2pos),
            "entity_tfidf": SparseVector.from_2pos(doc_snippet_entity2pos,
                                                   element_calc_func=SparseVector.compute_tfidf_entity_elements),
            "token": SparseVector.from_2pos(doc_snippet_token2pos),
            "token_tfidf": SparseVector.from_2pos(doc_snippet_token2pos,
                                                  element_calc_func=SparseVector.compute_tfidf_token_elements),
        })
        merge_2pos_dicts(combined_doc_snippet_token2pos, doc_snippet_token2pos)
        merge_2pos_dicts(combined_doc_snippet_entity2pos, doc_snippet_entity2pos)

        # Best fragment
        fragment_token2pos, fragment_entity2pos = \
            WebSearchResult.get_best_fragment_positions(doc_token2pos, doc_entity2pos, question_tokens2pos)
        fragment_vectors.append({
            "entity": SparseVector.from_2pos(fragment_entity2pos),
            "entity_tfidf": SparseVector.from_2pos(fragment_entity2pos,
                                                   element_calc_func=SparseVector.compute_tfidf_entity_elements),
            "token": SparseVector.from_2pos(fragment_token2pos),
            "token_tfidf": SparseVector.from_2pos(fragment_token2pos,
                                                  element_calc_func=SparseVector.compute_tfidf_token_elements),
        })
    combined_documents_vector = {
        "entity": SparseVector.from_2pos(combined_doc_entity2pos),
        "entity_tfidf": SparseVector.from_2pos(combined_doc_entity2pos,
                                               element_calc_func=SparseVector.compute_tfidf_entity_elements),
        "token": SparseVector.from_2pos(combined_doc_token2pos),
        "token_tfidf": SparseVector.from_2pos(combined_doc_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
    }
    combined_document_snippets_vector = {
        "entity": SparseVector.from_2pos(combined_doc_snippet_entity2pos),
        "entity_tfidf": SparseVector.from_2pos(combined_doc_snippet_entity2pos,
                                               element_calc_func=SparseVector.compute_tfidf_entity_elements),
        "token": SparseVector.from_2pos(combined_doc_snippet_token2pos),
        "token_tfidf": SparseVector.from_2pos(combined_doc_snippet_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
    }

    # Cache the computed vectors.
    _documents_vectors_cache[question_text] = (documents_vectors, snippets_vectors, fragment_vectors,
                                               combined_documents_vector, combined_document_snippets_vector)

    return documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector, combined_document_snippets_vector
示例#2
0
def generate_text_based_features(candidate):
    # Get candidate answers
    answers = map(unicode.lower, candidate.get_results_text())
    # Skip empty and extra-long answers.
    if len(answers) == 0:
        return dict()
    # Get answers descriptions.
    answers_descriptions = ['\n'.join(KBEntity.get_entity_descriptions_by_name(answer, keep_most_triples_only=True))
                            for answer in answers]

    # Get question text.
    question_text = candidate.query.original_query
    question_tokens2pos = dict((token, [1, ]) for token in tokenize(question_text))
    question_token_tfidf = SparseVector.from_2pos(question_tokens2pos,
                                                  element_calc_func=SparseVector.compute_tfidf_token_elements)

    # Get question entities
    question_entities2pos = dict((entity.entity.name.lower(), [1, ]) for entity in candidate.matched_entities)
    question_entitytoken2pos = dict((token, [1, ])
                                    for entity in candidate.matched_entities
                                    for token in tokenize(entity.entity.name))
    question_entity_tfidf = SparseVector.from_2pos(question_entitytoken2pos,
                                                   element_calc_func=SparseVector.compute_tfidf_token_elements)

    # Get search results and check that they aren't empty
    questions_search_results = get_questions_serps()

    documents_vectors = []
    snippets_vectors = []
    fragment_vectors = []
    combined_documents_vector = dict()
    combined_document_snippets_vector = dict()

    representations = ["entity_tfidf",
                       "token_tfidf",
                       # "entity",
                       # "token",
                      ]
    for r in representations:
        combined_documents_vector[r] = dict()
        combined_document_snippets_vector[r] = dict()

    if question_text not in questions_search_results:
        logger.warning("No search results found for the question %s" % question_text)
    else:
        documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector,\
            combined_document_snippets_vector = generate_document_vectors(question_text,
                                                                          question_tokens2pos,
                                                                          questions_search_results)

    answer_entity2pos = dict((answer_entity, [1, ]) for answer_entity in answers)
    answer_token2pos = dict((answer_token, [1, ]) for answer_entity in answers
                            for answer_token in tokenize(answer_entity))
    answers_vectors = {
        "token_tfidf": SparseVector.from_2pos(answer_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
        "entity_tfidf": SparseVector.from_2pos(answer_entity2pos,
                                               element_calc_func=SparseVector.compute_tfidf_entity_elements),
        # "entity": SparseVector.from_2pos(answer_entity2pos),
        # "token": SparseVector.from_2pos(answer_token2pos),
    }

    answer_descriptions_token2pos = dict((token, [1, ]) for description in answers_descriptions
                                         for token in tokenize(description))
    answer_description_vectors = {
        "token_tfidf": SparseVector.from_2pos(answer_descriptions_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
        "entity_tfidf": SparseVector(dict()),
        # Keeping only tf-idf similarities. This seems to be enough.
        # "token": SparseVector.from_2pos(answer_descriptions_token2pos),
        # "entity": SparseVector(dict()),
    }

    similarity_functions = [
        ("cosine", Similarity.cosine_similarity),
        # ("itersection", Similarity.intersection_similarity),
        # ("normalized_intersection", Similarity.normalized_intersection_similarity),
        # ("bm25", Similarity.bm25_similarity),
    ]
    features = dict()

    for similarity_name, similarity in similarity_functions:
        # Computing document-answer similarities for each representation.
        document_answer_similarities = {}
        for representation in representations:
            if representation not in document_answer_similarities:
                document_answer_similarities[representation] = []
            for doc_vector in documents_vectors:
                document_answer_similarities[representation].append(similarity(representation,
                                                                               doc_vector[representation],
                                                                               answers_vectors[representation]))
        for representation in representations:
            features.update({
                "text_features:avg_document_answer_%s_%s" % (representation, similarity_name):
                    avg(document_answer_similarities[representation]),
                "text_features:max_document_answer_%s_%s" % (representation, similarity_name):
                    max(document_answer_similarities[representation]) if document_answer_similarities[representation]
                    else 0.0,
            })

        # logger.info("Snippet-answer similarity...")
        # Computing snippet-answer similarities for each representation.
        snippet_answer_similarities = {}
        for representation in representations:
            if representation not in snippet_answer_similarities:
                snippet_answer_similarities[representation] = []

            for snippet_vector in snippets_vectors:
                snippet_answer_similarities[representation].append(similarity(representation,
                                                                              snippet_vector[representation],
                                                                              answers_vectors[representation]))

        for representation in representations:
            features.update({
                "text_features:avg_snippet_answer_%s_%s" % (representation, similarity_name):
                    avg(snippet_answer_similarities[representation]),
                "text_features:max_snippet_answer_%s_%s" % (representation, similarity_name):
                    max(snippet_answer_similarities[representation]) if snippet_answer_similarities[representation] else 0.0,
            })

        # logger.info("Fragment-answer similarity...")
        # Best BM25 fragment-answer similarities.
        # Weren't very efficient and therefore I remove this features. There is a chance that there is a bug in the features.

        # fragment_answer_similarities = {}
        # for fragment_vector in fragment_vectors:
        #     for representation in representations:
        #         if representation not in fragment_answer_similarities:
        #             fragment_answer_similarities[representation] = []
        #         fragment_answer_similarities[representation].append(similarity(representation,
        #                                                                        fragment_vector[representation],
        #                                                                        answers_vectors[representation]))
        #
        # for representation in representations:
        #     features.update({
        #         "text_features:avg_fragment_answer_%s_%s" % (representation, similarity_name):
        #             avg(fragment_answer_similarities[representation]),
        #         "text_features:max_fragment_answer_%s_%s" % (representation, similarity_name):
        #             max(fragment_answer_similarities[representation]) if fragment_answer_similarities[representation] else 0.0,
        #     })

        # logger.info("Combined document-answer similarity...")
        # Combined documents answer similarity
        for representation in representations:
            combineddoc_answer_similarity = similarity(representation,
                                                       combined_documents_vector[representation],
                                                       answers_vectors[representation])
            features.update({
                "text_features:combdocument_answer_%s_%s" % (representation, similarity_name):
                    combineddoc_answer_similarity,
            })

        # logger.info("Combined snippet-answer similarity...")
        for representation in representations:
            combineddocsnippet_answer_similarity = similarity(representation,
                                                              combined_document_snippets_vector[representation],
                                                              answers_vectors[representation])
            features.update({
                "text_features:combdocument_snippet_answer_%s_%s" % (representation, similarity_name):
                    combineddocsnippet_answer_similarity,
            })

        # logger.info("Description-question similarity...")
        # These features aren't very efficient either. The next candidate for removal.
        description_question_entity_similarity = similarity("token_tfidf", question_entity_tfidf,
                                                            answer_description_vectors["token_tfidf"])
        description_question_token_similarity = similarity("token_tfidf", question_token_tfidf,
                                                           answer_description_vectors["token_tfidf"])
        features.update({
            "text_features:description_question_entitytoken_%s" % similarity_name:
                description_question_entity_similarity,
            "text_features:description_question_token_%s" % similarity_name: description_question_token_similarity,
        })

    # Description - question embedding similarity.
    description_question_token_embedding_avg_similarity = Similarity.embedding_avg_idf_similarity(
        "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"])
    description_question_token_embedding_n_similarity = Similarity.embedding_avg_idf_similarity(
        "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"])
    features.update({
        "text_features:description_question_token_avg_idf_embeddings":
            description_question_token_embedding_avg_similarity,
        "text_features:description_question_token_n_embeddings":
            description_question_token_embedding_n_similarity,
    })

    # Remove features with 0 score.
    features = dict((feature, value) for feature, value in features.iteritems() if value != 0.0)
    return features