Пример #1
0
def main_entity_link_text():
    globals.read_configuration('config.cfg')
    entity_linker = globals.get_entity_linker()
    parser = globals.get_parser()
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    globals.logger.setLevel("DEBUG")
    import operator
    while True:
        print "Please enter some text: "
        text = sys.stdin.readline().strip().decode('utf-8')
        tokens = parser.parse(text).tokens
        print "Entities:", entity_linker.identify_entities_in_document(tokens, max_token_window=5)
        entities = {}
        tokens = {}

        if text in question_search_results:
            for doc in question_search_results[text][:10]:
                print doc
                title = doc.title
                snippet = doc.snippet
                snippet_tokens = parser.parse(title + "\n" + snippet).tokens
                for token in snippet_tokens:
                    if token.lemma not in tokens:
                        tokens[token.lemma] = 0
                    tokens[token.lemma] += 1
                for entity in entity_linker.identify_entities_in_document(snippet_tokens):
                    if entity['mid'] not in entities:
                        entities[entity['mid']] = entity
                    else:
                        entities[entity['mid']]['count'] += entity['count']
        print sorted(entities.values(), key=operator.itemgetter('count'), reverse=True)[:50]
Пример #2
0
def create_document_vectors_cache(questions):
    cache_file = globals.config.get('WebSearchFeatures', 'document-vectors')
    logger.info("Caching document vectors...")
    with open(cache_file, 'wx') as out:
        for index, question in enumerate(questions):
            question_token2pos = dict((token, [1, ]) for token in tokenize(question))
            generate_document_vectors(question, question_token2pos, get_questions_serps())
            pickle.dump((question, _documents_vectors_cache[question]), out)
            if index % 100 == 0:
                logger.info("Cached document vectors for %d questions" % index)
Пример #3
0
def main_parse():
    document_content_file = globals.config.get('WebSearchFeatures', 'documents-content-file')
    parser = CoreNLPParser.init_from_config()
    question_serps = get_questions_serps()
    print datetime.now()
    with open(document_content_file, 'wx') as out_file:
        index = 0
        for serp in question_serps.itervalues():
            for doc in serp[:10]:
                content = doc.content()
                if len(content) > 0:
                    document = (doc.url, parser.parse(content))
                    pickle.dump(document, out_file)
            print "Query #", index, datetime.now()
            index += 1
Пример #4
0
def main_doc_entities_from_content():
    entity_linker = globals.get_entity_linker()
    document_entities_file = globals.config.get('WebSearchFeatures', 'documents-entities-file')
    doc_entities = dict()
    from text2kb.utils import get_documents_content_dict
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    documents_content = get_documents_content_dict(return_parsed_tokens=True)
    index = 0
    for serp in question_search_results.itervalues():
        for doc in serp[:globals.SEARCH_RESULTS_TOPN]:
            if doc.url in documents_content:
                doc_entities[doc.url] = entity_linker.identify_entities_in_document(documents_content[doc.url],
                                                                                    min_surface_score=0.5)
        index += 1
        if index % 100 == 0:
            logger.info("%d SERPs processed" % index)
    with open(document_entities_file, 'wx') as out:
        pickle.dump(doc_entities, out)
Пример #5
0
def entity_link_snippets():
    entity_linker = globals.get_entity_linker()
    snippet_entities_file = globals.config.get('WebSearchFeatures', 'document-snippet-entities')
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    doc_snippet_entities = dict()
    for index, serp in enumerate(question_search_results.itervalues()):
        for doc in serp[:globals.SEARCH_RESULTS_TOPN]:
            snippet_tokens = doc.get_snippet_tokens()
            entities = entity_linker.identify_entities_in_document(snippet_tokens)
            for entity in entities:
                entity['matches'] = []
                for position in entity['positions']:
                    entity['matches'].append(snippet_tokens[position[0]:position[1]])
            doc_snippet_entities[doc.url] = entities
        if index % 100 == 0:
            logger.info("Processed %d serps" % index)
    logger.info("Pickling the dictionary...")
    with open(snippet_entities_file, 'wx') as out:
        pickle.dump(doc_snippet_entities, out)
    logger.info("Pickling the dictionary DONE!")
Пример #6
0
 def init_from_config():
     config_options = globals.config
     surface_index = EntitySurfaceIndexMemory.init_from_config()
     max_entities_p_token = int(config_options.get('EntityLinker',
                                                   'max-entites-per-tokens'))
     use_web_results = config_options.get('EntityLinker',
                                          'use-web-results') == "True"
     topn_entities = int(config_options.get('EntityLinker',
                                           'topn-external-entities'))
     question_search_results = dict()
     doc_snippets_entities = dict()
     if use_web_results:
         from text2kb.utils import get_documents_snippet_entities
         from text2kb.utils import get_questions_serps
         question_search_results = get_questions_serps()
         doc_snippets_entities = get_documents_snippet_entities()
     return WebSearchResultsExtenderEntityLinker(surface_index,
                                                 max_entities_p_token,
                                                 use_web_results,
                                                 topn_entities=topn_entities,
                                                 search_results=question_search_results,
                                                 doc_snippets_entities=doc_snippets_entities)
Пример #7
0
def main_entities():
    globals.read_configuration('config.cfg')
    from text2kb.utils import get_questions_serps
    from text2kb.utils import get_documents_entities
    serps = get_questions_serps()
    doc_entities = get_documents_entities()
    import operator
    while True:
        print "Please enter a question:"
        question = sys.stdin.readline().strip()
        if question in serps:
            docs = serps[question][:10]
            entities = {}
            for doc in docs:
                for entity in doc_entities[doc.url].itervalues():
                    e = (entity['mid'], entity['name'])
                    if e not in entities:
                        entities[e] = 0
                    entities[e] += entity['count']
            top_entities = entities.items()
            top_entities.sort(key=operator.itemgetter(1), reverse=True)
            print top_entities[:50]
Пример #8
0
def generate_text_based_features(candidate):
    # Get candidate answers
    answers = map(unicode.lower, candidate.get_results_text())
    # Skip empty and extra-long answers.
    if len(answers) == 0:
        return dict()
    # Get answers descriptions.
    answers_descriptions = ['\n'.join(KBEntity.get_entity_descriptions_by_name(answer, keep_most_triples_only=True))
                            for answer in answers]

    # Get question text.
    question_text = candidate.query.original_query
    question_tokens2pos = dict((token, [1, ]) for token in tokenize(question_text))
    question_token_tfidf = SparseVector.from_2pos(question_tokens2pos,
                                                  element_calc_func=SparseVector.compute_tfidf_token_elements)

    # Get question entities
    question_entities2pos = dict((entity.entity.name.lower(), [1, ]) for entity in candidate.matched_entities)
    question_entitytoken2pos = dict((token, [1, ])
                                    for entity in candidate.matched_entities
                                    for token in tokenize(entity.entity.name))
    question_entity_tfidf = SparseVector.from_2pos(question_entitytoken2pos,
                                                   element_calc_func=SparseVector.compute_tfidf_token_elements)

    # Get search results and check that they aren't empty
    questions_search_results = get_questions_serps()

    documents_vectors = []
    snippets_vectors = []
    fragment_vectors = []
    combined_documents_vector = dict()
    combined_document_snippets_vector = dict()

    representations = ["entity_tfidf",
                       "token_tfidf",
                       # "entity",
                       # "token",
                      ]
    for r in representations:
        combined_documents_vector[r] = dict()
        combined_document_snippets_vector[r] = dict()

    if question_text not in questions_search_results:
        logger.warning("No search results found for the question %s" % question_text)
    else:
        documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector,\
            combined_document_snippets_vector = generate_document_vectors(question_text,
                                                                          question_tokens2pos,
                                                                          questions_search_results)

    answer_entity2pos = dict((answer_entity, [1, ]) for answer_entity in answers)
    answer_token2pos = dict((answer_token, [1, ]) for answer_entity in answers
                            for answer_token in tokenize(answer_entity))
    answers_vectors = {
        "token_tfidf": SparseVector.from_2pos(answer_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
        "entity_tfidf": SparseVector.from_2pos(answer_entity2pos,
                                               element_calc_func=SparseVector.compute_tfidf_entity_elements),
        # "entity": SparseVector.from_2pos(answer_entity2pos),
        # "token": SparseVector.from_2pos(answer_token2pos),
    }

    answer_descriptions_token2pos = dict((token, [1, ]) for description in answers_descriptions
                                         for token in tokenize(description))
    answer_description_vectors = {
        "token_tfidf": SparseVector.from_2pos(answer_descriptions_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
        "entity_tfidf": SparseVector(dict()),
        # Keeping only tf-idf similarities. This seems to be enough.
        # "token": SparseVector.from_2pos(answer_descriptions_token2pos),
        # "entity": SparseVector(dict()),
    }

    similarity_functions = [
        ("cosine", Similarity.cosine_similarity),
        # ("itersection", Similarity.intersection_similarity),
        # ("normalized_intersection", Similarity.normalized_intersection_similarity),
        # ("bm25", Similarity.bm25_similarity),
    ]
    features = dict()

    for similarity_name, similarity in similarity_functions:
        # Computing document-answer similarities for each representation.
        document_answer_similarities = {}
        for representation in representations:
            if representation not in document_answer_similarities:
                document_answer_similarities[representation] = []
            for doc_vector in documents_vectors:
                document_answer_similarities[representation].append(similarity(representation,
                                                                               doc_vector[representation],
                                                                               answers_vectors[representation]))
        for representation in representations:
            features.update({
                "text_features:avg_document_answer_%s_%s" % (representation, similarity_name):
                    avg(document_answer_similarities[representation]),
                "text_features:max_document_answer_%s_%s" % (representation, similarity_name):
                    max(document_answer_similarities[representation]) if document_answer_similarities[representation]
                    else 0.0,
            })

        # logger.info("Snippet-answer similarity...")
        # Computing snippet-answer similarities for each representation.
        snippet_answer_similarities = {}
        for representation in representations:
            if representation not in snippet_answer_similarities:
                snippet_answer_similarities[representation] = []

            for snippet_vector in snippets_vectors:
                snippet_answer_similarities[representation].append(similarity(representation,
                                                                              snippet_vector[representation],
                                                                              answers_vectors[representation]))

        for representation in representations:
            features.update({
                "text_features:avg_snippet_answer_%s_%s" % (representation, similarity_name):
                    avg(snippet_answer_similarities[representation]),
                "text_features:max_snippet_answer_%s_%s" % (representation, similarity_name):
                    max(snippet_answer_similarities[representation]) if snippet_answer_similarities[representation] else 0.0,
            })

        # logger.info("Fragment-answer similarity...")
        # Best BM25 fragment-answer similarities.
        # Weren't very efficient and therefore I remove this features. There is a chance that there is a bug in the features.

        # fragment_answer_similarities = {}
        # for fragment_vector in fragment_vectors:
        #     for representation in representations:
        #         if representation not in fragment_answer_similarities:
        #             fragment_answer_similarities[representation] = []
        #         fragment_answer_similarities[representation].append(similarity(representation,
        #                                                                        fragment_vector[representation],
        #                                                                        answers_vectors[representation]))
        #
        # for representation in representations:
        #     features.update({
        #         "text_features:avg_fragment_answer_%s_%s" % (representation, similarity_name):
        #             avg(fragment_answer_similarities[representation]),
        #         "text_features:max_fragment_answer_%s_%s" % (representation, similarity_name):
        #             max(fragment_answer_similarities[representation]) if fragment_answer_similarities[representation] else 0.0,
        #     })

        # logger.info("Combined document-answer similarity...")
        # Combined documents answer similarity
        for representation in representations:
            combineddoc_answer_similarity = similarity(representation,
                                                       combined_documents_vector[representation],
                                                       answers_vectors[representation])
            features.update({
                "text_features:combdocument_answer_%s_%s" % (representation, similarity_name):
                    combineddoc_answer_similarity,
            })

        # logger.info("Combined snippet-answer similarity...")
        for representation in representations:
            combineddocsnippet_answer_similarity = similarity(representation,
                                                              combined_document_snippets_vector[representation],
                                                              answers_vectors[representation])
            features.update({
                "text_features:combdocument_snippet_answer_%s_%s" % (representation, similarity_name):
                    combineddocsnippet_answer_similarity,
            })

        # logger.info("Description-question similarity...")
        # These features aren't very efficient either. The next candidate for removal.
        description_question_entity_similarity = similarity("token_tfidf", question_entity_tfidf,
                                                            answer_description_vectors["token_tfidf"])
        description_question_token_similarity = similarity("token_tfidf", question_token_tfidf,
                                                           answer_description_vectors["token_tfidf"])
        features.update({
            "text_features:description_question_entitytoken_%s" % similarity_name:
                description_question_entity_similarity,
            "text_features:description_question_token_%s" % similarity_name: description_question_token_similarity,
        })

    # Description - question embedding similarity.
    description_question_token_embedding_avg_similarity = Similarity.embedding_avg_idf_similarity(
        "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"])
    description_question_token_embedding_n_similarity = Similarity.embedding_avg_idf_similarity(
        "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"])
    features.update({
        "text_features:description_question_token_avg_idf_embeddings":
            description_question_token_embedding_avg_similarity,
        "text_features:description_question_token_n_embeddings":
            description_question_token_embedding_n_similarity,
    })

    # Remove features with 0 score.
    features = dict((feature, value) for feature, value in features.iteritems() if value != 0.0)
    return features
Пример #9
0
        "token_tfidf": SparseVector.from_2pos(combined_doc_snippet_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
    }

    # Cache the computed vectors.
    _documents_vectors_cache[question_text] = (documents_vectors, snippets_vectors, fragment_vectors,
                                               combined_documents_vector, combined_document_snippets_vector)

    return documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector, combined_document_snippets_vector


def create_document_vectors_cache(questions):
    cache_file = globals.config.get('WebSearchFeatures', 'document-vectors')
    logger.info("Caching document vectors...")
    with open(cache_file, 'wx') as out:
        for index, question in enumerate(questions):
            question_token2pos = dict((token, [1, ]) for token in tokenize(question))
            generate_document_vectors(question, question_token2pos, get_questions_serps())
            pickle.dump((question, _documents_vectors_cache[question]), out)
            if index % 100 == 0:
                logger.info("Cached document vectors for %d questions" % index)


if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s : %(levelname)s '
                               ': %(module)s : %(message)s',
                        level=logging.INFO)
    globals.read_configuration('config_wikipedia.cfg')
    serps = get_questions_serps()
    create_document_vectors_cache(serps.keys())