def main_entity_link_text(): globals.read_configuration('config.cfg') entity_linker = globals.get_entity_linker() parser = globals.get_parser() from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() globals.logger.setLevel("DEBUG") import operator while True: print "Please enter some text: " text = sys.stdin.readline().strip().decode('utf-8') tokens = parser.parse(text).tokens print "Entities:", entity_linker.identify_entities_in_document(tokens, max_token_window=5) entities = {} tokens = {} if text in question_search_results: for doc in question_search_results[text][:10]: print doc title = doc.title snippet = doc.snippet snippet_tokens = parser.parse(title + "\n" + snippet).tokens for token in snippet_tokens: if token.lemma not in tokens: tokens[token.lemma] = 0 tokens[token.lemma] += 1 for entity in entity_linker.identify_entities_in_document(snippet_tokens): if entity['mid'] not in entities: entities[entity['mid']] = entity else: entities[entity['mid']]['count'] += entity['count'] print sorted(entities.values(), key=operator.itemgetter('count'), reverse=True)[:50]
def create_document_vectors_cache(questions): cache_file = globals.config.get('WebSearchFeatures', 'document-vectors') logger.info("Caching document vectors...") with open(cache_file, 'wx') as out: for index, question in enumerate(questions): question_token2pos = dict((token, [1, ]) for token in tokenize(question)) generate_document_vectors(question, question_token2pos, get_questions_serps()) pickle.dump((question, _documents_vectors_cache[question]), out) if index % 100 == 0: logger.info("Cached document vectors for %d questions" % index)
def main_parse(): document_content_file = globals.config.get('WebSearchFeatures', 'documents-content-file') parser = CoreNLPParser.init_from_config() question_serps = get_questions_serps() print datetime.now() with open(document_content_file, 'wx') as out_file: index = 0 for serp in question_serps.itervalues(): for doc in serp[:10]: content = doc.content() if len(content) > 0: document = (doc.url, parser.parse(content)) pickle.dump(document, out_file) print "Query #", index, datetime.now() index += 1
def main_doc_entities_from_content(): entity_linker = globals.get_entity_linker() document_entities_file = globals.config.get('WebSearchFeatures', 'documents-entities-file') doc_entities = dict() from text2kb.utils import get_documents_content_dict from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() documents_content = get_documents_content_dict(return_parsed_tokens=True) index = 0 for serp in question_search_results.itervalues(): for doc in serp[:globals.SEARCH_RESULTS_TOPN]: if doc.url in documents_content: doc_entities[doc.url] = entity_linker.identify_entities_in_document(documents_content[doc.url], min_surface_score=0.5) index += 1 if index % 100 == 0: logger.info("%d SERPs processed" % index) with open(document_entities_file, 'wx') as out: pickle.dump(doc_entities, out)
def entity_link_snippets(): entity_linker = globals.get_entity_linker() snippet_entities_file = globals.config.get('WebSearchFeatures', 'document-snippet-entities') from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() doc_snippet_entities = dict() for index, serp in enumerate(question_search_results.itervalues()): for doc in serp[:globals.SEARCH_RESULTS_TOPN]: snippet_tokens = doc.get_snippet_tokens() entities = entity_linker.identify_entities_in_document(snippet_tokens) for entity in entities: entity['matches'] = [] for position in entity['positions']: entity['matches'].append(snippet_tokens[position[0]:position[1]]) doc_snippet_entities[doc.url] = entities if index % 100 == 0: logger.info("Processed %d serps" % index) logger.info("Pickling the dictionary...") with open(snippet_entities_file, 'wx') as out: pickle.dump(doc_snippet_entities, out) logger.info("Pickling the dictionary DONE!")
def init_from_config(): config_options = globals.config surface_index = EntitySurfaceIndexMemory.init_from_config() max_entities_p_token = int(config_options.get('EntityLinker', 'max-entites-per-tokens')) use_web_results = config_options.get('EntityLinker', 'use-web-results') == "True" topn_entities = int(config_options.get('EntityLinker', 'topn-external-entities')) question_search_results = dict() doc_snippets_entities = dict() if use_web_results: from text2kb.utils import get_documents_snippet_entities from text2kb.utils import get_questions_serps question_search_results = get_questions_serps() doc_snippets_entities = get_documents_snippet_entities() return WebSearchResultsExtenderEntityLinker(surface_index, max_entities_p_token, use_web_results, topn_entities=topn_entities, search_results=question_search_results, doc_snippets_entities=doc_snippets_entities)
def main_entities(): globals.read_configuration('config.cfg') from text2kb.utils import get_questions_serps from text2kb.utils import get_documents_entities serps = get_questions_serps() doc_entities = get_documents_entities() import operator while True: print "Please enter a question:" question = sys.stdin.readline().strip() if question in serps: docs = serps[question][:10] entities = {} for doc in docs: for entity in doc_entities[doc.url].itervalues(): e = (entity['mid'], entity['name']) if e not in entities: entities[e] = 0 entities[e] += entity['count'] top_entities = entities.items() top_entities.sort(key=operator.itemgetter(1), reverse=True) print top_entities[:50]
def generate_text_based_features(candidate): # Get candidate answers answers = map(unicode.lower, candidate.get_results_text()) # Skip empty and extra-long answers. if len(answers) == 0: return dict() # Get answers descriptions. answers_descriptions = ['\n'.join(KBEntity.get_entity_descriptions_by_name(answer, keep_most_triples_only=True)) for answer in answers] # Get question text. question_text = candidate.query.original_query question_tokens2pos = dict((token, [1, ]) for token in tokenize(question_text)) question_token_tfidf = SparseVector.from_2pos(question_tokens2pos, element_calc_func=SparseVector.compute_tfidf_token_elements) # Get question entities question_entities2pos = dict((entity.entity.name.lower(), [1, ]) for entity in candidate.matched_entities) question_entitytoken2pos = dict((token, [1, ]) for entity in candidate.matched_entities for token in tokenize(entity.entity.name)) question_entity_tfidf = SparseVector.from_2pos(question_entitytoken2pos, element_calc_func=SparseVector.compute_tfidf_token_elements) # Get search results and check that they aren't empty questions_search_results = get_questions_serps() documents_vectors = [] snippets_vectors = [] fragment_vectors = [] combined_documents_vector = dict() combined_document_snippets_vector = dict() representations = ["entity_tfidf", "token_tfidf", # "entity", # "token", ] for r in representations: combined_documents_vector[r] = dict() combined_document_snippets_vector[r] = dict() if question_text not in questions_search_results: logger.warning("No search results found for the question %s" % question_text) else: documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector,\ combined_document_snippets_vector = generate_document_vectors(question_text, question_tokens2pos, questions_search_results) answer_entity2pos = dict((answer_entity, [1, ]) for answer_entity in answers) answer_token2pos = dict((answer_token, [1, ]) for answer_entity in answers for answer_token in tokenize(answer_entity)) answers_vectors = { "token_tfidf": SparseVector.from_2pos(answer_token2pos, element_calc_func=SparseVector.compute_tfidf_token_elements), "entity_tfidf": SparseVector.from_2pos(answer_entity2pos, element_calc_func=SparseVector.compute_tfidf_entity_elements), # "entity": SparseVector.from_2pos(answer_entity2pos), # "token": SparseVector.from_2pos(answer_token2pos), } answer_descriptions_token2pos = dict((token, [1, ]) for description in answers_descriptions for token in tokenize(description)) answer_description_vectors = { "token_tfidf": SparseVector.from_2pos(answer_descriptions_token2pos, element_calc_func=SparseVector.compute_tfidf_token_elements), "entity_tfidf": SparseVector(dict()), # Keeping only tf-idf similarities. This seems to be enough. # "token": SparseVector.from_2pos(answer_descriptions_token2pos), # "entity": SparseVector(dict()), } similarity_functions = [ ("cosine", Similarity.cosine_similarity), # ("itersection", Similarity.intersection_similarity), # ("normalized_intersection", Similarity.normalized_intersection_similarity), # ("bm25", Similarity.bm25_similarity), ] features = dict() for similarity_name, similarity in similarity_functions: # Computing document-answer similarities for each representation. document_answer_similarities = {} for representation in representations: if representation not in document_answer_similarities: document_answer_similarities[representation] = [] for doc_vector in documents_vectors: document_answer_similarities[representation].append(similarity(representation, doc_vector[representation], answers_vectors[representation])) for representation in representations: features.update({ "text_features:avg_document_answer_%s_%s" % (representation, similarity_name): avg(document_answer_similarities[representation]), "text_features:max_document_answer_%s_%s" % (representation, similarity_name): max(document_answer_similarities[representation]) if document_answer_similarities[representation] else 0.0, }) # logger.info("Snippet-answer similarity...") # Computing snippet-answer similarities for each representation. snippet_answer_similarities = {} for representation in representations: if representation not in snippet_answer_similarities: snippet_answer_similarities[representation] = [] for snippet_vector in snippets_vectors: snippet_answer_similarities[representation].append(similarity(representation, snippet_vector[representation], answers_vectors[representation])) for representation in representations: features.update({ "text_features:avg_snippet_answer_%s_%s" % (representation, similarity_name): avg(snippet_answer_similarities[representation]), "text_features:max_snippet_answer_%s_%s" % (representation, similarity_name): max(snippet_answer_similarities[representation]) if snippet_answer_similarities[representation] else 0.0, }) # logger.info("Fragment-answer similarity...") # Best BM25 fragment-answer similarities. # Weren't very efficient and therefore I remove this features. There is a chance that there is a bug in the features. # fragment_answer_similarities = {} # for fragment_vector in fragment_vectors: # for representation in representations: # if representation not in fragment_answer_similarities: # fragment_answer_similarities[representation] = [] # fragment_answer_similarities[representation].append(similarity(representation, # fragment_vector[representation], # answers_vectors[representation])) # # for representation in representations: # features.update({ # "text_features:avg_fragment_answer_%s_%s" % (representation, similarity_name): # avg(fragment_answer_similarities[representation]), # "text_features:max_fragment_answer_%s_%s" % (representation, similarity_name): # max(fragment_answer_similarities[representation]) if fragment_answer_similarities[representation] else 0.0, # }) # logger.info("Combined document-answer similarity...") # Combined documents answer similarity for representation in representations: combineddoc_answer_similarity = similarity(representation, combined_documents_vector[representation], answers_vectors[representation]) features.update({ "text_features:combdocument_answer_%s_%s" % (representation, similarity_name): combineddoc_answer_similarity, }) # logger.info("Combined snippet-answer similarity...") for representation in representations: combineddocsnippet_answer_similarity = similarity(representation, combined_document_snippets_vector[representation], answers_vectors[representation]) features.update({ "text_features:combdocument_snippet_answer_%s_%s" % (representation, similarity_name): combineddocsnippet_answer_similarity, }) # logger.info("Description-question similarity...") # These features aren't very efficient either. The next candidate for removal. description_question_entity_similarity = similarity("token_tfidf", question_entity_tfidf, answer_description_vectors["token_tfidf"]) description_question_token_similarity = similarity("token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"]) features.update({ "text_features:description_question_entitytoken_%s" % similarity_name: description_question_entity_similarity, "text_features:description_question_token_%s" % similarity_name: description_question_token_similarity, }) # Description - question embedding similarity. description_question_token_embedding_avg_similarity = Similarity.embedding_avg_idf_similarity( "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"]) description_question_token_embedding_n_similarity = Similarity.embedding_avg_idf_similarity( "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"]) features.update({ "text_features:description_question_token_avg_idf_embeddings": description_question_token_embedding_avg_similarity, "text_features:description_question_token_n_embeddings": description_question_token_embedding_n_similarity, }) # Remove features with 0 score. features = dict((feature, value) for feature, value in features.iteritems() if value != 0.0) return features
"token_tfidf": SparseVector.from_2pos(combined_doc_snippet_token2pos, element_calc_func=SparseVector.compute_tfidf_token_elements), } # Cache the computed vectors. _documents_vectors_cache[question_text] = (documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector, combined_document_snippets_vector) return documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector, combined_document_snippets_vector def create_document_vectors_cache(questions): cache_file = globals.config.get('WebSearchFeatures', 'document-vectors') logger.info("Caching document vectors...") with open(cache_file, 'wx') as out: for index, question in enumerate(questions): question_token2pos = dict((token, [1, ]) for token in tokenize(question)) generate_document_vectors(question, question_token2pos, get_questions_serps()) pickle.dump((question, _documents_vectors_cache[question]), out) if index % 100 == 0: logger.info("Cached document vectors for %d questions" % index) if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(levelname)s ' ': %(module)s : %(message)s', level=logging.INFO) globals.read_configuration('config_wikipedia.cfg') serps = get_questions_serps() create_document_vectors_cache(serps.keys())