def train_type_model(): globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True) features = [] labels = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): tokens = [token.lemma for token in parser.parse(query.utterance).tokens] n_grams = get_grams_feats(tokens) answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) other_notable_types = set() for candidate in query.eval_candidates: entities = [mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)] other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities])) incorrect_notable_types = other_notable_types.difference(correct_notable_types) for type in correct_notable_types.union(incorrect_notable_types): if type in correct_notable_types: labels.append(1) else: labels.append(0) features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type")) with open("type_model_data.pickle", 'wb') as out: pickle.dump((features, labels), out) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels) vec.restrict(feature_selector.get_support()) X = feature_selector.transform(X) type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out)
def get_entity_idf(entity): """ Get the entity IDF based on Google's annotation of ClueWeb corpus. :param entity: The entity to lookup. :return: IDF of the entity based on ClueWeb collection. """ global _entity_counts if _entity_counts is None: _entity_counts = dict() with gzip.open(globals.config.get('WebSearchFeatures', 'entity-webcounts-file'), 'r') as input_file: logger.info("Reading entity ClueWeb counts...") for line in input_file: entity, count = line.strip().split('\t') count = int(count) _entity_counts[entity] = count logger.info("Reading entity ClueWeb counts done!") if _entity_counts: mids = ["/" + mid.replace(".", "/") for mid in KBEntity.get_entityid_by_name(entity, keep_most_triples=True)] if mids: idf = min(log(max(1.0, CLUEWEB_DOCUMENTS_COUNT / (_entity_counts[mid] if mid in _entity_counts and _entity_counts[mid] > 0 else 1.0))) for mid in mids) # logger.info("IDF entity %s %.3f" % (entity, idf)) return idf return 1.0 else: return 0.0
def rank_query_candidates(self, query_candidates, key=lambda x: x, utterance=""): """ Returns the candidate generated from search results. This methods doesn't look into the existing candidates, but rather creates a new one based on search results. :param query_candidates: List of EvaluationCandidate objects. This answerer don't actually use them. """ if isinstance(utterance, unicode): utterance = utterance.encode('utf-8') if utterance in self._answers_cache: return self._answers_cache[utterance] question_entities = set([e['name'] for e in find_entity_mentions(utterance.encode("utf-8"), use_tagme=True)]) res = self._searcher.search(utterance, topn=self._topn) res = json.loads(res) entities = dict() for r in res: for e in r['entities']: if e['mid'] not in entities: entities[e['mid']] = [] entities[e['mid']].append((r['phrase'], r['score'])) answers = sorted(entities.items(), key=lambda x: sum(score for _, score in x[1]), reverse=True) answers = [(KBEntity.get_entity_name(answer[0].replace("/", ".")), answer[1]) for answer in answers if answer[0] not in question_entities] answers = [EvaluationCandidate(None, answer[1], [answer[0], ]) for answer in answers] self._answers_cache[utterance] = answers return answers
def filter_entity_descriptions(mids): import gzip descriptions_file = globals.config.get("EntityLinker", "entity-descriptions-file") with gzip.open(descriptions_file, "r") as input_file, gzip.open(descriptions_file + "_small", "w") as out: for index, line in enumerate(input_file): triple = KBEntity.parse_freebase_string_triple(line) if triple[0] in mids: print >> out, line.strip()
def get_answer_notable_types(self): """ Returns a list of notable types of each of the result entities. :return: """ if self.answer_notable_types is None: self.answer_notable_types = [] for mid, answer in zip(self.get_results_mids(), self.get_results_text()): if _year_pattern.match(answer) is not None: continue self.answer_notable_types.append(KBEntity.get_notable_type(mid)) return self.answer_notable_types
def filter_entity_names(names): import gzip mids = set() entities_file = globals.config.get("EntityLinker", "entity-names-file") with gzip.open(entities_file, "r") as input_file, gzip.open(entities_file + "_small", "w") as out: for index, line in enumerate(input_file): triple = KBEntity.parse_freebase_string_triple(line) name = triple[2].lower() if name in names: mids.add(triple[0]) print >> out, line.strip() return mids
def filter_answers_by_type(self, type_filter, score): assert self.type_filter is None text_results = self.get_results_text() mid_results = self.get_results_mids() assert len(text_results) == len(mid_results) new_results_text = [] new_results_mids = [] for mid, answer in zip(mid_results, text_results): if KBEntity.get_notable_type(mid) == type_filter: new_results_mids.append(mid) new_results_text.append(answer) self.query_results = new_results_text self.query_results_mids = new_results_mids self.type_filter = type_filter self.type_filter_max_npmi = score[0] self.type_filter_avg_npmi = score[0] self.cached_result_count = len(self.query_results) return self.get_results_text()
def _bytes_to_entity(line: bytes) -> 'KBEntity': """ Instantiate entity from string representation. >>> e = EntityIndex._bytes_to_entity(b'm.0abc1\\tfoo name\\t7\\tfooly\\tfoo\\n') >>> e.name 'foo name' >>> e.id 'm.0abc1' >>> e.score 7 >>> e.aliases ['fooly', 'foo'] """ cols = line.strip().decode('utf-8').split('\t') mid = cols[0] name = cols[1] score = int(cols[2]) aliases = cols[3:] return KBEntity(name, mid, score, aliases)
# else: # for relation in query.eval_candidates[query.oracle_position - 1].query_candidate.relations: # if relation.name not in correct_relations: # print query.utterance # print relation.name # print query.eval_candidates[query.oracle_position - 1].query_candidate # print "-----" # This loop will print questions without good candidate # if query.oracle_position == -1: # entities = set() # for candidate in query.eval_candidates: # for entity in candidate.query_candidate.matched_entities: # if isinstance(entity.entity.entity, KBEntity): # entities.add((entity.entity.name, entity.entity.entity.id)) # print ">>>", query.utterance # print entities for candidate in query.eval_candidates: answer_entities = set(mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)) question_entities = set(mid for entity in candidate.query_candidate.matched_entities for mid in KBEntity.get_entityid_by_name(entity.entity.name, keep_most_triples=True)) for question_entity in question_entities: for answer_entity in answer_entities: print question_entity + "\t" + answer_entity if index % 100 == 0: print >> stderr, "Processed %d queries" % index
def generate_text_based_features(candidate): # Get candidate answers answers = map(unicode.lower, candidate.get_results_text()) # Skip empty and extra-long answers. if len(answers) == 0: return dict() # Get answers descriptions. answers_descriptions = ['\n'.join(KBEntity.get_entity_descriptions_by_name(answer, keep_most_triples_only=True)) for answer in answers] # Get question text. question_text = candidate.query.original_query question_tokens2pos = dict((token, [1, ]) for token in tokenize(question_text)) question_token_tfidf = SparseVector.from_2pos(question_tokens2pos, element_calc_func=SparseVector.compute_tfidf_token_elements) # Get question entities question_entities2pos = dict((entity.entity.name.lower(), [1, ]) for entity in candidate.matched_entities) question_entitytoken2pos = dict((token, [1, ]) for entity in candidate.matched_entities for token in tokenize(entity.entity.name)) question_entity_tfidf = SparseVector.from_2pos(question_entitytoken2pos, element_calc_func=SparseVector.compute_tfidf_token_elements) # Get search results and check that they aren't empty questions_search_results = get_questions_serps() documents_vectors = [] snippets_vectors = [] fragment_vectors = [] combined_documents_vector = dict() combined_document_snippets_vector = dict() representations = ["entity_tfidf", "token_tfidf", # "entity", # "token", ] for r in representations: combined_documents_vector[r] = dict() combined_document_snippets_vector[r] = dict() if question_text not in questions_search_results: logger.warning("No search results found for the question %s" % question_text) else: documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector,\ combined_document_snippets_vector = generate_document_vectors(question_text, question_tokens2pos, questions_search_results) answer_entity2pos = dict((answer_entity, [1, ]) for answer_entity in answers) answer_token2pos = dict((answer_token, [1, ]) for answer_entity in answers for answer_token in tokenize(answer_entity)) answers_vectors = { "token_tfidf": SparseVector.from_2pos(answer_token2pos, element_calc_func=SparseVector.compute_tfidf_token_elements), "entity_tfidf": SparseVector.from_2pos(answer_entity2pos, element_calc_func=SparseVector.compute_tfidf_entity_elements), # "entity": SparseVector.from_2pos(answer_entity2pos), # "token": SparseVector.from_2pos(answer_token2pos), } answer_descriptions_token2pos = dict((token, [1, ]) for description in answers_descriptions for token in tokenize(description)) answer_description_vectors = { "token_tfidf": SparseVector.from_2pos(answer_descriptions_token2pos, element_calc_func=SparseVector.compute_tfidf_token_elements), "entity_tfidf": SparseVector(dict()), # Keeping only tf-idf similarities. This seems to be enough. # "token": SparseVector.from_2pos(answer_descriptions_token2pos), # "entity": SparseVector(dict()), } similarity_functions = [ ("cosine", Similarity.cosine_similarity), # ("itersection", Similarity.intersection_similarity), # ("normalized_intersection", Similarity.normalized_intersection_similarity), # ("bm25", Similarity.bm25_similarity), ] features = dict() for similarity_name, similarity in similarity_functions: # Computing document-answer similarities for each representation. document_answer_similarities = {} for representation in representations: if representation not in document_answer_similarities: document_answer_similarities[representation] = [] for doc_vector in documents_vectors: document_answer_similarities[representation].append(similarity(representation, doc_vector[representation], answers_vectors[representation])) for representation in representations: features.update({ "text_features:avg_document_answer_%s_%s" % (representation, similarity_name): avg(document_answer_similarities[representation]), "text_features:max_document_answer_%s_%s" % (representation, similarity_name): max(document_answer_similarities[representation]) if document_answer_similarities[representation] else 0.0, }) # logger.info("Snippet-answer similarity...") # Computing snippet-answer similarities for each representation. snippet_answer_similarities = {} for representation in representations: if representation not in snippet_answer_similarities: snippet_answer_similarities[representation] = [] for snippet_vector in snippets_vectors: snippet_answer_similarities[representation].append(similarity(representation, snippet_vector[representation], answers_vectors[representation])) for representation in representations: features.update({ "text_features:avg_snippet_answer_%s_%s" % (representation, similarity_name): avg(snippet_answer_similarities[representation]), "text_features:max_snippet_answer_%s_%s" % (representation, similarity_name): max(snippet_answer_similarities[representation]) if snippet_answer_similarities[representation] else 0.0, }) # logger.info("Fragment-answer similarity...") # Best BM25 fragment-answer similarities. # Weren't very efficient and therefore I remove this features. There is a chance that there is a bug in the features. # fragment_answer_similarities = {} # for fragment_vector in fragment_vectors: # for representation in representations: # if representation not in fragment_answer_similarities: # fragment_answer_similarities[representation] = [] # fragment_answer_similarities[representation].append(similarity(representation, # fragment_vector[representation], # answers_vectors[representation])) # # for representation in representations: # features.update({ # "text_features:avg_fragment_answer_%s_%s" % (representation, similarity_name): # avg(fragment_answer_similarities[representation]), # "text_features:max_fragment_answer_%s_%s" % (representation, similarity_name): # max(fragment_answer_similarities[representation]) if fragment_answer_similarities[representation] else 0.0, # }) # logger.info("Combined document-answer similarity...") # Combined documents answer similarity for representation in representations: combineddoc_answer_similarity = similarity(representation, combined_documents_vector[representation], answers_vectors[representation]) features.update({ "text_features:combdocument_answer_%s_%s" % (representation, similarity_name): combineddoc_answer_similarity, }) # logger.info("Combined snippet-answer similarity...") for representation in representations: combineddocsnippet_answer_similarity = similarity(representation, combined_document_snippets_vector[representation], answers_vectors[representation]) features.update({ "text_features:combdocument_snippet_answer_%s_%s" % (representation, similarity_name): combineddocsnippet_answer_similarity, }) # logger.info("Description-question similarity...") # These features aren't very efficient either. The next candidate for removal. description_question_entity_similarity = similarity("token_tfidf", question_entity_tfidf, answer_description_vectors["token_tfidf"]) description_question_token_similarity = similarity("token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"]) features.update({ "text_features:description_question_entitytoken_%s" % similarity_name: description_question_entity_similarity, "text_features:description_question_token_%s" % similarity_name: description_question_token_similarity, }) # Description - question embedding similarity. description_question_token_embedding_avg_similarity = Similarity.embedding_avg_idf_similarity( "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"]) description_question_token_embedding_n_similarity = Similarity.embedding_avg_idf_similarity( "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"]) features.update({ "text_features:description_question_token_avg_idf_embeddings": description_question_token_embedding_avg_similarity, "text_features:description_question_token_n_embeddings": description_question_token_embedding_n_similarity, }) # Remove features with 0 score. features = dict((feature, value) for feature, value in features.iteritems() if value != 0.0) return features
def extract_npmi_ngram_type_pairs(): globals.read_configuration('config.cfg') scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False n_gram_type_counts = dict() type_counts = dict() n_gram_counts = dict() total = 0 year_pattern = re.compile("[0-9]+") for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): if query.oracle_position != -1 and query.oracle_position <= len(query.eval_candidates): correct_candidate = query.eval_candidates[query.oracle_position - 1] logger.info(query.utterance) logger.info(correct_candidate.query_candidate) n_grams = set(get_n_grams_features(correct_candidate.query_candidate)) answer_entities = [mid for answer in query.target_result if year_pattern.match(answer) is None for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) for notable_type in correct_notable_types: if notable_type not in type_counts: type_counts[notable_type] = 0 type_counts[notable_type] += 1 for n_gram in n_grams: if n_gram not in n_gram_counts: n_gram_counts[n_gram] = 0 n_gram_counts[n_gram] += 1 for notable_type in correct_notable_types: pair = (n_gram, notable_type) if pair not in n_gram_type_counts: n_gram_type_counts[pair] = 0 n_gram_type_counts[pair] += 1 total += 1 npmi = dict() from math import log for n_gram_type_pair, n_gram_type_count in n_gram_type_counts.iteritems(): if n_gram_type_count > 4: n_gram, type = n_gram_type_pair npmi[n_gram_type_pair] = (log(n_gram_type_count) - log(n_gram_counts[n_gram]) - log(type_counts[type]) + log(total)) / (-log(n_gram_type_count) + log(total)) with open("type_model_npmi.pickle", 'wb') as out: pickle.dump(npmi, out) import operator npmi = sorted(npmi.items(), key=operator.itemgetter(1), reverse=True) print "\n".join(map(str, npmi[:50]))
with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out) if __name__ == "__main__": extract_npmi_ngram_type_pairs() exit() globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] data = [] for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): tokens = [token.token for token in parser.parse(query.utterance).tokens] answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities] data.append((tokens, notable_types)) logger.info(tokens) logger.info([KBEntity.get_entity_name(notable_type) for notable_type in notable_types]) with open("question_tokens_notable_types.pickle", 'wb') as out: pickle.dump(data, out)