def get_evaluated_queries(dataset, cached, parameters, n_top, prune_for_training): """Returns evaluated queries. :rtype list[EvaluationQuery] :param dataset: :param config: :param cached: :param parameters: :param n_top: :return: """ queries = [] if cached: queries = get_cached_evaluated_queries(dataset, parameters) if not queries: # Note: we use the default scorer here, but with parameters # of the selected scorer. translator = QueryTranslator.init_from_config() candidate_ranker = ranker.LiteralRanker('DefaultScorer') candidate_ranker.parameters = parameters translator.set_ranker(candidate_ranker) queries = load_eval_queries(dataset) # We evaluate the queries here, so that in subsequent runs, we already # know which candidate is correct etc. and do not have to perform the # same calculations again. _, queries = evaluate_translator(translator, queries, n_top=n_top, ignore_invalid=False, output_result=False, prune_for_training=prune_for_training) if cached: cache_evaluated_queries(dataset, queries, parameters) return queries
def get_number_of_external_entities(): import scorer_globals globals.read_configuration('config_webentity.cfg') parser = CoreNLPParser.init_from_config() entity_linker = WebSearchResultsExtenderEntityLinker.init_from_config() entity_linker.topn_entities = 100000 scorer_globals.init() parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False datasets = ["webquestions_split_train", "webquestions_split_dev",] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] external_entities_count = [] for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): entities = entity_linker.identify_entities_in_tokens(parser.parse(query.utterance).tokens, text=query.utterance, find_dates=False) print "-------------------------" print query.utterance print "\n".join(map(str, sorted(entities, key=lambda entity: entity.external_entity_count, reverse=True))) external_entities_count.append(0) for entity in entities: if entity.external_entity: external_entities_count[-1] += 1 if index % 100 == 0: print >> sys.stderr, "%s queries processed" % index print "=========================================" print external_entities_count print sum(external_entities_count) print len(external_entities_count)
def get_question_terms(): import scorer_globals globals.read_configuration('config_webentity.cfg') scorer_globals.init() datasets = ["webquestionstrain", "webquestionstest",] question_tokens = set() for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): question_tokens.update(token for token in tokenize(query.utterance)) print question_tokens
with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out) if __name__ == "__main__": extract_npmi_ngram_type_pairs() exit() globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] data = [] for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): tokens = [token.token for token in parser.parse(query.utterance).tokens] answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities] data.append((tokens, notable_types)) logger.info(tokens) logger.info([KBEntity.get_entity_name(notable_type) for notable_type in notable_types]) with open("question_tokens_notable_types.pickle", 'wb') as out: pickle.dump(data, out)