Пример #1
0
def get_evaluated_queries(dataset, cached, parameters, n_top,
                          prune_for_training):
    """Returns evaluated queries.

    :rtype list[EvaluationQuery]
    :param dataset:
    :param config:
    :param cached:
    :param parameters:
    :param n_top:
    :return:
    """
    queries = []
    if cached:
        queries = get_cached_evaluated_queries(dataset, parameters)
    if not queries:
        # Note: we use the default scorer here, but with parameters
        # of the selected scorer.
        translator = QueryTranslator.init_from_config()
        candidate_ranker = ranker.LiteralRanker('DefaultScorer')
        candidate_ranker.parameters = parameters
        translator.set_ranker(candidate_ranker)
        queries = load_eval_queries(dataset)
        # We evaluate the queries here, so that in subsequent runs, we already
        # know which candidate is correct etc. and do not have to perform the
        # same calculations again.
        _, queries = evaluate_translator(translator,
                                         queries,
                                         n_top=n_top,
                                         ignore_invalid=False,
                                         output_result=False,
                                         prune_for_training=prune_for_training)
        if cached:
            cache_evaluated_queries(dataset, queries, parameters)
    return queries
Пример #2
0
def get_number_of_external_entities():
    import scorer_globals
    globals.read_configuration('config_webentity.cfg')
    parser = CoreNLPParser.init_from_config()
    entity_linker = WebSearchResultsExtenderEntityLinker.init_from_config()
    entity_linker.topn_entities = 100000
    scorer_globals.init()

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    datasets = ["webquestions_split_train", "webquestions_split_dev",]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    external_entities_count = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            entities = entity_linker.identify_entities_in_tokens(parser.parse(query.utterance).tokens, text=query.utterance, find_dates=False)
            print "-------------------------"
            print query.utterance
            print "\n".join(map(str, sorted(entities, key=lambda entity: entity.external_entity_count, reverse=True)))

            external_entities_count.append(0)
            for entity in entities:
                if entity.external_entity:
                    external_entities_count[-1] += 1
            if index % 100 == 0:
                print >> sys.stderr, "%s queries processed" % index
    print "========================================="
    print external_entities_count
    print sum(external_entities_count)
    print len(external_entities_count)
Пример #3
0
def get_question_terms():
    import scorer_globals
    globals.read_configuration('config_webentity.cfg')
    scorer_globals.init()
    datasets = ["webquestionstrain", "webquestionstest",]

    question_tokens = set()
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            question_tokens.update(token for token in tokenize(query.utterance))
    print question_tokens
Пример #4
0
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)


if __name__ == "__main__":
    extract_npmi_ngram_type_pairs()
    exit()

    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    data = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            tokens = [token.token for token in parser.parse(query.utterance).tokens]
            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities]
            data.append((tokens, notable_types))
            logger.info(tokens)
            logger.info([KBEntity.get_entity_name(notable_type) for notable_type in notable_types])

    with open("question_tokens_notable_types.pickle", 'wb') as out:
        pickle.dump(data, out)