Пример #1
0
def print_sparql_queries():
    import argparse

    parser = argparse.ArgumentParser(description="Dump qa entity pairs.")
    parser.add_argument("--config",
                        default="config.cfg",
                        help="The configuration file to use.")
    parser.add_argument("--output",
                        help="The file to dump results to.")
    args = parser.parse_args()
    globals.read_configuration(args.config)
    scorer_globals.init()

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    dataset = "webquestions_test_filter"

    sparql_backend = globals.get_sparql_backend(globals.config)
    queries = get_evaluated_queries(dataset, True, parameters)
    for index, query in enumerate(queries):
        print "--------------------------------------------"
        print query.utterance
        print "\n".join([str((entity.__class__, entity.entity)) for entity in query.eval_candidates[0].query_candidate.query.identified_entities])
        for eval_candidate in query.eval_candidates:
            query_candidate = eval_candidate.query_candidate
            query_candidate.sparql_backend = sparql_backend
            notable_types = query_candidate.get_answers_notable_types()
            if notable_types:
                print notable_types
                print query_candidate.graph_as_simple_string().encode("utf-8")
                print query_candidate.to_sparql_query().encode("utf-8")
                print "\n\n"
Пример #2
0
def get_number_of_external_entities():
    import scorer_globals
    globals.read_configuration('config_webentity.cfg')
    parser = CoreNLPParser.init_from_config()
    entity_linker = WebSearchResultsExtenderEntityLinker.init_from_config()
    entity_linker.topn_entities = 100000
    scorer_globals.init()

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    datasets = ["webquestions_split_train", "webquestions_split_dev",]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    external_entities_count = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            entities = entity_linker.identify_entities_in_tokens(parser.parse(query.utterance).tokens, text=query.utterance, find_dates=False)
            print "-------------------------"
            print query.utterance
            print "\n".join(map(str, sorted(entities, key=lambda entity: entity.external_entity_count, reverse=True)))

            external_entities_count.append(0)
            for entity in entities:
                if entity.external_entity:
                    external_entities_count[-1] += 1
            if index % 100 == 0:
                print >> sys.stderr, "%s queries processed" % index
    print "========================================="
    print external_entities_count
    print sum(external_entities_count)
    print len(external_entities_count)
Пример #3
0
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
Пример #4
0
def get_question_terms():
    import scorer_globals
    globals.read_configuration('config_webentity.cfg')
    scorer_globals.init()
    datasets = ["webquestionstrain", "webquestionstest",]

    question_tokens = set()
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            question_tokens.update(token for token in tokenize(query.utterance))
    print question_tokens
Пример #5
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description="Console based translation.")
    parser.add_argument("ranker_name",
                        default="WQ_Ranker",
                        help="The ranker to use.")
    parser.add_argument("--config",
                        default="config.cfg",
                        help="The configuration file to use.")
    args = parser.parse_args()
    globals.read_configuration(args.config)
    scorer_globals.init()
    if args.ranker_name not in scorer_globals.scorers_dict:
        logger.error("%s is not a valid ranker" % args.ranker_name)
        logger.error("Valid rankers are: %s " % (" ".join(scorer_globals.scorers_dict.keys())))
    logger.info("Using ranker %s" % args.ranker_name)
    ranker = scorer_globals.scorers_dict[args.ranker_name]
    translator = SparqlQueryTranslator.init_from_config()
    translator.set_scorer(ranker)
    while True:
        try:
            sys.stdout.write("enter question> ")
            sys.stdout.flush()
            query = sys.stdin.readline().strip()
            logger.info("Translating query: %s" % query)
            results = translator.translate_and_execute_query(query)
            logger.info("Done translating query: %s" % query)
            logger.info("#candidates: %s" % len(results))
            logger.info("------------------- Candidate features ------------------")
            for rank, result in enumerate(results[:10]):
                logger.info("RANK " + str(rank))
                logger.info(result.query_candidate.relations)
                logger.info(result.query_candidate.get_results_text())
                if result.query_candidate.features:
                    logger.info("Features: " + str(result.query_candidate.features))
            logger.info("---------------------------------------------------------")
            if len(results) > 0:
                best_candidate = results[0].query_candidate
                sparql_query = best_candidate.to_sparql_query()
                result_rows = results[0].query_result_rows
                result = []
                # Usually we get a name + mid.
                for r in result_rows:
                    if len(r) > 1:
                        result.append("%s (%s)" % (r[1], r[0]))
                    else:
                        result.append("%s" % r[0])
                logger.info("SPARQL query: %s" % sparql_query)
                logger.info("Result: %s " % " ".join(result))
        except Exception as e:
            logger.error(e.message)
Пример #6
0
if __name__ == "__main__":
    # print_sparql_queries()
    # exit()

    import argparse

    parser = argparse.ArgumentParser(description="Dump qa entity pairs.")
    parser.add_argument("--config",
                        default="config.cfg",
                        help="The configuration file to use.")
    parser.add_argument("--output",
                        help="The file to dump results to.")
    args = parser.parse_args()
    globals.read_configuration(args.config)
    scorer_globals.init()

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    # datasets = ["webquestions_split_train", "webquestions_split_dev",]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]
    datasets = ["webquestions_train_externalentities_all", "webquestions_test_externalentities_all", ]

    count = 0
    correct_relations = set()
    positions = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
Пример #7
0
def extract_npmi_ngram_type_pairs():
    globals.read_configuration('config.cfg')
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    n_gram_type_counts = dict()
    type_counts = dict()
    n_gram_counts = dict()
    total = 0
    year_pattern = re.compile("[0-9]+")
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            if query.oracle_position != -1 and query.oracle_position <= len(query.eval_candidates):
                correct_candidate = query.eval_candidates[query.oracle_position - 1]
                logger.info(query.utterance)
                logger.info(correct_candidate.query_candidate)

                n_grams = set(get_n_grams_features(correct_candidate.query_candidate))

                answer_entities = [mid for answer in query.target_result
                                   if year_pattern.match(answer) is None
                                   for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
                correct_notable_types = set(filter(lambda x: x,
                                                   [KBEntity.get_notable_type(entity_mid)
                                                    for entity_mid in answer_entities]))

                for notable_type in correct_notable_types:
                    if notable_type not in type_counts:
                        type_counts[notable_type] = 0
                    type_counts[notable_type] += 1

                for n_gram in n_grams:
                    if n_gram not in n_gram_counts:
                        n_gram_counts[n_gram] = 0
                    n_gram_counts[n_gram] += 1

                    for notable_type in correct_notable_types:
                        pair = (n_gram, notable_type)
                        if pair not in n_gram_type_counts:
                            n_gram_type_counts[pair] = 0
                        n_gram_type_counts[pair] += 1

                total += 1

    npmi = dict()
    from math import log
    for n_gram_type_pair, n_gram_type_count in n_gram_type_counts.iteritems():
        if n_gram_type_count > 4:
            n_gram, type = n_gram_type_pair
            npmi[n_gram_type_pair] = (log(n_gram_type_count) - log(n_gram_counts[n_gram]) - log(type_counts[type]) +
                                        log(total)) / (-log(n_gram_type_count) + log(total))

    with open("type_model_npmi.pickle", 'wb') as out:
        pickle.dump(npmi, out)

    import operator
    npmi = sorted(npmi.items(), key=operator.itemgetter(1), reverse=True)
    print "\n".join(map(str, npmi[:50]))
Пример #8
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='Learn or test a'
                                                 ' scorer model.')
    parser.add_argument('--no-cached',
                        default=False,
                        action='store_true',
                        help='Don\'t use cached data if available.')
    parser.add_argument('--config',
                        default='config.cfg',
                        help='The configuration file to use.')
    subparsers = parser.add_subparsers(help='command help')
    train_parser = subparsers.add_parser('train', help='Train a scorer.')
    train_parser.add_argument('scorer_name',
                              help='The scorer to train.')
    train_parser.set_defaults(which='train')
    test_parser = subparsers.add_parser('test', help='Test a scorer.')
    test_parser.add_argument('scorer_name',
                             help='The scorer to test.')
    test_parser.add_argument('test_dataset',
                             help='The dataset on which to test the scorer.')
    test_parser.add_argument('--avg_runs',
                             type=int,
                             default=1,
                             help='Over how many runs to average.')
    test_parser.set_defaults(which='test')
    traintest_parser = subparsers.add_parser('traintest', help='Train and test a scorer.')
    traintest_parser.add_argument('scorer_name',
                             help='The scorer to test.')
    traintest_parser.add_argument('test_dataset',
                             help='The dataset on which to test the scorer.')
    traintest_parser.add_argument('--avg_runs',
                             type=int,
                             default=1,
                             help='Over how many runs to average.')
    traintest_parser.set_defaults(which='traintest')
    cv_parser = subparsers.add_parser('cv', help='Cross-validate a scorer.')
    cv_parser.add_argument('scorer_name',
                           help='The scorer to test.')
    cv_parser.add_argument('dataset',
                           help='The dataset on which to compute cv scores.')
    cv_parser.add_argument('--n_folds',
                           type=int,
                           default=6,
                           help='The number of folds.')
    cv_parser.add_argument('--avg_runs',
                           type=int,
                           default=1,
                           help='Over how many runs to average.')
    cv_parser.set_defaults(which='cv')
    print_parser = subparsers.add_parser('print', help='Print ranked results.')
    print_parser.add_argument('scorer_name',
                           help='The scorer to test.')
    print_parser.add_argument('dataset',
                           help='The dataset on which to compute cv scores.')
    print_parser.set_defaults(which='print')


    args = parser.parse_args()
    # Read global config.
    globals.read_configuration(args.config)
    scorer_globals.init()
    # Fix randomness.
    random.seed(999)
    use_cache = not args.no_cached
    if args.which == 'train':
        train(args.scorer_name, use_cache)
    elif args.which == 'test':
        test(args.scorer_name, args.test_dataset, use_cache,
             avg_runs=args.avg_runs)
    elif args.which == 'traintest':
        train(args.scorer_name, use_cache)
        test(args.scorer_name, args.test_dataset, use_cache,
             avg_runs=args.avg_runs)
    elif args.which == 'cv':
        cv(args.scorer_name, args.dataset, use_cache, n_folds=args.n_folds,
           avg_runs=args.avg_runs)
    elif args.which == 'print':
        eval_print(args.scorer_name, args.dataset, use_cache)