Пример #1
0
    def rank_query_candidates(self, query_candidates, key=lambda x: x, utterance=""):
        """
        Returns the candidate generated from search results. This methods doesn't look into the
         existing candidates, but rather creates a new one based on search results.
        :param query_candidates: List of EvaluationCandidate objects. This answerer don't actually use them.
        """
        if isinstance(utterance, unicode):
            utterance = utterance.encode('utf-8')
        if utterance in self._answers_cache:
            return self._answers_cache[utterance]

        question_entities = set([e['name'] for e in find_entity_mentions(utterance.encode("utf-8"), use_tagme=True)])
        res = self._searcher.search(utterance, topn=self._topn)
        res = json.loads(res)
        entities = dict()
        for r in res:
            for e in r['entities']:
                if e['mid'] not in entities:
                    entities[e['mid']] = []
                entities[e['mid']].append((r['phrase'], r['score']))

        answers = sorted(entities.items(), key=lambda x: sum(score for _, score in x[1]), reverse=True)
        answers = [(KBEntity.get_entity_name(answer[0].replace("/", ".")), answer[1]) for answer in answers
                   if answer[0] not in question_entities]
        answers = [EvaluationCandidate(None, answer[1], [answer[0], ]) for answer in answers]
        self._answers_cache[utterance] = answers
        return answers
Пример #2
0
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)


if __name__ == "__main__":
    extract_npmi_ngram_type_pairs()
    exit()

    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    data = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            tokens = [token.token for token in parser.parse(query.utterance).tokens]
            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities]
            data.append((tokens, notable_types))
            logger.info(tokens)
            logger.info([KBEntity.get_entity_name(notable_type) for notable_type in notable_types])

    with open("question_tokens_notable_types.pickle", 'wb') as out:
        pickle.dump(data, out)