Exemplo n.º 1
0
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
Exemplo n.º 2
0
Arquivo: utils.py Projeto: DenXX/aqqu
def get_entity_idf(entity):
    """
    Get the entity IDF based on Google's annotation of ClueWeb corpus.
    :param entity: The entity to lookup.
    :return: IDF of the entity based on ClueWeb collection.
    """
    global _entity_counts
    if _entity_counts is None:
        _entity_counts = dict()
        with gzip.open(globals.config.get('WebSearchFeatures', 'entity-webcounts-file'), 'r') as input_file:
            logger.info("Reading entity ClueWeb counts...")
            for line in input_file:
                entity, count = line.strip().split('\t')
                count = int(count)
                _entity_counts[entity] = count
            logger.info("Reading entity ClueWeb counts done!")

    if _entity_counts:
        mids = ["/" + mid.replace(".", "/") for mid in KBEntity.get_entityid_by_name(entity, keep_most_triples=True)]
        if mids:
            idf = min(log(max(1.0, CLUEWEB_DOCUMENTS_COUNT / (_entity_counts[mid]
                                                              if mid in _entity_counts and _entity_counts[mid] > 0
                                                              else 1.0)))
                      for mid in mids)
            # logger.info("IDF entity %s %.3f" % (entity, idf))
            return idf
        return 1.0
    else:
        return 0.0
Exemplo n.º 3
0
    def rank_query_candidates(self, query_candidates, key=lambda x: x, utterance=""):
        """
        Returns the candidate generated from search results. This methods doesn't look into the
         existing candidates, but rather creates a new one based on search results.
        :param query_candidates: List of EvaluationCandidate objects. This answerer don't actually use them.
        """
        if isinstance(utterance, unicode):
            utterance = utterance.encode('utf-8')
        if utterance in self._answers_cache:
            return self._answers_cache[utterance]

        question_entities = set([e['name'] for e in find_entity_mentions(utterance.encode("utf-8"), use_tagme=True)])
        res = self._searcher.search(utterance, topn=self._topn)
        res = json.loads(res)
        entities = dict()
        for r in res:
            for e in r['entities']:
                if e['mid'] not in entities:
                    entities[e['mid']] = []
                entities[e['mid']].append((r['phrase'], r['score']))

        answers = sorted(entities.items(), key=lambda x: sum(score for _, score in x[1]), reverse=True)
        answers = [(KBEntity.get_entity_name(answer[0].replace("/", ".")), answer[1]) for answer in answers
                   if answer[0] not in question_entities]
        answers = [EvaluationCandidate(None, answer[1], [answer[0], ]) for answer in answers]
        self._answers_cache[utterance] = answers
        return answers
Exemplo n.º 4
0
def filter_entity_descriptions(mids):
    import gzip

    descriptions_file = globals.config.get("EntityLinker", "entity-descriptions-file")
    with gzip.open(descriptions_file, "r") as input_file, gzip.open(descriptions_file + "_small", "w") as out:
        for index, line in enumerate(input_file):
            triple = KBEntity.parse_freebase_string_triple(line)
            if triple[0] in mids:
                print >> out, line.strip()
Exemplo n.º 5
0
 def get_answer_notable_types(self):
     """
     Returns a list of notable types of each of the result entities.
     :return:
     """
     if self.answer_notable_types is None:
         self.answer_notable_types = []
         for mid, answer in zip(self.get_results_mids(), self.get_results_text()):
             if _year_pattern.match(answer) is not None:
                 continue
             self.answer_notable_types.append(KBEntity.get_notable_type(mid))
     return self.answer_notable_types
Exemplo n.º 6
0
def filter_entity_names(names):
    import gzip

    mids = set()
    entities_file = globals.config.get("EntityLinker", "entity-names-file")
    with gzip.open(entities_file, "r") as input_file, gzip.open(entities_file + "_small", "w") as out:
        for index, line in enumerate(input_file):
            triple = KBEntity.parse_freebase_string_triple(line)
            name = triple[2].lower()
            if name in names:
                mids.add(triple[0])
                print >> out, line.strip()
    return mids
Exemplo n.º 7
0
    def filter_answers_by_type(self, type_filter, score):
        assert self.type_filter is None
        text_results = self.get_results_text()
        mid_results = self.get_results_mids()
        assert len(text_results) == len(mid_results)

        new_results_text = []
        new_results_mids = []
        for mid, answer in zip(mid_results, text_results):
            if KBEntity.get_notable_type(mid) == type_filter:
                new_results_mids.append(mid)
                new_results_text.append(answer)
        self.query_results = new_results_text
        self.query_results_mids = new_results_mids
        self.type_filter = type_filter
        self.type_filter_max_npmi = score[0]
        self.type_filter_avg_npmi = score[0]
        self.cached_result_count = len(self.query_results)
        return self.get_results_text()
Exemplo n.º 8
0
    def _bytes_to_entity(line: bytes) -> 'KBEntity':
        """
        Instantiate entity from string representation.

        >>> e = EntityIndex._bytes_to_entity(b'm.0abc1\\tfoo name\\t7\\tfooly\\tfoo\\n')
        >>> e.name
        'foo name'
        >>> e.id
        'm.0abc1'
        >>> e.score
        7
        >>> e.aliases
        ['fooly', 'foo']
        """
        cols = line.strip().decode('utf-8').split('\t')
        mid = cols[0]
        name = cols[1]
        score = int(cols[2])
        aliases = cols[3:]
        return KBEntity(name, mid, score, aliases)
Exemplo n.º 9
0
            #     else:
            #         for relation in query.eval_candidates[query.oracle_position - 1].query_candidate.relations:
            #             if relation.name not in correct_relations:
            #                 print query.utterance
            #                 print relation.name
            #                 print query.eval_candidates[query.oracle_position - 1].query_candidate
            #                 print "-----"

            # This loop will print questions without good candidate
            # if query.oracle_position == -1:
            #     entities = set()
            #     for candidate in query.eval_candidates:
            #         for entity in candidate.query_candidate.matched_entities:
            #             if isinstance(entity.entity.entity, KBEntity):
            #                 entities.add((entity.entity.name, entity.entity.entity.id))
            #     print ">>>", query.utterance
            #     print entities

            for candidate in query.eval_candidates:
                answer_entities = set(mid for entity_name in candidate.prediction
                                      for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True))
                question_entities = set(mid for entity in candidate.query_candidate.matched_entities
                                        for mid in KBEntity.get_entityid_by_name(entity.entity.name,
                                                                                 keep_most_triples=True))
                for question_entity in question_entities:
                    for answer_entity in answer_entities:
                        print question_entity + "\t" + answer_entity

            if index % 100 == 0:
                print >> stderr, "Processed %d queries" % index
Exemplo n.º 10
0
def generate_text_based_features(candidate):
    # Get candidate answers
    answers = map(unicode.lower, candidate.get_results_text())
    # Skip empty and extra-long answers.
    if len(answers) == 0:
        return dict()
    # Get answers descriptions.
    answers_descriptions = ['\n'.join(KBEntity.get_entity_descriptions_by_name(answer, keep_most_triples_only=True))
                            for answer in answers]

    # Get question text.
    question_text = candidate.query.original_query
    question_tokens2pos = dict((token, [1, ]) for token in tokenize(question_text))
    question_token_tfidf = SparseVector.from_2pos(question_tokens2pos,
                                                  element_calc_func=SparseVector.compute_tfidf_token_elements)

    # Get question entities
    question_entities2pos = dict((entity.entity.name.lower(), [1, ]) for entity in candidate.matched_entities)
    question_entitytoken2pos = dict((token, [1, ])
                                    for entity in candidate.matched_entities
                                    for token in tokenize(entity.entity.name))
    question_entity_tfidf = SparseVector.from_2pos(question_entitytoken2pos,
                                                   element_calc_func=SparseVector.compute_tfidf_token_elements)

    # Get search results and check that they aren't empty
    questions_search_results = get_questions_serps()

    documents_vectors = []
    snippets_vectors = []
    fragment_vectors = []
    combined_documents_vector = dict()
    combined_document_snippets_vector = dict()

    representations = ["entity_tfidf",
                       "token_tfidf",
                       # "entity",
                       # "token",
                      ]
    for r in representations:
        combined_documents_vector[r] = dict()
        combined_document_snippets_vector[r] = dict()

    if question_text not in questions_search_results:
        logger.warning("No search results found for the question %s" % question_text)
    else:
        documents_vectors, snippets_vectors, fragment_vectors, combined_documents_vector,\
            combined_document_snippets_vector = generate_document_vectors(question_text,
                                                                          question_tokens2pos,
                                                                          questions_search_results)

    answer_entity2pos = dict((answer_entity, [1, ]) for answer_entity in answers)
    answer_token2pos = dict((answer_token, [1, ]) for answer_entity in answers
                            for answer_token in tokenize(answer_entity))
    answers_vectors = {
        "token_tfidf": SparseVector.from_2pos(answer_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
        "entity_tfidf": SparseVector.from_2pos(answer_entity2pos,
                                               element_calc_func=SparseVector.compute_tfidf_entity_elements),
        # "entity": SparseVector.from_2pos(answer_entity2pos),
        # "token": SparseVector.from_2pos(answer_token2pos),
    }

    answer_descriptions_token2pos = dict((token, [1, ]) for description in answers_descriptions
                                         for token in tokenize(description))
    answer_description_vectors = {
        "token_tfidf": SparseVector.from_2pos(answer_descriptions_token2pos,
                                              element_calc_func=SparseVector.compute_tfidf_token_elements),
        "entity_tfidf": SparseVector(dict()),
        # Keeping only tf-idf similarities. This seems to be enough.
        # "token": SparseVector.from_2pos(answer_descriptions_token2pos),
        # "entity": SparseVector(dict()),
    }

    similarity_functions = [
        ("cosine", Similarity.cosine_similarity),
        # ("itersection", Similarity.intersection_similarity),
        # ("normalized_intersection", Similarity.normalized_intersection_similarity),
        # ("bm25", Similarity.bm25_similarity),
    ]
    features = dict()

    for similarity_name, similarity in similarity_functions:
        # Computing document-answer similarities for each representation.
        document_answer_similarities = {}
        for representation in representations:
            if representation not in document_answer_similarities:
                document_answer_similarities[representation] = []
            for doc_vector in documents_vectors:
                document_answer_similarities[representation].append(similarity(representation,
                                                                               doc_vector[representation],
                                                                               answers_vectors[representation]))
        for representation in representations:
            features.update({
                "text_features:avg_document_answer_%s_%s" % (representation, similarity_name):
                    avg(document_answer_similarities[representation]),
                "text_features:max_document_answer_%s_%s" % (representation, similarity_name):
                    max(document_answer_similarities[representation]) if document_answer_similarities[representation]
                    else 0.0,
            })

        # logger.info("Snippet-answer similarity...")
        # Computing snippet-answer similarities for each representation.
        snippet_answer_similarities = {}
        for representation in representations:
            if representation not in snippet_answer_similarities:
                snippet_answer_similarities[representation] = []

            for snippet_vector in snippets_vectors:
                snippet_answer_similarities[representation].append(similarity(representation,
                                                                              snippet_vector[representation],
                                                                              answers_vectors[representation]))

        for representation in representations:
            features.update({
                "text_features:avg_snippet_answer_%s_%s" % (representation, similarity_name):
                    avg(snippet_answer_similarities[representation]),
                "text_features:max_snippet_answer_%s_%s" % (representation, similarity_name):
                    max(snippet_answer_similarities[representation]) if snippet_answer_similarities[representation] else 0.0,
            })

        # logger.info("Fragment-answer similarity...")
        # Best BM25 fragment-answer similarities.
        # Weren't very efficient and therefore I remove this features. There is a chance that there is a bug in the features.

        # fragment_answer_similarities = {}
        # for fragment_vector in fragment_vectors:
        #     for representation in representations:
        #         if representation not in fragment_answer_similarities:
        #             fragment_answer_similarities[representation] = []
        #         fragment_answer_similarities[representation].append(similarity(representation,
        #                                                                        fragment_vector[representation],
        #                                                                        answers_vectors[representation]))
        #
        # for representation in representations:
        #     features.update({
        #         "text_features:avg_fragment_answer_%s_%s" % (representation, similarity_name):
        #             avg(fragment_answer_similarities[representation]),
        #         "text_features:max_fragment_answer_%s_%s" % (representation, similarity_name):
        #             max(fragment_answer_similarities[representation]) if fragment_answer_similarities[representation] else 0.0,
        #     })

        # logger.info("Combined document-answer similarity...")
        # Combined documents answer similarity
        for representation in representations:
            combineddoc_answer_similarity = similarity(representation,
                                                       combined_documents_vector[representation],
                                                       answers_vectors[representation])
            features.update({
                "text_features:combdocument_answer_%s_%s" % (representation, similarity_name):
                    combineddoc_answer_similarity,
            })

        # logger.info("Combined snippet-answer similarity...")
        for representation in representations:
            combineddocsnippet_answer_similarity = similarity(representation,
                                                              combined_document_snippets_vector[representation],
                                                              answers_vectors[representation])
            features.update({
                "text_features:combdocument_snippet_answer_%s_%s" % (representation, similarity_name):
                    combineddocsnippet_answer_similarity,
            })

        # logger.info("Description-question similarity...")
        # These features aren't very efficient either. The next candidate for removal.
        description_question_entity_similarity = similarity("token_tfidf", question_entity_tfidf,
                                                            answer_description_vectors["token_tfidf"])
        description_question_token_similarity = similarity("token_tfidf", question_token_tfidf,
                                                           answer_description_vectors["token_tfidf"])
        features.update({
            "text_features:description_question_entitytoken_%s" % similarity_name:
                description_question_entity_similarity,
            "text_features:description_question_token_%s" % similarity_name: description_question_token_similarity,
        })

    # Description - question embedding similarity.
    description_question_token_embedding_avg_similarity = Similarity.embedding_avg_idf_similarity(
        "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"])
    description_question_token_embedding_n_similarity = Similarity.embedding_avg_idf_similarity(
        "token_tfidf", question_token_tfidf, answer_description_vectors["token_tfidf"])
    features.update({
        "text_features:description_question_token_avg_idf_embeddings":
            description_question_token_embedding_avg_similarity,
        "text_features:description_question_token_n_embeddings":
            description_question_token_embedding_n_similarity,
    })

    # Remove features with 0 score.
    features = dict((feature, value) for feature, value in features.iteritems() if value != 0.0)
    return features
Exemplo n.º 11
0
def extract_npmi_ngram_type_pairs():
    globals.read_configuration('config.cfg')
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    n_gram_type_counts = dict()
    type_counts = dict()
    n_gram_counts = dict()
    total = 0
    year_pattern = re.compile("[0-9]+")
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            if query.oracle_position != -1 and query.oracle_position <= len(query.eval_candidates):
                correct_candidate = query.eval_candidates[query.oracle_position - 1]
                logger.info(query.utterance)
                logger.info(correct_candidate.query_candidate)

                n_grams = set(get_n_grams_features(correct_candidate.query_candidate))

                answer_entities = [mid for answer in query.target_result
                                   if year_pattern.match(answer) is None
                                   for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
                correct_notable_types = set(filter(lambda x: x,
                                                   [KBEntity.get_notable_type(entity_mid)
                                                    for entity_mid in answer_entities]))

                for notable_type in correct_notable_types:
                    if notable_type not in type_counts:
                        type_counts[notable_type] = 0
                    type_counts[notable_type] += 1

                for n_gram in n_grams:
                    if n_gram not in n_gram_counts:
                        n_gram_counts[n_gram] = 0
                    n_gram_counts[n_gram] += 1

                    for notable_type in correct_notable_types:
                        pair = (n_gram, notable_type)
                        if pair not in n_gram_type_counts:
                            n_gram_type_counts[pair] = 0
                        n_gram_type_counts[pair] += 1

                total += 1

    npmi = dict()
    from math import log
    for n_gram_type_pair, n_gram_type_count in n_gram_type_counts.iteritems():
        if n_gram_type_count > 4:
            n_gram, type = n_gram_type_pair
            npmi[n_gram_type_pair] = (log(n_gram_type_count) - log(n_gram_counts[n_gram]) - log(type_counts[type]) +
                                        log(total)) / (-log(n_gram_type_count) + log(total))

    with open("type_model_npmi.pickle", 'wb') as out:
        pickle.dump(npmi, out)

    import operator
    npmi = sorted(npmi.items(), key=operator.itemgetter(1), reverse=True)
    print "\n".join(map(str, npmi[:50]))
Exemplo n.º 12
0
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)


if __name__ == "__main__":
    extract_npmi_ngram_type_pairs()
    exit()

    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    data = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            tokens = [token.token for token in parser.parse(query.utterance).tokens]
            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities]
            data.append((tokens, notable_types))
            logger.info(tokens)
            logger.info([KBEntity.get_entity_name(notable_type) for notable_type in notable_types])

    with open("question_tokens_notable_types.pickle", 'wb') as out:
        pickle.dump(data, out)