예제 #1
0
def main_entity_link_text():
    globals.read_configuration('config.cfg')
    entity_linker = globals.get_entity_linker()
    parser = globals.get_parser()
    from text2kb.utils import get_questions_serps
    question_search_results = get_questions_serps()
    globals.logger.setLevel("DEBUG")
    import operator
    while True:
        print "Please enter some text: "
        text = sys.stdin.readline().strip().decode('utf-8')
        tokens = parser.parse(text).tokens
        print "Entities:", entity_linker.identify_entities_in_document(tokens, max_token_window=5)
        entities = {}
        tokens = {}

        if text in question_search_results:
            for doc in question_search_results[text][:10]:
                print doc
                title = doc.title
                snippet = doc.snippet
                snippet_tokens = parser.parse(title + "\n" + snippet).tokens
                for token in snippet_tokens:
                    if token.lemma not in tokens:
                        tokens[token.lemma] = 0
                    tokens[token.lemma] += 1
                for entity in entity_linker.identify_entities_in_document(snippet_tokens):
                    if entity['mid'] not in entities:
                        entities[entity['mid']] = entity
                    else:
                        entities[entity['mid']]['count'] += entity['count']
        print sorted(entities.values(), key=operator.itemgetter('count'), reverse=True)[:50]
예제 #2
0
def find_entity_mentions(text, use_tagme=False):
    if use_tagme:
        import urllib, httplib, json
        params = urllib.urlencode({
            # Request parameters
            'text': text,
        })

        data = None
        try:
            host, port = globals.config.get("EntityLinker", "tagme-service-url").split(":")
            conn = httplib.HTTPConnection(host, port)
            conn.request("GET", "/get_entities?%s" % params)
            response = conn.getresponse()
            data = response.read()
            conn.close()
        except Exception as ex:
            logger.error(ex.message)
            return []
        if not data:
            return []
        return [{'mid': e['entity'],
                'name': e['entity'],
                'surface_score': float(e['coherence']),
                'score': float(e['rho']),
                'positions': (e['start'], e['end']),
                'count': 1} for e in json.loads(data)]
    else:
        entity_linker = globals.get_entity_linker()
        parser = globals.get_parser()
        tokens = parser.parse(text).tokens
        return entity_linker.identify_entities_in_document(tokens, max_token_window=5, get_main_name=True)
예제 #3
0
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
예제 #4
0
파일: translator.py 프로젝트: DenXX/aqqu
    def get_from_config(cls, config_params):
        sparql_backend = globals.get_sparql_backend(config_params)
        query_extender = QueryCandidateExtender.init_from_config()
        entity_linker = globals.get_entity_linker()
        parser = globals.get_parser()
        scorer_obj = ranker.SimpleScoreRanker('DefaultScorer')
        ngram_notable_types_npmi_path = config_params.get('QueryCandidateExtender', 'ngram-notable-types-npmi', '')
        notable_types_npmi_threshold = float(config_params.get('QueryCandidateExtender', 'notable-types-npmi-threshold'))
        ngram_notable_types_npmi = None
        if ngram_notable_types_npmi_path and os.path.exists(ngram_notable_types_npmi_path):
            import cPickle as pickle
            try:
                with open(ngram_notable_types_npmi_path, 'rb') as inp:
                    logger.info("Loading types model from disk...")
                    ngram_notable_types_npmi = pickle.load(inp)
            except IOError as exc:
                logger.error("Error reading types model: %s" % str(exc))
                ngram_notable_types_npmi = None

        return SparqlQueryTranslator(sparql_backend, query_extender,
                                     entity_linker, parser, scorer_obj,
                                     ngram_notable_types_npmi,
                                     notable_types_npmi_threshold)
예제 #5
0
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)


if __name__ == "__main__":
    extract_npmi_ngram_type_pairs()
    exit()

    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    data = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            tokens = [token.token for token in parser.parse(query.utterance).tokens]
            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities]
            data.append((tokens, notable_types))