Пример #1
0
def gen_description_data(fn_wq_list, fn_out):

    globals.read_configuration("config.cfg")
    entity_linker = EntityLinker.init_from_config()
    parser = CoreNLPParser.init_from_config()

    fout = open(fn_out, 'w')
    for fn_wq in fn_wq_list:
        wq = json.load(open(fn_wq), encoding='utf8')
        for data in wq:
            tokens = parser.parse(data['utterance'])
            entities = entity_linker.identify_entities_in_tokens(tokens)
            neg_entities = set()
            for e in entities:
                mid = e.get_mid()
                if mid == '':
                    continue
                if mid.startswith('m.'):
                    neg_entities.add(mid)
                else:
                    print mid, e.name, data['utterance']
            neg_entities -= set([data['mid1']])
            instance = {
                'q': data['utterance'],
                'pos': data['mid1'],
                'neg': list(neg_entities)
            }
            print >> fout, json.dumps(instance,
                                      ensure_ascii=False).encode('utf8')
    fout.close()
Пример #2
0
def get_parser():
    global _parser
    if _parser is not None:
        return _parser
    from corenlp_parser.parser import CoreNLPParser
    _parser = CoreNLPParser.init_from_config()
    return _parser
Пример #3
0
def link_entity_in_simple_question(fn_in, fn_out):
    globals.read_configuration("config.cfg")
    entity_linker = EntityLinker.init_from_config()
    parser = CoreNLPParser.init_from_config()
    with open(fn_out, 'w') as fout:
        with open(fn_in) as fin:
            for line in fin:
                ll = line.decode('utf8').strip().split('\t')
                if len(ll) != 5:
                    continue
                tokens = parser.parse(ll[4])
                entities = entity_linker.identify_entities_in_tokens(tokens)
                neg_entities = set()
                for e in entities:
                    mid = e.get_mid()
                    if mid == '':
                        continue
                    if mid.startswith('m.'):
                        neg_entities.add(mid)
                    else:
                        print mid, e.name, ll[4]
                neg_entities -= set([ll[0]])
                line = json.dumps(
                    {
                        'q': ll[4],
                        'pos': ll[0],
                        'neg': list(neg_entities)
                    },
                    ensure_ascii=False).encode('utf8')
                print >> fout, line
Пример #4
0
def get_number_of_external_entities():
    import scorer_globals
    globals.read_configuration('config_webentity.cfg')
    parser = CoreNLPParser.init_from_config()
    entity_linker = WebSearchResultsExtenderEntityLinker.init_from_config()
    entity_linker.topn_entities = 100000
    scorer_globals.init()

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    datasets = ["webquestions_split_train", "webquestions_split_dev",]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    external_entities_count = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            entities = entity_linker.identify_entities_in_tokens(parser.parse(query.utterance).tokens, text=query.utterance, find_dates=False)
            print "-------------------------"
            print query.utterance
            print "\n".join(map(str, sorted(entities, key=lambda entity: entity.external_entity_count, reverse=True)))

            external_entities_count.append(0)
            for entity in entities:
                if entity.external_entity:
                    external_entities_count[-1] += 1
            if index % 100 == 0:
                print >> sys.stderr, "%s queries processed" % index
    print "========================================="
    print external_entities_count
    print sum(external_entities_count)
    print len(external_entities_count)
 def init_from_config():
     config_params = globals.config
     sparql_backend = globals.get_sparql_backend(config_params)
     query_extender = QueryCandidateExtender.init_from_config()
     entity_linker = EntityLinker.init_from_config()
     parser = CoreNLPParser.init_from_config()
     scorer_obj = ranker.SimpleScoreRanker('DefaultScorer')
     return QueryTranslator(sparql_backend, query_extender, entity_linker,
                            parser, scorer_obj)
Пример #6
0
 def init_from_config():
     config_params = globals.config
     sparql_backend = globals.get_sparql_backend(config_params)
     query_extender = QueryCandidateExtender.init_from_config()
     entity_linker = EntityLinker.init_from_config()
     parser = CoreNLPParser.init_from_config()
     scorer_obj = ranker.SimpleScoreRanker('DefaultScorer')
     return QueryTranslator(sparql_backend, query_extender,
                            entity_linker, parser, scorer_obj)
Пример #7
0
def get_lemma_fn():
    parser = CoreNLPParser.init_from_config()

    def fn(text):
        text = text.lower()
        tokens = parser.parse(text)

        lemma = [
            t.lemma if t.pos.startswith('NN') else t.token for t in tokens
        ]
        return ''.join(lemma), 0.9

    return fn
Пример #8
0
def main_parse():
    document_content_file = globals.config.get('WebSearchFeatures', 'documents-content-file')
    parser = CoreNLPParser.init_from_config()
    question_serps = get_questions_serps()
    print datetime.now()
    with open(document_content_file, 'wx') as out_file:
        index = 0
        for serp in question_serps.itervalues():
            for doc in serp[:10]:
                content = doc.content()
                if len(content) > 0:
                    document = (doc.url, parser.parse(content))
                    pickle.dump(document, out_file)
            print "Query #", index, datetime.now()
            index += 1
Пример #9
0
def link_entity_one(params):
    fn, start, end = params
    lno = 0
    fin = open(fn)
    while lno < start:
        fin.readline()
        lno += 1
    globals.read_configuration("config.cfg")
    parser = CoreNLPParser.init_from_config()
    ret = []

    for i in xrange(start, end):
        line = fin.readline()
        ll = line.decode('utf8').strip().split('\t')
        if len(ll) != 5:
            continue
        tokens = parser.parse(ll[4])
        ret.append((ll[4], ll[0], tokens))

    fin.close()
    return ret
Пример #10
0
 def __init__(self):
     self.parser = CoreNLPParser.init_from_config()
     self.entity_linker = EntityLinker.init_from_config()