def gen_description_data(fn_wq_list, fn_out): globals.read_configuration("config.cfg") entity_linker = EntityLinker.init_from_config() parser = CoreNLPParser.init_from_config() fout = open(fn_out, 'w') for fn_wq in fn_wq_list: wq = json.load(open(fn_wq), encoding='utf8') for data in wq: tokens = parser.parse(data['utterance']) entities = entity_linker.identify_entities_in_tokens(tokens) neg_entities = set() for e in entities: mid = e.get_mid() if mid == '': continue if mid.startswith('m.'): neg_entities.add(mid) else: print mid, e.name, data['utterance'] neg_entities -= set([data['mid1']]) instance = { 'q': data['utterance'], 'pos': data['mid1'], 'neg': list(neg_entities) } print >> fout, json.dumps(instance, ensure_ascii=False).encode('utf8') fout.close()
def get_parser(): global _parser if _parser is not None: return _parser from corenlp_parser.parser import CoreNLPParser _parser = CoreNLPParser.init_from_config() return _parser
def link_entity_in_simple_question(fn_in, fn_out): globals.read_configuration("config.cfg") entity_linker = EntityLinker.init_from_config() parser = CoreNLPParser.init_from_config() with open(fn_out, 'w') as fout: with open(fn_in) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) != 5: continue tokens = parser.parse(ll[4]) entities = entity_linker.identify_entities_in_tokens(tokens) neg_entities = set() for e in entities: mid = e.get_mid() if mid == '': continue if mid.startswith('m.'): neg_entities.add(mid) else: print mid, e.name, ll[4] neg_entities -= set([ll[0]]) line = json.dumps( { 'q': ll[4], 'pos': ll[0], 'neg': list(neg_entities) }, ensure_ascii=False).encode('utf8') print >> fout, line
def get_number_of_external_entities(): import scorer_globals globals.read_configuration('config_webentity.cfg') parser = CoreNLPParser.init_from_config() entity_linker = WebSearchResultsExtenderEntityLinker.init_from_config() entity_linker.topn_entities = 100000 scorer_globals.init() parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False datasets = ["webquestions_split_train", "webquestions_split_dev",] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] external_entities_count = [] for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): entities = entity_linker.identify_entities_in_tokens(parser.parse(query.utterance).tokens, text=query.utterance, find_dates=False) print "-------------------------" print query.utterance print "\n".join(map(str, sorted(entities, key=lambda entity: entity.external_entity_count, reverse=True))) external_entities_count.append(0) for entity in entities: if entity.external_entity: external_entities_count[-1] += 1 if index % 100 == 0: print >> sys.stderr, "%s queries processed" % index print "=========================================" print external_entities_count print sum(external_entities_count) print len(external_entities_count)
def init_from_config(): config_params = globals.config sparql_backend = globals.get_sparql_backend(config_params) query_extender = QueryCandidateExtender.init_from_config() entity_linker = EntityLinker.init_from_config() parser = CoreNLPParser.init_from_config() scorer_obj = ranker.SimpleScoreRanker('DefaultScorer') return QueryTranslator(sparql_backend, query_extender, entity_linker, parser, scorer_obj)
def get_lemma_fn(): parser = CoreNLPParser.init_from_config() def fn(text): text = text.lower() tokens = parser.parse(text) lemma = [ t.lemma if t.pos.startswith('NN') else t.token for t in tokens ] return ''.join(lemma), 0.9 return fn
def main_parse(): document_content_file = globals.config.get('WebSearchFeatures', 'documents-content-file') parser = CoreNLPParser.init_from_config() question_serps = get_questions_serps() print datetime.now() with open(document_content_file, 'wx') as out_file: index = 0 for serp in question_serps.itervalues(): for doc in serp[:10]: content = doc.content() if len(content) > 0: document = (doc.url, parser.parse(content)) pickle.dump(document, out_file) print "Query #", index, datetime.now() index += 1
def link_entity_one(params): fn, start, end = params lno = 0 fin = open(fn) while lno < start: fin.readline() lno += 1 globals.read_configuration("config.cfg") parser = CoreNLPParser.init_from_config() ret = [] for i in xrange(start, end): line = fin.readline() ll = line.decode('utf8').strip().split('\t') if len(ll) != 5: continue tokens = parser.parse(ll[4]) ret.append((ll[4], ll[0], tokens)) fin.close() return ret
def __init__(self): self.parser = CoreNLPParser.init_from_config() self.entity_linker = EntityLinker.init_from_config()