def __init__(self): self.entityNameIdMap = util.EntityNameIdMap() self.entityNameIdMap.init_gerbil_compatible_ent_id() self.unknown_ent_name = dict() self.no_english_uri = dict() self.all_gm_cnt = dict() self.englishuri_gm_cnt = dict() self.valid_gms = dict()
def process_aida(in_filepath, out_filepath): # _, wiki_id_name_map = util.load_wiki_name_id_map(lowercase=False) #_, wiki_id_name_map = util.entity_name_id_map_from_dump() entityNameIdMap = util.EntityNameIdMap() entityNameIdMap.init_compatible_ent_id() unknown_gt_ids = 0 # counter of ground truth entity ids that are not in the wiki_name_id.txt ent_id_changes = 0 with open(in_filepath) as fin, open(out_filepath, "w") as fout: in_mention = False # am i inside a mention span or not first_document = True for line in fin: l = line.split('\t') if in_mention and not (len(l) == 7 and l[1] == 'I'): # if I am in mention but the current line does not continue the previous mention # then print MMEND and be in state in_mention=FALSE fout.write("MMEND\n") in_mention = False if line.startswith("-DOCSTART-"): if not first_document: fout.write("DOCEND\n") # line = "-DOCSTART- (967testa ATHLETICS)\n" doc_title = line[len("-DOCSTART- ("):-2] fout.write("DOCSTART_" + doc_title.replace(' ', '_') + "\n") first_document = False elif line == "\n": fout.write("*NL*\n") elif len(l) == 7 and l[1] == 'B': # this is a new mention wiki_title = l[4] wiki_title = wiki_title[len("http://en.wikipedia.org/wiki/" ):].replace('_', ' ') new_ent_id = entityNameIdMap.compatible_ent_id( wiki_title, l[5]) if new_ent_id is not None: if new_ent_id != l[5]: ent_id_changes += 1 #print(line, "old ent_id: " + l[5], " new_ent_id: ", new_ent_id) fout.write( "MMSTART_" + new_ent_id + "\n" ) # TODO check here if entity id is inside my wikidump # if not then omit this mention fout.write(l[0] + "\n") # write the word in_mention = True else: unknown_gt_ids += 1 fout.write(l[0] + "\n") # write the word print(line) else: # words that continue a mention len(l) == 7: and l[1]=='I' # or normal word outside of mention, or in mention without disambiguation (len(l) == 4) fout.write(l[0].rstrip() + "\n") fout.write("DOCEND\n") # for the last document print("process_aida unknown_gt_ids: ", unknown_gt_ids) print("process_aida ent_id_changes: ", ent_id_changes)
def process_hipe(in_filepath, out_filepath): # _, wiki_id_name_map = util.load_wiki_name_id_map(lowercase=False) #_, wiki_id_name_map = util.entity_name_id_map_from_dump() entityNameIdMap = util.EntityNameIdMap() entityNameIdMap.init_compatible_ent_id() unknown_gt_ids = 0 # counter of ground truth entity ids that are not in the wiki_name_id.txt ent_id_changes = 0 with open(in_filepath) as fin, open(out_filepath, "w") as fout: in_mention = False # am i inside a mention span or not first_document = True for line in fin: l = line.split('\t') if len(l) == 10: if in_mention and not ('I' in l[1]): # if I am in mention but the current line does not continue the previous mention # then print MMEND and be in state in_mention=FALSE # end of the mention fout.write("MMEND\n") in_mention = False elif "EndOfLine" in l[9]: # new line fout.write("*NL*\n") elif 'B' in l[1]: # this is a new mention wikidata_id = l[7] fout.write( "MMSTART_" + wikidata_id + "\n" ) # TODO check here if entity id is inside my wikidump # if not then omit this mention fout.write(l[0] + "\n") # write the word in_mention = True elif l[1] == "NE-COARSE-LIT": continue else: # words that continue a mention len(l) == 10: and l[1] contains 'I' # or normal word outside of mention fout.write(l[0].rstrip() + "\n") elif "# document_id" in line: if not first_document: fout.write("DOCEND\n") # line = "# document_id = NZZ-1798-01-20-a-p0001\n" doc_title = line[len("# document_id = "):-1] fout.write("DOCSTART_" + doc_title.replace(' ', '_') + "\n") first_document = False else: continue fout.write("DOCEND\n") # until the last document
def process_wimcor(in_filepath, out_filepath): with open(in_filepath) as fin: content = fin.read() soup = BeautifulSoup(content, 'lxml') spacy_tokenizer = English(parser=False) entityNameIdMap = util.EntityNameIdMap() entityNameIdMap.init_compatible_ent_id() unknown_gt_ids = 0 # counter of ground truth entity ids that are not in the wiki_name_id.txt with open(out_filepath, "w") as fout: for idx, item in enumerate(soup.find_all('sample')): fout.write('DOCSTART_{}\n'.format(idx)) lcontext = str(item.find('pmw').previous_sibling) if item.find('pmw').previous_sibling else "" pmw = item.find('pmw').text loc_pmw = len(spacy_tokenizer(lcontext)) len_pmw = len(spacy_tokenizer(pmw)) sample = '{} {} {}'.format(lcontext, pmw, str(item.find('pmw').next_sibling) if item.find('pmw').next_sibling else "") ctr = 0 in_pmw = False for idx, token in enumerate(spacy_tokenizer(sample)): if idx == loc_pmw: wiki_title = item.find('pmw')['fine'] ent_id = entityNameIdMap.compatible_ent_id(wiki_title) if ent_id is not None: fout.write('MMSTART_{}\n'.format(ent_id)) in_pmw = True ctr = len_pmw else: unknown_gt_ids += 1 fout.write('{}\n'.format(token)) elif in_pmw and ctr == 0: in_pmw = False fout.write('MMEND\n') fout.write('{}\n'.format(token)) else: fout.write('{}\n'.format(token)) ctr -= 1 fout.write('DOCEND\n') print("process_wimcor unknown_gt_ids: ", unknown_gt_ids)
def wikidump_to_new_format(): doc_cnt = 0 hyperlink2EntityId = util.EntityNameIdMap() hyperlink2EntityId.init_hyperlink2id() if args.debug: infilepath = config.base_folder + "data/mydata/tokenized_toy_wiki_dump2.txt" outfilepath = args.out_folder + "toy_wikidump.txt" else: infilepath = config.base_folder + "data/basic_data/tokenizedWiki.txt" outfilepath = args.out_folder + "wikidump.txt" with open(infilepath) as fin,\ open(outfilepath, "w") as fout: in_mention = False for line in fin: line = line.rstrip() # omit the '\n' character if line.startswith('<doc\xa0id="'): docid = line[9:line.find('"', 9)] doctitle = line[line.rfind('="') + 2:-2] fout.write("DOCSTART_" + docid + "_" + doctitle.replace(' ', '_') + "\n") elif line.startswith('<a\xa0href="'): ent_id = hyperlink2EntityId.hyperlink2id(line) if ent_id != config.unk_ent_id: in_mention = True fout.write("MMSTART_" + ent_id + "\n") elif line == '</doc>': fout.write("DOCEND\n") doc_cnt += 1 if doc_cnt % 5000 == 0: print("document counter: ", doc_cnt) elif line == '</a>': if in_mention: fout.write("MMEND\n") in_mention = False else: fout.write(line + "\n")
def process_aida(in_filepath, out_filepath): # _, wiki_id_name_map = util.load_wiki_name_id_map(lowercase=False) #_, wiki_id_name_map = util.entity_name_id_map_from_dump() entityNameIdMap = util.EntityNameIdMap() entityNameIdMap.init_compatible_ent_id() unknown_gt_ids = 0 # counter of ground truth entity ids that are not in the wiki_name_id.txt ent_id_changes = 0 text_acc = [] with open(in_filepath) as fin, open( args.output_folder + "tokenize_" + out_filepath, "w") as fout: in_mention = False # am i inside a mention span or not first_document = True for line in fin: l = line.strip().split('\t') if in_mention and not (len(l) == 5 and l[1] == 'I'): # if I am in mention but the current line does not continue the previous mention # then print MMEND and be in state in_mention=FALSE #fout.write("MMEND\n") text_acc.append("MMEND") in_mention = False if line.startswith("-DOCSTART-"): if not first_document: #fout.write("DOCEND\n") text_acc.append("DOCEND") # line = "-DOCSTART- (967testa ATHLETICS)\n" doc_title = line[len("-DOCSTART- ("):-2] #fout.write("DOCSTART_"+doc_title.replace(' ', '_')+"\n") text_acc.append("DOCSTART_" + doc_title.replace(' ', '_')) first_document = False elif line == "\n": #fout.write("*NL*\n") text_acc.append("\n") elif len(l) == 5 and l[1] == 'B': # this is a new mention wikidataid = l[4] wikidataid = wikidataid[len("https://www.wikidata.org/wiki/"):] if entityNameIdMap.is_valid_entity_id(wikidataid): text_acc.append("MMSTART_" + wikidataid) # if not then omit this mention #fout.write(l[0]+"\n") # write the word text_acc.append(l[0]) # write the word in_mention = True else: unknown_gt_ids += 1 #fout.write(l[0]+"\n") # write the word text_acc.append(l[0]) # write the word print(line) else: # words that continue a mention len(l) == 7: and l[1]=='I' # or normal word outside of mention, or in mention without disambiguation (len(l) == 4) #fout.write(l[0].rstrip()+"\n") text_acc.append(l[0].rstrip()) #fout.write("DOCEND\n") # for the last document text_acc.append("DOCEND") # for the last document fout.write(' '.join(text_acc)) print("process_aida unknown_gt_ids: ", unknown_gt_ids) print("now tokenize with stanford tokenizer") tokenize_command = 'cd {}; java -cp "*" ' \ 'edu.stanford.nlp.process.PTBTokenizer -options "tokenizeNLs=True" < {} > {}'.format( args.stanford_tokenizer_folder, args.output_folder+"tokenize_"+out_filepath, args.output_folder+out_filepath) print(tokenize_command) call(tokenize_command, shell=True)
def __init__(self): self.entityNameIdMap = util.EntityNameIdMap() self.entityNameIdMap.init_compatible_ent_id()