def get_data_items(self, dataset, predict=False): data = [] cand_source = 'candidates' count = 0 for doc_name, content in dataset.items(): count += 1 if count % 1000 == 0: print(count, end='\r') items = [] conll_doc = content[0].get('conll_doc', None) for m in content: try: named_cands = [ c[0] for c in m[cand_source] if (wiki_prefix + c[0]) in self.model.entity_voca.word2id ] p_e_m = [min(1., max(1e-3, c[1])) for c in m[cand_source]] except: named_cands = [ c[0] for c in m['candidates'] if (wiki_prefix + c[0]) in self.model.entity_voca.word2id ] p_e_m = [min(1., max(1e-3, c[1])) for c in m['candidates']] try: true_pos = named_cands.index(m['gold'][0]) p = p_e_m[true_pos] except: true_pos = -1 named_cands = named_cands[:min(self.args.n_cands_before_rank, len(named_cands))] p_e_m = p_e_m[:min(self.args.n_cands_before_rank, len(p_e_m))] if true_pos >= len(named_cands): if not predict: true_pos = len(named_cands) - 1 p_e_m[-1] = p named_cands[-1] = m['gold'][0] else: true_pos = -1 cands = [ self.model.entity_voca.get_id(wiki_prefix + c) for c in named_cands ] mask = [1.] * len(cands) if len(cands) == 0 and not predict: continue elif len(cands) < self.args.n_cands_before_rank: cands += [self.model.entity_voca.unk_id ] * (self.args.n_cands_before_rank - len(cands)) named_cands += [Vocabulary.unk_token] * ( self.args.n_cands_before_rank - len(named_cands)) p_e_m += [1e-8 ] * (self.args.n_cands_before_rank - len(p_e_m)) mask += [0.] * (self.args.n_cands_before_rank - len(mask)) lctx = m['context'][0].strip().split() lctx_ids = [ self.prerank_model.word_voca.get_id(t) for t in lctx if utils.is_important_word(t) ] lctx_ids = [ tid for tid in lctx_ids if tid != self.prerank_model.word_voca.unk_id ] lctx_ids = lctx_ids[ max(0, len(lctx_ids) - self.args.ctx_window // 2):] rctx = m['context'][1].strip().split() rctx_ids = [ self.prerank_model.word_voca.get_id(t) for t in rctx if utils.is_important_word(t) ] rctx_ids = [ tid for tid in rctx_ids if tid != self.prerank_model.word_voca.unk_id ] rctx_ids = rctx_ids[:min(len(rctx_ids), self.args.ctx_window // 2)] ment = m['mention'].strip().split() ment_ids = [ self.prerank_model.word_voca.get_id(t) for t in ment if utils.is_important_word(t) ] ment_ids = [ tid for tid in ment_ids if tid != self.prerank_model.word_voca.unk_id ] m['sent'] = ' '.join(lctx + rctx) # secondary local context (for computing relation scores) if conll_doc is not None: conll_m = m['conll_m'] sent = conll_doc['sentences'][conll_m['sent_id']] start = conll_m['start'] end = conll_m['end'] snd_lctx = [ self.model.snd_word_voca.get_id(t) for t in sent[max(0, start - self.args.snd_local_ctx_window // 2):start] ] snd_rctx = [ self.model.snd_word_voca.get_id(t) for t in sent[end:min(len(sent), end + self.args.snd_local_ctx_window // 2)] ] snd_ment = [ self.model.snd_word_voca.get_id(t) for t in sent[start:end] ] if len(snd_lctx) == 0: snd_lctx = [self.model.snd_word_voca.unk_id] if len(snd_rctx) == 0: snd_rctx = [self.model.snd_word_voca.unk_id] if len(snd_ment) == 0: snd_ment = [self.model.snd_word_voca.unk_id] else: snd_lctx = [self.model.snd_word_voca.unk_id] snd_rctx = [self.model.snd_word_voca.unk_id] snd_ment = [self.model.snd_word_voca.unk_id] items.append({ 'context': (lctx_ids, rctx_ids), 'snd_ctx': (snd_lctx, snd_rctx), 'ment_ids': ment_ids, 'snd_ment': snd_ment, 'cands': cands, 'named_cands': named_cands, 'p_e_m': p_e_m, 'mask': mask, 'true_pos': true_pos, 'doc_name': doc_name, 'raw': m }) if len(items) > 0: # note: this shouldn't affect the order of prediction because we use doc_name to add predicted entities, # and we don't shuffle the data for prediction max_len = 50 if len(items) > max_len: # print(len(items)) for k in range(0, len(items), max_len): data.append(items[k:min(len(items), k + max_len)]) else: data.append(items) return self.prerank(data, predict)
def get_data_items(self, dataset, predict=False): data = [] cand_source = 'candidates' # doc_name is the id of doc, not 'aida-A', 'msnbc' etc. for doc_name, content in dataset.items(): items = [] # only the first mention has the 'conll_doc' conll_doc = content[0].get('conll_doc', None) #content is list of mentions for m in content: try: # c[0] is string name of candidates, c[1] is pem named_cands = [c[0] for c in m[cand_source]] p_e_m = [min(1., max(1e-3, c[1])) for c in m[cand_source]] except: named_cands = [c[0] for c in m['candidates']] p_e_m = [min(1., max(1e-3, c[1])) for c in m['candidates']] try: # index of gold entity in named_cands list, # gold # ('Germany', 1e-05, -1) true_pos = named_cands.index(m['gold'][0]) p = p_e_m[true_pos] except: # gold is not in candidates true_pos = -1 # short list of candidates for mention m named_cands = named_cands[:min(self.args.n_cands_before_rank, len(named_cands))] p_e_m = p_e_m[:min(self.args.n_cands_before_rank, len(p_e_m))] # if gold is out of short list, then add the gold for training if true_pos >= len(named_cands): if not predict: true_pos = len(named_cands) - 1 p_e_m[-1] = p named_cands[-1] = m['gold'][ 0] # replace the last candidates with gold else: true_pos = -1 # get the id of entity cands = [ self.model.entity_voca.get_id(wiki_prefix + c) for c in named_cands ] mask = [1.] * len(cands) # skip the training of mentions without candidates if len(cands) == 0 and not predict: continue elif len( cands ) < self.args.n_cands_before_rank: # padding? and mask the padding cands += [self.model.entity_voca.unk_id ] * (self.args.n_cands_before_rank - len(cands)) named_cands += [Vocabulary.unk_token] * ( self.args.n_cands_before_rank - len(named_cands)) p_e_m += [1e-8 ] * (self.args.n_cands_before_rank - len(p_e_m)) mask += [0.] * (self.args.n_cands_before_rank - len(mask)) # left contxt token ids of important words lctx = m['context'][0].strip().split() lctx_ids = [ self.prerank_model.word_voca.get_id(t) for t in lctx if utils.is_important_word(t) ] lctx_ids = [ tid for tid in lctx_ids if tid != self.prerank_model.word_voca.unk_id ] lctx_ids = lctx_ids[ max(0, len(lctx_ids) - self.args.ctx_window // 2):] rctx = m['context'][1].strip().split() rctx_ids = [ self.prerank_model.word_voca.get_id(t) for t in rctx if utils.is_important_word(t) ] rctx_ids = [ tid for tid in rctx_ids if tid != self.prerank_model.word_voca.unk_id ] rctx_ids = rctx_ids[:min(len(rctx_ids), self.args.ctx_window // 2)] # mention itself ment = m['mention'].strip().split() ment_ids = [ self.prerank_model.word_voca.get_id(t) for t in ment if utils.is_important_word(t) ] ment_ids = [ tid for tid in ment_ids if tid != self.prerank_model.word_voca.unk_id ] m['sent'] = ' '.join(lctx + rctx) # secondary local context (for computing relation scores) if conll_doc is not None: conll_m = m['conll_m'] # the sent where the mention appears # conll_m, #{'sent_id': 0, 'start': 2, 'end': 3, 'wikilink': 'http://en.wikipedia.org/wiki/Germany'} sent = conll_doc['sentences'][conll_m['sent_id']] aet_words = conll_doc['aet_words'] start = conll_m['start'] end = conll_m['end'] snd_lctx = [ self.model.snd_word_voca.get_id(t) for t in sent[max(0, start - self.args.snd_local_ctx_window // 2):start] ] snd_rctx = [ self.model.snd_word_voca.get_id(t) for t in sent[end:min(len(sent), end + self.args.snd_local_ctx_window // 2)] ] snd_ment = [ self.model.snd_word_voca.get_id(t) for t in sent[start:end] ] aet_ctx = [ self.model.aet_word_voca.get_id(ae) for ae in aet_words ] if len(snd_lctx) == 0: snd_lctx = [self.model.snd_word_voca.unk_id] if len(snd_rctx) == 0: snd_rctx = [self.model.snd_word_voca.unk_id] if len(snd_ment) == 0: snd_ment = [self.model.snd_word_voca.unk_id] else: snd_lctx = [self.model.snd_word_voca.unk_id] snd_rctx = [self.model.snd_word_voca.unk_id] snd_ment = [self.model.snd_word_voca.unk_id] items.append({ 'context': (lctx_ids, rctx_ids), 'snd_ctx': (snd_lctx, snd_rctx), 'aet_ctx': aet_ctx, 'ment_ids': ment_ids, 'snd_ment': snd_ment, 'cands': cands, 'named_cands': named_cands, 'p_e_m': p_e_m, 'mask': mask, 'true_pos': true_pos, 'doc_name': doc_name, 'raw': m }) if len(items) > 0: # note: this shouldn't affect the order of prediction because we use doc_name to add predicted entities, # and we don't shuffle the data for prediction if len(items) > 100: print(len(items)) for k in range(0, len(items), 100): data.append(items[k:min(len(items), k + 100)]) else: data.append(items) return self.prerank(data, predict)