def deduplication(triplets): unique_pairs = [] pair_confidence = [] for t in triplets: key = '{}\t{}\t{}'.format(t['h'], t['r'], t['t']) conf = t['c'] if key not in unique_pairs: unique_pairs.append(key) pair_confidence.append(conf) unique_triplets = [] for idx, unique_pair in enumerate(unique_pairs): h, r, t = unique_pair.split('\t') unique_triplets.append({ 'h': h, 'r': r, 't': t, 'c': pair_confidence[idx] }) return unique_triplets if __name__ == "__main__": emb = GenericLookup("entity_word_embedding", save_dir=sqlite_path, table_name="embeddings") p_e_m = emb.wiki("Bob", 'wiki')[:10] print(p_e_m)
class MentionDetectionBase: def __init__(self, base_url, wiki_version): self.wiki_db = GenericLookup( "entity_word_embedding", os.path.join(base_url, wiki_version, "generated")) def get_ctxt(self, start, end, idx_sent, sentence, sentences_doc): """ Retrieves context surrounding a given mention up to 100 words from both sides. :return: left and right context """ # Iteratively add words up until we have 100 left_ctxt = split_in_words(sentence[:start]) if idx_sent > 0: i = idx_sent - 1 while (i >= 0) and (len(left_ctxt) <= 100): left_ctxt = split_in_words(sentences_doc[i]) + left_ctxt i -= 1 left_ctxt = left_ctxt[-100:] left_ctxt = " ".join(left_ctxt) right_ctxt = split_in_words(sentence[end:]) if idx_sent < len(sentences_doc): i = idx_sent + 1 while (i < len(sentences_doc)) and (len(right_ctxt) <= 100): right_ctxt = right_ctxt + split_in_words(sentences_doc[i]) i += 1 right_ctxt = right_ctxt[:100] right_ctxt = " ".join(right_ctxt) return left_ctxt, right_ctxt def get_candidates(self, mention): """ Retrieves a maximum of 100 candidates from the sqlite3 database for a given mention. :return: set of candidates """ # Performs extra check for ED. cands = self.wiki_db.wiki(mention, "wiki") if cands: return cands[:100] else: return [] def preprocess_mention(self, m): """ Responsible for preprocessing a mention and making sure we find a set of matching candidates in our database. :return: mention """ # TODO: This can be optimised (less db calls required). cur_m = modify_uppercase_phrase(m) freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") if not freq_lookup_cur_m: cur_m = m freq_lookup_m = self.wiki_db.wiki(m, "wiki", "freq") freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") if freq_lookup_m and (freq_lookup_m > freq_lookup_cur_m): # Cases like 'U.S.' are handed badly by modify_uppercase_phrase cur_m = m freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") # If we cannot find the exact mention in our index, we try our luck to # find it in a case insensitive index. if not freq_lookup_cur_m: # cur_m and m both not found, verify if lower-case version can be found. find_lower = self.wiki_db.wiki(m.lower(), "wiki", "lower") if find_lower: cur_m = find_lower freq_lookup_cur_m = self.wiki_db.wiki(cur_m, "wiki", "freq") # Try and remove first or last characters (e.g. 'Washington,' to 'Washington') # To be error prone, we only try this if no match was found thus far, else # this might get in the way of 'U.S.' converting to 'US'. # Could do this recursively, interesting to explore in future work. if not freq_lookup_cur_m: temp = re.sub(r"[\(.|,|!|')]", "", m).strip() simple_lookup = self.wiki_db.wiki(temp, "wiki", "freq") if simple_lookup: cur_m = temp return cur_m
class MentionDetection: def __init__(self, base_url, wiki_subfolder): self.cnt_exact = 0 self.cnt_partial = 0 self.cnt_total = 0 self.wiki_db = GenericLookup( "entity_word_embedding", "{}/{}/generated/".format(base_url, wiki_subfolder), ) # def __verify_pos(self, ngram, start, end, sentence): # ngram = ngram.lower() # find_ngram = sentence[start:end].lower() # find_ngram_ws_invariant = " ".join( # [x.text for x in Sentence(find_ngram, use_tokenizer=True)] # ).lower() # assert (find_ngram == ngram) or ( # find_ngram_ws_invariant == ngram # ), "Mention not found on given position: {};{};{};{}".format( # find_ngram, ngram, find_ngram_ws_invariant, sentence # ) # def split_text(self, dataset): # """ # Splits text into sentences. This behavior is required for the default NER-tagger, which during experiments # was experienced to perform more optimally in such a fashion. # # :return: dictionary with sentences and optional given spans per sentence. # """ # # res = {} # for doc in dataset: # text, spans = dataset[doc] # sentences = split_single(text) # res[doc] = {} # # i = 0 # for sent in sentences: # if len(sent.strip()) == 0: # continue # # Match gt to sentence. # pos_start = text.find(sent) # pos_end = pos_start + len(sent) # # # ngram, start_pos, end_pos # spans_sent = [ # [text[x[0] : x[0] + x[1]], x[0], x[0] + x[1]] # for x in spans # if pos_start <= x[0] < pos_end # ] # res[doc][i] = [sent, spans_sent] # i += 1 # return res def _get_ctxt(self, start, end, idx_sent, sentence): """ Retrieves context surrounding a given mention up to 100 words from both sides. :return: left and right context """ # Iteratively add words up until we have 100 left_ctxt = split_in_words(sentence[:start]) if idx_sent > 0: i = idx_sent - 1 while (i >= 0) and (len(left_ctxt) <= 100): left_ctxt = split_in_words(self.sentences_doc[i]) + left_ctxt i -= 1 left_ctxt = left_ctxt[-100:] left_ctxt = " ".join(left_ctxt) right_ctxt = split_in_words(sentence[end:]) if idx_sent < len(self.sentences_doc): i = idx_sent + 1 while (i < len(self.sentences_doc)) and (len(right_ctxt) <= 100): right_ctxt = right_ctxt + split_in_words(self.sentences_doc[i]) i += 1 right_ctxt = right_ctxt[:100] right_ctxt = " ".join(right_ctxt) return left_ctxt, right_ctxt def _get_candidates(self, mention): """ Retrieves a maximum of 100 candidates from the sqlite3 database for a given mention. :return: set of candidates """ # Performs extra check for ED. cands = self.wiki_db.wiki(mention, "wiki") if cands: return cands[:100] else: return [] def format_spans(self, dataset): """ Responsible for formatting given spans into dataset for the ED step. More specifically, it returns the mention, its left/right context and a set of candidates. :return: Dictionary with mentions per document. """ dataset, _, _ = self.split_text(dataset) results = {} total_ment = 0 for doc in dataset: contents = dataset[doc] self.sentences_doc = [v[0] for v in contents.values()] results_doc = [] for idx_sent, (sentence, spans) in contents.items(): for ngram, start_pos, end_pos in spans: total_ment += 1 # end_pos = start_pos + length # ngram = text[start_pos:end_pos] mention = preprocess_mention(ngram, self.wiki_db) left_ctxt, right_ctxt = self._get_ctxt( start_pos, end_pos, idx_sent, sentence) chosen_cands = self._get_candidates(mention) res = { "mention": mention, "context": (left_ctxt, right_ctxt), "candidates": chosen_cands, "gold": ["NONE"], "pos": start_pos, "sent_idx": idx_sent, "ngram": ngram, "end_pos": end_pos, "sentence": sentence, } results_doc.append(res) results[doc] = results_doc return results, total_ment # def find_mentions(self, dataset, tagger_ner=None): # """ # Responsible for finding mentions given a set of documents. More specifically, # it returns the mention, its left/right context and a set of candidates. # # :return: Dictionary with mentions per document. # """ # # if tagger_ner is None: # raise Exception( # "No NER tagger is set, but you are attempting to perform Mention Detection.." # ) # # dataset, _, _ = self.split_text(dataset) # results = {} # total_ment = 0 # # for doc in dataset: # contents = dataset[doc] # # self.sentences_doc = [v[0] for v in contents.values()] # result_doc = [] # # sentences = [ # Sentence(v[0], use_tokenizer=True) for k, v in contents.items() # ] # # tagger_ner.predict(sentences) # # for (idx_sent, (sentence, ground_truth_sentence)), snt in zip( # contents.items(), sentences # ): # illegal = [] # for entity in snt.get_spans("ner"): # text, start_pos, end_pos, conf = ( # entity.text, # entity.start_pos, # entity.end_pos, # entity.score, # ) # total_ment += 1 # # m = preprocess_mention(text, self.wiki_db) # cands = self._get_candidates(m) # # if len(cands) == 0: # continue # # ngram = sentence[start_pos:end_pos] # illegal.extend(range(start_pos, end_pos)) # # left_ctxt, right_ctxt = self._get_ctxt( # start_pos, end_pos, idx_sent, sentence # ) # # res = { # "mention": m, # "context": (left_ctxt, right_ctxt), # "candidates": cands, # "gold": ["NONE"], # "pos": start_pos, # "sent_idx": idx_sent, # "ngram": ngram, # "end_pos": end_pos, # "sentence": sentence, # "conf_md": conf, # "tag": entity.tag, # } # # result_doc.append(res) # # results[doc] = result_doc # # return results, total_ment def split_text(self, dataset): """ Splits text into sentences. This behavior is required for the default NER-tagger, which during experiments was experienced to perform more optimally in such a fashion. :return: dictionary with sentences and optional given spans per sentence. """ res = {} splits = [0] processed_sentences = [] for doc in dataset: text, spans = dataset[doc] sentences = split_single(text) res[doc] = {} i = 0 for sent in sentences: if len(sent.strip()) == 0: continue # Match gt to sentence. pos_start = text.find(sent) pos_end = pos_start + len(sent) # ngram, start_pos, end_pos spans_sent = [[text[x[0]:x[0] + x[1]], x[0], x[0] + x[1]] for x in spans if pos_start <= x[0] < pos_end] res[doc][i] = [sent, spans_sent] if len(spans) == 0: processed_sentences.append( Sentence(sent, use_tokenizer=True)) i += 1 splits.append(splits[-1] + i) return res, processed_sentences, splits def find_mentions(self, dataset, tagger_ner=None): """ Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, it returns the mention, its left/right context and a set of candidates. :return: Dictionary with mentions per document. """ if tagger_ner is None: raise Exception( "No NER tagger is set, but you are attempting to perform Mention Detection.." ) dataset, processed_sentences, splits = self.split_text(dataset) results = {} total_ment = 0 tagger_ner.predict(processed_sentences, mini_batch_size=32) for i, doc in enumerate(dataset): contents = dataset[doc] self.sentences_doc = [v[0] for v in contents.values()] sentences = processed_sentences[splits[i]:splits[i + 1]] result_doc = [] for (idx_sent, (sentence, ground_truth_sentence)), snt in zip( contents.items(), sentences): illegal = [] for entity in snt.get_spans("ner"): text, start_pos, end_pos, conf = ( entity.text, entity.start_pos, entity.end_pos, entity.score, ) total_ment += 1 m = preprocess_mention(text, self.wiki_db) cands = self._get_candidates(m) if len(cands) == 0: continue ngram = sentence[start_pos:end_pos] illegal.extend(range(start_pos, end_pos)) left_ctxt, right_ctxt = self._get_ctxt( start_pos, end_pos, idx_sent, sentence) res = { "mention": m, "context": (left_ctxt, right_ctxt), "candidates": cands, "gold": ["NONE"], "pos": start_pos, "sent_idx": idx_sent, "ngram": ngram, "end_pos": end_pos, "sentence": sentence, "conf_md": conf, "tag": entity.tag, } result_doc.append(res) results[doc] = result_doc return results, total_ment
class MentionDetection: """ Class responsible for mention detection. """ def __init__(self, base_url, wiki_subfolder): if isinstance(base_url, str): base_url = Path(base_url) self.cnt_exact = 0 self.cnt_partial = 0 self.cnt_total = 0 self.wiki_db = GenericLookup( "entity_word_embedding", base_url / wiki_subfolder / "generated", ) # def __verify_pos(self, ngram, start, end, sentence): # ngram = ngram.lower() # find_ngram = sentence[start:end].lower() # find_ngram_ws_invariant = " ".join( # [x.text for x in Sentence(find_ngram, use_tokenizer=True)] # ).lower() # assert (find_ngram == ngram) or ( # find_ngram_ws_invariant == ngram # ), "Mention not found on given position: {};{};{};{}".format( # find_ngram, ngram, find_ngram_ws_invariant, sentence # ) def split_text(self, dataset): """ Splits text into sentences. This behavior is required for the default NER-tagger, which during experiments was experienced to perform more optimally in such a fashion. :return: dictionary with sentences and optional given spans per sentence. """ res = {} for doc in dataset: text, spans = dataset[doc] sentences = split_single(text) res[doc] = {} i = 0 for sent in sentences: if len(sent.strip()) == 0: continue # Match gt to sentence. pos_start = text.find(sent) pos_end = pos_start + len(sent) # ngram, start_pos, end_pos spans_sent = [ [text[x[0] : x[0] + x[1]], x[0], x[0] + x[1]] for x in spans if pos_start <= x[0] < pos_end ] res[doc][i] = [sent, spans_sent] i += 1 return res def _get_ctxt(self, start, end, idx_sent, sentence): """ Retrieves context surrounding a given mention up to 100 words from both sides. :return: left and right context """ # Iteratively add words up until we have 100 left_ctxt = split_in_words(sentence[:start]) if idx_sent > 0: i = idx_sent - 1 while (i >= 0) and (len(left_ctxt) <= 100): left_ctxt = split_in_words(self.sentences_doc[i]) + left_ctxt i -= 1 left_ctxt = left_ctxt[-100:] left_ctxt = " ".join(left_ctxt) right_ctxt = split_in_words(sentence[end:]) if idx_sent < len(self.sentences_doc): i = idx_sent + 1 while (i < len(self.sentences_doc)) and (len(right_ctxt) <= 100): right_ctxt = right_ctxt + split_in_words(self.sentences_doc[i]) i += 1 right_ctxt = right_ctxt[:100] right_ctxt = " ".join(right_ctxt) return left_ctxt, right_ctxt def _get_candidates(self, mention, top_n=100): """ Retrieves a maximum of n candidates from the sqlite3 database for a given mention. :param top_n: number of candidates to return :return: set of candidates """ # Performs extra check for ED. # TODO: Add `LIMIT n` to the SQL Query to better performance candidates = self.wiki_db.wiki(mention, "wiki") if candidates: return candidates[:top_n] else: return [] def format_spans(self, dataset): """ Responsible for formatting given spans into dataset for the ED step. More specifically, it returns the mention, its left/right context and a set of candidates. :return: Dictionary with mentions per document. """ dataset = self.split_text(dataset) results = {} total_ment = 0 for doc in dataset: contents = dataset[doc] self.sentences_doc = [v[0] for v in contents.values()] results_doc = [] for idx_sent, (sentence, spans) in contents.items(): for ngram, start_pos, end_pos in spans: total_ment += 1 # end_pos = start_pos + length # ngram = text[start_pos:end_pos] mention = preprocess_mention(ngram, self.wiki_db) left_ctxt, right_ctxt = self._get_ctxt( start_pos, end_pos, idx_sent, sentence ) chosen_cands = self._get_candidates(mention) res = { "mention": mention, "context": (left_ctxt, right_ctxt), "candidates": chosen_cands, "gold": ["NONE"], "pos": start_pos, "sent_idx": idx_sent, "ngram": ngram, "end_pos": end_pos, "sentence": sentence, } results_doc.append(res) results[doc] = results_doc return results, total_ment def find_mentions(self, dataset, tagger_ner=None): """ Responsible for finding mentions given a set of documents. More specifically, it returns the mention, its left/right context and a set of candidates. :return: Dictionary with mentions per document. """ if tagger_ner is None: raise Exception( "No NER tagger is set, but you are attempting to perform Mention Detection.." ) dataset = self.split_text(dataset) results = {} total_ment = 0 for doc in dataset: contents = dataset[doc] self.sentences_doc = [v[0] for v in contents.values()] result_doc = [] sentences = [ Sentence(v[0], use_tokenizer=True) for k, v in contents.items() ] tagger_ner.predict(sentences) for (idx_sent, (sentence, ground_truth_sentence)), snt in zip( contents.items(), sentences ): illegal = [] for entity in snt.get_spans("ner"): text, start_pos, end_pos, conf = ( entity.text, entity.start_pos, entity.end_pos, entity.score, ) total_ment += 1 m = preprocess_mention(text, self.wiki_db) cands = self._get_candidates(m) if len(cands) == 0: continue ngram = sentence[start_pos:end_pos] illegal.extend(range(start_pos, end_pos)) left_ctxt, right_ctxt = self._get_ctxt( start_pos, end_pos, idx_sent, sentence ) res = { "mention": m, "context": (left_ctxt, right_ctxt), "candidates": cands, "gold": ["NONE"], "pos": start_pos, "sent_idx": idx_sent, "ngram": ngram, "end_pos": end_pos, "sentence": sentence, "conf_md": conf, } result_doc.append(res) results[doc] = result_doc return results, total_ment