def link_entities_in_raw_input(self, input_text: str, element_id: str=None, num_candidates=-1) -> Sentence: """ Takes a raw input string, extracts mentions and returns a list of the most probable entities that can be linked to the given input text. :param input_text: the input sentence as a string :param element_id: sentence id :param num_candidates: the number of candidate entity links to store for each entity. If set to more than 0 it will override the class setting. :return: a list of tuples where the first element is the entity id and the second is the entity label >>> l = HeuristicsLinker(num_candidates=1) >>> l.link_entities_in_raw_input("Who wrote the song hotel California?") [('Q7366', 'song', (14, 18), [3]), ('Q780394', 'Hotel California', (19, 35), [4, 5])] >>> l.link_entities_in_raw_input("Donovan McNabb'strade to the Vikings is in place.") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE [('Q963185', 'Donovan McNabb', (0, 14), [0, 1]), ...] >>> l.link_entities_in_raw_input("what was the queen album?") [('Q15862', 'Queen', (13, 18), [3]), ('Q482994', 'album', (20, 24), [4])] """ sentence = Sentence(input_text=input_text) sentence = self.link_entities_in_sentence_obj(sentence, element_id=element_id, num_candidates=num_candidates) sentence.entities = [{k: e[k] for k in {'type', 'linkings', 'token_ids', 'poss', 'tokens'}} for e in sentence.entities if len(e['linkings']) > 0] for e in sentence.entities: e['linkings'] = [(l.get('kbID'), l.get('label')) for l in e['linkings']] return sentence
def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1): sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged, mentions=sentence_obj.mentions, entities=sentence_obj.entities) if not sentence_obj.tagged: sentence_obj.tagged = utils.get_tagged_from_server( sentence_obj.input_text, caseless=sentence_obj.input_text.islower()) sentence_obj.entities = [] if element_id: smart_predictions = [ ([ p[4].replace("/", ".")[1:] for p in candidates if float(p[6]) > self._confidence ], int(candidates[0][2]), int(candidates[0][2]) + int(candidates[0][3])) for e, candidates in self.predictions[element_id].items() if len(candidates) > 0 ] for c, s, e in smart_predictions: linkings = [] for p in c: kbID = queries.map_f_id(p) linkings.append({ 'fbID': p, 'kbID': kbID, 'label': queries.get_main_entity_label(kbID) if kbID else None }) sentence_obj.entities.append({ "linkings": linkings, 'offsets': (s, e), 'type': 'NNP', 'poss': [], 'token_ids': _offets_to_token_ids(s, e, sentence_obj.tagged), 'tokens': [] }) for e in sentence_obj.entities: # If there are many linking candidates we take the top N, since they are still ordered if num_candidates > 0: e['linkings'] = e['linkings'][:num_candidates] else: e['linkings'] = e['linkings'][:self.num_candidates] return sentence_obj
def precompute(self, linker: BaseLinker, verbose=True): """ Extract entities from each dataset instance and compute linking candidates with a BaseLinker. :param linker: an instance of a BaseLinker :param verbose: if True, progress is indicated :return: a dictionary that maps instance ids to candidate annotations """ gold_total = 0 predicted_correct = 0 data_iterator = tqdm.tqdm(self.get_samples() + self.get_samples(dev=True), ncols=100, ascii=True, disable=not verbose) precomputed_candidates = {} for el_id, text, annotations, _ in data_iterator: sentence = Sentence(input_text=text) sentence = linker.link_entities_in_sentence_obj(sentence) entities = [(l.get('kbID'), ) + tuple(e['offsets']) for e in sentence.entities for l in e['linkings'] if len(e['linkings']) > 0] match = measures.entity_linking_tp_with_overlap( annotations, entities) predicted_correct += match gold_total += len(annotations) recall = predicted_correct / gold_total if gold_total > 0 else 0 data_iterator.set_postfix(rec=recall) precomputed_candidates[el_id] = sentence return precomputed_candidates
def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1): sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged, mentions=sentence_obj.mentions, entities=sentence_obj.entities) sentence_obj.entities = [] params = urllib.parse.urlencode({'text': sentence_obj.input_text, 'confidence': str(self._confidence)}) request = urllib.request.Request(self._spotlight_url + params) request.add_header("Accept", "application/json") try: content = json.loads(urllib.request.urlopen(request).read()) sentence_obj.entities = [{ "linkings": [{ 'kbID': queries.map_wikipedia_id(r.get("@URI") .replace("http://dbpedia.org/resource/", "") .replace("http://dbpedia.org/page/", "")) }], 'offsets': (int(r.get('@offset', '0')), int(r.get('@offset', '0')) + len(r.get('@surfaceForm', "")))} for r in content.get('Resources', [])] except: pass return sentence_obj
def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1) -> Sentence: """ The method takes a sentence dictionary object that might already contain a tagged input or recognized mentions. This is useful if tagging and mentioned extraction is done in bulk before the entity linking step. Supported fields in the sentence_obj object: "input_text": raw input text as a string "tagged": a list of dict objects, one per token, with the output of the POS and NER taggers, see utils for more info (optional) "mentions": a list of dict object, one per mention, see mention_extraction for more info (optional) "entities": extracted entity candidates (optional) See Sentence for more info. :param sentence_obj: input sentence as a dictionary, might be an empty dict :param element_id: sentence id to retrieve precomputed candidates for certain linkers :param num_candidates: the number of candidate entity links to store for each entity. If set to more than 0 it will override the class setting. :return: the same sentence_obj object with a new field "entities" >>> l = HeuristicsLinker() >>> l.link_entities_in_sentence_obj(Sentence("Where does Norway get their oil?")).entities[0]['linkings'] # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE [('Q20', 'Norway'), ...] """ if self._precomputed_candidates is not None and element_id in self._precomputed_candidates: sentence_obj = self._precomputed_candidates[element_id] sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged, mentions=sentence_obj.mentions, entities=sentence_obj.entities) if sentence_obj.tagged is None and sentence_obj.input_text is not None: sentence_obj.tagged = utils.get_tagged_from_server(sentence_obj.input_text, caseless=sentence_obj.input_text.islower()) self.logger.debug([(t['word'], t['pos']) for t in sentence_obj.tagged]) if sentence_obj.entities is None: if sentence_obj.mentions is not None: sentence_obj.entities = self._link_mentions_to_entities(sentence_obj.mentions) else: sentence_obj.entities = self._link_entities_in_tagged_input(sentence_obj.tagged) self.logger.debug([e['linkings'][0] for e in sentence_obj.entities]) elif self.prefer_longer_matches: sentence_obj.entities = self._prefer_longer_matches(sentence_obj.entities) for e in sentence_obj.entities: e['text'] = sentence_obj.input_text sentence_obj.entities = [self.compute_candidate_scores(e, tagged_text=sentence_obj.tagged) for e in sentence_obj.entities] if self.no_mentions_overlap: if not self.one_entity_mode: sentence_obj.entities = resolve_entity_overlap_beam_search(sentence_obj.entities) else: sentence_obj.entities = sorted(sentence_obj.entities, key=lambda x: x.get('drop_score', 0.0)) sentence_obj.entities = sentence_obj.entities[:1] # One mention span -> one entity. Each entity can have multiple linking candidates. for e in sentence_obj.entities: # If there are many linking candidates we take the top N, since they are still ordered if num_candidates > 0: e['linkings'] = e['linkings'][:num_candidates] else: e['linkings'] = e['linkings'][:self.num_candidates] return sentence_obj
def eval(self, linker: BaseLinker, only_the_main_entity=False, fb=False, verbose=True): performance_per_entity_type = defaultdict(lambda: [0, 0, 0]) predicted_correct = 0 predicted_total = 0 gold_total = 0 data_iterator = tqdm.tqdm(self.get_samples(dev=True, fb=fb), ncols=100, ascii=True, disable=not verbose) for el_id, text, annotations, main_entity, gold_entity_classes in data_iterator: sentence = Sentence(input_text=text) sentence = linker.link_entities_in_sentence_obj(sentence, element_id=el_id) entities = [(l.get('kbID'), ) + tuple(e['offsets']) for e in sentence.entities for l in e['linkings'] if len(e['linkings']) > 0] entity_classes = [ queries.get_mapped_entity_type(e[0]) if e else "other" for e in entities ] if fb: entities = [(l.get('fbID'), ) + tuple(e['offsets']) for e in sentence.entities for l in e['linkings'] if len(e['linkings']) > 0] if only_the_main_entity: annotations = [main_entity] match = measures.entity_linking_tp_with_overlap( annotations, entities) else: entities = [e[0] for e in entities] annotations = [e[0] for e in annotations] match = 0 for ai, a in enumerate(annotations): gold_entity_class = gold_entity_classes[ ai] if gold_entity_classes and gold_entity_classes[ ai] else "other" if a in entities: match += 1 performance_per_entity_type[gold_entity_class][0] += 1 performance_per_entity_type[gold_entity_class][2] += 1 for entity_class in entity_classes: performance_per_entity_type[entity_class][1] += 1 predicted_correct += match predicted_total += len(entities) gold_total += len(annotations) precision = predicted_correct / predicted_total if predicted_total > 0 else 0 recall = predicted_correct / gold_total if gold_total > 0 else 0 f1 = (2.0 * precision * recall) / ( precision + recall) if precision + recall > 0 else 0 data_iterator.set_postfix(prec=precision, rec=recall, f1=f1) precision = predicted_correct / predicted_total if predicted_total > 0 else 0 recall = predicted_correct / gold_total if gold_total > 0 else 0 f1 = (2.0 * precision * recall) / (precision + recall) if precision + recall > 0 else 0 for cls, stats in performance_per_entity_type.items(): predicted_correct, predicted_total, gold_total = tuple(stats) cls_precision = predicted_correct / predicted_total if predicted_total > 0 else 0 cls_recall = predicted_correct / gold_total if gold_total > 0 else 0 cls_f1 = (2.0 * cls_precision * cls_recall) / ( cls_precision + cls_recall) if cls_precision + cls_recall > 0 else 0 performance_per_entity_type[cls] = (cls_precision, cls_recall, cls_f1) return precision, recall, f1, dict(performance_per_entity_type)
def eval(self, linker: BaseLinker, only_the_main_entity=False, fb=False, verbose=True, main_entity_given=False): performance_per_entity_type = defaultdict(lambda: [0, 0, 0]) predicted_correct = 0 predicted_total = 0 gold_total = 0 data_iterator = tqdm.tqdm(self.get_samples(dev=True, fb=fb), ncols=100, ascii=True, disable=not verbose) for el_id, text, annotations, main_entity, gold_entity_classes in data_iterator: sentence = Sentence(input_text=text) # TODO SET FALSE use_main_entity = main_entity_given if use_main_entity: if sentence.tagged is None and sentence.input_text is not None: sentence.tagged = utils.get_tagged_from_server( sentence.input_text, caseless=sentence.input_text.islower()) # find main entity mentions for i, t in enumerate(sentence.tagged): t['abs_id'] = i mention_chunks = [] for chunk in sentence.tagged: if chunk['characterOffsetBegin'] >= main_entity[ 1] and chunk['characterOffsetEnd'] <= main_entity[ 2]: mention_chunks.append(chunk) if chunk['characterOffsetEnd'] >= main_entity[2]: break try: assert len(mention_chunks) > 0 for chunk in mention_chunks: assert chunk['word'] in text[ main_entity[1]:main_entity[2]] assert mention_chunks[0][ 'word'] == text[main_entity[1]:main_entity[1] + len(mention_chunks[0]['word'])] assert mention_chunks[-1]['word'] == text[ main_entity[2] - len(mention_chunks[-1]['word']):main_entity[2]] except AssertionError: print("bad mention: {} {}".format(el_id, main_entity)) mention_chunks = [] if el_id == 'WebQTest-1575': mention_chunks.append(sentence.tagged[8]) fragment_type = ("NNP" if any( el['pos'] in {'NNP', 'NNPS'} for el in mention_chunks) else "NN" if any( el['pos'] != "CD" and el['ner'] != "DATE" for el in mention_chunks) else "DATE" if all( el['ner'] == "DATE" for el in mention_chunks) else utils.unknown_el) mentions = [{ 'type': fragment_type, 'tokens': [el['word'] for el in mention_chunks], 'token_ids': [el['abs_id'] for el in mention_chunks], 'poss': [el['pos'] for el in mention_chunks], 'offsets': (main_entity[1], main_entity[2]) }] sentence.mentions = mentions sentence = linker.link_entities_in_sentence_obj(sentence, element_id=el_id) entities = [(l.get('kbID'), ) + tuple(e['offsets']) for e in sentence.entities for l in e['linkings'] if len(e['linkings']) > 0] entity_classes = [ queries.get_mapped_entity_type(e[0]) if e else "other" for e in entities ] if fb: entities = [(l.get('fbID'), ) + tuple(e['offsets']) for e in sentence.entities for l in e['linkings'] if len(e['linkings']) > 0] if only_the_main_entity: annotations = [main_entity] match = measures.entity_linking_tp_with_overlap( annotations, entities) else: match = measures.entity_linking_tp_with_overlap( annotations, entities) for ai, a in enumerate(annotations): gold_entity_class = gold_entity_classes[ ai] if gold_entity_classes and gold_entity_classes[ ai] else "other" a_match = measures.entity_linking_tp_with_overlap([a], entities) performance_per_entity_type[gold_entity_class][ 0] += a_match performance_per_entity_type[gold_entity_class][2] += 1 for entity_class in entity_classes: performance_per_entity_type[entity_class][1] += 1 #entities = [e[0] for e in entities] #annotations = [e[0] for e in annotations] #match = 0 #for ai, a in enumerate(annotations): # gold_entity_class = gold_entity_classes[ai] if gold_entity_classes and gold_entity_classes[ai] else "other" # if a in entities: # match += 1 # performance_per_entity_type[gold_entity_class][0] += 1 # performance_per_entity_type[gold_entity_class][2] += 1 #for entity_class in entity_classes: # performance_per_entity_type[entity_class][1] += 1 predicted_correct += match predicted_total += len(entities) gold_total += len(annotations) precision = predicted_correct / predicted_total if predicted_total > 0 else 0 recall = predicted_correct / gold_total if gold_total > 0 else 0 f1 = (2.0 * precision * recall) / ( precision + recall) if precision + recall > 0 else 0 data_iterator.set_postfix(prec=precision, rec=recall, f1=f1) precision = predicted_correct / predicted_total if predicted_total > 0 else 0 recall = predicted_correct / gold_total if gold_total > 0 else 0 f1 = (2.0 * precision * recall) / (precision + recall) if precision + recall > 0 else 0 for cls, stats in performance_per_entity_type.items(): predicted_correct, predicted_total, gold_total = tuple(stats) cls_precision = predicted_correct / predicted_total if predicted_total > 0 else 0 cls_recall = predicted_correct / gold_total if gold_total > 0 else 0 cls_f1 = (2.0 * cls_precision * cls_recall) / ( cls_precision + cls_recall) if cls_precision + cls_recall > 0 else 0 performance_per_entity_type[cls] = (cls_precision, cls_recall, cls_f1) return precision, recall, f1, dict(performance_per_entity_type)