def link_entities_in_raw_input(self, input_text: str, element_id: str=None, num_candidates=-1) -> Sentence: """ Takes a raw input string, extracts mentions and returns a list of the most probable entities that can be linked to the given input text. :param input_text: the input sentence as a string :param element_id: sentence id :param num_candidates: the number of candidate entity links to store for each entity. If set to more than 0 it will override the class setting. :return: a list of tuples where the first element is the entity id and the second is the entity label >>> l = HeuristicsLinker(num_candidates=1) >>> l.link_entities_in_raw_input("Who wrote the song hotel California?") [('Q7366', 'song', (14, 18), [3]), ('Q780394', 'Hotel California', (19, 35), [4, 5])] >>> l.link_entities_in_raw_input("Donovan McNabb'strade to the Vikings is in place.") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE [('Q963185', 'Donovan McNabb', (0, 14), [0, 1]), ...] >>> l.link_entities_in_raw_input("what was the queen album?") [('Q15862', 'Queen', (13, 18), [3]), ('Q482994', 'album', (20, 24), [4])] """ sentence = Sentence(input_text=input_text) sentence = self.link_entities_in_sentence_obj(sentence, element_id=element_id, num_candidates=num_candidates) sentence.entities = [{k: e[k] for k in {'type', 'linkings', 'token_ids', 'poss', 'tokens'}} for e in sentence.entities if len(e['linkings']) > 0] for e in sentence.entities: e['linkings'] = [(l.get('kbID'), l.get('label')) for l in e['linkings']] return sentence
def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1): sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged, mentions=sentence_obj.mentions, entities=sentence_obj.entities) if not sentence_obj.tagged: sentence_obj.tagged = utils.get_tagged_from_server( sentence_obj.input_text, caseless=sentence_obj.input_text.islower()) sentence_obj.entities = [] if element_id: smart_predictions = [ ([ p[4].replace("/", ".")[1:] for p in candidates if float(p[6]) > self._confidence ], int(candidates[0][2]), int(candidates[0][2]) + int(candidates[0][3])) for e, candidates in self.predictions[element_id].items() if len(candidates) > 0 ] for c, s, e in smart_predictions: linkings = [] for p in c: kbID = queries.map_f_id(p) linkings.append({ 'fbID': p, 'kbID': kbID, 'label': queries.get_main_entity_label(kbID) if kbID else None }) sentence_obj.entities.append({ "linkings": linkings, 'offsets': (s, e), 'type': 'NNP', 'poss': [], 'token_ids': _offets_to_token_ids(s, e, sentence_obj.tagged), 'tokens': [] }) for e in sentence_obj.entities: # If there are many linking candidates we take the top N, since they are still ordered if num_candidates > 0: e['linkings'] = e['linkings'][:num_candidates] else: e['linkings'] = e['linkings'][:self.num_candidates] return sentence_obj
def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1): sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged, mentions=sentence_obj.mentions, entities=sentence_obj.entities) sentence_obj.entities = [] params = urllib.parse.urlencode({'text': sentence_obj.input_text, 'confidence': str(self._confidence)}) request = urllib.request.Request(self._spotlight_url + params) request.add_header("Accept", "application/json") try: content = json.loads(urllib.request.urlopen(request).read()) sentence_obj.entities = [{ "linkings": [{ 'kbID': queries.map_wikipedia_id(r.get("@URI") .replace("http://dbpedia.org/resource/", "") .replace("http://dbpedia.org/page/", "")) }], 'offsets': (int(r.get('@offset', '0')), int(r.get('@offset', '0')) + len(r.get('@surfaceForm', "")))} for r in content.get('Resources', [])] except: pass return sentence_obj
def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1) -> Sentence: """ The method takes a sentence dictionary object that might already contain a tagged input or recognized mentions. This is useful if tagging and mentioned extraction is done in bulk before the entity linking step. Supported fields in the sentence_obj object: "input_text": raw input text as a string "tagged": a list of dict objects, one per token, with the output of the POS and NER taggers, see utils for more info (optional) "mentions": a list of dict object, one per mention, see mention_extraction for more info (optional) "entities": extracted entity candidates (optional) See Sentence for more info. :param sentence_obj: input sentence as a dictionary, might be an empty dict :param element_id: sentence id to retrieve precomputed candidates for certain linkers :param num_candidates: the number of candidate entity links to store for each entity. If set to more than 0 it will override the class setting. :return: the same sentence_obj object with a new field "entities" >>> l = HeuristicsLinker() >>> l.link_entities_in_sentence_obj(Sentence("Where does Norway get their oil?")).entities[0]['linkings'] # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE [('Q20', 'Norway'), ...] """ if self._precomputed_candidates is not None and element_id in self._precomputed_candidates: sentence_obj = self._precomputed_candidates[element_id] sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged, mentions=sentence_obj.mentions, entities=sentence_obj.entities) if sentence_obj.tagged is None and sentence_obj.input_text is not None: sentence_obj.tagged = utils.get_tagged_from_server(sentence_obj.input_text, caseless=sentence_obj.input_text.islower()) self.logger.debug([(t['word'], t['pos']) for t in sentence_obj.tagged]) if sentence_obj.entities is None: if sentence_obj.mentions is not None: sentence_obj.entities = self._link_mentions_to_entities(sentence_obj.mentions) else: sentence_obj.entities = self._link_entities_in_tagged_input(sentence_obj.tagged) self.logger.debug([e['linkings'][0] for e in sentence_obj.entities]) elif self.prefer_longer_matches: sentence_obj.entities = self._prefer_longer_matches(sentence_obj.entities) for e in sentence_obj.entities: e['text'] = sentence_obj.input_text sentence_obj.entities = [self.compute_candidate_scores(e, tagged_text=sentence_obj.tagged) for e in sentence_obj.entities] if self.no_mentions_overlap: if not self.one_entity_mode: sentence_obj.entities = resolve_entity_overlap_beam_search(sentence_obj.entities) else: sentence_obj.entities = sorted(sentence_obj.entities, key=lambda x: x.get('drop_score', 0.0)) sentence_obj.entities = sentence_obj.entities[:1] # One mention span -> one entity. Each entity can have multiple linking candidates. for e in sentence_obj.entities: # If there are many linking candidates we take the top N, since they are still ordered if num_candidates > 0: e['linkings'] = e['linkings'][:num_candidates] else: e['linkings'] = e['linkings'][:self.num_candidates] return sentence_obj