def link_entities_in_raw_input(self, input_text: str, element_id: str=None, num_candidates=-1) -> Sentence:
        """
        Takes a raw input string, extracts mentions and returns a list of the most probable entities that can be linked
         to the given input text.

        :param input_text: the input sentence as a string
        :param element_id: sentence id
        :param num_candidates: the number of candidate entity links to store for each entity.
                                If set to more than 0 it will override the class setting.
        :return: a list of tuples where the first element is the entity id and the second is the entity label
        >>> l = HeuristicsLinker(num_candidates=1)
        >>> l.link_entities_in_raw_input("Who wrote the song hotel California?")
        [('Q7366', 'song', (14, 18), [3]), ('Q780394', 'Hotel California', (19, 35), [4, 5])]
        >>> l.link_entities_in_raw_input("Donovan McNabb'strade to the Vikings is in place.")  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        [('Q963185', 'Donovan McNabb', (0, 14), [0, 1]), ...]
        >>> l.link_entities_in_raw_input("what was the queen album?")
        [('Q15862', 'Queen', (13, 18), [3]), ('Q482994', 'album', (20, 24), [4])]
        """
        sentence = Sentence(input_text=input_text)
        sentence = self.link_entities_in_sentence_obj(sentence, element_id=element_id, num_candidates=num_candidates)
        sentence.entities = [{k: e[k] for k in {'type', 'linkings', 'token_ids', 'poss', 'tokens'}}
                             for e in sentence.entities
                             if len(e['linkings']) > 0]
        for e in sentence.entities:
            e['linkings'] = [(l.get('kbID'), l.get('label')) for l in e['linkings']]
        return sentence
Exemplo n.º 2
0
    def link_entities_in_sentence_obj(self,
                                      sentence_obj: Sentence,
                                      element_id=None,
                                      num_candidates=-1):
        sentence_obj = Sentence(input_text=sentence_obj.input_text,
                                tagged=sentence_obj.tagged,
                                mentions=sentence_obj.mentions,
                                entities=sentence_obj.entities)
        if not sentence_obj.tagged:
            sentence_obj.tagged = utils.get_tagged_from_server(
                sentence_obj.input_text,
                caseless=sentence_obj.input_text.islower())
        sentence_obj.entities = []
        if element_id:
            smart_predictions = [
                ([
                    p[4].replace("/", ".")[1:] for p in candidates
                    if float(p[6]) > self._confidence
                ], int(candidates[0][2]),
                 int(candidates[0][2]) + int(candidates[0][3]))
                for e, candidates in self.predictions[element_id].items()
                if len(candidates) > 0
            ]

            for c, s, e in smart_predictions:
                linkings = []
                for p in c:
                    kbID = queries.map_f_id(p)
                    linkings.append({
                        'fbID':
                        p,
                        'kbID':
                        kbID,
                        'label':
                        queries.get_main_entity_label(kbID) if kbID else None
                    })
                sentence_obj.entities.append({
                    "linkings":
                    linkings,
                    'offsets': (s, e),
                    'type':
                    'NNP',
                    'poss': [],
                    'token_ids':
                    _offets_to_token_ids(s, e, sentence_obj.tagged),
                    'tokens': []
                })

        for e in sentence_obj.entities:
            # If there are many linking candidates we take the top N, since they are still ordered
            if num_candidates > 0:
                e['linkings'] = e['linkings'][:num_candidates]
            else:
                e['linkings'] = e['linkings'][:self.num_candidates]

        return sentence_obj
    def precompute(self, linker: BaseLinker, verbose=True):
        """
        Extract entities from each dataset instance and compute linking candidates with a BaseLinker.

        :param linker: an instance of a BaseLinker
        :param verbose: if True, progress is indicated
        :return: a dictionary that maps instance ids to candidate annotations
        """
        gold_total = 0
        predicted_correct = 0

        data_iterator = tqdm.tqdm(self.get_samples() +
                                  self.get_samples(dev=True),
                                  ncols=100,
                                  ascii=True,
                                  disable=not verbose)
        precomputed_candidates = {}

        for el_id, text, annotations, _ in data_iterator:
            sentence = Sentence(input_text=text)
            sentence = linker.link_entities_in_sentence_obj(sentence)
            entities = [(l.get('kbID'), ) + tuple(e['offsets'])
                        for e in sentence.entities for l in e['linkings']
                        if len(e['linkings']) > 0]

            match = measures.entity_linking_tp_with_overlap(
                annotations, entities)

            predicted_correct += match
            gold_total += len(annotations)
            recall = predicted_correct / gold_total if gold_total > 0 else 0
            data_iterator.set_postfix(rec=recall)
            precomputed_candidates[el_id] = sentence

        return precomputed_candidates
    def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1):
        sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged,
                                mentions=sentence_obj.mentions, entities=sentence_obj.entities)
        sentence_obj.entities = []

        params = urllib.parse.urlencode({'text': sentence_obj.input_text, 'confidence': str(self._confidence)})
        request = urllib.request.Request(self._spotlight_url + params)
        request.add_header("Accept", "application/json")
        try:
            content = json.loads(urllib.request.urlopen(request).read())
            sentence_obj.entities = [{
                                        "linkings": [{
                                                        'kbID': queries.map_wikipedia_id(r.get("@URI")
                                                                                                  .replace("http://dbpedia.org/resource/", "")
                                                                                                  .replace("http://dbpedia.org/page/", ""))
                                                      }],
                                        'offsets': (int(r.get('@offset', '0')), int(r.get('@offset', '0')) + len(r.get('@surfaceForm', "")))}
                                     for r in content.get('Resources', [])]
        except:
            pass
        return sentence_obj
    def link_entities_in_sentence_obj(self, sentence_obj: Sentence, element_id=None, num_candidates=-1) -> Sentence:
        """
        The method takes a sentence dictionary object that might already contain a tagged input or recognized mentions.
        This is useful if tagging and mentioned extraction is done in bulk before the entity linking step.
        Supported fields in the sentence_obj object:
            "input_text": raw input text as a string
            "tagged": a list of dict objects, one per token, with the output of the POS and NER taggers, see utils
                      for more info (optional)
            "mentions": a list of dict object, one per mention, see mention_extraction for more info (optional)
            "entities": extracted entity candidates (optional)
        See Sentence for more info.
        
        :param sentence_obj: input sentence as a dictionary, might be an empty dict
        :param element_id: sentence id to retrieve precomputed candidates for certain linkers
        :param num_candidates: the number of candidate entity links to store for each entity. 
                                If set to more than 0 it will override the class setting.
        :return: the same sentence_obj object with a new field "entities"
        
        >>> l = HeuristicsLinker()
        >>> l.link_entities_in_sentence_obj(Sentence("Where does Norway get their oil?")).entities[0]['linkings']  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        [('Q20', 'Norway'), ...]
        """
        if self._precomputed_candidates is not None and element_id in self._precomputed_candidates:
            sentence_obj = self._precomputed_candidates[element_id]
        sentence_obj = Sentence(input_text=sentence_obj.input_text, tagged=sentence_obj.tagged,
                                mentions=sentence_obj.mentions, entities=sentence_obj.entities)
        if sentence_obj.tagged is None and sentence_obj.input_text is not None:
            sentence_obj.tagged = utils.get_tagged_from_server(sentence_obj.input_text,
                                                               caseless=sentence_obj.input_text.islower())
            self.logger.debug([(t['word'], t['pos']) for t in sentence_obj.tagged])

        if sentence_obj.entities is None:
            if sentence_obj.mentions is not None:
                sentence_obj.entities = self._link_mentions_to_entities(sentence_obj.mentions)
            else:
                sentence_obj.entities = self._link_entities_in_tagged_input(sentence_obj.tagged)
                self.logger.debug([e['linkings'][0] for e in sentence_obj.entities])
        elif self.prefer_longer_matches:
            sentence_obj.entities = self._prefer_longer_matches(sentence_obj.entities)

        for e in sentence_obj.entities:
            e['text'] = sentence_obj.input_text
        sentence_obj.entities = [self.compute_candidate_scores(e, tagged_text=sentence_obj.tagged)
                                 for e in sentence_obj.entities]

        if self.no_mentions_overlap:
            if not self.one_entity_mode:
                sentence_obj.entities = resolve_entity_overlap_beam_search(sentence_obj.entities)
            else:
                sentence_obj.entities = sorted(sentence_obj.entities, key=lambda x: x.get('drop_score', 0.0))
                sentence_obj.entities = sentence_obj.entities[:1]

                # One mention span -> one entity. Each entity can have multiple linking candidates.
        for e in sentence_obj.entities:
            # If there are many linking candidates we take the top N, since they are still ordered
            if num_candidates > 0:
                e['linkings'] = e['linkings'][:num_candidates]
            else:
                e['linkings'] = e['linkings'][:self.num_candidates]

        return sentence_obj
    def eval(self,
             linker: BaseLinker,
             only_the_main_entity=False,
             fb=False,
             verbose=True):
        performance_per_entity_type = defaultdict(lambda: [0, 0, 0])
        predicted_correct = 0
        predicted_total = 0
        gold_total = 0

        data_iterator = tqdm.tqdm(self.get_samples(dev=True, fb=fb),
                                  ncols=100,
                                  ascii=True,
                                  disable=not verbose)

        for el_id, text, annotations, main_entity, gold_entity_classes in data_iterator:
            sentence = Sentence(input_text=text)
            sentence = linker.link_entities_in_sentence_obj(sentence,
                                                            element_id=el_id)
            entities = [(l.get('kbID'), ) + tuple(e['offsets'])
                        for e in sentence.entities for l in e['linkings']
                        if len(e['linkings']) > 0]
            entity_classes = [
                queries.get_mapped_entity_type(e[0]) if e else "other"
                for e in entities
            ]
            if fb:
                entities = [(l.get('fbID'), ) + tuple(e['offsets'])
                            for e in sentence.entities for l in e['linkings']
                            if len(e['linkings']) > 0]
            if only_the_main_entity:
                annotations = [main_entity]
                match = measures.entity_linking_tp_with_overlap(
                    annotations, entities)
            else:
                entities = [e[0] for e in entities]
                annotations = [e[0] for e in annotations]
                match = 0
                for ai, a in enumerate(annotations):
                    gold_entity_class = gold_entity_classes[
                        ai] if gold_entity_classes and gold_entity_classes[
                            ai] else "other"
                    if a in entities:
                        match += 1
                        performance_per_entity_type[gold_entity_class][0] += 1
                    performance_per_entity_type[gold_entity_class][2] += 1
                for entity_class in entity_classes:
                    performance_per_entity_type[entity_class][1] += 1

            predicted_correct += match
            predicted_total += len(entities)
            gold_total += len(annotations)
            precision = predicted_correct / predicted_total if predicted_total > 0 else 0
            recall = predicted_correct / gold_total if gold_total > 0 else 0
            f1 = (2.0 * precision * recall) / (
                precision + recall) if precision + recall > 0 else 0
            data_iterator.set_postfix(prec=precision, rec=recall, f1=f1)

        precision = predicted_correct / predicted_total if predicted_total > 0 else 0
        recall = predicted_correct / gold_total if gold_total > 0 else 0
        f1 = (2.0 * precision *
              recall) / (precision + recall) if precision + recall > 0 else 0

        for cls, stats in performance_per_entity_type.items():
            predicted_correct, predicted_total, gold_total = tuple(stats)
            cls_precision = predicted_correct / predicted_total if predicted_total > 0 else 0
            cls_recall = predicted_correct / gold_total if gold_total > 0 else 0
            cls_f1 = (2.0 * cls_precision * cls_recall) / (
                cls_precision +
                cls_recall) if cls_precision + cls_recall > 0 else 0
            performance_per_entity_type[cls] = (cls_precision, cls_recall,
                                                cls_f1)

        return precision, recall, f1, dict(performance_per_entity_type)
Exemplo n.º 7
0
    def eval(self,
             linker: BaseLinker,
             only_the_main_entity=False,
             fb=False,
             verbose=True,
             main_entity_given=False):
        performance_per_entity_type = defaultdict(lambda: [0, 0, 0])
        predicted_correct = 0
        predicted_total = 0
        gold_total = 0

        data_iterator = tqdm.tqdm(self.get_samples(dev=True, fb=fb),
                                  ncols=100,
                                  ascii=True,
                                  disable=not verbose)

        for el_id, text, annotations, main_entity, gold_entity_classes in data_iterator:
            sentence = Sentence(input_text=text)
            # TODO SET FALSE
            use_main_entity = main_entity_given
            if use_main_entity:
                if sentence.tagged is None and sentence.input_text is not None:
                    sentence.tagged = utils.get_tagged_from_server(
                        sentence.input_text,
                        caseless=sentence.input_text.islower())
                # find main entity mentions
                for i, t in enumerate(sentence.tagged):
                    t['abs_id'] = i
                mention_chunks = []
                for chunk in sentence.tagged:
                    if chunk['characterOffsetBegin'] >= main_entity[
                            1] and chunk['characterOffsetEnd'] <= main_entity[
                                2]:
                        mention_chunks.append(chunk)
                    if chunk['characterOffsetEnd'] >= main_entity[2]:
                        break
                try:
                    assert len(mention_chunks) > 0
                    for chunk in mention_chunks:
                        assert chunk['word'] in text[
                            main_entity[1]:main_entity[2]]
                    assert mention_chunks[0][
                        'word'] == text[main_entity[1]:main_entity[1] +
                                        len(mention_chunks[0]['word'])]
                    assert mention_chunks[-1]['word'] == text[
                        main_entity[2] -
                        len(mention_chunks[-1]['word']):main_entity[2]]
                except AssertionError:
                    print("bad mention: {} {}".format(el_id, main_entity))
                    mention_chunks = []
                    if el_id == 'WebQTest-1575':
                        mention_chunks.append(sentence.tagged[8])
                fragment_type = ("NNP" if any(
                    el['pos'] in {'NNP', 'NNPS'}
                    for el in mention_chunks) else "NN" if any(
                        el['pos'] != "CD" and el['ner'] != "DATE"
                        for el in mention_chunks) else "DATE" if all(
                            el['ner'] == "DATE"
                            for el in mention_chunks) else utils.unknown_el)
                mentions = [{
                    'type':
                    fragment_type,
                    'tokens': [el['word'] for el in mention_chunks],
                    'token_ids': [el['abs_id'] for el in mention_chunks],
                    'poss': [el['pos'] for el in mention_chunks],
                    'offsets': (main_entity[1], main_entity[2])
                }]

                sentence.mentions = mentions
            sentence = linker.link_entities_in_sentence_obj(sentence,
                                                            element_id=el_id)
            entities = [(l.get('kbID'), ) + tuple(e['offsets'])
                        for e in sentence.entities for l in e['linkings']
                        if len(e['linkings']) > 0]
            entity_classes = [
                queries.get_mapped_entity_type(e[0]) if e else "other"
                for e in entities
            ]
            if fb:
                entities = [(l.get('fbID'), ) + tuple(e['offsets'])
                            for e in sentence.entities for l in e['linkings']
                            if len(e['linkings']) > 0]
            if only_the_main_entity:
                annotations = [main_entity]
                match = measures.entity_linking_tp_with_overlap(
                    annotations, entities)
            else:
                match = measures.entity_linking_tp_with_overlap(
                    annotations, entities)
                for ai, a in enumerate(annotations):
                    gold_entity_class = gold_entity_classes[
                        ai] if gold_entity_classes and gold_entity_classes[
                            ai] else "other"
                    a_match = measures.entity_linking_tp_with_overlap([a],
                                                                      entities)
                    performance_per_entity_type[gold_entity_class][
                        0] += a_match
                    performance_per_entity_type[gold_entity_class][2] += 1
                for entity_class in entity_classes:
                    performance_per_entity_type[entity_class][1] += 1

                #entities = [e[0] for e in entities]
                #annotations = [e[0] for e in annotations]
                #match = 0
                #for ai, a in enumerate(annotations):
                #    gold_entity_class = gold_entity_classes[ai] if gold_entity_classes and gold_entity_classes[ai] else "other"
                #    if a in entities:
                #        match += 1
                #        performance_per_entity_type[gold_entity_class][0] += 1
                #    performance_per_entity_type[gold_entity_class][2] += 1
                #for entity_class in entity_classes:
                #    performance_per_entity_type[entity_class][1] += 1

            predicted_correct += match
            predicted_total += len(entities)
            gold_total += len(annotations)
            precision = predicted_correct / predicted_total if predicted_total > 0 else 0
            recall = predicted_correct / gold_total if gold_total > 0 else 0
            f1 = (2.0 * precision * recall) / (
                precision + recall) if precision + recall > 0 else 0
            data_iterator.set_postfix(prec=precision, rec=recall, f1=f1)

        precision = predicted_correct / predicted_total if predicted_total > 0 else 0
        recall = predicted_correct / gold_total if gold_total > 0 else 0
        f1 = (2.0 * precision *
              recall) / (precision + recall) if precision + recall > 0 else 0

        for cls, stats in performance_per_entity_type.items():
            predicted_correct, predicted_total, gold_total = tuple(stats)
            cls_precision = predicted_correct / predicted_total if predicted_total > 0 else 0
            cls_recall = predicted_correct / gold_total if gold_total > 0 else 0
            cls_f1 = (2.0 * cls_precision * cls_recall) / (
                cls_precision +
                cls_recall) if cls_precision + cls_recall > 0 else 0
            performance_per_entity_type[cls] = (cls_precision, cls_recall,
                                                cls_f1)

        return precision, recall, f1, dict(performance_per_entity_type)