Exemplo n.º 1
0
    def _get_tars_formatted_sentence(self, label, sentence):

        original_text = sentence.to_tokenized_string()

        label_text_pair = (f"{label} {self.separator} {original_text}"
                           if self.prefix else
                           f"{original_text} {self.separator} {label}")

        label_length = 0 if not self.prefix else len(label.split(" ")) + len(
            self.separator.split(" "))

        # make a tars sentence where all labels are O by default
        tars_sentence = Sentence(label_text_pair, use_tokenizer=False)

        for entity_label in sentence.get_labels(self.label_type):
            if entity_label.value == label:
                new_span = [
                    tars_sentence.get_token(token.idx + label_length)
                    for token in entity_label.span
                ]
                tars_sentence.add_complex_label(
                    self.static_label_type,
                    SpanLabel(Span(new_span), value="entity"))

        return tars_sentence
Exemplo n.º 2
0
    def _label(self, sentence: Sentence):
        """
        This will add a complex_label to the given sentence for every match.span() for every registered_mapping.
        If a match span overlaps with a token span an exception is raised.
        """
        collection = RegexpTagger.TokenCollection(sentence)

        for label, pattern in self._regexp_mapping.items():
            for match in pattern.finditer(sentence.to_original_text()):
                span: Tuple[int, int] = match.span()
                try:
                    token_span = collection.get_token_span(span)
                except ValueError:
                    raise Exception(
                        f"The match span {span} for label '{label}' is overlapping with a token!"
                    )
                sentence.add_complex_label(label, SpanLabel(token_span, label))