def _handle_bilou_label(self, word_idx: int,
                            entities: List[Any]) -> Tuple[Any, Any, Any]:
        label, confidence = self.most_likely_entity(word_idx, entities)
        entity_label = bilou_utils.entity_name_from_tag(label)

        if bilou_utils.bilou_prefix_from_tag(label) == "U":
            return word_idx, confidence, entity_label

        elif bilou_utils.bilou_prefix_from_tag(label) == "B":
            # start of multi word-entity need to represent whole extent
            ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
            return ent_word_idx, confidence, entity_label

        else:
            return None, None, None
    def _from_json_to_crf(
        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
    ) -> List[CRFToken]:
        """Convert json examples to format of underlying crfsuite."""

        tokens = self._tokens_without_cls(message)
        ents = bilou_utils.bilou_tags_from_offsets(tokens, entity_offsets)

        # collect badly annotated examples
        collected = []
        for t, e in zip(tokens, ents):
            if e == "-":
                collected.append(t)
            elif collected:
                collected_text = " ".join([t.text for t in collected])
                common_utils.raise_warning(
                    f"Misaligned entity annotation for '{collected_text}' "
                    f"in sentence '{message.text}' with intent "
                    f"'{message.get('intent')}'. "
                    f"Make sure the start and end values of the "
                    f"annotated training examples end at token "
                    f"boundaries (e.g. don't include trailing "
                    f"whitespaces or punctuation).",
                    docs=DOCS_URL_TRAINING_DATA_NLU,
                )
                collected = []

        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if bilou_utils.bilou_prefix_from_tag(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = bilou_utils.entity_name_from_tag(label)

        return self._from_text_to_crf(message, ents)
示例#3
0
    def convert_predictions_into_entities(
        text: Text,
        tokens: List[Token],
        tags: Dict[Text, List[Text]],
        split_entities_config: Dict[Text, bool] = None,
        confidences: Optional[Dict[Text, List[float]]] = None,
    ) -> List[Dict[Text, Any]]:
        """Convert predictions into entities.

        Args:
            text: The text message.
            tokens: Message tokens without CLS token.
            tags: Predicted tags.
            split_entities_config: config for handling splitting a list of entities
            confidences: Confidences of predicted tags.

        Returns:
            Entities.
        """
        import rasa.nlu.utils.bilou_utils as bilou_utils

        entities = []

        last_entity_tag = NO_ENTITY_TAG
        last_role_tag = NO_ENTITY_TAG
        last_group_tag = NO_ENTITY_TAG
        last_token_end = -1

        for idx, token in enumerate(tokens):
            current_entity_tag = EntityExtractor.get_tag_for(
                tags, ENTITY_ATTRIBUTE_TYPE, idx)

            if current_entity_tag == NO_ENTITY_TAG:
                last_entity_tag = NO_ENTITY_TAG
                last_token_end = token.end
                continue

            current_group_tag = EntityExtractor.get_tag_for(
                tags, ENTITY_ATTRIBUTE_GROUP, idx)
            current_group_tag = bilou_utils.tag_without_prefix(
                current_group_tag)
            current_role_tag = EntityExtractor.get_tag_for(
                tags, ENTITY_ATTRIBUTE_ROLE, idx)
            current_role_tag = bilou_utils.tag_without_prefix(current_role_tag)

            group_or_role_changed = (last_group_tag != current_group_tag
                                     or last_role_tag != current_role_tag)

            if bilou_utils.bilou_prefix_from_tag(current_entity_tag):
                # checks for new bilou tag
                # new bilou tag begins are not with I- , L- tags
                new_bilou_tag_starts = last_entity_tag != current_entity_tag and (
                    bilou_utils.LAST !=
                    bilou_utils.bilou_prefix_from_tag(current_entity_tag)
                    and bilou_utils.INSIDE !=
                    bilou_utils.bilou_prefix_from_tag(current_entity_tag))

                # to handle bilou tags such as only I-, L- tags without B-tag
                # and handle multiple U-tags consecutively
                new_unigram_bilou_tag_starts = (
                    last_entity_tag == NO_ENTITY_TAG or bilou_utils.UNIT
                    == bilou_utils.bilou_prefix_from_tag(current_entity_tag))

                new_tag_found = (new_bilou_tag_starts
                                 or new_unigram_bilou_tag_starts
                                 or group_or_role_changed)
                last_entity_tag = current_entity_tag
                current_entity_tag = bilou_utils.tag_without_prefix(
                    current_entity_tag)
            else:
                new_tag_found = (last_entity_tag != current_entity_tag
                                 or group_or_role_changed)
                last_entity_tag = current_entity_tag

            if new_tag_found:
                # new entity found
                entity = EntityExtractor._create_new_entity(
                    list(tags.keys()),
                    current_entity_tag,
                    current_group_tag,
                    current_role_tag,
                    token,
                    idx,
                    confidences,
                )
                entities.append(entity)
            elif EntityExtractor._check_is_single_entity(
                    text, token, last_token_end, split_entities_config,
                    current_entity_tag):
                # current token has the same entity tag as the token before and
                # the two tokens are separated by at most 3 symbols, where each
                # of the symbols has to be either punctuation (e.g. "." or ",")
                # and a whitespace.
                entities[-1][ENTITY_ATTRIBUTE_END] = token.end
                if confidences is not None:
                    EntityExtractor._update_confidence_values(
                        entities, confidences, idx)

            else:
                # the token has the same entity tag as the token before but the two
                # tokens are separated by at least 2 symbols (e.g. multiple spaces,
                # a comma and a space, etc.) and also shouldn't be represented as a
                # single entity
                entity = EntityExtractor._create_new_entity(
                    list(tags.keys()),
                    current_entity_tag,
                    current_group_tag,
                    current_role_tag,
                    token,
                    idx,
                    confidences,
                )
                entities.append(entity)

            last_group_tag = current_group_tag
            last_role_tag = current_role_tag
            last_token_end = token.end

        for entity in entities:
            entity[ENTITY_ATTRIBUTE_VALUE] = text[
                entity[ENTITY_ATTRIBUTE_START]:entity[ENTITY_ATTRIBUTE_END]]

        return entities
示例#4
0
    def convert_predictions_into_entities(
        self,
        text: Text,
        tokens: List[Token],
        tags: Dict[Text, List[Text]],
        confidences: Optional[Dict[Text, List[float]]] = None,
    ) -> List[Dict[Text, Any]]:
        """
        Convert predictions into entities.

        Args:
            text: The text message.
            tokens: Message tokens without CLS token.
            tags: Predicted tags.
            confidences: Confidences of predicted tags.

        Returns:
            Entities.
        """
        entities = []

        last_entity_tag = NO_ENTITY_TAG
        last_role_tag = NO_ENTITY_TAG
        last_group_tag = NO_ENTITY_TAG
        last_token_end = -1

        for idx, token in enumerate(tokens):
            current_entity_tag = self.get_tag_for(tags, ENTITY_ATTRIBUTE_TYPE,
                                                  idx)

            if current_entity_tag == NO_ENTITY_TAG:
                last_entity_tag = NO_ENTITY_TAG
                last_token_end = token.end
                continue

            current_group_tag = self.get_tag_for(tags, ENTITY_ATTRIBUTE_GROUP,
                                                 idx)
            current_group_tag = bilou_utils.tag_without_prefix(
                current_group_tag)
            current_role_tag = self.get_tag_for(tags, ENTITY_ATTRIBUTE_ROLE,
                                                idx)
            current_role_tag = bilou_utils.tag_without_prefix(current_role_tag)

            group_or_role_changed = (last_group_tag != current_group_tag
                                     or last_role_tag != current_role_tag)

            if bilou_utils.bilou_prefix_from_tag(current_entity_tag):
                # checks for new bilou tag
                # new bilou tag begins are not with I- , L- tags
                new_bilou_tag_starts = last_entity_tag != current_entity_tag and (
                    bilou_utils.LAST !=
                    bilou_utils.bilou_prefix_from_tag(current_entity_tag)
                    and bilou_utils.INSIDE !=
                    bilou_utils.bilou_prefix_from_tag(current_entity_tag))

                # to handle bilou tags such as only I-, L- tags without B-tag
                # and handle multiple U-tags consecutively
                new_unigram_bilou_tag_starts = (
                    last_entity_tag == NO_ENTITY_TAG or bilou_utils.UNIT
                    == bilou_utils.bilou_prefix_from_tag(current_entity_tag))

                new_tag_found = (new_bilou_tag_starts
                                 or new_unigram_bilou_tag_starts
                                 or group_or_role_changed)
                last_entity_tag = current_entity_tag
                current_entity_tag = bilou_utils.tag_without_prefix(
                    current_entity_tag)
            else:
                new_tag_found = (last_entity_tag != current_entity_tag
                                 or group_or_role_changed)
                last_entity_tag = current_entity_tag

            if new_tag_found:
                entity = self._create_new_entity(
                    list(tags.keys()),
                    current_entity_tag,
                    current_group_tag,
                    current_role_tag,
                    token,
                    idx,
                    confidences,
                )
                entities.append(entity)
            elif token.start - last_token_end <= 1:
                # current token has the same entity tag as the token before and
                # the two tokens are only separated by at most one symbol (e.g. space,
                # dash, etc.)
                entities[-1][ENTITY_ATTRIBUTE_END] = token.end
                if confidences is not None:
                    self._update_confidence_values(entities, confidences, idx)
            else:
                # the token has the same entity tag as the token before but the two
                # tokens are separated by at least 2 symbols (e.g. multiple spaces,
                # a comma and a space, etc.)
                entity = self._create_new_entity(
                    list(tags.keys()),
                    current_entity_tag,
                    current_group_tag,
                    current_role_tag,
                    token,
                    idx,
                    confidences,
                )
                entities.append(entity)

            last_group_tag = current_group_tag
            last_role_tag = current_role_tag
            last_token_end = token.end

        for entity in entities:
            entity[ENTITY_ATTRIBUTE_VALUE] = text[
                entity[ENTITY_ATTRIBUTE_START]:entity[ENTITY_ATTRIBUTE_END]]

        return entities
示例#5
0
def test_bilou_from_tag(tag, expected):
    actual = bilou_utils.bilou_prefix_from_tag(tag)

    assert actual == expected