Пример #1
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract the character feature of one instance.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        data = []
        max_char_length = -1

        for word in pack.get(self.config.entry_type, instance):
            if self.vocab:
                data.append([self.element2repr(char) for char in word.text])
            else:
                data.append(list(word.text))
            max_char_length = max(max_char_length, len(data[-1]))

        if hasattr(self.config, "max_char_length") and \
                self.config.max_char_length is not None and \
                self.config.max_char_length < max_char_length:
            data = [token[:self.config.max_char_length] for token in data]

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 2,
            "dtype": int if self.vocab else str
        }
        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Пример #2
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract the sequence tagging feature of one instance. If the
        vocabulary of this extractor is set, then the extracted tag sequences
        will be converted to the tag ids (int).

        Args:
            pack (DataPack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns (Feature):
            a feature that contains the extracted data.
        """
        instance_tagged: List[Tuple[Optional[str], str]] = bio_tagging(
            pack, instance, self.config.tagging_unit, self.config.entry_type,
            self.config.attribute)

        if self.vocab:
            # Use the vocabulary to map data into representation.
            vocab_mapped: List[Union[int, List[int]]] = []
            for pair in instance_tagged:
                vocab_mapped.append(self.element2repr(pair))
            raw_data: List = vocab_mapped
        else:
            # When vocabulary is not available, use the original data.
            raw_data = instance_tagged

        return Feature(data=raw_data,
                       metadata={
                           "pad_value": self.get_pad_value(),
                           "dim": 1,
                           "dtype": int if self.vocab else tuple
                       },
                       vocab=self.vocab)
Пример #3
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract attributes of one instance.
        For example, the text of tokens in one sentence.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        data = []
        for entry in pack.get(self.config.entry_type, instance):
            value = self.get_attribute(entry, self.config.attribute)
            rep = self.element2repr(value) if self.vocab else value
            data.append(rep)

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int if self.vocab else Any
        }

        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Пример #4
0
    def extract(self,
                pack: DataPack,
                context: Optional[Annotation] = None) -> Feature:
        """Extract the attribute of an entry of the configured entry type.
        The entry type is passed in from via extractor config `entry_type`.

        Args:
            pack (DataPack): The datapack that contains the current instance.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns: Features (attributes) for instance with in the provided
            context, they will be converted to the representation based on
            the vocabulary configuration.
        """
        data = []

        instance: Annotation
        for instance in pack.get(self.config.entry_type, context):
            value = self._get_attribute(instance, self.config.attribute)
            rep = self.element2repr(value) if self.vocab else value
            data.append(rep)

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int if self.vocab else Any,
        }
        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Пример #5
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract the subword feature of one instance.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        data = []
        for subword in pack.get(self._entry_type, instance):
            text = subword.text
            if not subword.is_first_segment:
                text = "##" + text
            data.append(self.element2repr(text))

        data = ([self.element2repr("[CLS]")] + data +
                [self.element2repr("[SEP]")])

        meta_data = {
            "need_pad": self.vocab.use_pad,  # type: ignore
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int,
        }

        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Пример #6
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract the sequence tagging feature of one instance.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        instance_tagged: List[Tuple[Optional[str], str]] = \
            bio_tagging(pack, instance,
            self.config.tagging_unit, self.config.entry_type,
            self.config.attribute)

        data = []
        for pair in instance_tagged:
            if self.vocab:
                data.append(self.element2repr(pair))
            else:
                data.append(pair)
        meta_data = {
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int if self.vocab else tuple
        }
        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Пример #7
0
    def extract(self,
                pack: DataPack,
                context: Optional[Annotation] = None) -> Feature:
        r"""Extract the sequence tagging feature of one instance. If the
        vocabulary of this extractor is set, then the extracted tag sequences
        will be converted to the tag ids (int).

        Args:
            pack (DataPack): The datapack that contains the current
                instance.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns (Feature): a feature that contains the extracted BIO sequence
            of and other metadata.
        """
        instance_tagged: List[Tuple[Optional[str], str]] = bio_tagging(
            pack,
            self.config.tagging_unit,
            self.config.entry_type,
            self.config.attribute,
            context,
        )

        pad_value = self.get_pad_value()
        if self.vocab:
            # Use the vocabulary to map data into representation.
            vocab_mapped: List[Union[int, List[int]]] = []
            for pair in instance_tagged:
                vocab_mapped.append(self.element2repr(pair))
            raw_data: List = vocab_mapped
            if self.config.is_bert:
                raw_data = [pad_value] + raw_data + [pad_value]

            need_pad = self.vocab.use_pad
        else:
            # When vocabulary is not available, use the original data.
            raw_data = instance_tagged
            need_pad = self.config.need_pad

        meta_data = {
            "need_pad": need_pad,
            "pad_value": pad_value,
            "dim": 1,
            "dtype": int if self.vocab else tuple,
        }
        return Feature(data=raw_data, metadata=meta_data, vocab=self.vocab)
Пример #8
0
    def extract(self,
                pack: DataPack,
                context: Optional[Annotation] = None) -> Feature:
        """Extract link data as features from the context.

        Args:
            pack (DataPack): The input data pack that contains the features.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns:

        """
        index_annotations: List[Annotation] = list(
            pack.get(self.config.index_annotation, context))

        parent_nodes: List[Annotation] = []
        child_nodes: List[Annotation] = []
        relation_atts = []

        r: Link
        for r in pack.get(self.config.entry_type, context):
            parent_nodes.append(r.get_parent())  # type: ignore
            child_nodes.append(r.get_child())  # type: ignore

            raw_att = getattr(r, self.config.attribute)
            relation_atts.append(
                self.element2repr(raw_att) if self.vocab else raw_att)

        parent_unit_span = []
        child_unit_span = []

        for p, c in zip(parent_nodes, child_nodes):
            parent_unit_span.append(get_index(pack, index_annotations, p))
            child_unit_span.append(get_index(pack, index_annotations, c))

        meta_data = {
            "parent_unit_span": parent_unit_span,
            "child_unit_span": child_unit_span,
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int if self.vocab else str,
        }

        return Feature(data=relation_atts,
                       metadata=meta_data,
                       vocab=self.vocab)
Пример #9
0
    def extract(
        self, pack: DataPack, context: Optional[Annotation] = None
    ) -> Feature:
        r"""Extract the subword feature of one instance.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        data = []

        subword: Annotation
        for subword in pack.get(self.config.subword_class, context):
            text = subword.text  # type: ignore
            if not subword.is_first_segment:  # type: ignore
                text = "##" + text
            data.append(self.element2repr(text))

        data = (
            [self.element2repr("[CLS]")] + data + [self.element2repr("[SEP]")]
        )

        meta_data = {
            "need_pad": self.vocab.use_pad,  # type: ignore
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int,
        }

        return Feature(data=data, metadata=meta_data, vocab=self.vocab)