예제 #1
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create fixed length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(chain_transform(self._units), inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._context['filter_unit'].transform,
                                mode='right', inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._context['vocab_unit'].transform,
                                mode='both', inplace=True, verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left', inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right', inplace=True, verbose=verbose)

        max_len_left = self._fixed_length_left
        max_len_right = self._fixed_length_right

        data_pack.left['length_left'] = \
            data_pack.left['length_left'].apply(
                lambda val: min(val, max_len_left))

        data_pack.right['length_right'] = \
            data_pack.right['length_right'].apply(
                lambda val: min(val, max_len_right))
        return data_pack
예제 #2
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create truncated length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(chain_transform(self._units),
                                inplace=True,
                                verbose=verbose)

        # data_pack.apply_on_text(self._context['filter_unit'].transform,
        #                         mode='right', inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._context['vocab_unit'].transform,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        if self._truncated_length_left:
            data_pack.apply_on_text(self._left_truncatedlength_unit.transform,
                                    mode='left',
                                    inplace=True,
                                    verbose=verbose)
        if self._truncated_length_right:
            data_pack.apply_on_text(self._right_truncatedlength_unit.transform,
                                    mode='right',
                                    inplace=True,
                                    verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)

        data_pack.drop_empty(inplace=True)
        return data_pack
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create `letter-ngram` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        func = chain_transform(self._default_units())
        data_pack.apply_on_text(func, inplace=True, verbose=verbose)

        data_pack.apply_on_text(self._left_truncatedlength_unit.transform,
                                mode='left', inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._right_truncatedlength_unit.transform,
                                mode='right', inplace=True, verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)

        post_units = [units.NgramLetter(reduce_dim=False)]
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            post_units.append(units.WordHashing(term_index))
        data_pack.apply_on_text(chain_transform(post_units),
                                inplace=True, verbose=verbose)
        return data_pack
예제 #4
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(self.bert_encode,
                                mode='both',
                                inplace=True,
                                multiprocessing=self.multiprocessing,
                                verbose=verbose)

        if self._truncated_length_left:
            data_pack.apply_on_text(ChainTransform(
                self._left_truncated_length_unit),
                                    mode='left',
                                    inplace=True,
                                    verbose=verbose)
        if self._truncated_length_right:
            data_pack.apply_on_text(ChainTransform(
                self._right_truncated_length_unit),
                                    mode='right',
                                    inplace=True,
                                    verbose=verbose)

        data_pack.append_text_length(inplace=True,
                                     verbose=verbose,
                                     multiprocessing=self.multiprocessing)
        data_pack.drop_empty(inplace=True)
        return data_pack
예제 #5
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create fixed length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(chain_transform(self._units),
                                inplace=True,
                                verbose=verbose)

        # data_pack.apply_on_text(self._context['filter_unit'].transform,
        #                         mode='right', inplace=True, verbose=verbose)
        # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left")
        # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right")

        data_pack.apply_on_text(self._context['vocab_unit'].transform,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)

        def process_decoder_input_output(text: str):
            tokens = chain_transform(self._units)(text)
            tokens = self._context['vocab_unit'].transform(tokens)
            return self._right_fixedlength_unit.transform(tokens)

        data_pack.right[KeyWordSettings.TextRightInput] = data_pack.right[
            KeyWordSettings.TextRightInput].apply(process_decoder_input_output)
        data_pack.right[KeyWordSettings.TextRightOutput] = data_pack.right[
            KeyWordSettings.TextRightOutput].apply(
                process_decoder_input_output)

        max_len_left = self._fixed_length_left
        max_len_right = self._fixed_length_right

        data_pack.left['length_left'] = \
            data_pack.left['length_left'].apply(
                lambda val: min(val, max_len_left))

        data_pack.right['length_right'] = \
            data_pack.right['length_right'].apply(
                lambda val: min(val, max_len_right))
        return data_pack
예제 #6
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:'DataPack' object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(
            chain_transform(self._units),
            mode='both', inplace=True, verbose=verbose)

        data_pack.apply_on_text(
            self._left_truncatedlength_unit.transform,
            mode='left', inplace=True, verbose=verbose)
        data_pack.apply_on_text(
            self._right_truncatedlength_unit.transform,
            mode='right', inplace=True, verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)

        # Process character representation
        data_pack.apply_on_text(
            units.NgramLetter(ngram=1, reduce_dim=False).transform,
            rename=('char_left', 'char_right'),
            mode='both', inplace=True, verbose=verbose)
        char_index_dict = self._context['char_unit'].state['term_index']
        charindex_unit = units.CharacterIndex(char_index_dict)
        data_pack.left['char_left'] = data_pack.left['char_left'].apply(
            charindex_unit.transform)
        data_pack.right['char_right'] = data_pack.right['char_right'].apply(
            charindex_unit.transform)

        # Process word representation
        data_pack.apply_on_text(
            self._context['vocab_unit'].transform,
            mode='both', inplace=True, verbose=verbose)

        # Process exact match representation
        data_pack.relation["match_left"] = ""
        data_pack.relation["match_right"] = ""
        frame = data_pack.relation.join(
            data_pack.left, on='id_left', how='left'
        ).join(data_pack.right, on='id_right', how='left')
        left_exactmatch_unit = units.WordExactMatch(
            match='text_left', to_match='text_right')
        right_exactmatch_unit = units.WordExactMatch(
            match='text_right', to_match='text_left')
        data_pack.relation['match_left'] = frame.apply(
            left_exactmatch_unit.transform, axis=1)
        data_pack.relation['match_right'] = frame.apply(
            right_exactmatch_unit.transform, axis=1)

        return data_pack
예제 #7
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack.apply_on_text(self._tokenizer.encode,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)

        return data_pack
예제 #8
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create truncated length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        units_ = self._default_units()
        units_.append(self._context['vocab_unit'])
        units_.append(
            units.TruncatedLength(text_length=30, truncate_mode='post'))
        func = chain_transform(units_)
        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        return data_pack
 def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
     data_pack = data_pack.copy()
     data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
     data_pack.apply_on_text(self._context['filter_unit'].transform,
                             mode='right',
                             inplace=True,
                             verbose=verbose)
     data_pack.apply_on_text(self._context['vocab_unit'].transform,
                             mode='both',
                             inplace=True,
                             verbose=verbose)
     if self._truncated_length_left:
         data_pack.apply_on_text(self._left_truncatedlength_unit.transform,
                                 mode='left',
                                 inplace=True,
                                 verbose=verbose)
     if self._truncated_length_right:
         data_pack.apply_on_text(self._right_truncatedlength_unit.transform,
                                 mode='right',
                                 inplace=True,
                                 verbose=verbose)
     data_pack.append_text_length(inplace=True, verbose=verbose)
     data_pack.drop_empty(inplace=True)
     return data_pack