예제 #1
0
    def transform(self, data_pack: DataPack, verbose=1) -> DataPack:
        """
        Apply transformation on data, create `letter-ngram` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units = self._default_processor_units()
        data_pack.apply_on_text(chain_transform(units),
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)
        post_units = [processor_units.NgramLetterUnit(reduce_dim=False)]
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            post_units.append(processor_units.WordHashingUnit(term_index))
        data_pack.apply_on_text(chain_transform(post_units),
                                inplace=True,
                                verbose=verbose)
        return data_pack
예제 #2
0
 def _default_processor_units(cls) -> list:
     """Prepare needed process units."""
     return [
         processor_units.TokenizeUnit(),
         processor_units.LowercaseUnit(),
         processor_units.PuncRemovalUnit(),
         processor_units.StopRemovalUnit(),
         processor_units.NgramLetterUnit(),
     ]
예제 #3
0
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: Data_pack to be preprocessed.
        :return: class:`CDSSMPreprocessor` instance.
        """
        units = self._default_processor_units()
        units.append(processor_units.NgramLetterUnit())
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        vocab_size = len(vocab_unit.state['term_index']) + 1
        self._context['input_shapes'] = [(self._fixed_length_left, vocab_size),
                                         (self._fixed_length_right, vocab_size)
                                         ]
        return self