Пример #1
0
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`BasicPreprocessor` instance.
        """
        units = self._default_processor_units()
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)

        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
                                                       data_pack,
                                                       flatten=False,
                                                       mode='right',
                                                       verbose=verbose)
        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
                                            mode='right',
                                            verbose=verbose)
        self._context['filter_unit'] = fitted_filter_unit

        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['vocab_unit'] = vocab_unit
        self._context['vocab_size'] = len(vocab_unit.state['term_index']) + 1

        self._context['input_shapes'] = [(self._fixed_length_left, ),
                                         (self._fixed_length_right, )]

        return self
Пример #2
0
def build_unit_from_data_pack(unit: StatefulUnit,
                              data_pack: mz.DataPack,
                              mode: str = 'both',
                              flatten: bool = True,
                              verbose: int = 1) -> StatefulUnit:
    """
    Build a :class:`StatefulUnit` from a :class:`DataPack` object.

    :param unit: :class:`StatefulUnit` object to be built.
    :param data_pack: The input :class:`DataPack` object.
    :param mode: One of 'left', 'right', and 'both', to determine the source
            data for building the :class:`VocabularyUnit`.
    :param flatten: Flatten the datapack or not. `True` to organize the
        :class:`DataPack` text as a list, and `False` to organize
        :class:`DataPack` text as a list of list.
    :param verbose: Verbosity.
    :return: A built :class:`StatefulUnit` object.

    """
    corpus = []
    if flatten:
        data_pack.apply_on_text(corpus.extend, mode=mode, verbose=verbose)
    else:
        data_pack.apply_on_text(corpus.append, mode=mode, verbose=verbose)
    if verbose:
        description = 'Building ' + unit.__class__.__name__ + \
                      ' from a datapack.'
        corpus = tqdm(corpus, desc=description)
    unit.fit(corpus)
    return unit
    def fit(self, data_pack: DataPack, verbose: int = 1):
        ## 经过分词、去标点以及去停用词
        data_pack = data_pack.apply_on_text(chain_transform(self._units),
                                            verbose=verbose)
        ## 过滤高频词和低频词
        ## 先通过build进行统计
        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
                                                       data_pack,
                                                       flatten=False,
                                                       mode='right',
                                                       verbose=verbose)
        ### 基于上面统计的结果进行转换并保存模型
        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
                                            mode='right',
                                            verbose=verbose)
        self._context['filter_unit'] = fitted_filter_unit

        ## 构建词表
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['vocab_unit'] = vocab_unit

        vocab_size = len(vocab_unit.state['term_index'])
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size

        return self
Пример #4
0
    def transform(self, pack: matchzoo.DataPack):
        """ Converting the raw path to mapped indices """
        def left_to_indices(images: List[str]):
            images_indices = [
                self.left_img_path2index[p]
                for p in images[:self.max_num_left_images]
            ]
            images_indices += [0] * (
                self.max_num_left_images - len(images_indices))  # padding
            return images_indices

        def right_to_indices(images: List[str]):
            images_indices = [
                self.right_img_path2index[p]
                for p in images[:self.max_num_right_images]
            ]
            images_indices += [0] * (
                self.max_num_right_images - len(images_indices))  # padding
            return images_indices

        pack.left["images_left"] = pack.left["images_left"].apply(
            left_to_indices)
        pack.right["images_right"] = pack.right["images_right"].apply(
            right_to_indices)
        return pack
Пример #5
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:'DIINPreprocessor' instance.
        """
        func = chain_transform(self._units)
        data_pack = data_pack.apply_on_text(func, mode='both', verbose=verbose)

        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        vocab_size = len(vocab_unit.state['term_index'])
        self._context['vocab_unit'] = vocab_unit
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size

        data_pack = data_pack.apply_on_text(
            units.NgramLetter(ngram=1, reduce_dim=True).transform,
            mode='both', verbose=verbose)
        char_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['char_unit'] = char_unit

        self._context['input_shapes'] = [
            (self._fixed_length_left,),
            (self._fixed_length_right,),
            (self._fixed_length_left, self._fixed_length_word,),
            (self._fixed_length_right, self._fixed_length_word,),
            (self._fixed_length_left,),
            (self._fixed_length_right,)
        ]
        return self
Пример #6
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`BasicPreprocessor` instance.
        """
        data_pack = data_pack.apply_on_text(chain_transform(self._units),
                                            verbose=verbose)
        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
                                                       data_pack,
                                                       flatten=False,
                                                       mode='right',
                                                       verbose=verbose)
        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
                                            mode='right',
                                            verbose=verbose)
        self._context['filter_unit'] = fitted_filter_unit

        vocab_unit = build_vocab_unit(
            data_pack, verbose=verbose,
            mode="right")  # only rely on the right side
        self._context['vocab_unit'] = vocab_unit

        vocab_size = len(
            vocab_unit.state['term_index'])  # + 1  # +1 for padding
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size
        self._context['input_shapes'] = [(self._fixed_length_left, ),
                                         (self._fixed_length_right, )]

        return self
Пример #7
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create `letter-ngram` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        func = chain_transform(self._default_units())
        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)
        post_units = [units.NgramLetter(reduce_dim=False)]
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            post_units.append(units.WordHashing(term_index))
        data_pack.apply_on_text(chain_transform(post_units),
                                inplace=True,
                                verbose=verbose)
        return data_pack
Пример #8
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create fixed length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(chain_transform(self._units),
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._context['filter_unit'].transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)

        def convert_to_bow(input_: List[str]):
            """the list of tokens will be converted to """
            vocab_unit = self._context['vocab_unit']
            ans = [0.0] * self._context['vocab_size']
            for token in input_:
                index = vocab_unit._state['term_index'][token]
                ans[index] = 1.0
            return ans

        data_pack.apply_on_text(convert_to_bow,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.right['images_right'] = data_pack.right[
            "images_right"].progress_apply(self._images_unit.transform)
        return data_pack
Пример #9
0
    def __init__(self,
                 data_pack: mz.DataPack,
                 mode='point',
                 num_dup: int = 1,
                 num_neg: int = 1,
                 resample: bool = True,
                 batch_size: int = 128,
                 shuffle: bool = True,
                 callbacks: typing.List[Callback] = None):
        """Init."""
        if callbacks is None:
            callbacks = []

        if mode not in ('point', 'pair', 'list'):
            raise ValueError(f"{mode} is not a valid mode type."
                             f"Must be one of `point`, `pair` or `list`.")

        self._mode = mode
        self._num_dup = num_dup
        self._num_neg = num_neg
        self._batch_size = batch_size
        self._shuffle = shuffle
        self._resample = resample
        self._orig_relation = data_pack.relation
        self._callbacks = callbacks

        if mode == 'pair':
            data_pack.relation = self._reorganize_pair_wise(data_pack.relation,
                                                            num_dup=num_dup,
                                                            num_neg=num_neg)

        self._data_pack = data_pack
        self._batch_indices = None

        self.reset_index()
Пример #10
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack.apply_on_text(self._tokenizer.encode,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)

        return data_pack
Пример #11
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create truncated length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        units_ = self._default_units()
        units_.append(self._context['vocab_unit'])
        units_.append(
            units.TruncatedLength(text_length=30, truncate_mode='post'))
        func = chain_transform(units_)
        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        return data_pack
Пример #12
0
    def __init__(
        self,
        data_pack: mz.DataPack,
        mode='point',
        num_dup: int = 1,
        num_neg: int = 1,
        batch_size: int = 32,
        resample: bool = False,
        shuffle: bool = True,
        sort: bool = False,
        callbacks: typing.List[BaseCallback] = None
    ):
        """Init."""
        if callbacks is None:
            callbacks = []

        if mode not in ('point', 'pair', 'list'):
            raise ValueError(f"{mode} is not a valid mode type."
                             f"Must be one of `point`, `pair` or `list`.")

        if shuffle and sort:
            raise ValueError(f"parameters `shuffle` and `sort` conflict, "
                             f"should not both be `True`.")

        data_pack = data_pack.copy()
        self._mode = mode
        self._num_dup = num_dup
        self._num_neg = num_neg
        self._batch_size = batch_size
        self._resample = (resample if mode != 'point' else False)
        self._shuffle = shuffle
        self._sort = sort
        self._orig_relation = data_pack.relation
        self._callbacks = callbacks

        if mode == 'pair':
            data_pack.relation = self._reorganize_pair_wise(
                relation=self._orig_relation,
                num_dup=num_dup,
                num_neg=num_neg
            )

        self._data_pack = data_pack
        self._batch_indices = None

        self.reset_index()
Пример #13
0
    def transform(self, data_pack: DataPack, verbose=1) -> DataPack:
        """
        Apply transformation on data, create `tri-letter` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units = self._default_processor_units()
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            units.append(processor_units.WordHashingUnit(term_index))
        data_pack.apply_on_text(chain_transform(units),
                                inplace=True,
                                verbose=verbose)
        return data_pack
Пример #14
0
def data_pack():
    relation = [['qid0', 'did0', 1], ['qid1', 'did1', 0]]
    left = [['qid0', [1, 2]], ['qid1', [2, 3]]]
    right = [['did0', [2, 3, 4]], ['did1', [3, 4, 5]]]
    relation = pd.DataFrame(relation, columns=['id_left', 'id_right', 'label'])
    left = pd.DataFrame(left, columns=['id_left', 'text_left'])
    left.set_index('id_left', inplace=True)
    right = pd.DataFrame(right, columns=['id_right', 'text_right'])
    right.set_index('id_right', inplace=True)
    return DataPack(relation=relation, left=left, right=right)
Пример #15
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`BasicPreprocessor` instance.
        """
        data_pack = data_pack.apply_on_text(
            ChainTransform(self._units),
            multiprocessing=self.multiprocessing,
            verbose=verbose)
        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
                                                       data_pack,
                                                       flatten=False,
                                                       mode='right',
                                                       verbose=verbose)
        data_pack = data_pack.apply_on_text(ChainTransform(fitted_filter_unit),
                                            mode='right',
                                            multiprocessing=False,
                                            verbose=verbose)
        self._context['filter_unit'] = fitted_filter_unit

        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        if self.extra_terms:
            vocab_unit.fit_incrementally(self.extra_terms)
        self._context['vocab_unit'] = vocab_unit

        vocab_size = len(vocab_unit.state['term_index'])
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size

        if self._ngram_size:
            data_pack = data_pack.apply_on_text(
                ChainTransform(self._context['ngram_process_unit']),
                mode='both',
                multiprocessing=self.multiprocessing,
                verbose=verbose)
            ngram_unit = build_vocab_unit(data_pack, verbose=verbose)
            self._context['ngram_vocab_unit'] = ngram_unit
            self._context['ngram_vocab_size'] = len(
                ngram_unit.state['term_index'])
        return self
Пример #16
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`NaivePreprocessor` instance.
        """
        func = chain_transform(self._default_units())
        data_pack = data_pack.apply_on_text(func, verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['vocab_unit'] = vocab_unit
        return self
Пример #17
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create `tri-letter` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        units = self._default_processor_units()
        units.append(self._context['vocab_unit'])
        units.append(processor_units.FixedLengthUnit(text_length=30,
                                                     pad_mode='post'))
        return data_pack.apply_on_text(chain_transform(units), verbose=verbose)
Пример #18
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`BasicPreprocessor` instance.
        """
        data_pack = data_pack.apply_on_text(chain_transform(self._units),
                                            verbose=verbose)
        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
        #                                                data_pack,
        #                                                flatten=False,
        #                                                mode='right',
        #                                                verbose=verbose)
        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
        #                                     mode='right', verbose=verbose)
        # self._context['filter_unit'] = fitted_filter_unit

        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['vocab_unit'] = vocab_unit

        vocab_size = len(vocab_unit.state['term_index'])
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size

        if self._ngram_size:
            data_pack = data_pack.apply_on_text(
                self._context['ngram_process_unit'].transform,
                mode='both',
                verbose=verbose)
            ngram_unit = build_vocab_unit(data_pack, verbose=verbose)
            self._context['ngram_vocab_unit'] = ngram_unit
            self._context['ngram_vocab_size'] = len(
                ngram_unit.state['term_index'])
        return self
Пример #19
0
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: data_pack to be preprocessed.
        :return: class:`DSSMPreprocessor` instance.
        """
        units = self._default_processor_units()
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        triletter_dim = len(vocab_unit.state['term_index']) + 1
        self._context['input_shapes'] = [(triletter_dim, ), (triletter_dim, )]
        return self
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: Data_pack to be preprocessed.
        :return: class:`CDSSMPreprocessor` instance.
        """
        fit_units = self._default_units() + [units.NgramLetter()]
        func = chain_transform(fit_units)
        data_pack = data_pack.apply_on_text(func, verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        vocab_size = len(vocab_unit.state['term_index'])
        self._context['vocab_size'] = vocab_size
        self._context['embedding_input_dim'] = vocab_size
        return self
Пример #21
0
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: Data_pack to be preprocessed.
        :return: class:`CDSSMPreprocessor` instance.
        """
        units = self._default_processor_units()
        units.append(processor_units.NgramLetterUnit())
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        vocab_size = len(vocab_unit.state['term_index']) + 1
        self._context['input_shapes'] = [(self._fixed_length_left, vocab_size),
                                         (self._fixed_length_right, vocab_size)
                                         ]
        return self
Пример #22
0
    def transform(self, data_pack: DataPack, verbose=1) -> DataPack:
        """
        Apply transformation on data, create fixed length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units = self._default_processor_units()
        data_pack.apply_on_text(chain_transform(units),
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._context['filter_unit'].transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._context['vocab_unit'].transform,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)
        max_len_left = self._fixed_length_left
        max_len_right = self._fixed_length_right
        data_pack.left['length_left'] = data_pack.left['length_left'].apply(
            lambda val: val if val <= max_len_left else max_len_left)
        data_pack.right['length_right'] = data_pack.right[
            'length_right'].apply(lambda val: val
                                  if val <= max_len_right else max_len_right)
        return data_pack
Пример #23
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create truncated length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(ChainTransform(self._units),
                                inplace=True,
                                multiprocessing=self.multiprocessing,
                                verbose=verbose)

        data_pack.apply_on_text(ChainTransform(self._context['filter_unit']),
                                mode='right',
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(ChainTransform(self._context['vocab_unit']),
                                mode='both',
                                inplace=True,
                                verbose=verbose)

        if self._truncated_length_left:
            data_pack.apply_on_text(ChainTransform(
                self._left_truncatedlength_unit),
                                    mode='left',
                                    inplace=True,
                                    verbose=verbose)
        if self._truncated_length_right:
            data_pack.apply_on_text(ChainTransform(
                self._right_truncatedlength_unit),
                                    mode='right',
                                    inplace=True,
                                    verbose=verbose)

        data_pack.append_text_length(inplace=True, verbose=verbose)
        data_pack.drop_empty(inplace=True)
        return data_pack
 def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
     data_pack = data_pack.copy()
     data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
     data_pack.apply_on_text(self._context['filter_unit'].transform,
                             mode='right',
                             inplace=True,
                             verbose=verbose)
     data_pack.apply_on_text(self._context['vocab_unit'].transform,
                             mode='both',
                             inplace=True,
                             verbose=verbose)
     if self._truncated_length_left:
         data_pack.apply_on_text(self._left_truncatedlength_unit.transform,
                                 mode='left',
                                 inplace=True,
                                 verbose=verbose)
     if self._truncated_length_right:
         data_pack.apply_on_text(self._right_truncatedlength_unit.transform,
                                 mode='right',
                                 inplace=True,
                                 verbose=verbose)
     data_pack.append_text_length(inplace=True, verbose=verbose)
     data_pack.drop_empty(inplace=True)
     return data_pack
Пример #25
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create fixed length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(chain_transform(self._units),
                                inplace=True,
                                verbose=verbose)

        # data_pack.apply_on_text(self._context['filter_unit'].transform,
        #                         mode='right', inplace=True, verbose=verbose)
        # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left")
        # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right")

        data_pack.apply_on_text(self._context['vocab_unit'].transform,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)

        def process_decoder_input_output(text: str):
            tokens = chain_transform(self._units)(text)
            tokens = self._context['vocab_unit'].transform(tokens)
            return self._right_fixedlength_unit.transform(tokens)

        data_pack.right[KeyWordSettings.TextRightInput] = data_pack.right[
            KeyWordSettings.TextRightInput].apply(process_decoder_input_output)
        data_pack.right[KeyWordSettings.TextRightOutput] = data_pack.right[
            KeyWordSettings.TextRightOutput].apply(
                process_decoder_input_output)

        max_len_left = self._fixed_length_left
        max_len_right = self._fixed_length_right

        data_pack.left['length_left'] = \
            data_pack.left['length_left'].apply(
                lambda val: min(val, max_len_left))

        data_pack.right['length_right'] = \
            data_pack.right['length_right'].apply(
                lambda val: min(val, max_len_right))
        return data_pack
Пример #26
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:'DataPack' object.
        """
        data_pack = data_pack.copy()
        data_pack.apply_on_text(
            chain_transform(self._units),
            mode='both', inplace=True, verbose=verbose)

        # Process character representation
        data_pack.apply_on_text(
            units.NgramLetter(ngram=1, reduce_dim=False).transform,
            rename=('char_left', 'char_right'),
            mode='both', inplace=True, verbose=verbose)
        char_index_dict = self._context['char_unit'].state['term_index']
        left_charindex_unit = units.CharacterIndex(
            char_index_dict, self._fixed_length_left, self._fixed_length_word)
        right_charindex_unit = units.CharacterIndex(
            char_index_dict, self._fixed_length_right, self._fixed_length_word)
        data_pack.left['char_left'] = data_pack.left['char_left'].apply(
            left_charindex_unit.transform)
        data_pack.right['char_right'] = data_pack.right['char_right'].apply(
            right_charindex_unit.transform)

        # Process word representation
        data_pack.apply_on_text(
            self._context['vocab_unit'].transform,
            mode='both', inplace=True, verbose=verbose)

        # Process exact match representation
        frame = data_pack.relation.join(
            data_pack.left, on='id_left', how='left'
        ).join(data_pack.right, on='id_right', how='left')
        left_exactmatch_unit = units.WordExactMatch(
            self._fixed_length_left, match='text_left', to_match='text_right')
        right_exactmatch_unit = units.WordExactMatch(
            self._fixed_length_right, match='text_right', to_match='text_left')
        data_pack.relation['match_left'] = frame.apply(
            left_exactmatch_unit.transform, axis=1)
        data_pack.relation['match_right'] = frame.apply(
            right_exactmatch_unit.transform, axis=1)

        data_pack.apply_on_text(
            self._left_fixedlength_unit.transform,
            mode='left', inplace=True, verbose=verbose)
        data_pack.apply_on_text(
            self._right_fixedlength_unit.transform,
            mode='right', inplace=True, verbose=verbose)

        return data_pack