def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create fixed length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['filter_unit'].transform, mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) max_len_left = self._fixed_length_left max_len_right = self._fixed_length_right data_pack.left['length_left'] = \ data_pack.left['length_left'].apply( lambda val: min(val, max_len_left)) data_pack.right['length_right'] = \ data_pack.right['length_right'].apply( lambda val: min(val, max_len_right)) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create truncated length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) # data_pack.apply_on_text(self._context['filter_unit'].transform, # mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) if self._truncated_length_left: data_pack.apply_on_text(self._left_truncatedlength_unit.transform, mode='left', inplace=True, verbose=verbose) if self._truncated_length_right: data_pack.apply_on_text(self._right_truncatedlength_unit.transform, mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.drop_empty(inplace=True) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create `letter-ngram` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() func = chain_transform(self._default_units()) data_pack.apply_on_text(func, inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_truncatedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_truncatedlength_unit.transform, mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) post_units = [units.NgramLetter(reduce_dim=False)] if self._with_word_hashing: term_index = self._context['vocab_unit'].state['term_index'] post_units.append(units.WordHashing(term_index)) data_pack.apply_on_text(chain_transform(post_units), inplace=True, verbose=verbose) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(self.bert_encode, mode='both', inplace=True, multiprocessing=self.multiprocessing, verbose=verbose) if self._truncated_length_left: data_pack.apply_on_text(ChainTransform( self._left_truncated_length_unit), mode='left', inplace=True, verbose=verbose) if self._truncated_length_right: data_pack.apply_on_text(ChainTransform( self._right_truncated_length_unit), mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose, multiprocessing=self.multiprocessing) data_pack.drop_empty(inplace=True) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create fixed length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) # data_pack.apply_on_text(self._context['filter_unit'].transform, # mode='right', inplace=True, verbose=verbose) # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left") # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right") data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) def process_decoder_input_output(text: str): tokens = chain_transform(self._units)(text) tokens = self._context['vocab_unit'].transform(tokens) return self._right_fixedlength_unit.transform(tokens) data_pack.right[KeyWordSettings.TextRightInput] = data_pack.right[ KeyWordSettings.TextRightInput].apply(process_decoder_input_output) data_pack.right[KeyWordSettings.TextRightOutput] = data_pack.right[ KeyWordSettings.TextRightOutput].apply( process_decoder_input_output) max_len_left = self._fixed_length_left max_len_right = self._fixed_length_right data_pack.left['length_left'] = \ data_pack.left['length_left'].apply( lambda val: min(val, max_len_left)) data_pack.right['length_right'] = \ data_pack.right['length_right'].apply( lambda val: min(val, max_len_right)) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:'DataPack' object. """ data_pack = data_pack.copy() data_pack.apply_on_text( chain_transform(self._units), mode='both', inplace=True, verbose=verbose) data_pack.apply_on_text( self._left_truncatedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text( self._right_truncatedlength_unit.transform, mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) # Process character representation data_pack.apply_on_text( units.NgramLetter(ngram=1, reduce_dim=False).transform, rename=('char_left', 'char_right'), mode='both', inplace=True, verbose=verbose) char_index_dict = self._context['char_unit'].state['term_index'] charindex_unit = units.CharacterIndex(char_index_dict) data_pack.left['char_left'] = data_pack.left['char_left'].apply( charindex_unit.transform) data_pack.right['char_right'] = data_pack.right['char_right'].apply( charindex_unit.transform) # Process word representation data_pack.apply_on_text( self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) # Process exact match representation data_pack.relation["match_left"] = "" data_pack.relation["match_right"] = "" frame = data_pack.relation.join( data_pack.left, on='id_left', how='left' ).join(data_pack.right, on='id_right', how='left') left_exactmatch_unit = units.WordExactMatch( match='text_left', to_match='text_right') right_exactmatch_unit = units.WordExactMatch( match='text_right', to_match='text_left') data_pack.relation['match_left'] = frame.apply( left_exactmatch_unit.transform, axis=1) data_pack.relation['match_right'] = frame.apply( right_exactmatch_unit.transform, axis=1) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack.apply_on_text(self._tokenizer.encode, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create truncated length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ units_ = self._default_units() units_.append(self._context['vocab_unit']) units_.append( units.TruncatedLength(text_length=30, truncate_mode='post')) func = chain_transform(units_) data_pack.apply_on_text(func, inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) data_pack.apply_on_text(self._context['filter_unit'].transform, mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) if self._truncated_length_left: data_pack.apply_on_text(self._left_truncatedlength_unit.transform, mode='left', inplace=True, verbose=verbose) if self._truncated_length_right: data_pack.apply_on_text(self._right_truncatedlength_unit.transform, mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.drop_empty(inplace=True) return data_pack