def configure(self): if isinstance(SentencePieceBPETokenizer, UnsupportedPackage): SentencePieceBPETokenizer.raise_error(self.__provider__) self.tokenizer = SentencePieceBPETokenizer( str(self.get_value_from_config('vocabulary_file')), str(self.get_value_from_config('merges_file'))) self.add_extra_symbols = self.get_value_from_config( 'add_extra_symbols') self.idx = {} for s in ['sos', 'eos']: self.idx[s] = self.tokenizer.token_to_id( str(self.get_value_from_config(s + '_symbol')))
def configure(self): if isinstance(SentencePieceBPETokenizer, UnsupportedPackage): SentencePieceBPETokenizer.raise_error(self.__provider__) self.tokenizer = SentencePieceBPETokenizer( str(self.get_value_from_config('vocabulary_file')), str(self.get_value_from_config('merges_file'))) self.remove_extra_symbols = self.get_value_from_config( 'remove_extra_symbols') self.idx = {} for s in ['sos', 'eos', 'pad']: self.idx[s] = str(self.get_value_from_config(s + '_symbol')) self.output_name = self.get_value_from_config('output_name') self.output_checked = False