class CustomSeniorProjectTokenizer(object): def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad', TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh' ): from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab from fastai.text.data import TokenizeProcessor, NumericalizeProcessor with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'rb') as f: itos = pickle.load(f) self.vocab = Vocab(itos) self.tokenizer = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=1) self.cls_token_id = self.vocab.stoi[BOS] self.sep_token_id = self.vocab.stoi[EOS] # tokenizer_processor = TokenizeProcessor(tokenizer=tt, chunksize=300000, mark_fields=False) # numbericalize_processor = NumericalizeProcessor(vocab=vocab) def num_special_tokens_to_add(self, pair=False): return 2 def tokenize(self, text): return self.tokenizer._process_all_1([text])[0] # return self.tokenizer.process_all([text])[0] def convert_tokens_to_ids(self, token_list): return self.vocab.numericalize(token_list) def build_inputs_with_special_tokens(self, token_list): # From https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py#L235 return [self.cls_token_id] + token_list + [self.sep_token_id]
class CustomSeniorProjectTokenizer(object): def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad', TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh', n_cpus=1, ): from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab from fastai.text.data import TokenizeProcessor, NumericalizeProcessor with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'rb') as f: itos = pickle.load(f) self.vocab = Vocab(itos) self.tokenizer = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=n_cpus) self.cls_token_id = self.vocab.stoi[BOS] self.sep_token_id = self.vocab.stoi[EOS] self.pad_token_id = self.vocab.stoi[PAD] self.mask_token = FLD #SINCE THIS ONE IS NOT USED, and INSIDE SPECIAL TOKEN.... self._pad_token = PAD # tokenizer_processor = TokenizeProcessor(tokenizer=tt, chunksize=300000, mark_fields=False) # numbericalize_processor = NumericalizeProcessor(vocab=vocab) def num_special_tokens_to_add(self, pair=False): return 2 def tokenize(self, text): return self.tokenizer._process_all_1([text])[0] # return self.tokenizer.process_all([text])[0] def convert_tokens_to_ids(self, token_list): #From https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html#PreTrainedTokenizerFast.convert_tokens_to_ids if token_list is None: return None if isinstance(token_list, str): return self.vocab.numericalize([token_list])[0] return self.vocab.numericalize(token_list) def build_inputs_with_special_tokens(self, token_list): # From https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py#L235 return [self.cls_token_id] + token_list + [self.sep_token_id] def get_special_tokens_mask( self, token_ids_0, token_ids_1 = None, already_has_special_tokens = False ): # From https://huggingface.co/transformers/_modules/transformers/tokenization_utils.html#PreTrainedTokenizer.get_special_tokens_mask """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids for sequence pairs already_has_special_tokens: (default False) Set to True if the token list is already formated with special tokens for the model Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def __len__(self): #https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html#PreTrainedTokenizerFast.__len__ return len(self.vocab.itos)
vocab = Vocab(itos) # In[17]: pyThai_tt = ThaiTokenizer() # In[18]: tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=1) test_sample = tt._process_all_1([text[:100]]) print(test_sample) test_sample = [vocab.numericalize(seq) for seq in test_sample] print(test_sample) # In[21]: class CustomSeniorProjectTokenizer(object): def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad', TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh', n_cpus=1, ): from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab from fastai.text.data import TokenizeProcessor, NumericalizeProcessor