Python Vocab.numericalize примеры использования

Язык программирования: Python

Пространство имен/Пакет: fastai.text.transform

Класс/Тип: Vocab

Метод/Функция: numericalize

Примеров на hotexamples.com: 3

Python Vocab.numericalize - 3 примера найдено. Это лучшие примеры Python кода для fastai.text.transform.Vocab.numericalize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Vocab(6)

numericalize(3)

create(2)

load(1)

Основные методы

Vocab (6)

numericalize (3)

create (2)

load (1)

Пример #1

Показать файл

Файл: make_tok_cache_senior_proj.py Проект: Knight-H/thai-lm-training

class CustomSeniorProjectTokenizer(object):
    def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad',
                 TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh'
                ):
        from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th
        from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab
        from fastai.text.data import TokenizeProcessor, NumericalizeProcessor

        with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'rb') as f:
            itos = pickle.load(f)
            
        self.vocab = Vocab(itos)
        self.tokenizer = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', 
                                   pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=1)
        
        self.cls_token_id = self.vocab.stoi[BOS]
        self.sep_token_id = self.vocab.stoi[EOS]
        
#         tokenizer_processor = TokenizeProcessor(tokenizer=tt, chunksize=300000, mark_fields=False)
#         numbericalize_processor = NumericalizeProcessor(vocab=vocab)
        
    def num_special_tokens_to_add(self, pair=False):
        return 2
    def tokenize(self, text):
        return self.tokenizer._process_all_1([text])[0]
#         return self.tokenizer.process_all([text])[0]
    
    def convert_tokens_to_ids(self, token_list):
        return self.vocab.numericalize(token_list)
    
    def build_inputs_with_special_tokens(self, token_list):
        # From https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py#L235
        return [self.cls_token_id] + token_list + [self.sep_token_id]

Пример #2

Показать файл

class CustomSeniorProjectTokenizer(object):
    def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad',
                 TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh', n_cpus=1,
                ):
        from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th
        from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab
        from fastai.text.data import TokenizeProcessor, NumericalizeProcessor

        with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'rb') as f:
            itos = pickle.load(f)
            
        self.vocab = Vocab(itos)
        self.tokenizer = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', 
                                   pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=n_cpus)
        
        self.cls_token_id = self.vocab.stoi[BOS]
        self.sep_token_id = self.vocab.stoi[EOS]
        self.pad_token_id = self.vocab.stoi[PAD]
        
        self.mask_token = FLD  #SINCE THIS ONE IS NOT USED, and INSIDE SPECIAL TOKEN....
        self._pad_token = PAD
        
#         tokenizer_processor = TokenizeProcessor(tokenizer=tt, chunksize=300000, mark_fields=False)
#         numbericalize_processor = NumericalizeProcessor(vocab=vocab)
        
    def num_special_tokens_to_add(self, pair=False):
        return 2
    def tokenize(self, text):
        return self.tokenizer._process_all_1([text])[0]
#         return self.tokenizer.process_all([text])[0]
    
    def convert_tokens_to_ids(self, token_list):
        #From https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html#PreTrainedTokenizerFast.convert_tokens_to_ids
        if token_list is None:
            return None

        if isinstance(token_list, str):
            return self.vocab.numericalize([token_list])[0]
        
        return self.vocab.numericalize(token_list)
    
    def build_inputs_with_special_tokens(self, token_list):
        # From https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py#L235
        return [self.cls_token_id] + token_list + [self.sep_token_id]
    
    def get_special_tokens_mask(
        self, token_ids_0, token_ids_1 = None, already_has_special_tokens = False
    ):
        # From https://huggingface.co/transformers/_modules/transformers/tokenization_utils.html#PreTrainedTokenizer.get_special_tokens_mask
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` method.

        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
    
    def __len__(self):
        #https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html#PreTrainedTokenizerFast.__len__
        return len(self.vocab.itos)

Пример #3

Показать файл

Файл: run_pretrain_originalBert.py Проект: Knight-H/thai-lm-training


vocab = Vocab(itos)


# In[17]:
pyThai_tt = ThaiTokenizer()


# In[18]:


tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=1)
test_sample = tt._process_all_1([text[:100]])
print(test_sample)
test_sample = [vocab.numericalize(seq) for seq in test_sample]
print(test_sample)




# In[21]:


class CustomSeniorProjectTokenizer(object):
    def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad',
                 TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh', n_cpus=1,
                ):
        from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th
        from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab
        from fastai.text.data import TokenizeProcessor, NumericalizeProcessor