Пример #1
0
def Tok_Train(input_file_path,vocab_size,output_path):
    """Train a Simple BPE Tokenizer"""
    GPTToken = ByteLevelBPETokenizer(lowercase=True)
    GPTToken.enable_padding()
    GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"])
    GPTToken.save_model(output_path)
    return None
Пример #2
0
    def __init__(self, max_tokens=512):

        ## RoBERTa uses BPE tokenizer similar to GPT
        t = ByteLevelBPETokenizer("tokenizer/vocab.json",
                                  "tokenizer/merges.txt")
        t._tokenizer.post_processor = BertProcessing(
            ("</s>", t.token_to_id("</s>")),
            ("<s>", t.token_to_id("<s>")),
        )
        t.enable_truncation(max_tokens)
        t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
        self.tokenizer = t
def inference():

    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    '''
    initialize tokenizer with saved model files
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./tok_checkpoints/tokenizer_model-vocab.json",
        "./tok_checkpoints/tokenizer_model-merges.txt",
    )
    '''
    optional step : preprocess the strings
    Ex: add <s> and </s> as BOS and EOS tokens to the string
        pad string to some max length and truncate string to some max length
    '''
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='<pad>',
                             pad_id=tokenizer.get_vocab()['<pad>'],
                             length=20)
    tokenizer.enable_truncation(max_length=20)
    '''
    tokenize/encode strings
    '''
    input_ids = tokenizer.encode("Hello World, Whats up!!!").ids
    print("input ids", input_ids)
    tokens = tokenizer.encode("Hello World, Whats up!!!").tokens
    print("tokens", tokens)
    '''
    tokenize/encode batch of string
    '''
    batch_tokenized = tokenizer.encode_batch(
        ["Hello World, Whats up!!!", "Whata whata wa wada wada"])
    input_ids = [i.ids for i in batch_tokenized]
    print("input ids", input_ids)
    tokens = [i.tokens for i in batch_tokenized]
    print("tokens", tokens)
class ByteBPETokenizer:
    def __init__(self, vocab_json, merge_txt, max_length=750):
        self.tokenizer = ByteLevelBPETokenizer(vocab_json, merge_txt)
        self.tokenizer.enable_truncation(max_length=max_length)
        self.tokenizer.enable_padding(max_length=max_length)
        self.tokenizer.add_special_tokens(["[PAD]", "[CLS]"])
        # self.tokenizer.post_processor = RobertaProcessing(("</s>", 2), ("<s>", 1))
        # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def encode(self, review):
        review = clean_sentence(review)
        encoded = self.tokenizer.encode(review.lower())
        # pp_encoded = self.tokenizer.post_process(encoded)
        return encoded

    def tokenize2Index(self, review, should_stem=False):
        encoded = self.encode(review)

        return encoded.ids

    def trainBPE(self, paths, vocab_size=30000, min_frequency=10, special_tokens=["[PAD]", "[CLS]"]):
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens)
        tokenizer.save("yelp_bpe/", "yelp-bpe")