Python ByteLevelBPETokenizer.get_vocab 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tokenizers

메소드/함수: get_vocab

hotexamples.com에서의 예제들: 2

Python ByteLevelBPETokenizer.get_vocab - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tokenizers.ByteLevelBPETokenizer.get_vocab에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

ByteLevelBPETokenizer(30)

train(30)

save_model(30)

save(30)

encode(30)

token_to_id(23)

decode(19)

enable_truncation(13)

add_special_tokens(10)

get_vocab_size(10)

encode_batch(9)

from_file(5)

id_to_token(5)

pre_tokenizer(4)

enable_padding(4)

save_pretrained(3)

get_vocab(2)

add_tokens(2)

train_from_iterator(2)

model_max_length(1)

mask_token(1)

do_lower_case(1)

convert_tokens_to_ids(1)

cache_id(1)

to_str(1)

build_inputs_with_special_tokens(1)

tokenize(1)

예제 #1

파일 보기

파일: ByteLevelBPETokenizer.py 프로젝트: hemanths933/Transformers

def inference():

    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    '''
    initialize tokenizer with saved model files
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./tok_checkpoints/tokenizer_model-vocab.json",
        "./tok_checkpoints/tokenizer_model-merges.txt",
    )
    '''
    optional step : preprocess the strings
    Ex: add <s> and </s> as BOS and EOS tokens to the string
        pad string to some max length and truncate string to some max length
    '''
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='<pad>',
                             pad_id=tokenizer.get_vocab()['<pad>'],
                             length=20)
    tokenizer.enable_truncation(max_length=20)
    '''
    tokenize/encode strings
    '''
    input_ids = tokenizer.encode("Hello World, Whats up!!!").ids
    print("input ids", input_ids)
    tokens = tokenizer.encode("Hello World, Whats up!!!").tokens
    print("tokens", tokens)
    '''
    tokenize/encode batch of string
    '''
    batch_tokenized = tokenizer.encode_batch(
        ["Hello World, Whats up!!!", "Whata whata wa wada wada"])
    input_ids = [i.ids for i in batch_tokenized]
    print("input ids", input_ids)
    tokens = [i.tokens for i in batch_tokenized]
    print("tokens", tokens)

예제 #2

파일 보기

파일: data.py 프로젝트: StuartMesham/low_resource_lm

    def __init__(self, path, vocab_size=-1, use_bpe=False, tokenizer_data=""):
        self.dictionary = Dictionary()

        if use_bpe:
            assert os.path.exists(path), "Path does not exist: " + path

            print(
                "-------------------------------------------------------------"
            )

            tokenizer = ByteLevelBPETokenizer()
            if len(tokenizer_data) != 0:
                print("Training tokenizer on: " +
                      os.path.join(tokenizer_data, 'train.txt'))
                tokenizer.train([os.path.join(tokenizer_data, 'train.txt')],
                                vocab_size=vocab_size,
                                show_progress=False)
            else:
                print("Training tokenizer on: " +
                      os.path.join(path, 'train.txt'))
                tokenizer.train(
                    [
                        os.path.join(path, 'train.txt')
                        # os.path.join(path, 'valid.txt'),
                        # os.path.join(path, 'test.txt')
                    ],
                    vocab_size=vocab_size,
                    show_progress=False)
            print(
                "-------------------------------------------------------------"
            )

            print("Encoding dataset at: " + path)
            with open(os.path.join(path, 'train.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.train = ids
                self.dictionary.avg_characters_per_token['train'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'valid.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.valid = ids
                self.dictionary.avg_characters_per_token['valid'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'test.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.test = ids
                self.dictionary.avg_characters_per_token['test'] = len(
                    text) / len(enc.ids)
            print(
                "-------------------------------------------------------------"
            )

            self.dictionary.word2idx = tokenizer.get_vocab()
            self.dictionary.idx2word = [
                tokenizer.id_to_token(x)
                for x in range(tokenizer.get_vocab_size())
            ]
            self.dictionary.total = tokenizer.get_vocab_size()

        else:
            self.train = self.tokenize(os.path.join(path, 'train.txt'))
            self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
            self.test = self.tokenize(os.path.join(path, 'test.txt'))