def test_encode_formats(self, bert_files): tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Well formed output = tokenizer.encode("my name is john") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True) assert output.tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] output = tokenizer.encode_batch( ["My name is John", "My name is Georges"]) assert output[0].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]" ] assert output[1].tokens == [ "[CLS]", "my", "name", "is", "georges", "[SEP]" ] output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")]) assert output[0].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] assert output[1].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True) assert output[0].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]" ] # Mal formed with pytest.raises(ValueError, match="InputSequence must be str"): tokenizer.encode([["my", "name"]]) tokenizer.encode("My name is john", [["pair"]]) tokenizer.encode("my name is john", ["pair"]) with pytest.raises(ValueError, match="InputSequence must be Union[List[str]"): tokenizer.encode("My name is john", is_pretokenized=True) tokenizer.encode("My name is john", ["pair"], is_pretokenized=True) tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
def test_encode_formats(self, bert_files): with pytest.deprecated_call(): tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Encode output = tokenizer.encode("my name is john") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] # Encode batch result_single = [ ["[CLS]", "my", "name", "is", "john", "[SEP]"], ["[CLS]", "my", "name", "is", "georges", "[SEP]"], ] result_pair = [ ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"], ["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"], ] def format(encodings): return [e.tokens for e in encodings] def test_single(input, is_pretokenized=False): output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized) assert format(output) == result_single def test_pair(input, is_pretokenized=False): output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized) assert format(output) == result_pair # Classic inputs # Lists test_single(["My name is John", "My name is Georges"]) test_pair([("my name is john", "pair"), ("my name is georges", "pair")]) test_pair([["my name is john", "pair"], ["my name is georges", "pair"]]) # Tuples test_single(("My name is John", "My name is Georges")) test_pair((("My name is John", "pair"), ("My name is Georges", "pair"))) # Numpy test_single(np.array(["My name is John", "My name is Georges"])) test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")])) test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])) # PreTokenized inputs # Lists test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True) test_pair( [ (["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"]), ], True, ) test_pair( [ [["My", "name", "is", "John"], ["pair"]], [["My", "name", "is", "Georges"], ["pair"]], ], True, ) # Tuples test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True) test_pair( ( (("My", "name", "is", "John"), ("pair",)), (("My", "name", "is", "Georges"), ("pair",)), ), True, ) test_pair( ( (["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"]), ), True, ) # Numpy test_single( np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True, ) test_single( np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True, ) test_pair( np.array( [ [["My", "name", "is", "John"], ["pair"]], [["My", "name", "is", "Georges"], ["pair"]], ], dtype=object, ), True, ) test_pair( np.array( ( (("My", "name", "is", "John"), ("pair",)), (("My", "name", "is", "Georges"), ("pair",)), ), dtype=object, ), True, ) # Mal formed with pytest.raises(TypeError, match="TextInputSequence must be str"): tokenizer.encode([["my", "name"]]) with pytest.raises(TypeError, match="TextInputSequence must be str"): tokenizer.encode("My name is john", [["pair"]]) with pytest.raises(TypeError, match="TextInputSequence must be str"): tokenizer.encode("my name is john", ["pair"]) with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode("My name is john", is_pretokenized=True) with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode("My name is john", ["pair"], is_pretokenized=True) with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
from transformers import RobertaConfig from transformers import RobertaTokenizerFast from transformers import RobertaForMaskedLM from transformers import LineByLineTextDataset from transformers import DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments tokenizer = BertWordPieceTokenizer(vocab_file="./FaBerto/vocab.txt") # tokenizer._tokenizer.post_processor = BertProcessing( # ("</s>", tokenizer.token_to_id("</s>")), # ("<s>", tokenizer.token_to_id("<s>")), # ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("مرکزگرایی یکی از عوامل اصلی خداناباوری است.").tokens) print(tokenizer.encode("این یک تست است.").ids) config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained("./FaBerto", max_len=512) model = RobertaForMaskedLM(config=config) print(model.num_parameters())