예제 #1
0
def main():
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=get_file(),
                    vocab_size=config.VOCAB_SIZE,
                    min_frequency=config.MIN_FREQUENCY,
                    special_tokens=config.SPECIAL_TOKENS)

    tokenizer.save_model(config.TOKENIZER_PATH)
예제 #2
0
    def pretrain_tokenization(self):
        paths = [str(x) for x in Path("handler/datadir/").glob("*-train.txt")]
        print(paths)
        tokenizer = ByteLevelBPETokenizer()

        tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

        tokenizer.save(".", "danbert-small")
예제 #3
0
 def _fit_tokenizer(
     path_to_text_file: Union[str, List[str]],
     tokenizer: ByteLevelBPETokenizer,
     vocabulary_size: int,
 ) -> None:
     tokenizer.train(
         path_to_text_file,
         vocabulary_size,
         special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN],
     )
예제 #4
0
def train_tok(txt_dir, tokenizer_dir):
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=txt_dir,
                    vocab_size=52_000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    tokenizer.save_model(tokenizer_dir)
예제 #5
0
def test_language_model_dataset_fit_tokenizer_should_call_the_train_method_of_bpe_tokenizer(
):
    # Given
    language_modeling_dataset = LanguageModelingDataset(1, 1)
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train = MagicMock()
    language_modeling_dataset.set_tokenizer(tokenizer)

    # When
    language_modeling_dataset._fit_tokenizer(FAKE_PATH_FOR_TEST, tokenizer,
                                             300)

    # Then
    tokenizer.train.assert_called_with(
        FAKE_PATH_FOR_TEST,
        300,
        special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN],
    )
                                  num_attention_heads=6,
                                  num_hidden_layers=3,
                                  epochs=5,
                                  batch_size=30,
                                  val_batch_size=60,
                                  eval_steps=50,
                                  **kwargs):
    # instantiate tokenizer
    bpe_tokenizer = ByteLevelBPETokenizer()
    # train tokenizer
    _pretty_print("Training tokenizer")
    bpe_tokenizer.train([input_path, input_path_val],
                        vocab_size=vocab_size,
                        min_frequency=min_freq,
                        special_tokens=[
                            "<s>",
                            "<pad>",
                            "</s>",
                            "<unk>",
                            "<mask>",
                        ])
    # save tokenizer
    tok_path = os.path.join(output_path, "tokenizer")
    os.makedirs(tok_path, exist_ok=True)
    bpe_tokenizer.save_model(tok_path)

    # load tokenizer with Roberta configuration
    bpe_tokenizer = RobertaTokenizerFast.from_pretrained(tok_path,
                                                         max_len=max_len)

    # create data objects
    dataset_gen = LineByLineTextDataset(tokenizer=bpe_tokenizer,
예제 #7
0
corpus_length = 6_993_330 # fazer um wc -l para ver a qtde de linhas
vocab_size = 150_000

# Dataset files
# --------------------------------------------------
paths = [str(x) for x in Path("./").glob("**/corpus.txt")]

# Byte Level Tokernize
# --------------------------------------------------
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
# Save files to disk
tokenizer.save_model("BR_BERTo")
# Test
tokenizer = ByteLevelBPETokenizer(
    "./BR_BERTo/vocab.json",
    "./BR_BERTo/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
예제 #8
0
                    default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--data-dir',
                    type=str,
                    default=os.environ['SM_CHANNEL_TRAINING'])

args = parser.parse_args()

paths = [str(x) for x in Path(args.data_dir).glob("**/*.txt")]
print("data files")
print(paths)

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Need to save it to model dir for inference
tokenizer.save(args.model_dir)

tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"),
                                  os.path.join(args.model_dir, "merges.txt"))

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")))
tokenizer.enable_truncation(max_length=args.token_max_len)

print(tokenizer.encode("Nay, but speak not."))
예제 #9
0
import os
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

PATH = os.getcwd()
SAVE_MODEL = os.getcwd()

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="kant.txt",
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.save_model(SAVE_MODEL)
tokenizer = ByteLevelBPETokenizer(
    SAVE_MODEL + "/vocab.json",
    SAVE_MODEL + "/merges.txt",
)

tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("For it is in reality vain to profess"))

config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,