def preprocess(texts, tokenizer_path, max_len=32): input_ids, input_masks = [], [] tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" for text in tqdm(texts): encoded = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, truncation=True) input_ids.append(encoded['input_ids']) input_masks.append(encoded['attention_mask']) return [np.array(input_ids), np.array(input_masks)]
DATA_PATH = 'data/item_name.txt' parser = argparse.ArgumentParser(description='Training language model') parser.add_argument('--config_path', type=str, default='src/configs/train_lm1.yaml', help='path to config file') args = parser.parse_args() config = OmegaConf.load(args.config_path) print(OmegaConf.to_yaml(config)) os.environ['WANDB_DISABLED'] = 'true' tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" distilbert_config = DistilBertConfig(vocab_size=config.vocab_size, n_heads=8, dim=512, hidden_dim=2048) model = DistilBertForMaskedLM(distilbert_config) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=DATA_PATH, block_size=64) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability)