def test_itf(): tokenizer = Tokenizer.from_pretrained(Config.model_name) if Config.use_pickle: with open(f'{Config.pickle_path}', 'rb') as f: train_data = pickle.load(f) else: train_data = make_train_data_from_txt(Config, tokenizer) counter, itf = make_itf(train_data, Config.vocab_size, tokenizer) # itf = (itf - itf.min()) / (itf.max() - itf.min()) # for i in range(itf.size(0)): # print(i, itf[i]) # itf[itf == 0] += 1e-6 for k, v in counter.most_common(len(counter)): print(tokenizer.decode([k]), v)
seed_everything(Config.seed) device = torch.device(Config.device) start_epoch = 0 logging.info('Define Models') model = build_model(Config).to(device) tokenizer = Tokenizer.from_pretrained(Config.model_name) logging.info('Define Loss and Optimizer') criterion = LabelSmoothing(tokenizer.vocab_size, pad_id=tokenizer.pad_token_id, smoothing=Config.smoothing) _opt = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9) optimizer = get_optimizer(_opt, factor=Config.factor, warmup=Config.warmup) logging.info('Preparing training data') if Config.use_pickle: with open(f'{Config.pickle_path}', 'rb') as f: train_data = pickle.load(f) else: train_data = make_train_data_from_txt(Config, tokenizer) dataset = DialogDataset(train_data, tokenizer) logging.info('Start Training') for epoch in range(start_epoch, Config.n_epoch): one_cycle(epoch, Config, model, optimizer, criterion, BalancedDataLoader(dataset, tokenizer.pad_token_id), tokenizer, device) evaluate(Config, 'もう疲れたー', tokenizer, model, device)