pred.shape tokenizer.decode(torch.argmax(pred, dim=-1).squeeze(0)) loss_fn = nn.CrossEntropyLoss() # masked_lm_loss = loss_fn(pred.view(-1, tokenizer.vocab_size), labels.view(-1)) masked_lm_loss device = 'cuda:0' if torch.cuda.is_available() else 'cpu' total_loss = 0.0 model.train() model.to(device) inputs = inputs.to(device) labels = labels.to(device) loss = [] optimizer = AdamW(params=model.parameters()) for _ in tqdm(range(100000)): pred = model(inputs) mlm_loss = loss_fn(pred.view(-1, tokenizer.vocab_size), labels.view(-1)) total_loss += mlm_loss.item() loss.append(mlm_loss.item()) mlm_loss.backward() optimizer.step() model.zero_grad()
def train(device='cpu', output_dir='model', epochs=5, save_step=5, batch_size=4): model = ReformerLM(num_tokens=13137, dim=128, depth=12, max_seq_len=4096, lsh_dropout=0.1, causal=True, full_attn_thres=128) model = TrainingWrapper(model, ignore_index=0, pad_value=0).to(device) # output_dir="model" model_cpu_path = os.path.join(output_dir, 'model_cpu.pt') try: model.load_state_dict(torch.load(model_cpu_path)) except: pass model.train() optimizer = AdamW(params=model.parameters()) optimizer_path = os.path.join(output_dir, 'optimizer.pt') try: optimizer.load_state_dict(torch.load(optimizer_path)) except: pass print(optimizer) total_loss = 0.0 # batch_size=4 loss = [] data = [] for it in get_data("data/train.json", tokenizer): data.append(it) # data=data[:1000] loss_fn = nn.CrossEntropyLoss() # -100 index = padding token for n in tqdm(range(epochs)): # print(n) random.shuffle(data) inputs = [] labels = [] for i, it in enumerate(data): # print("it",it) inputs.append(it['keywords']) labels.append(it['text']) if i % batch_size == 0 and i != 0: # print(it) inputs_batch = torch.tensor(inputs).long().to(device) labels_batch = torch.tensor(labels).long().to(device) # print(inputs_batch) inputs = [] labels = [] # inputs = torch.tensor(it['keywords']).long() # labels = torch.tensor(it['text']).long() # print("inputs",inputs) pred = model(inputs_batch) mlm_loss = loss_fn(pred.view(-1, tokenizer.vocab_size), labels_batch.view(-1)) total_loss += mlm_loss.item() loss.append(mlm_loss.item()) print('loss', mlm_loss.item()) mlm_loss.backward() optimizer.step() model.zero_grad() # output_dir="model" if i % save_step == 0 and i != 0: model_cpu_path = os.path.join(output_dir, 'model_cpu.pt') optimizer_path = os.path.join(output_dir, 'optimizer.pt') torch.save(model.state_dict(), model_cpu_path) torch.save(optimizer.state_dict(), optimizer_path) model_cpu_path = os.path.join(output_dir, 'model_cpu.pt') optimizer_path = os.path.join(output_dir, 'optimizer.pt') torch.save(model.state_dict(), model_cpu_path) torch.save(optimizer.state_dict(), optimizer_path)