def go(arg): """ Creates and trains a basic transformer for the IMDB sentiment classification task. """ tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the IMDB data if arg.final: train, test = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train, max_size=arg.vocab_size - 2) LABEL.build_vocab(train) train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_size=arg.batch_size, device=util.d()) else: tdata, _ = datasets.IMDB.splits(TEXT, LABEL) train, test = tdata.split(split_ratio=0.8) TEXT.build_vocab(train, max_size=arg.vocab_size - 2) # - 2 to make space for <unk> and <pad> LABEL.build_vocab(train) train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_size=arg.batch_size, device=util.d()) print(f'- nr. of training examples {len(train_iter)}') print( f'- nr. of {"test" if arg.final else "validation"} examples {len(test_iter)}' ) if arg.max_length < 0: mx = max([input.text[0].size(1) for input in train_iter]) mx = mx * 2 print(f'- maximum sequence length: {mx}') else: mx = arg.max_length # create the model model = former.CustomTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=mx, num_tokens=arg.vocab_size, num_classes=NUM_CLS, max_pool=arg.max_pool) if torch.cuda.is_available(): model.cuda() opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) sch = torch.optim.lr_scheduler.LambdaLR( opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) # training loop seen = 0 for e in range(arg.num_epochs): print(f'\n epoch {e}') model.train(True) for batch in tqdm.tqdm(train_iter): print('looping through batches') opt.zero_grad() input = batch.text[0] print('got our first input!') label = batch.label - 1 if input.size(1) > mx: input = input[:, :mx] break break
def go(arg): """ Creates and trains a basic transformer for the IMDB sentiment classification task. """ tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the IMDB data if arg.final: train, test = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train, max_size=arg.vocab_size - 2) LABEL.build_vocab(train) train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_size=arg.batch_size, device=util.d()) else: tdata, _ = datasets.IMDB.splits(TEXT, LABEL) train, test = tdata.split(split_ratio=0.8) TEXT.build_vocab(train, max_size=arg.vocab_size - 2) # - 2 to make space for <unk> and <pad> LABEL.build_vocab(train) train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_size=arg.batch_size, device=util.d()) print(f"- nr. of training examples {len(train_iter)}") print( f'- nr. of {"test" if arg.final else "validation"} examples {len(test_iter)}' ) if arg.max_length < 0: mx = max([input.text[0].size(1) for input in train_iter]) mx = mx * 2 print(f"- maximum sequence length: {mx}") else: mx = arg.max_length # create the model model = former.CTransformer( emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=mx, num_tokens=arg.vocab_size, num_classes=NUM_CLS, max_pool=arg.max_pool, ) if torch.cuda.is_available(): model.cuda() opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # training loop seen = 0 for e in range(arg.num_epochs): print(f"\n epoch {e}") model.train(True) for batch in tqdm.tqdm(train_iter): # learning rate warmup # - we linearly increase the learning rate from 10e-10 to arg.lr over the first # few thousand batches if arg.lr_warmup > 0 and seen < arg.lr_warmup: lr = max((arg.lr / arg.lr_warmup) * seen, 1e-10) opt.lr = lr opt.zero_grad() input = batch.text[0] label = batch.label - 1 if input.size(1) > mx: input = input[:, :mx] out = model(input) loss = F.nll_loss(out, label) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() seen += input.size(0) tbw.add_scalar("classification/train-loss", float(loss.item()), seen) with torch.no_grad(): model.train(False) tot, cor = 0.0, 0.0 for batch in test_iter: input = batch.text[0] label = batch.label - 1 if input.size(1) > mx: input = input[:, :mx] out = model(input).argmax(dim=1) tot += float(input.size(0)) cor += float((label == out).sum().item()) acc = cor / tot print( f'-- {"test" if arg.final else "validation"} accuracy {acc:.3}' ) tbw.add_scalar("classification/test-loss", float(loss.item()), e)