while True: min_test_loss = 1.e6 loss = 0.0 train_loss_seq = [] test_loss_seq = [] if model_type == 'Transformer': model = TransformerModel(config) elif model_type == 'LSTM': model = LSTMModel(config) if cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=config['train']['learning_rate'], weight_decay=config['train']['weight_decay']) criterion = torch.nn.MSELoss() optimizer.zero_grad() for it in range(n_iter): model.train() country = random.choice(train_countries) inp, target = get_data_tensor(data, country, measure_mode, output_mode=output_mode, cuda=cuda) out_nn, _ = get_net_output(inp, model_type, model, cuda) temp_loss = criterion(out_nn, target)
target = source[i + 1:i + 1 + seq_len].view(-1) return data, target ntokens = len(TEXT.vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(TEXT.vocab.stoi) for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad() output = model(data) loss = criterion(output.view(-1, ntokens), targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
TRG.build_vocab(train_data, min_freq=2) # Create model model = TransformerModel(len(SRC.vocab), len(TRG.vocab), args.d_model, args.n_head, args.num_enc_layers, args.num_dec_layers, args.dim_feedforword, args.dropout, args.activation).to(device) if args.resume_model is not None: start_epoch, best_wer = resume_model(model, args.resume_model) # Run the model parallelly if torch.cuda.device_count() > 1: logger.info("Using {} GPUs".format(torch.cuda.device_count())) model = nn.DataParallel(model) # Create loss criterion & optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # Start training logger.info("Training Started".center(60, '#')) for epoch in range(start_epoch, args.epochs): # Train the model train(model, criterion, optimizer, train_iter, device, epoch, logger, args.log_interval, writer, TRG) # Test the model bleu = test(model, criterion, val_iter, device, epoch, logger, args.log_interval, writer, TRG) # Save model # remember best wer and save checkpoint is_best = bleu < best_bleu best_bleu = min(bleu, best_bleu) save_checkpoint(
], axis=0) if loss_func == "cross_entropy": loss_vals = criterion(pred.transpose(1, 2), labels) # Avg of sum of token loss (after ignoring padding tokens) # loss = loss_vals loss = loss_vals.sum(axis=0).mean() elif loss_func == "label_smoothing": loss = criterion(pred, labels) loss.backward() # Clipping if clipping: torch.nn.utils.clip_grad_norm_(model.parameters(), clipping) optimizer.step() loss_val = loss.data.item() # * batch.in_text.size(0) if verbose >= 2: if batch_idx % 500 == 0: print("Train: {} loss={}".format(batch_idx, loss_val)) print("Input: {}".format( denumericalize(batch.in_text, OUT_TEXT.vocab)[0])) print("True: {}".format( denumericalize(batch.out_text, OUT_TEXT.vocab)[0])) print("Pred: {}".format( denumericalize(pred.argmax(dim=2), OUT_TEXT.vocab)[0])) training_loss += loss_val
def main(): args = parse_args() if args.deterministic: random.seed(0) torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.gpu = 0 TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), init_token='<sos>', eos_token='<eos>', lower=False) train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits( TEXT, root=args.data_dir) TEXT.build_vocab(train_txt) model = TransformerModel(len(TEXT.vocab.stoi), args.em_size, args.num_heads, args.hid_size, args.num_layers).to(device) # model = torch.nn.DataParallel(model, dim=1) # optimiser = optim.Adam(model.parameters()) # optimiser = Ranger(model.parameters()) optimiser = RAdam(model.parameters()) if args.eval: dataloaders = { "test": DataLoader(TextEvalDataset(test_txt, args.ngram, TEXT), batch_size=args.eval_batch_size, shuffle=False) } if args.resume: resume(model, args) test_loss, test_acc = eval_pll(device, model, dataloaders["test"], args) logger.info(f"Eval: Test Loss = {test_loss}, Test Acc = {test_acc}") else: dataloaders = { "train": DataLoader(TextTrainDataset(train_txt, args.ngram, TEXT, args.poisson_rate), batch_size=args.train_batch_size, shuffle=True), "val": DataLoader(TextEvalDataset(val_txt, args.ngram, TEXT), batch_size=args.eval_batch_size, shuffle=False), "test": DataLoader(TextEvalDataset(test_txt, args.ngram, TEXT), batch_size=args.eval_batch_size, shuffle=False) } args.start_epoch = 0 args.best_acc = 1 / args.ngram if args.resume: resume(model, args, optimiser) # Create folder for the current model and save args model_dir = time.ctime().replace(" ", "_").replace(":", "_") args.model_dir = os.path.join("models", model_dir) os.makedirs(args.model_dir, exist_ok=True) with open(os.path.join(args.model_dir, "args.json"), "w") as f: json.dump(args.__dict__, f, indent=2) args.logger = logger train_pll(device, model, optimiser, dataloaders, args)