def main(): """ Main function Here, you should instantiate 1) DataLoaders for training and validation. Try SubsetRandomSampler to create these DataLoaders. 3) model 4) optimizer 5) cost function: use torch.nn.CrossEntropyLoss """ parser = argparse.ArgumentParser() parser.add_argument('--val_ratio', type=float, default=.5, help='The ratio for valid set') parser.add_argument('--n_layers', type=int, default=4, help='Number of stacked RNN layers') parser.add_argument('--n_hidden', type=int, default=512, help='Number of hidden neurons of RNN cells') parser.add_argument('--drop_prob', type=float, default=0.1, help='Dropout probability') parser.add_argument('--num_epochs', type=int, default=100, help='The number of epochs') parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') parser.add_argument('--device', type=str, default='gpu', help='For cpu: \'cpu\', for gpu: \'gpu\'') parser.add_argument('--batch_size', type=int, default=256, help='Size of batches for training') parser.add_argument('--model_save_dir', type=str, default='../model', help='Directory for saving model.') parser.add_argument('--results_save_dir', type=str, default='../results', help='Directory for saving results.') parser.add_argument('--rnn', type=bool, default=True, help='Train vanilla rnn model') parser.add_argument('--lstm', type=bool, default=True, help='Train lstm model') parser.add_argument('--chunk_size', type=int, default=30, help='Chunk size(sequence length)') parser.add_argument('--s_step', type=int, default=3, help='Sequence step') args = parser.parse_args() n_cpu = multiprocessing.cpu_count() if args.device == 'gpu': args.device = 'cuda' device = torch.device(args.device) chunk_size = args.chunk_size s_step = args.s_step num_epochs = args.num_epochs batch_size = args.batch_size val_ratio = args.val_ratio shuffle_dataset = True random_seed = 42 datasets = dataset.Shakespeare('shakespeare_train.txt', chunk_size, s_step) dataset_size = len(datasets) indices = list(range(dataset_size)) split = int(np.floor(val_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) trn_loader = DataLoader(datasets, batch_size=batch_size, sampler=train_sampler, num_workers=n_cpu) val_loader = DataLoader(datasets, batch_size=batch_size, sampler=valid_sampler, num_workers=n_cpu) chars = datasets.chars print('-----Train Vanilla RNN Model-----') if args.rnn: model = CharRNN(chars, args).to(device) optimizer = Adam(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() rnn_trn_loss, rnn_val_loss = [], [] best_val_loss = np.inf for epoch in range(args.num_epochs): epoch_time = time.time() trn_loss = train(model, trn_loader, device, criterion, optimizer) val_loss = validate(model, val_loader, device, criterion) rnn_trn_loss.append(trn_loss) rnn_val_loss.append(val_loss) print('Epoch: %3s/%s...' % (epoch + 1, num_epochs), 'Train Loss: %.4f...' % trn_loss, 'Val Loss: %.4f...' % val_loss, 'Time: %.4f' % (time.time() - epoch_time)) if val_loss < best_val_loss: best_val_loss = val_loss torch.save(model.state_dict(), '%s/rnn.pt' % args.model_save_dir) value, idx = np.array(rnn_val_loss).min(), np.array( rnn_val_loss).argmin() plt.figure(figsize=(8, 6)) plt.title('Vanilla RNN Model training and validation loss') plt.plot(np.arange(1, args.num_epochs + 1), rnn_trn_loss, 'g', label='Train Loss') plt.plot(np.arange(1, args.num_epochs + 1), rnn_val_loss, 'r', label='Val Loss') plt.grid(True) plt.legend(loc='upper right') plt.annotate('min epoch: %s \n\ min valid loss: %.5f' % (idx, value), (idx, value), xytext=(-60, 20), textcoords='offset points', arrowprops={'arrowstyle': '->'}) plt.savefig('%s/rnn_loss.png' % args.results_save_dir, dpi=300) print('-----Train LSTM Model-----') if args.lstm: model = CharLSTM(chars, args).to(device) optimizer = Adam(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() lstm_trn_loss, lstm_val_loss = [], [] best_val_loss = np.inf for epoch in range(args.num_epochs): epoch_time = time.time() trn_loss = train(model, trn_loader, device, criterion, optimizer) val_loss = validate(model, val_loader, device, criterion) lstm_trn_loss.append(trn_loss) lstm_val_loss.append(val_loss) print('Epoch: %3s/%s...' % (epoch + 1, num_epochs), 'Train Loss: %.4f...' % trn_loss, 'Val Loss: %.4f...' % val_loss, 'Time: %.4f' % (time.time() - epoch_time)) if val_loss < best_val_loss: best_val_loss = val_loss torch.save(model.state_dict(), '%s/lstm.pt' % args.model_save_dir) value, idx = np.array(lstm_val_loss).min(), np.array( lstm_val_loss).argmin() plt.figure(figsize=(8, 6)) plt.title('LSTM Model training and validation loss') plt.plot(np.arange(1, args.num_epochs + 1), lstm_trn_loss, 'g', label='Train Loss') plt.plot(np.arange(1, args.num_epochs + 1), lstm_val_loss, 'r', label='Val Loss') plt.grid(True) plt.legend(loc='upper right') plt.annotate('min epoch: %s \n\ min valid loss: %.5f' % (idx, value), (idx, value), xytext=(-60, 20), textcoords='offset points', arrowprops={'arrowstyle': '->'}) plt.savefig('%s/lstm_loss.png' % args.results_save_dir, dpi=300)
def train(opt, x_train, x_val, dictionary_len): ''' Training a network Arguments --------- net: CharRNN network data: training data to train the network (text) epochs: Number of epochs to train batch_size: Number of mini-sequences per mini-batch, aka batch size seq_length: Number of character steps per mini-batch lr: learning rate clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss ''' torch.manual_seed(0) np.random.seed(0) random.seed(0) # Declaring the hyperparameters batch_size = opt.batch_size seq_length = int(opt.seq_length) epochs = 50 if torch.cuda.is_available(): device = "cuda" torch.cuda.manual_seed_all(0) else: device = "cpu" print(device) date = datetime.now().strftime('%y%m%d%H%M%S') if opt.nologs: writer = SummaryWriter(log_dir=f'logs/nologs/') else: writer = SummaryWriter(log_dir=f'logs/logs_{date}/') y_train = get_labels_text_prediction(x_train) train_dataset = TextDataset(x_train, y_train, max_len=seq_length) if not opt.onlytrain: y_val = get_labels_text_prediction(x_val) val_dataset = TextDataset(x_val, y_val, max_len=seq_length) val_loader = DataLoader(dataset=val_dataset, pin_memory=device == 'cuda', batch_size=batch_size, shuffle=False) train_loader = DataLoader(dataset=train_dataset, pin_memory=device == 'cuda', batch_size=batch_size, shuffle=True) model_params = { 'dictionary_len': dictionary_len, 'dropout': opt.dropout, 'hidden_size': opt.hidden_size, 'layers': opt.layers, 'embedding_len': 32, 'device': device, 'lr': opt.lr } model = CharRNN(**model_params).to(device) print(model) # embed() # summary(model, input_size=(channels, H, W)) # summary(model, input_size=(dictionary_len, 28, 28)) optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.CrossEntropyLoss() if opt.scheduler: scheduler = ReduceLROnPlateau(optimizer, 'min', cooldown=3, factor=0.5, patience=10) global_step = 0 for j in trange(epochs, desc='T raining LSTM...'): for i, (x, y) in enumerate(train_loader): if i == len(train_loader) - 1: print("FER PADDING - DE MOMENT NO VA") continue model.train() x = x.to(device) y = y.to(device) # state_h, state_c = model.zero_state(opt.batch_size) # # Transfer data to GPU # state_h = state_h.to(device) # state_c = state_c.to(device) # DELETE PAST GRADIENTS optimizer.zero_grad() # FORWARD PASS --> ultim state , (tots) [ state_h[-1] == pred ] pred, (state_h, state_c) = model(x) # pred, (state_h, state_c) = model(x, (state_h, state_c)) # CALCULATE LOSS # pred = pred.transpose(1, 2) pred2 = pred.view(-1, dictionary_len) y2 = y.view(-1) loss = criterion(pred2, y2) loss_value = loss.item() # BACKWARD PASS loss.backward() # MINIMIZE LOSS optimizer.step() global_step += 1 if i % 100 == 0: writer.add_scalar('train/loss', loss_value, global_step) print('[Training epoch {}: {}/{}] Loss: {}'.format( j, i, len(train_loader), loss_value)) if not opt.onlytrain: val_loss = [] for i, (x, y) in enumerate(val_loader): if i == len(val_loader) - 1: # print("FER PADDING - DE MOMENT NO VA") continue model.eval() x = x.to(device) y = y.to(device) # state_h, state_c = model.zero_state(opt.batch_size) # state_h = state_h.to(device) # state_c = state_c.to(device) # NO BACKPROPAGATION # FORWARD PASS # pred, (state_h, state_c) = model(x, (state_h, state_c)) pred, (state_h, state_c) = model(x) # CALCULATE LOSS # pred = pred.transpose(1, 2) # pred = [batch x 40 x diccionary_len] # y = [batch x 40] pred2 = pred.view(-1, dictionary_len) y2 = y.view(-1) loss = criterion(pred2, y2) # loss = criterion(pred, y) val_loss.append(loss.item()) if i % 50 == 0: print('[Validation epoch {}: {}/{}] Loss: {}'.format( j, i, len(val_loader), loss.item())) writer.add_scalar('val/loss', np.mean(val_loss), j) if opt.scheduler: scheduler.step(np.mean(val_loss)) writer.add_scalar("lr", optimizer.param_groups[0]["lr"], j) predicted_words = inference_prediction(model, device, 500) # output = pred[0].unsqueeze(0) # [1,diccionary_len, 40] # predicted_words = do_inference_test(output, model, device) print(predicted_words) writer.add_text('val/Generated_Samples', predicted_words, j) checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), } # if j % 5 == 0: os.makedirs("weights/{}".format(date), exist_ok=True) torch.save(checkpoint, "weights/{}/checkpoint_{}.pt".format(date, j))
n_hidden = args.n_hidden n_layers = args.n_layers net = CharRNN(chars, n_hidden, n_layers) # declaring the hyperparameters batch_size = args.batch_size seq_length = args.seq_length n_epochs = args.n_epochs # train the model train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=50) # Saving the model model_name = f'rnn_{n_epochs}_epoch.net' checkpoint = { 'n_hidden': net.n_hidden, 'n_layers': net.n_layers, 'state_dict': net.state_dict(), 'tokens': net.chars } with open(model_name, 'wb') as f: torch.save(checkpoint, f)
def train(filename, rnn_type, num_layers, dropout, emb_size, hidden_size, num_epochs, batch_size, learning_rate, num_samples, seed_phrase, sample_every, checkpoint_path): """ Trains a character-level Recurrent Neural Network in PyTorch. Args: optional arguments [python train.py --help] """ logging.info('reading `{}` for character sequences'.format(filename)) inputs, token_to_idx, idx_to_token = load_dataset(file_name=filename) idx_to_token.remove('~') idx_to_token.remove('#') idx_to_token = ['~'] + idx_to_token + ['#'] token_to_idx = {token: idx_to_token.index(token) for token in idx_to_token} logging.info(idx_to_token) logging.info(token_to_idx) n_tokens = len(idx_to_token) max_length = inputs.size(1) logging.debug('creating char-level RNN model') model = CharRNN(num_layers=num_layers, rnn_type=rnn_type, dropout=dropout, n_tokens=n_tokens, emb_size=emb_size, hidden_size=hidden_size, pad_id=token_to_idx[PAD_TOKEN]) if torch.cuda.is_available(): model = model.cuda() logging.debug('defining model training operations') # define training procedures and operations for training the model criterion = nn.NLLLoss(reduction='mean') optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', min_lr=1e-6, factor=0.1, patience=7, verbose=True) # train-val-test split of the dataset split_index = int(0.9 * inputs.size(0)) train_tensors, inputs = inputs[:split_index], inputs[split_index:] split_index = int(0.5 * inputs.size(0)) val_tensors, test_tensors = inputs[:split_index], inputs[split_index:] del inputs logging.info('train tensors: {}'.format(train_tensors.size())) logging.info('val tensors: {}'.format(val_tensors.size())) logging.info('test tensors: {}'.format(test_tensors.size())) logging.debug('training char-level RNN model') # loop over epochs for epoch in range(1, num_epochs + 1): epoch_loss, n_iter = 0.0, 0 # loop over batches for tensors in tqdm(iterate_minibatches(train_tensors, batchsize=batch_size), desc='Epoch[{}/{}]'.format(epoch, num_epochs), leave=False, total=train_tensors.size(0) // batch_size): # optimize model parameters epoch_loss += optimize(model, tensors, max_length, n_tokens, criterion, optimizer) n_iter += 1 # evaluate model after every epoch val_loss = evaluate(model, val_tensors, max_length, n_tokens, criterion) # lr_scheduler decreases lr when stuck at local minima scheduler.step(val_loss) # log epoch status info logging.info( 'Epoch[{}/{}]: train_loss - {:.4f} val_loss - {:.4f}'.format( epoch, num_epochs, epoch_loss / n_iter, val_loss)) # sample from the model every few epochs if epoch % sample_every == 0: print( 'Epoch[{}/{}]: train_loss - {:.4f} val_loss - {:.4f}'.format( epoch, num_epochs, epoch_loss / n_iter, val_loss)) for _ in range(num_samples): sample = generate_sample(model, token_to_idx, idx_to_token, max_length, n_tokens, seed_phrase=seed_phrase) logging.debug(sample) checkpoint = { 'epoch': epoch + 1, 'valid_loss_min': val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } # save checkpoint best_model_path = checkpoint_path save_ckp(checkpoint, False, checkpoint_path, best_model_path)