def __init__(self, num_steps, embed_size=16, hidden_dim=96, edge_drop=0.1): super(SudokuNN, self).__init__() self.num_steps = num_steps self.digit_embed = nn.Embedding(10, embed_size) self.row_embed = nn.Embedding(9, embed_size) self.col_embed = nn.Embedding(9, embed_size) self.input_layer = nn.Sequential( nn.Linear(3 * embed_size, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), ) self.lstm = nn.LSTMCell(hidden_dim * 2, hidden_dim, bias=False) msg_layer = nn.Sequential( nn.Linear(2 * hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), ) self.rrn = RRN(msg_layer, self.node_update_func, num_steps, edge_drop) self.output_layer = nn.Linear(hidden_dim, 10) self.loss_func = nn.CrossEntropyLoss()
def train_rrn(hyperparameters: dict, train_inputs: list, train_outputs: list, other_inputs: dict = None, other_outputs: dict = None): """ :param hyperparameters: Check below for what fields must exist in hyperparameters :param train_inputs: list of GridStrings :param train_outputs: list of GridStrings, corresponding in index to train_inputs :param other_inputs: dictionary of GridStrings where the key is name of the dataset :param other_outputs: dictionary of GridStrings where the key is name of the dataset, corresponding in index to inputs of same name :return: """ if other_inputs is None: other_inputs = {} if other_outputs is None: other_outputs = {} assert set(other_inputs.keys()) == set(other_outputs.keys()) if not os.path.exists('./checkpoints'): os.makedirs('./checkpoints') if not os.path.exists('./logs'): os.makedirs('./logs') dim_x = hyperparameters['dim_x'] dim_y = hyperparameters['dim_y'] num_iters = hyperparameters['num_iters'] batch_size = hyperparameters['batch_size'] epochs = hyperparameters['epochs'] valid_epochs = hyperparameters['valid_epochs'] save_epochs = hyperparameters['save_epochs'] embed_size = hyperparameters['embed_size'] hidden_layer_size = hyperparameters['hidden_layer_size'] learning_rate = hyperparameters['learning_rate'] weight_decay = hyperparameters['weight_decay'] parallel = False if 'devices' in hyperparameters: if len(hyperparameters['devices']) > 1: devices = hyperparameters['devices'] parallel = True device = hyperparameters['devices'][0] else: device = hyperparameters['device'] train_x = torch.stack([encode_input(p) for p in train_inputs]).cuda(device) train_y = torch.stack([encode_output(p) for p in train_outputs]).cuda(device) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack([encode_input(p) for p in other_inputs[k]]).cuda(device) other_y[k] = torch.stack([encode_output(p) for p in other_outputs[k]]).cuda(device) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) if parallel: model = nn.DataParallel(model, device_ids=devices) # else: # model = model.cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) train_losses = [] # (epoch, ) train_accuracies = [] # (epoch, grid, timestep) other_losses = {name: [] for name in other_x} # (epoch, ) other_accuracies = {name: [] for name in other_x} # (epoch, grid, timestep) times = [] def closure(): optimizer.zero_grad() total_loss = 0 epoch_accuracies = [] shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices) for i in tqdm(range(0, len(train_x), batch_size), leave=False): x_batch = train_x[shuffle_indices[i:i + batch_size]] y_batch = train_y[shuffle_indices[i:i + batch_size]] loss, accuracies = get_performance(model=model, x=x_batch, y=y_batch, no_grad=False, num_iters=num_iters) loss.backward() total_loss += loss train_losses.append(float(total_loss)) epoch_accuracies.append(accuracies) train_accuracies.append(np.concatenate(epoch_accuracies)) return total_loss for i in tqdm(range(epochs)): start_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") start_time = time.time() train_loss = optimizer.step(closure) run_validate = i == 0 or (i + 1) % valid_epochs == 0 if run_validate: for name in other_x: loss, accuracy = get_performance(model=model, x=other_x[name], y=other_y[name], num_iters=num_iters, no_grad=True) other_losses[name].append(float(loss)) other_accuracies[name].append(accuracy) if (i + 1) % save_epochs == 0: model_filename = "./checkpoints/epoch_{}.mdl".format(i + 1) train_data_filename = "./logs/training.pkl" print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) with open(train_data_filename, 'wb') as f: pickle.dump( { 'hyperparameters': hyperparameters, 'train_losses': train_losses, 'train_accuracies': train_accuracies, 'other_losses': other_losses, 'other_accuracies': other_accuracies, 'times': times }, f) end_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") end_time = time.time() runtime = end_time - start_time times.append({ 'start_time': start_time_str, 'end_time': end_time_str, 'runtime': runtime }) print("duration: {}s\t iter: {}\t| loss: {}\t| accuracy: {}".format( round(runtime, 1), i, round(float(train_loss), 3), round(np.mean(train_accuracies[-1][:, -1]), 3))) if run_validate: for name in sorted(other_x): print("data: {}\t| loss: {}\t| accuracy: {}".format( name, round(other_losses[name][-1], 3), round(np.mean(other_accuracies[name][-1][:, -1]), 3))) model_filename = "./model.mdl" print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) return model
def train_rrn(hyperparameters: dict, data: dict): model_name = hyperparameters['model_name'] device = hyperparameters['device'] dim_x = hyperparameters['dim_x'] dim_y = hyperparameters['dim_y'] num_iters = hyperparameters['num_iters'] train_size = hyperparameters['train_size'] valid_size = hyperparameters['valid_size'] test_size = hyperparameters['test_size'] batch_size = hyperparameters['batch_size'] epochs = hyperparameters['epochs'] save_epochs = hyperparameters['save_epochs'] embed_size = hyperparameters['embed_size'] hidden_layer_size = hyperparameters['hidden_layer_size'] learning_rate = hyperparameters['learning_rate'] weight_decay = hyperparameters['weight_decay'] train_inputs = data['train_inputs'] train_outputs = data['train_outputs'] valid_inputs = data['valid_inputs'] valid_outputs = data['valid_outputs'] test_inputs = data['test_inputs'] test_outputs = data['test_outputs'] all_train_x = torch.stack([encode_input(p) for p in train_inputs]) all_train_y = torch.stack([encode_output(p) for p in train_outputs]) all_valid_x = torch.stack([encode_input(p) for p in valid_inputs]) all_valid_y = torch.stack([encode_output(p) for p in valid_outputs]) all_test_x = torch.stack([encode_input(p) for p in test_inputs]) all_test_y = torch.stack([encode_output(p) for p in test_outputs]) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) train_losses = [] # epoch x batch train_accuracies = [] # epoch x batch x grid x timestep valid_losses = [] # epoch x batch valid_accuracies = [] # epoch x batch x grid x timestep times = [] train_x = all_train_x[:train_size].cuda(device) train_y = all_train_y[:train_size].cuda(device) valid_x = all_valid_x[:valid_size].cuda(device) valid_y = all_valid_y[:valid_size].cuda(device) test_x = all_test_x[:test_size].cuda(device) test_y = all_test_y[:test_size].cuda(device) def closure(): optimizer.zero_grad() total_loss = 0 shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices) for i in tqdm(range(0, len(train_x), batch_size), leave=False): x_batch = train_x[shuffle_indices[i:i + batch_size]] y_batch = train_y[shuffle_indices[i:i + batch_size]] loss, accuracies = get_performance(model, x_batch, y_batch, num_iters) loss.backward() total_loss += loss train_losses[-1].append(float(loss)) train_accuracies[-1].append(accuracies) return total_loss for i in tqdm(range(epochs)): start_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") start_time = time.time() train_losses.append([]) train_accuracies.append([]) train_loss = optimizer.step(closure) train_accuracies[-1] = np.array(train_accuracies[-1]) valid_loss, valid_accuracy = get_performance(model, valid_x, valid_y, num_iters) valid_losses.append(float(valid_loss)) valid_accuracies.append(valid_accuracy) train_accuracies[-1] = np.array(train_accuracies[-1]) train_loss = round(float(train_loss), 3) train_accuracy = round(np.mean(train_accuracies[-1][:, :, -1]), 3) valid_loss = round(valid_losses[-1], 3) valid_accuracy = round(np.mean(valid_accuracies[-1][:, -1]), 3) end_time_str = datetime.now().strftime("%m/%d/%Y %H:%M:%S") end_time = time.time() runtime = end_time - start_time times.append({ 'start_time': start_time_str, 'end_time': end_time_str, 'runtime': runtime }) print("({}s): Iter {}\t| TrLoss {}\t| VLoss {}\t| TrAcc {}\t| VAcc {}". format(round(runtime, 1), i, train_loss, valid_loss, train_accuracy, valid_accuracy)) if (i + 1) % save_epochs == 0: model_filename = SUDOKU_PATH + "/models/{}_{}.mdl".format( model_name, i + 1) train_data_filename = SUDOKU_PATH + "/pickles/{}.pkl".format( model_name) print("Saving model to {}".format(model_filename)) torch.save(model.state_dict(), model_filename) with open(train_data_filename, 'wb') as f: pickle.dump( { 'hyperparameters': hyperparameters, 'train_losses': train_losses, 'train_accuracies': train_accuracies, 'valid_losses': valid_losses, 'valid_accuracies': valid_accuracies, 'times': times }, f) test_loss, test_accuracy = get_performance(model, test_x, test_y, num_iters) test_loss = round(float(test_loss), 3) test_accuracy = round(np.mean(test_accuracy[:, -1]), 3) print("TeLoss {}\t| TeAcc {}".format(test_loss, test_accuracy)) return model
train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]).cuda(device) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(device) other_x = {} other_y = {} for k in other_inputs: other_x[k] = torch.stack( [rrn_utils.encode_input(p) for p in other_inputs[k]]).cuda(device) other_y[k] = torch.stack( [rrn_utils.encode_output(p) for p in other_outputs[k]]).cuda(device) # model = EmbedRRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) model = RRN(dim_x=dim_x, dim_y=dim_y, embed_size=embed_size, hidden_layer_size=hidden_layer_size).cuda(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # ones = torch.ones(10, 16).cuda(device) def closure(): optimizer.zero_grad() total_loss = 0 epoch_accuracies = [] shuffle_indices = np.arange(len(train_x)) np.random.shuffle(shuffle_indices)
split_inputs, split_outputs = dataset.split_data([ train_size_per_num_hints, train_size_per_num_hints + valid_size_per_num_hints ]) train_inputs = split_inputs[0] train_outputs = split_outputs[0] other_inputs = {'validation': split_inputs[1]} other_outputs = {'validation': split_outputs[1]} # model = RelNet(dim_x=hp['dim_x'], # dim_y=hp['dim_y'], # embed_size=hp['embed_size'], # hidden_layer_size=hp['hidden_layer_size']).cuda(hp['device']) model = RRN(dim_x=hp['dim_x'], dim_y=hp['dim_y'], embed_size=hp['embed_size'], hidden_layer_size=hp['hidden_layer_size']).cuda(hp['device']) optimizer = optim.Adam(model.parameters(), lr=hp['learning_rate'], weight_decay=hp['weight_decay']) train_x = torch.stack([rrn_utils.encode_input(p) for p in train_inputs]).cuda(hp['device']) # train_x = utils.one_hot_encode(train_x) train_y = torch.stack([rrn_utils.encode_output(p) for p in train_outputs]).cuda(hp['device']) other_x = {} other_y = {} for k in other_inputs:
def train(config): #print parameters print_config(config) config.model_type = config.model_type.lower() assert config.model_type in ('rnn', 'lstm', 'rrn') # Initialize the device which to run the model on wanted_device = config.device.lower() if wanted_device == 'cuda': #check if cuda is available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') else: #cpu is the standard option device = torch.device('cpu') # Initialize the model that we are going to use if config.model_type == 'rnn': model = VanillaRNN(seq_length = config.input_length, input_dim = config.input_dim, num_hidden = config.num_hidden, num_classes = config.num_classes, batch_size = config.batch_size, device = device) elif config.model_type == 'lstm': model = LSTM(seq_length = config.input_length, input_dim = config.input_dim, num_hidden = config.num_hidden, num_classes = config.num_classes, batch_size = config.batch_size, device = device) elif config.model_type == 'rrn': model = RRN(seq_length = config.input_length, input_dim = config.input_dim, num_hidden = config.num_hidden, num_classes = config.num_classes, batch_size = config.batch_size, device = device) # Initialize the dataset and data loader (note the +1) dataset = PalindromeDataset(config.input_length+1) data_loader = DataLoader(dataset, config.batch_size, num_workers=0) # Setup the loss and optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate) #keep stats train_acc = np.zeros(config.train_steps+1) first_best_acc = 0 acc_MA = 0 for step, (batch_inputs, batch_targets) in enumerate(data_loader): # Only for time measurement of step through network t1 = time.time() #batches to torch tensors x = torch.tensor(batch_inputs, dtype=torch.float, device=device) y_true = torch.tensor(batch_targets, dtype=torch.long, device=device) #Forward pass y_pred = model.forward(x) loss = criterion(y_pred, y_true) #Backward pass optimizer.zero_grad() loss.backward() ############################################################################ # QUESTION: what happens here and why? # clip_grad_norm() is a method to avoid exploding gradients. It clips # gradients above max_norm to max_norm. #Deprecated, use clip_grad_norm_() instead ############################################################################ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) ############################################################################ optimizer.step() train_acc[step] = accuracy(y_pred, y_true, config) # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/(float(t2-t1) + 1e-6) if step % config.print_every == 0: print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, " "Accuracy = {:.2f}, Loss = {:.3f}".format( datetime.now().strftime("%Y-%m-%d %H:%M"), step, config.train_steps, config.batch_size, examples_per_second, train_acc[step], loss )) print(f"x: {x[0,:]}, y_pred: {y_pred[0,:].argmax()}, y_true: {y_true[0]}") acc_MA = train_acc[step-4:step+1].sum()/5 if step == config.train_steps or acc_MA == 1.0: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.') #Save the final model torch.save(model, config.model_type + "_model.pt") np.save("train_acc_" + config.model_type + str(config.input_length), train_acc) if config.experiment: stats = {} stats["last acc"] = train_acc[-1] first_best_acc = np.argmax(train_acc) stats["best acc"] = train_acc[first_best_acc] stats["step best acc"] = first_best_acc stats["num steps"] = len(train_acc) stats["accs"] = train_acc return stats