def main(dataset_name, save_dir, cfg): ## Data data_dir = os.path.join('/scail/data/group/atlas/kalpit/data', dataset_name) dataset = Dataset(data_dir) dataset.data_reshape( (cfg.input_height, cfg.input_width, cfg.input_nchannels)) ## Model print 'Creating Model...' model = get_model(dataset_name + '_conv', cfg) #model.summary() ## Train print 'Training Model...' starttime = time.time() train_loss_batch, train_acc_batch, train_loss, val_loss, val_acc = train( model, dataset, cfg) endtime = time.time() #plot_loss(train_loss, save_dir, 'training_cost', 'training_cost') #plot_loss(val_loss, save_dir, 'validation_cost', 'validation_cost') ## Validate print '' print 'Final Validation...' validate(model, dataset) ## Training Time print 'Training Time: {:.2f}'.format(endtime - starttime) return min(train_loss)
def main(dataset_name, network, save_dir, cfg): ## Data data_dir = os.path.join('/scail/data/group/atlas/kalpit/data', dataset_name) dataset = Dataset(data_dir) ## Model print 'Creating Model...' model = get_model(dataset_name + '_' + network, cfg) #model.summary() ## Train print 'Training Model...' starttime = time.time() if network == 'ff': if cfg.optimizer == 'kalpit': train_loss, val_loss, val_acc = train_ff_kalpit( model, dataset, cfg, save_dir) else: train_loss, val_loss, val_acc = train_ff_vanilla( model, dataset, cfg, save_dir) elif network == 'conv': dataset.data_reshape( (cfg.input_height, cfg.input_width, cfg.input_nchannels)) # for both mnist and cifar10 if cfg.optimizer == 'kalpit': train_loss, val_loss, val_acc = train_conv_kalpit( model, dataset, cfg, save_dir) else: train_loss, val_loss, val_acc = train_conv_vanilla( model, dataset, cfg, save_dir) elif network == 'autoencoder': if cfg.optimizer == 'kalpit': train_loss, val_loss = train_autoencoder_kalpit( model, dataset, cfg, save_dir) else: train_loss, val_loss = train_autoencoder_vanilla( model, dataset, cfg, save_dir) else: raise NotImplementedError endtime = time.time() #plot_loss(train_loss, save_dir, 'training_cost', 'training_cost') #plot_loss(val_loss, save_dir, 'validation_cost', 'validation_cost') ## Validate print '' print 'Final Validation...' if network == 'ff': validate_ff(model, dataset) elif network == 'conv': validate_conv(model, dataset) elif network == 'autoencoder': validate_autoencoder(model, dataset) ## Training Time print 'Training Time: {:.2f}'.format(endtime - starttime) return min(train_loss)
def samplePredictions(self): use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True validation_set = Dataset(self.test_set_IDs) validation_generator = torch.utils.data.DataLoader( validation_set, **self.params) with torch.set_grad_enabled(False): counter = 0 for local_batch, local_labels in validation_generator: if counter % 10 == 0: print(f'Batch prediction number {counter+1}...') outputs = [] for sample in range(self.n_samples): # Transfer to GPU local_batch, local_labels = local_batch.to( device), local_labels.to(device) # Model computations output = self.model(local_batch.float()) outputs.append(output) # do inverse transform before taking mean and std preds = [ self.inverseScale(outputs[k]) for k in range(len(outputs)) ] if self.difference_length > 0: IDs = self.test_set_IDs[counter * self.params['batch_size']: (counter + 1) * self.params['batch_size']] preds = [ self.inverseDifference(preds[k], IDs) for k in range(len(outputs)) ] preds = torch.stack(preds) # find mean and std for each observation, store these in a list if counter == 0: means = preds.mean(axis=0) stds = preds.std(axis=0) else: means = torch.cat((means, preds.mean(axis=0)), 0) stds = torch.cat((stds, preds.std(axis=0)), 0) counter += 1 self.means = means self.stds = stds
def train_model( model, max_epochs, params, partition, elbo_sample_nbr, ): # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True model.to(device) # Generators training_set = Dataset(partition['train']) training_generator = torch.utils.data.DataLoader(training_set, **params) validation_set = Dataset(partition['test']) validation_generator = torch.utils.data.DataLoader(validation_set, **params) # define criterion and optimiser criterion = nn.MSELoss() optimiser = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1.0) #optimiser = optim.SGD(model.parameters(), lr=1e-3) # store losses per epoch train_losses = np.zeros(max_epochs) test_losses = np.zeros(max_epochs) scheduler = StepLR(optimiser, step_size=15) for epoch in range(max_epochs): # Training counter = 0 current_loss_sum = 0 for local_batch, local_labels in training_generator: # Transfer to GPU local_batch, local_labels = local_batch.to( device), local_labels.to(device) # Model computations optimiser.zero_grad() # forward and backward propagation batch_loss_elbo = model.sample_elbo(inputs=local_batch.float(), labels=local_labels.float(), criterion=criterion, sample_nbr=elbo_sample_nbr) batch_loss_elbo.backward() optimiser.step() output = model(local_batch) current_train_loss = criterion(output, local_labels.float()) current_loss_sum += current_train_loss.item() counter += 1 train_losses[epoch] = current_loss_sum / counter # Validation counter = 0 current_loss_sum = 0 with torch.set_grad_enabled(False): for local_batch, local_labels in validation_generator: # Transfer to GPU local_batch, local_labels = local_batch.to( device), local_labels.to(device) # Model computations output = model(local_batch.float()) current_test_loss = criterion(output, local_labels.float()) current_loss_sum += current_test_loss.item() counter += 1 test_losses[epoch] = current_loss_sum / counter scheduler.step() # Output losses after each epoch print( f'Epoch {epoch+1} train loss: {train_losses[epoch]}, test loss: {test_losses[epoch]}' ) return model, train_losses, test_losses
if not os.path.exists(save_dir): break #run_id = run_id #save_dir = '/atlas/u/kalpit/Second-Order/code/mnist/output' os.system('rm -rf '+save_dir) os.makedirs(save_dir) ## redirect stdout if final_run: sys.stdout = open(os.path.join(save_dir, 'stdout'), 'w') print run_id print 'testing' ## Data data_dir = os.path.join('/scail/data/group/atlas/kalpit/data', dataset_name) dataset = Dataset(data_dir) ## Config cfg = Config(save_dir) ## Model print 'Creating Model...' print 'DROPOUT NOT IMPLEMENTED CORRECTLY FOR VALIDATION!!!' model = get_model(dataset_name, cfg) #model.summary() ## Train print 'Training Model...' starttime = time.time() train_loss_batch, train_acc_batch, train_loss, val_loss, val_acc = train(model, dataset, cfg) endtime = time.time()