def generate_train_val_dataloader(dataset, batch_size, num_workers, shuffle=True, split=0.9, use_fraction_of_data=1.): """ return two Dataloaders split into training and validation `split` sets the train/val split fraction (0.9 is 90 % training data) u """ ## this is a testing feature to make epochs go faster, uses only some of the available data if use_fraction_of_data < 1.: n_samples = int(use_fraction_of_data * len(dataset)) else: n_samples = len(dataset) inds = np.arange(n_samples) train_inds, val_inds = train_test_split(inds, test_size=1 - split, train_size=split) train_loader = DataLoader(dataset, sampler=SubsetRandomSampler(train_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers) val_loader = DataLoader(dataset, sampler=SubsetRandomSampler(val_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers) return train_loader, val_loader
def generate_train_val_dataloader(dataset, batch_size, num_workers, shuffle=False, split=0.8, fraction_of_data=1., train_inds=None, val_inds=None): """ return two Data`s split into training and validation `split` sets the train/val split fraction (0.9 is 90 % training data) u """ if train_inds == None: inds = np.arange(len(dataset)) inds = inds[:int(np.ceil(len(inds)*fraction_of_data))] if fraction_of_data < 1: print 'using ' + str(len(inds)) + ' data points total' train_inds, val_inds = train_test_split(inds, test_size=1-split, train_size=split) train_loader = DataLoader( dataset, sampler=SubsetRandomSampler(train_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers ) val_loader = DataLoader( dataset, sampler=SubsetRandomSampler(val_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers ) return train_loader, val_loader, train_inds, val_inds
def plot_accuracy(train_acc, val_acc, num_epochs, figsize=(8, 6)): fig = plt.figure(figsize=figsize) t = np.linspace(0, num_epochs, len(train_acc)) plt.plot(t, train_acc, label="Training") t = np.arange(num_epochs) + 1 plt.plot(t, val_acc, label="Validation") plt.xlabel("Epoch") plt.ylabel("F2 Score") plt.legend() return fig
def triple_train_val_balance_dataloaders(datasets, batch_size, num_workers, shuffle=True, split=0.9, use_fraction_of_data=1.): """ generate three training and three validation dataloaders to train triple resnet """ n_samples = len(datasets[0]) ## set up train val split inds = np.arange(n_samples) train_inds, val_inds = train_test_split(inds, test_size=1 - split, train_size=split) ## logical indexing to use with BalanceSampler log_train_inds = np.zeros(n_samples) log_train_inds[train_inds] = 1 log_val_inds = np.zeros(n_samples) log_val_inds[val_inds] = 1 ## reduce the size of your dataset (use for testing only) if use_fraction_of_data < 1: train_idx = int(np.ceil(use_fraction_of_data * split * n_samples)) val_idx = int(np.ceil(use_fraction_of_data * (1 - split) * n_samples)) log_train_inds[train_idx:] = 0 log_val_inds[val_idx:] = 0 train_loaders = [] val_loaders = [] for dset in datasets: train_loaders.append( BalanceDataLoader(dset, sampler=BalanceSampler(dset, log_train_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)) val_loaders.append( BalanceDataLoader(dset, sampler=BalanceSampler(dset, log_val_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)) return train_loaders, val_loaders
def triple_train_val_dataloaders(datasets, batch_size, num_workers, shuffle=True, split=0.9, use_fraction_of_data=1.): """ generate three training and three validation dataloaders to train triple resnet """ ## this is a testing feature to make epochs go faster, uses only some of the available data if use_fraction_of_data < 1.: n_samples = int(use_fraction_of_data * len(datasets[0])) else: n_samples = len(datasets[0]) inds = np.arange(n_samples) train_inds, val_inds = train_test_split(inds, test_size=1 - split, train_size=split) train_loaders = [] val_loaders = [] for dset in datasets: train_loaders.append( DataLoader(dset, sampler=SubsetRandomSampler(train_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)) val_loaders.append( DataLoader(dset, sampler=SubsetRandomSampler(val_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)) return train_loaders, val_loaders
def run(args): torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # Load data train_data, test_data, train_mask, test_mask, user_list = load_data(random_split=True) # train_data, test_data, train_mask, test_mask, user_list = load_toy_data() # Params # n_bins = 288 n_samples, n_bins, n_mods = train_data.shape n_features = n_bins * n_mods # n_mods = n_features // n_bins modalities = ['cpm', 'steps', 'screen', 'location_lat', 'location_lon'][:n_mods] num_train = train_data.shape[0] // args.batch_size num_test = test_data.shape[0] // args.batch_size # Convert to torch tensor train_data = torch.from_numpy(train_data) test_data = torch.from_numpy(test_data) train_mask = torch.from_numpy(train_mask).float() test_mask = torch.from_numpy(test_mask).float() def get_batch(source, mask, i, evaluation=False): data = Variable(source[i * args.batch_size:(i + 1) * args.batch_size], volatile=evaluation) _mask = Variable(mask[i * args.batch_size:(i + 1) * args.batch_size], volatile=evaluation) return data, _mask if args.model.lower() == 'vae': model = VAE(args.layers, input_dim=n_features, args=args) elif args.model.lower() == 'rae': model = CRAE(args.layers, input_dim=n_features, args=args) elif args.model.lower() == 'unet': model = SUnet(args.layers, input_dim=n_features, args=args) elif args.model.lower() == 'avb': model = AVB(args.layers, input_dim=n_features, args=args) else: model = SDAE(args.layers, input_dim=n_features, args=args) print(model) def train(epoch): model.train() train_loss = 0 for batch_idx in range(num_train): data, mask = get_batch(train_data, train_mask, batch_idx, evaluation=False) if args.cuda: data = data.cuda() # Run model updates and collect loss loss = model.forward(data, mask) train_loss += loss if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_data), 100. * batch_idx / num_train, loss / len(data))) print('====> Epoch: {} Average loss: {:.6f}'.format( epoch, train_loss / len(train_data))) return train_loss / len(train_data) def test(epoch): model.eval() test_loss = 0 for batch_idx in range(num_test): data, mask = get_batch(test_data, test_mask, batch_idx, evaluation=True) if args.cuda: data = data.cuda() # Evaluate batch on model test_loss += model.eval_loss(data, mask) test_loss /= len(test_data) print('====> Test set loss: {:.6f}'.format(test_loss)) return test_loss train_loss = list() test_loss = list() for epoch in range(1, args.epochs + 1): train_loss.append(train(epoch)) test_loss.append(test(epoch)) # Plot result test_batch, test_mask_batch = get_batch(test_data, test_mask, 0, evaluation=True) if 'vae' in args.model: recon_batch, mu, logvar, noise = model(test_batch, test_mask_batch) else: recon_batch, noise = model(test_batch, test_mask_batch) # Mask out known values test_batch = test_batch * test_mask_batch recon_batch = recon_batch * test_mask_batch # * (1 - noise) test_batch = test_batch.data.numpy().reshape(-1, n_bins, n_mods) recon_batch = recon_batch.data.numpy().reshape(-1, n_bins, n_mods) # fig, ax = plt.subplots(nrows=2, ncols=n_mods, figsize=(10 * n_mods, 20)) # for i, mod in enumerate(modalities): # vmax = np.max((test_batch[:, :, i].max(), recon_batch[:, :, i].max())) # sns.heatmap(test_batch[:, :, i], ax=ax[0, i], vmin=0, vmax=vmax) # sns.heatmap(recon_batch[:, :, i], ax=ax[1, i], vmin=0, vmax=vmax) # plt.savefig('{}_recon_heatmap'.format(args.model)) # # # Plot error curves # fig, ax = plt.subplots(figsize=(20, 10)) # ax.plot(range(args.epochs - 1), train_loss[1:], label='train') # ax.plot(range(args.epochs - 1), test_loss[1:], label='test') # plt.savefig('{}_error'.format(args.model)) # Create a visdom object vis = Visdom(env=args.model) # Heatmap for i, mod in enumerate(modalities): vmax = np.max((test_batch[:, :, i].max(), recon_batch[:, :, i].max())) vis.heatmap(test_batch[:, :, i], opts=dict(colormap='Electric', title='true_' + mod, xmin=0, xmax=float(vmax))) vis.heatmap(recon_batch[:, :, i], opts=dict(colormap='Electric', title='recon_' + mod, xmin=0, xmax=float(vmax))) vis.heatmap(((1 - noise) * test_mask_batch)[:, :, 0].data.numpy(), opts=dict(title='mask')) # Errors vis.line(np.stack((train_loss[1:], test_loss[1:]), axis=1), np.tile(np.arange(args.epochs - 1), (2, 1)).transpose(), opts=dict(legend=['train', 'test'])) return train_loss[-1], test_loss[-1]
sampler=BalanceSampler(dset, log_val_inds), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)) return train_loaders, val_loaders if __name__ == '__main__': from balance_batch_dataloader import * csv_path = 'data/train_v2.csv' img_path = 'data/train-jpg' img_ext = '.jpg' dtype = torch.FloatTensor training_dataset = ResnetOptimizeDataset(csv_path, img_path, dtype) inds = np.arange(10000) logical_inds = np.zeros(len(training_dataset)) logical_inds[inds] = 1 bbs = BalanceSampler(training_dataset, logical_inds) print(len(bbs)) train_loader = BalanceDataLoader(training_dataset, sampler=bbs, batch_size=32, num_workers=1) for t, (x, y) in enumerate(train_loader): col_sum = y.sum(dim=0).numpy().flatten() print(col_sum > 0) break