def main(args): # fix random seeds if args.seed: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) # CNN if args.verbose: print('Architecture: {}'.format(args.arch)) model = models.__dict__[args.arch](sobel=args.sobel, dropout=args.dropout) fd = int(model.top_layer.weight.size()[1]) model.top_layer = None model.features = torch.nn.DataParallel(model.features) model.cuda() cudnn.benchmark = True # create optimizer optimizer = torch.optim.SGD( filter(lambda x: x.requires_grad, model.parameters()), lr=args.learning_rate, momentum=args.momentum, weight_decay=10**args.weight_decay, ) # define loss function criterion = nn.CrossEntropyLoss().cuda() restore(model, args.resume) # creating checkpoint repo exp_check = os.path.join(args.experiment, 'checkpoints') if not os.path.isdir(exp_check): os.makedirs(exp_check) # creating cluster assignments log cluster_log = Logger(os.path.join(args.experiment, 'clusters')) # preprocessing of data normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) tra = [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ] # load the data end = time.time() dataset = datasets.ImageFolder(args.data, transform=transforms.Compose(tra)) if args.verbose: print('Load dataset: {0:.2f} s'.format(time.time() - end)) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, num_workers=args.workers, pin_memory=True) algs = { 'KMeans': clustering.KMeans, 'PIC': clustering.PIC, } cluster_alg = algs[args.cluster_alg](args.nmb_cluster) # training convnet with cluster_alg for epoch in range(args.start_epoch, args.epochs): end = time.time() # remove head model.top_layer = None model.classifier = nn.Sequential( *list(model.classifier.children())[:-1]) # get the features for the whole dataset features = compute_features(dataloader, model, len(dataset), args.batch) # cluster the features if args.verbose: print('Cluster the features') clustering_loss = cluster_alg.cluster(features, verbose=args.verbose) # assign pseudo-labels if args.verbose: print('Assign real labels') # train_dataset = cluster_assign(cluster_alg.images_lists, # dataset.imgs) train_dataset = cluster_assign_with_original_labels(dataset.imgs) # uniformly sample per target sampler = UnifLabelSampler(int(args.reassign * len(train_dataset)), cluster_alg.images_lists) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch, num_workers=args.workers, sampler=sampler, pin_memory=True, ) # set last fully connected layer mlp = list(model.classifier.children()) mlp.append(nn.ReLU(inplace=True).cuda()) model.classifier = nn.Sequential(*mlp) model.top_layer = nn.Linear(fd, len(cluster_alg.images_lists)) model.top_layer.weight.data.normal_(0, 0.01) model.top_layer.bias.data.zero_() model.top_layer.cuda() # train network with clusters as pseudo-labels end = time.time() for x in range(1000): loss = train(train_dataloader, model, criterion, optimizer, epoch) # print log if args.verbose: print('###### Epoch [{0}] ###### \n' 'Time: {1:.3f} s\n' 'Clustering loss: {2:.3f} \n' 'ConvNet loss: {3:.3f}'.format(epoch, time.time() - end, clustering_loss, loss)) try: nmi = normalized_mutual_info_score( arrange_clustering(cluster_alg.images_lists), arrange_clustering(cluster_log.data[-1])) print('NMI against previous assignment: {0:.3f}'.format(nmi)) except IndexError: pass print('####################### \n') # save running checkpoint torch.save( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(args.experiment, 'checkpoint.pth.tar')) # save cluster assignments cluster_log.log(cluster_alg.images_lists)
def train_model(args, metadata, device='cuda'): print('training on {}'.format(torch.cuda.get_device_name(device) if args.cuda else 'cpu')) # load data if not args.preload: dset = SyntheticDataset(args.file, 'cpu') # originally 'cpu' ???? train_loader = DataLoader(dset, shuffle=True, batch_size=args.batch_size) data_dim, latent_dim, aux_dim = dset.get_dims() args.N = len(dset) metadata.update(dset.get_metadata()) else: train_loader = DataLoaderGPU(args.file, shuffle=True, batch_size=args.batch_size) data_dim, latent_dim, aux_dim = train_loader.get_dims() args.N = train_loader.dataset_len metadata.update(train_loader.get_metadata()) if args.max_iter is None: args.max_iter = len(train_loader) * args.epochs if args.latent_dim is not None: latent_dim = args.latent_dim metadata.update({"train_latent_dim": latent_dim}) # define model and optimizer model = None if args.i_what == 'iVAE': model = iVAE(latent_dim, data_dim, aux_dim, n_layers=args.depth, activation='lrelu', device=device, hidden_dim=args.hidden_dim, anneal=args.anneal, # False file=metadata['file'], # Added dataset location for easier checkpoint loading seed=1, epochs=args.epochs) elif args.i_what == 'iFlow': metadata.update({"device": device}) model = iFlow(args=metadata).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \ factor=args.lr_drop_factor, \ patience=args.lr_patience, \ verbose=True) # factor=0.1 and patience=4 ste = time.time() print('setup time: {}s'.format(ste - st)) # setup loggers logger = Logger(logdir=LOG_FOLDER) # 'log/' exp_id = logger.get_id() # 1 tensorboard_run_name = TENSORBOARD_RUN_FOLDER + 'exp' + str(exp_id) + '_'.join( map(str, ['', args.batch_size, args.max_iter, args.lr, args.hidden_dim, args.depth, args.anneal])) # 'runs/exp1_64_12500_0.001_50_3_False' writer = SummaryWriter(logdir=tensorboard_run_name) if args.i_what == 'iFlow': logger.add('log_normalizer') logger.add('neg_log_det') logger.add('neg_trace') logger.add('loss') logger.add('perf') print('Beginning training for exp: {}'.format(exp_id)) # training loop epoch = 0 model.train() while epoch < args.epochs: # args.max_iter: #12500 est = time.time() for itr, (x, u, z) in enumerate(train_loader): acc_itr = itr + epoch * len(train_loader) # x is of shape [64, 4] # u is of shape [64, 40], one-hot coding of 40 classes # z is of shape [64, 2] # it += 1 # model.anneal(args.N, args.max_iter, it) optimizer.zero_grad() if args.cuda and not args.preload: x = x.cuda(device=device, non_blocking=True) u = u.cuda(device=device, non_blocking=True) if args.i_what == 'iVAE': elbo, z_est = model.elbo(x, u) # elbo is a scalar loss while z_est is of shape [64, 2] loss = elbo.mul(-1) elif args.i_what == 'iFlow': (log_normalizer, neg_trace, neg_log_det), z_est = model.neg_log_likelihood(x, u) loss = log_normalizer + neg_trace + neg_log_det loss.backward() optimizer.step() logger.update('loss', loss.item()) if args.i_what == 'iFlow': logger.update('log_normalizer', log_normalizer.item()) logger.update('neg_trace', neg_trace.item()) logger.update('neg_log_det', neg_log_det.item()) perf = mcc(z.cpu().numpy(), z_est.cpu().detach().numpy()) logger.update('perf', perf) if acc_itr % args.log_freq == 0: # % 25 logger.log() writer.add_scalar('data/performance', logger.get_last('perf'), acc_itr) writer.add_scalar('data/loss', logger.get_last('loss'), acc_itr) if args.i_what == 'iFlow': writer.add_scalar('data/log_normalizer', logger.get_last('log_normalizer'), acc_itr) writer.add_scalar('data/neg_trace', logger.get_last('neg_trace'), acc_itr) writer.add_scalar('data/neg_log_det', logger.get_last('neg_log_det'), acc_itr) scheduler.step(logger.get_last('loss')) if acc_itr % int(args.max_iter / 5) == 0 and not args.no_log: checkpoint(TORCH_CHECKPOINT_FOLDER, \ exp_id, \ acc_itr, \ model, \ optimizer, \ logger.get_last('loss'), \ logger.get_last('perf')) epoch += 1 eet = time.time() if args.i_what == 'iVAE': print('epoch {}: {:.4f}s;\tloss: {:.4f};\tperf: {:.4f}'.format(epoch, eet - est, logger.get_last('loss'), logger.get_last('perf'))) elif args.i_what == 'iFlow': print('epoch {}: {:.4f}s;\tloss: {:.4f} (l1: {:.4f}, l2: {:.4f}, l3: {:.4f});\tperf: {:.4f}'.format( \ epoch, eet - est, logger.get_last('loss'), logger.get_last('log_normalizer'), logger.get_last('neg_trace'), logger.get_last('neg_log_det'), logger.get_last('perf'))) et = time.time() print('training time: {}s'.format(et - ste)) # Save final model checkpoint(PT_MODELS_FOLDER, "", 'final', model, optimizer, logger.get_last('loss'), logger.get_last('perf')) writer.close() if not args.no_log: logger.add_metadata(**metadata) logger.save_to_json() logger.save_to_npz() print('total time: {}s'.format(et - st)) return model
real_data = samples_to_vectors(real_batch) # generate fake data and detach (so gradient not calculated for generator) fake_data = generator(to_device(noise(N), device)).detach() # train discriminator d_error, d_pred_real, d_pred_fake = train_discriminator( d_optimizer, real_data, fake_data) '''2. Train Generator''' # generate fake data (no detach this time because need graidents) fake_data = generator(to_device(noise(N), device)) # train generator g_error = train_generator(g_optimizer, fake_data) ''' Log batch error ''' logger.log(d_error, g_error, d_pred_real, d_pred_fake, epoch, batch_num, num_batches) ''' Display progress every few batches & save model checkpoint ''' if (batch_num) % 100 == 0: generator.eval() test_samples = vectors_to_samples(generator(test_noise)) test_samples = test_samples.data generator.train() logger.log_images(test_samples.cpu(), num_test_samples, epoch, batch_num, num_batches) # Display status logs logger.display_status(epoch, num_epochs, batch_num, num_batches, d_error, g_error, d_pred_real, d_pred_fake) # Save model dictionaries (uses HEAPS OF MEMORY ~300-600MB per save)
loss = log_normalizer + neg_trace + neg_log_det loss.backward() optimizer.step() logger.update('loss', loss.item()) if args.i_what == 'iFlow': logger.update('log_normalizer', log_normalizer.item()) logger.update('neg_trace', neg_trace.item()) logger.update('neg_log_det', neg_log_det.item()) perf = mcc(z.cpu().numpy(), z_est.cpu().detach().numpy()) logger.update('perf', perf) if acc_itr % args.log_freq == 0: # % 25 logger.log() writer.add_scalar('data/performance', logger.get_last('perf'), acc_itr) writer.add_scalar('data/loss', logger.get_last('loss'), acc_itr) if args.i_what == 'iFlow': writer.add_scalar('data/log_normalizer', logger.get_last('log_normalizer'), acc_itr) writer.add_scalar('data/neg_trace', logger.get_last('neg_trace'), acc_itr) writer.add_scalar('data/neg_log_det', logger.get_last('neg_log_det'), acc_itr) scheduler.step(logger.get_last('loss'))