def main(): logger.info('Start to declare training variable') cfg.device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info('Session will be ran in device: [%s]' % cfg.device) start_epoch = 0 best_acc = 0. logger.info('Start to prepare data') # get transformers # train_transform is for data perturbation train_transform = transforms.get(train=True) # test_transform is for evaluation test_transform = transforms.get(train=False) # reduced_transform is for original training data reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size, cfg.tfm_means, cfg.tfm_stds) # get datasets # each head should have its own trainset train_splits = dict(cifar100=[['train', 'test']], stl10=[['train+unlabeled', 'test'], ['train', 'test']]) test_splits = dict(cifar100=['train', 'test'], stl10=['train', 'test']) # instance dataset for each head # otrainset: original trainset otrainset = [ ConcatDataset([ datasets.get(split=split, transform=reduced_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in xrange(len(train_splits[cfg.dataset])) ] # ptrainset: perturbed trainset ptrainset = [ ConcatDataset([ datasets.get(split=split, transform=train_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in xrange(len(train_splits[cfg.dataset])) ] # testset testset = ConcatDataset([ datasets.get(split=split, transform=test_transform) for split in test_splits[cfg.dataset] ]) # declare data loaders for testset only test_loader = DataLoader(testset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) logger.info('Start to build model') net = networks.get() criterion = PUILoss(cfg.pica_lamda) optimizer = optimizers.get( params=[val for _, val in net.trainable_parameters().iteritems()]) lr_handler = lr_policy.get() # load session if checkpoint is provided if cfg.resume: assert os.path.exists(cfg.resume), "Resume file not found" ckpt = torch.load(cfg.resume) logger.info('Start to resume session for file: [%s]' % cfg.resume) net.load_state_dict(ckpt['net']) best_acc = ckpt['acc'] start_epoch = ckpt['epoch'] # move modules to target device net, criterion = net.to(cfg.device), criterion.to(cfg.device) # tensorboard wrtier writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir) # start training lr = cfg.base_lr epoch = start_epoch while lr > 0 and epoch < cfg.max_epochs: lr = lr_handler.update(epoch, optimizer) writer.add_scalar('Train/Learing_Rate', lr, epoch) logger.info('Start to train at %d epoch with learning rate %.5f' % (epoch, lr)) train(epoch, net, otrainset, ptrainset, optimizer, criterion, writer) logger.info('Start to evaluate after %d epoch of training' % epoch) acc, nmi, ari = evaluate(net, test_loader) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) epoch += 1 if cfg.debug: continue # save checkpoint is_best = acc > best_acc best_acc = max(best_acc, acc) save_checkpoint( { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': acc, 'epoch': epoch }, is_best=is_best) logger.info('Done')
def train_head(epoch, net, hidx, head, otrainset, ptrainset, optimizer, criterion, writer): """trains one head for an epoch """ # declare dataloader random_sampler = RandomSampler(otrainset) batch_sampler = RepeatSampler(random_sampler, cfg.batch_size, nrepeat=cfg.data_nrepeat) ploader = DataLoader(ptrainset, batch_sampler=batch_sampler, num_workers=cfg.num_workers, pin_memory=True) oloader = DataLoader(otrainset, sampler=random_sampler, batch_size=cfg.batch_size, num_workers=cfg.num_workers, pin_memory=True) # set network mode net.train() # tracking variable end = time.time() train_loss = AverageMeter('Loss', ':.4f') data_time = AverageMeter('Data', ':.3f') batch_time = AverageMeter('Time', ':.3f') progress = TimeProgressMeter(batch_time, data_time, train_loss, Batch=len(oloader), Head=len(cfg.net_heads), Epoch=cfg.max_epochs) for batch_idx, (obatch, pbatch) in enumerate(itertools.izip(oloader, ploader)): # record data loading time data_time.update(time.time() - end) # move data to target device (oinputs, _), (pinputs, _) = (obatch, pbatch) oinputs, pinputs = (oinputs.to(cfg.device, non_blocking=True), pinputs.to(cfg.device, non_blocking=True)) # forward ologits, plogits = net(oinputs)[hidx], net(pinputs)[hidx] loss = criterion(ologits.repeat(cfg.data_nrepeat, 1), plogits) # backward optimizer.zero_grad() loss.backward() optimizer.step() train_loss.update(loss.item(), oinputs.size(0)) batch_time.update(time.time() - end) end = time.time() writer.add_scalar('Train/Loss/Head-%d' % head, train_loss.val, epoch * len(oloader) + batch_idx) if batch_idx % cfg.display_freq != 0: continue logger.info(progress.show(Batch=batch_idx, Epoch=epoch, Head=hidx))
def evaluate(net, loader, writer, epoch): """evaluates on provided data """ net.eval() predicts = np.zeros(len(loader.dataset), dtype=np.int32) labels = np.zeros(len(loader.dataset), dtype=np.int32) intermediates = np.zeros((len(loader.dataset), 2048), dtype=np.float32) images = np.zeros((len(loader.dataset), 3, 64, 64), dtype=np.float32) print(f"Evaluating on {len(loader.dataset)} samples") with torch.no_grad(): for batch_idx, (batch, targets) in enumerate(loader): # logger.progress('processing %d/%d batch' % (batch_idx, len(loader))) batch = batch.to(cfg.device, non_blocking=True) # assuming the last head is the main one # output dimension of the last head # should be consistent with the ground-truth logits = net(batch, -1) start = batch_idx * loader.batch_size end = start + loader.batch_size end = min(end, len(loader.dataset)) labels[start:end] = targets.cpu().numpy() predicts[start:end] = logits.max(1)[1].cpu().numpy() if epoch % cfg.embedding_freq == 0: intermediates[start:end] = net(batch, -1, True).cpu().numpy() if not cfg.tfm_adaptive_thresholding: for i in range(3): batch[:, i] = (batch[:, i] * cfg.tfm_stds[i]) + cfg.tfm_means[i] images[start:end] = torch.nn.functional.interpolate( batch, size=(64, 64), mode='bicubic', align_corners=False).cpu().numpy() # TODO: Gather labels and predicts # compute accuracy num_classes = labels.max().item() + 1 count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32) for i in range(predicts.shape[0]): count_matrix[predicts[i], labels[i]] += 1 reassignment = np.dstack( linear_sum_assignment(count_matrix.max() - count_matrix))[0] acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype( np.float32) / predicts.shape[0] nmi = NMI(labels, predicts) ari = ARI(labels, predicts) # compute f1 scores per class predicts_reassigned = reassignment[predicts, 1] precision = precision_score(labels, predicts_reassigned, average=None, zero_division=0) recall = recall_score(labels, predicts_reassigned, average=None, zero_division=0) f1 = f1_score(labels, predicts_reassigned, average=None, zero_division=0) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) if cfg.local_rank == 0: writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) for i in range(len(f1)): writer.add_scalar(f'Evaluate/f1_{i}', f1[i], epoch) writer.add_scalar(f'Evaluate/precision_{i}', precision[i], epoch) writer.add_scalar(f'Evaluate/recall_{i}', recall[i], epoch) if epoch % cfg.embedding_freq == 0 and cfg.embedding_freq != -1: writer.add_embedding(intermediates, labels, images, epoch, cfg.session) return acc
def main(): torch.autograd.set_detect_anomaly(True) logger.info('Start to declare training variable') if torch.cuda.is_available(): cfg.device = torch.device("cuda") torch.cuda.set_device(cfg.local_rank) else: cfg.device = torch.device("cpu") logger.info('Session will be ran in device: [%s]' % cfg.device) start_epoch = 0 best_acc = 0. logger.info('Start to prepare data') # get transformers # train_transform is for data perturbation train_transform = transforms.get(train=True) # test_transform is for evaluation test_transform = transforms.get(train=False) # reduced_transform is for original training data reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size, cfg.tfm_blur, cfg.tfm_means, cfg.tfm_stds, cfg.tfm_adaptive_thresholding) # get datasets # each head should have its own trainset train_splits = dict(cifar100=[['train', 'test']], image_folder_wrapper=[['train']], stl10=[['train+unlabeled', 'test'], ['train', 'test']]) test_splits = dict(cifar100=['train', 'test'], image_folder_wrapper=['test'], stl10=['train', 'test']) # instance dataset for each head # otrainset: original trainset otrainset = [ ConcatDataset([ datasets.get(split=split, transform=reduced_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in range(len(train_splits[cfg.dataset])) ] # ptrainset: perturbed trainset ptrainset = [ ConcatDataset([ datasets.get(split=split, transform=train_transform) for split in train_splits[cfg.dataset][hidx] ]) for hidx in range(len(train_splits[cfg.dataset])) ] # testset testset = ConcatDataset([ datasets.get(split=split, transform=test_transform) for split in test_splits[cfg.dataset] ]) # declare data loaders for testset only test_loader = DataLoader(testset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) logger.info('Start to build model') net = networks.get() criterion = PUILoss(cfg.pica_lamda, cfg.pica_target, cfg.pica_iic) optimizer = optimizers.get( params=[val for _, val in net.trainable_parameters().items()]) lr_handler = lr_policy.get() # load session if checkpoint is provided if cfg.resume: assert os.path.exists(cfg.resume), "Resume file not found" ckpt = torch.load(cfg.resume) logger.info('Start to resume session for file: [%s]' % cfg.resume) net.load_state_dict(ckpt['net']) best_acc = ckpt['acc'] start_epoch = ckpt['epoch'] # move modules to target device if int(os.environ["WORLD_SIZE"]) > 1: dist.init_process_group(backend="nccl", init_method="env://") print("world size: {}".format(os.environ["WORLD_SIZE"])) print("rank: {}".format(cfg.local_rank)) synchronize() criterion = criterion.to(cfg.device) net = net.to(cfg.device) if int(os.environ["WORLD_SIZE"]) > 1: net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = torch.nn.parallel.DistributedDataParallel( net, device_ids=[cfg.local_rank], find_unused_parameters=True, output_device=cfg.local_rank).cuda() # Only rank 0 needs a SummaryWriter if cfg.local_rank == 0: # tensorboard writer writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir) else: writer = None # start training lr = cfg.base_lr epoch = start_epoch logger.info('Start to evaluate after %d epoch of training' % epoch) acc = evaluate(net, test_loader, writer, epoch) if not cfg.debug and cfg.local_rank == 0: # save checkpoint is_best = acc > best_acc best_acc = max(best_acc, acc) save_checkpoint( { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': acc, 'epoch': epoch }, is_best=is_best) while lr > 0 and epoch < cfg.max_epochs: lr = lr_handler.update(epoch, optimizer) logger.info('Start to train at %d epoch with learning rate %.5f' % (epoch, lr)) train(epoch, net, otrainset, ptrainset, optimizer, criterion, writer) epoch += 1 logger.info('Start to evaluate after %d epoch of training' % epoch) acc = evaluate(net, test_loader, writer, epoch) if not cfg.debug and cfg.local_rank == 0: writer.add_scalar('Train/Learing_Rate', lr, epoch) # save checkpoint is_best = acc > best_acc best_acc = max(best_acc, acc) save_checkpoint( { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': acc, 'epoch': epoch }, is_best=is_best) logger.info('Done')
def main(): logger.info('Start to declare training variable') cfg.device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info('Session will be ran in device: [%s]' % cfg.device) start_epoch = 0 best_acc = 0. if cfg.pica: logger.info('Work at PICA !!!!') logger.info('Start to prepare data') # get transformers # train_transform is for data perturbation #train_transform = transforms.get(train=True) # test_transform is for evaluation test_transform = transforms.get(train=False) # reduced_transform is for original training data #reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size, # cfg.tfm_means, cfg.tfm_stds) # get datasets # each head should have its own trainset #train_splits = dict(cifar100=[['train', 'test']], cifar10=[['train', 'test']], # stl10=[['train+unlabeled', 'test'], ['train', 'test']]) test_splits = dict(cifar100=['train', 'test'], cifar10=['train', 'test'], stl10=['train', 'test']) # instance dataset for each head if cfg.dataset.startswith('stl') or cfg.dataset.startswith('cifar'): # otrainset: original trainset # otrainset = [ConcatDataset([datasets.get(split=split, transform=reduced_transform) # for split in train_splits[cfg.dataset][hidx]]) # for hidx in xrange(len(train_splits[cfg.dataset]))] # # ptrainset: perturbed trainset # ptrainset = [ConcatDataset([datasets.get(split=split, transform=train_transform) # for split in train_splits[cfg.dataset][hidx]]) # for hidx in xrange(len(train_splits[cfg.dataset]))] # testset testset = ConcatDataset([ datasets.get(split=split, transform=test_transform) for split in test_splits[cfg.dataset] ]) else: # otrainset = [ImageFolder(root = cfg.data_root, transform = reduced_transform) for hidx in xrange(2)] # ptrainset = [ImageFolder(root = cfg.data_root, transform = train_transform) for hidx in xrange(2)] testset = ImageFolder(root=cfg.data_root, transform=test_transform) logger.debug( 'Dataset [%s] from directory [%s] is declared and %d samples ' 'are loaded' % (cfg.dataset, cfg.data_root, len(testset))) # declare data loaders for testset only test_loader = DataLoader(testset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) logger.info('Start to build model') net = networks.get() criterion = DCLoss(cfg.dc_lamda) optimizer = optimizers.get( params=[val for _, val in net.trainable_parameters().iteritems()]) lr_handler = lr_policy.get() # load session if checkpoint is provided if cfg.resume: assert os.path.exists(cfg.resume), "Resume file not found" ckpt = torch.load(cfg.resume) logger.info('Start to resume session for file: [%s]' % cfg.resume) if not cfg.pica: net.load_state_dict(ckpt['net']) best_acc = ckpt['acc'] start_epoch = ckpt['epoch'] else: net.load_state_dict(ckpt) best_acc = 0 start_epoch = 0 # data parallel if cfg.device == 'cuda' and len(cfg.gpus.split(',')) > 1: logger.info('Data parallel will be used for acceleration purpose') device_ids = range(len(cfg.gpus.split(','))) if not (hasattr(net, 'data_parallel') and net.data_parallel(device_ids)): net = nn.DataParallel(net, device_ids=device_ids) cudnn.benchmark = True else: logger.info('Data parallel will not be used for acceleration') # move modules to target device net, criterion = net.to(cfg.device), criterion.to(cfg.device) # tensorboard wrtier writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir) # start training lr = cfg.base_lr epoch = start_epoch logger.info('Start to evaluate after %d epoch of training' % epoch) acc, nmi, ari = evaluate(net, test_loader) logger.info('Evaluation results at epoch %d are: ' 'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari)) writer.add_scalar('Evaluate/ACC', acc, epoch) writer.add_scalar('Evaluate/NMI', nmi, epoch) writer.add_scalar('Evaluate/ARI', ari, epoch) logger.info('Done')