def main(): np.random.seed(0) torch.manual_seed(0) logger.info('Loading data...') train_loader, val_loader, classes = custom_dataset.load_data(args) # override autodetect if n_classes is given if args.n_classes > 0: classes = np.arange(args.n_classes) model = load_model(classes) logger.info('Loaded model; params={}'.format(util.count_parameters(model))) if not args.cpu: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") else: device = "cpu" model.to(device) cudnn.benchmark = True logger.info('Running on ' + str(device)) summary_writer = Logger(args.logdir) # Loss and Optimizer n_epochs = args.epochs if args.label_smoothing > 0: criterion = nn.BCEWithLogitsLoss() else: criterion = nn.CrossEntropyLoss() train_state = init_train_state() # freeze layers for l in args.freeze_layers: for p in getattr(model, l).parameters(): p.requires_grad = False if args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=train_state['lr'], weight_decay=args.weight_decay) elif args.optimizer == 'nesterov': optimizer = torch.optim.SGD(model.parameters(), lr=train_state['lr'], momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) # this is used to warm-start if args.warm_start_from: logger.info('Warm-starting from {}'.format(args.warm_start_from)) assert os.path.isfile(args.warm_start_from) train_state = load_checkpoint(args.warm_start_from, model, optimizer) logger.info('Params loaded.') # do not override train_state these when warm staring train_state = init_train_state() ckptfile = str(Path(args.logdir) / args.latest_fname) if os.path.isfile(ckptfile): logger.info('Loading checkpoint: {}'.format(ckptfile)) train_state = load_checkpoint(ckptfile, model, optimizer) logger.info('Params loaded.') else: logger.info('Checkpoint {} not found; ignoring.'.format(ckptfile)) # Training / Eval loop epoch_time = [] # store time per epoch # we save epoch+1 to checkpoints; but for eval we should repeat prev. epoch if args.skip_train: train_state['start_epoch'] -= 1 for epoch in range(train_state['start_epoch'], n_epochs): logger.info('Epoch: [%d/%d]' % (epoch + 1, n_epochs)) start = time.time() if not args.skip_train: model.train() train(train_loader, device, model, criterion, optimizer, summary_writer, train_state, n_classes=len(classes)) logger.info('Time taken: %.2f sec...' % (time.time() - start)) if epoch == 0: train_state['steps_epoch'] = train_state['step'] # always eval on last epoch if not args.skip_eval or epoch == n_epochs - 1: logger.info('\n Starting evaluation...') model.eval() eval_shrec = True if epoch == n_epochs - 1 and args.retrieval_dir else False metrics, inputs = eval( val_loader, device, model, criterion, eval_shrec) logger.info('\tcombined: %.2f, Acc: %.2f, mAP: %.2f, Loss: %.4f' % (metrics['combined'], metrics['acc_inst'], metrics.get('mAP_inst', 0.), metrics['loss'])) # Log epoch to tensorboard # See log using: tensorboard --logdir='logs' --port=6006 ims = get_summary_ims(inputs) if not args.nolog: util.logEpoch(summary_writer, model, epoch + 1, metrics, ims) else: metrics = None # Decaying Learning Rate if args.lr_decay_mode == 'step': if (epoch + 1) % args.lr_decay_freq == 0: train_state['lr'] *= args.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = train_state['lr'] # Save model if not args.skip_train: logger.info('\tSaving latest model') util.save_checkpoint({ 'epoch': epoch + 1, 'step': train_state['step'], 'steps_epoch': train_state['steps_epoch'], 'state_dict': model.state_dict(), 'metrics': metrics, 'optimizer': optimizer.state_dict(), 'lr': train_state['lr'], }, str(Path(args.logdir) / args.latest_fname)) total_epoch_time = time.time() - start epoch_time.append(total_epoch_time) logger.info('Total time for this epoch: {} s'.format(total_epoch_time)) # if last epoch, show eval results if epoch == n_epochs - 1: logger.info( '|model|combined|acc inst|acc cls|mAP inst|mAP cls|loss|') logger.info('|{}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.4f}|' .format(os.path.basename(args.logdir), metrics['combined'], metrics['acc_inst'], metrics['acc_cls'], metrics.get('mAP_inst', 0.), metrics.get('mAP_cls', 0.), metrics['loss'])) if args.skip_train: # if evaluating, run it once break if time.perf_counter() + np.max(epoch_time) > start_time + args.exit_after: logger.info('Next epoch will likely exceed alotted time; exiting...') break
# print('\nEvaluation:') # print('\tTrain Acc: %.2f - Loss: %.4f' % (avg_train_acc.item(), avg_loss_train.item())) avg_val_acc, avg_loss_val = eval(eval_val_loader) print('\nEvaluation:') print('\tVal Acc: %.2f - Loss: %.4f' % (avg_val_acc, avg_loss_val)) # print('\tVal Acc: %.2f - Loss: %.4f' % (avg_val_acc.item(), avg_loss_val.item())) print('\tCurrent best val acc: %.2f' % best_acc) # Log epoch to tensorboard # See log using: tensorboard --logdir='logs' --port=6006 #util.logEpoch(logger, model, epoch + 1, avg_loss_val, avg_val_acc,avg_loss_train,avg_train_acc) util.logEpoch(logger, model, epoch + 1, avg_loss_val, avg_val_acc) # Save model if avg_val_acc > best_acc: print('\tSaving checkpoint - Acc: %.2f' % avg_val_acc) best_acc = avg_val_acc best_loss = avg_loss_val util.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': avg_val_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, args.model, START, str(args.depth))
resnet.train() train() print('Time taken: %.2f sec.' % (time.time() - start)) resnet.eval() avg_test_acc, avg_loss = eval(val_loader) print('\nEvaluation:') print('\tVal Acc: %.2f - Loss: %.4f' % (avg_test_acc.item(), avg_loss.item())) print('\tCurrent best val acc: %.2f' % best_acc) # Log epoch to tensorboard # See log using: tensorboard --logdir='logs' --port=6006 util.logEpoch(logger, resnet, epoch + 1, avg_loss, avg_test_acc) # Save model if avg_test_acc > best_acc: print('\tSaving checkpoint - Acc: %.2f' % avg_test_acc) best_acc = avg_test_acc best_loss = avg_loss util.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': resnet.state_dict(), 'acc': avg_test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }) # Decaying Learning Rate
model.train() train() print('Time taken: %.2f sec.' % (time.time() - start)) model.eval() avg_test_acc, avg_loss = eval(val_loader) print('\nEvaluation:') print('\tVal Acc: %.2f - Loss: %.4f' % (avg_test_acc.item(), avg_loss.item())) print('\tCurrent best val acc: %.2f' % best_acc) # Log epoch to tensorboard # See log using: tensorboard --logdir='logs' --port=6006 util.logEpoch(logger, model, epoch + 1, avg_loss, avg_test_acc) # Save model if avg_test_acc > best_acc: print('\tSaving checkpoint - Acc: %.2f' % avg_test_acc) best_acc = avg_test_acc best_loss = avg_loss util.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': avg_test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, args.model, args.depth)
def main(): train_chairs = [ 'chair_0001', 'chair_0005', 'chair_0101', 'chair_0084', 'chair_0497', 'chair_0724', 'chair_0878' ] test_chairs = ['chair_0957'] features = [] np.random.seed(0) torch.manual_seed(0) logger.info('Loading data...') train_loader, val_loader, classes = custom_dataset.load_data(args) # override autodetect if n_classes is given if args.n_classes > 0: classes = np.arange(args.n_classes) model = load_model(classes) logger.info('Loaded model; params={}'.format(util.count_parameters(model))) if not args.cpu: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") else: device = "cpu" model.to(device) cudnn.benchmark = True logger.info('Running on ' + str(device)) summary_writer = Logger(args.logdir) # Loss and Optimizer n_epochs = args.epochs if args.label_smoothing > 0: criterion = nn.BCEWithLogitsLoss() else: criterion = nn.CrossEntropyLoss() train_state = init_train_state() # freeze layers for l in args.freeze_layers: for p in getattr(model, l).parameters(): p.requires_grad = False if args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=train_state['lr'], weight_decay=args.weight_decay) elif args.optimizer == 'nesterov': optimizer = torch.optim.SGD(model.parameters(), lr=train_state['lr'], momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) # this is used to warm-start if args.warm_start_from: logger.info('Warm-starting from {}'.format(args.warm_start_from)) assert os.path.isfile(args.warm_start_from) train_state = load_checkpoint(args.warm_start_from, model, optimizer) logger.info('Params loaded.') # do not override train_state these when warm staring train_state = init_train_state() ckptfile = str(Path(args.logdir) / args.latest_fname) if os.path.isfile(ckptfile): logger.info('Loading checkpoint: {}'.format(ckptfile)) train_state = load_checkpoint(ckptfile, model, optimizer) logger.info('Params loaded.') else: logger.info('Checkpoint {} not found; ignoring.'.format(ckptfile)) # Training / Eval loop epoch_time = [] # store time per epoch # we save epoch+1 to checkpoints; but for eval we should repeat prev. epoch if args.skip_train: train_state['start_epoch'] -= 1 for epoch in range(0, n_epochs): logger.info('Epoch: [%d/%d]' % (epoch + 1, n_epochs)) start = time.time() if not args.skip_train: model.train() if epoch == n_epochs - 1: features = train(train_loader, device, model, criterion, optimizer, summary_writer, train_state, 1, train_chairs, n_classes=len(classes)) PIK = "descriptors.dat" with open(PIK, "wb") as f: pickle.dump(train_desc, f) else: train(train_loader, device, model, criterion, optimizer, summary_writer, train_state, 0, train_chairs, n_classes=len(classes)) logger.info('Time taken: %.2f sec...' % (time.time() - start)) if epoch == 0: train_state['steps_epoch'] = train_state['step'] # always eval on last epoch if not args.skip_eval or epoch == n_epochs + 1: #print("-------------SAVING MODEL----------------"); #torch.save(model,"saved.pth") logger.info('\n Starting evaluation...') model.eval() eval_shrec = True if epoch == n_epochs - 1 and args.retrieval_dir else False metrics, inputs = eval(val_loader, device, model, criterion, eval_shrec, 0, test_chairs, features) logger.info('\tcombined: %.2f, Acc: %.2f, mAP: %.2f, Loss: %.4f' % (metrics['combined'], metrics['acc_inst'], metrics.get('mAP_inst', 0.), metrics['loss'])) # Log epoch to tensorboard # See log using: tensorboard --logdir='logs' --port=6006 ims = get_summary_ims(inputs) if not args.nolog: util.logEpoch(summary_writer, model, epoch + 1, metrics, ims) else: metrics = None # Decaying Learning Rate if args.lr_decay_mode == 'step': if (epoch + 1) % args.lr_decay_freq == 0: train_state['lr'] *= args.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = train_state['lr'] # Save model if not args.skip_train: logger.info('\tSaving latest model') util.save_checkpoint( { 'epoch': epoch + 1, 'step': train_state['step'], 'steps_epoch': train_state['steps_epoch'], 'state_dict': model.state_dict(), 'metrics': metrics, 'optimizer': optimizer.state_dict(), 'lr': train_state['lr'], }, str(Path(args.logdir) / args.latest_fname)) total_epoch_time = time.time() - start epoch_time.append(total_epoch_time) logger.info('Total time for this epoch: {} s'.format(total_epoch_time)) if args.skip_train: # if evaluating, run it once break if time.perf_counter() + np.max( epoch_time) > start_time + args.exit_after: logger.info( 'Next epoch will likely exceed alotted time; exiting...') break print("Encoder training done") print("Now training the Decoder") ###############################Decoder ########################################### decoder = models.Decoder() print(decoder) decoder.to(device) train_state = init_train_state() crit = nn.MSELoss() optim = torch.optim.SGD(decoder.parameters(), lr=train_state['lr'], momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) path = str("/home/smjadhav/Research/emvn/decoder_model/latest.pth.tar") if os.path.isfile(path): logger.info('Loading decoder checkpoint: {}'.format(path)) train_state = load_checkpoint(path, decoder, optimizer) logger.info('Params loaded.') else: print("Decoder model not found") train_size = len(train_loader) metrics = {} for epoch in range(0, 50): print("Epoch ", epoch + 1) decoder.train() PIK = "D1.dat" with open(PIK, "rb") as f: try: i = 0 while (True): data = pickle.load(f) inputs = torch.from_numpy(data[1]).to(device) target_img = torch.from_numpy(data[0]).to(device) outputs = decoder(inputs) optim.zero_grad() loss = crit(outputs, target_img) loss.backward() optim.step() if args.lr_decay_mode == 'cos': # estimate steps_epoch from first epoch (we may have dropped entries) steps_epoch = (train_state['steps_epoch'] if train_state['steps_epoch'] > 0 else len(train_loader)) # TODO: there will be a jump here if many entries are dropped # and we only figure out # of steps after first epoch if train_state['step'] < steps_epoch: train_state['lr'] = args.lr * train_state[ 'step'] / steps_epoch else: nsteps = steps_epoch * args.epochs train_state['lr'] = (0.5 * args.lr * (1 + np.cos( train_state['step'] * np.pi / nsteps))) for param_group in optim.param_groups: param_group['lr'] = train_state['lr'] if (i + 1) % args.print_freq == 0: print("\tIter [%d/%d] Loss: %.4f" % (i + 1, train_size, loss.item())) if args.max_steps > 0 and i > args.max_steps: break i = i + 1 except: exit if ((epoch + 1) % 5 == 0): print("Saving Decoder model") util.save_checkpoint( { 'epoch': epoch + 1, 'step': train_state['step'], 'steps_epoch': train_state['steps_epoch'], 'state_dict': decoder.state_dict(), 'metrics': metrics, 'optimizer': optimizer.state_dict(), 'lr': train_state['lr'], }, path) PIK = "images.dat" with open(PIK, "wb") as f: pickle.dump(outputs, f)