from ops.pgcn_ops import CompletenessLoss, ClassWiseRegressionLoss SEED = 777 random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) np.random.seed(SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False best_loss = 100 pin_memory = True os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "2" configs = get_and_save_args(parser) parser.set_defaults(**configs) dataset_configs = configs["dataset_configs"] model_configs = configs["model_configs"] graph_configs = configs["graph_configs"] args = parser.parse_args() train_dataset = PGCNDataSet(dataset_configs, graph_configs, prop_file=dataset_configs['train_prop_file'], prop_dict_path=dataset_configs['train_dict_path'], ft_path=dataset_configs['train_ft_path'], epoch_multiplier=dataset_configs['training_epoch_multiplier'], mode='Train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True,collate_fn=collate_fn, num_workers=4, pin_memory=True, drop_last=True)
def main(): global args, best_loss, writer, adj_num, logger configs = get_and_save_args(parser) parser.set_defaults(**configs) dataset_configs = configs["dataset_configs"] model_configs = configs["model_configs"] graph_configs = configs["graph_configs"] args = parser.parse_args() """copy codes and creat dir for saving models and logs""" if not os.path.isdir(args.snapshot_pref): os.makedirs(args.snapshot_pref) logger = get_logger(args) logger.info('\ncreating folder: ' + args.snapshot_pref) if not args.evaluate: writer = SummaryWriter(args.snapshot_pref) # recorder = Recorder(args.snapshot_pref, ["models", "__pycache__"]) # recorder.writeopt(args) logger.info('\nruntime args\n\n{}\n\nconfig\n\n{}'.format( args, dataset_configs)) """construct model""" model = PGCN(model_configs, graph_configs) # model_att = AttEmbedding() policies = model.get_optim_policies() model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() if args.resume: if os.path.isfile(args.resume): logger.info(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) logger.info(("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch']))) else: logger.info(("=> no checkpoint found at '{}'".format(args.resume))) """construct dataset""" train_loader = torch.utils.data.DataLoader( PGCNDataSet( dataset_configs, graph_configs, prop_file=dataset_configs['train_prop_file'], prop_dict_path=dataset_configs['train_dict_path'], ft_path=dataset_configs['train_ft_path'], epoch_multiplier=dataset_configs['training_epoch_multiplier'], test_mode=False), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) # in training we drop the last incomplete minibatch val_loader = torch.utils.data.DataLoader(PGCNDataSet( dataset_configs, graph_configs, prop_file=dataset_configs['test_prop_file'], prop_dict_path=dataset_configs['val_dict_path'], ft_path=dataset_configs['test_ft_path'], epoch_multiplier=dataset_configs['testing_epoch_multiplier'], reg_stats=train_loader.dataset.stats, test_mode=False, val_mode=True), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) """loss and optimizer""" activity_criterion = torch.nn.CrossEntropyLoss().cuda() completeness_criterion = CompletenessLoss().cuda() regression_criterion = ClassWiseRegressionLoss().cuda() for group in policies: logger.info( ('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) optimizer = torch.optim.SGD(policies, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: validate(val_loader, model, activity_criterion, completeness_criterion, regression_criterion, 0) return # args.lr=1e-3 for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr_steps) train(train_loader, model, activity_criterion, completeness_criterion, regression_criterion, optimizer, epoch) # evaluate on validation set if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: loss = validate(val_loader, model, activity_criterion, completeness_criterion, regression_criterion, (epoch + 1) * len(train_loader)) ## remember best validation loss and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_loss': loss, 'reg_stats': torch.from_numpy(train_loader.dataset.stats) }, is_best, epoch, filename='checkpoint.pth.tar') writer.close()
def main(): global args, best_loss, writer configs = get_and_save_args(parser) parser.set_defaults(**configs) dataset_configs = configs["dataset_configs"] model_configs = configs["model_configs"] args = parser.parse_args() if 'batch_size' in model_configs: args.batch_size = model_configs['batch_size'] if 'iter_size' in model_configs: args.iter_size = model_configs['iter_size'] model = TwoStageDetector(model_configs, roi_size=dataset_configs['roi_pool_size']) cnt = 0 for p in model.parameters(): cnt += p.data.numel() print(cnt) """copy codes and creat dir for saving models and logs""" if not os.path.isdir(args.snapshot_pref): os.makedirs(args.snapshot_pref) date = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) logfile = os.path.join(args.snapshot_pref, date + '_train.log') get_logger(args, logfile) logging.info(' '.join(sys.argv)) logging.info('\ncreating folder: ' + args.snapshot_pref) if not args.evaluate: pass # writer = SummaryWriter(args.snapshot_pref) # make a copy of the entire project folder, which can cost huge space # recorder = Recorder(args.snapshot_pref, ["models", "__pycache__"]) # recorder.writeopt(args) logging.info('\nruntime args\n\n{}\n\nconfig\n\n{}'.format( args, dataset_configs)) logging.info(str(model)) logging.info(str(cnt)) if 'lr' in model_configs: args.lr = model_configs['lr'] logging.info('Using learning rate {}'.format(args.lr)) """construct model""" policies = model.get_optim_policies() model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() if args.resume: if os.path.isfile(args.resume): logging.info(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) logging.info(("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch']))) else: logging.info( ("=> no checkpoint found at '{}'".format(args.resume))) """construct dataset""" train_dataset = VideoDataSet( dataset_configs, prop_file=dataset_configs['train_prop_file'], ft_path=dataset_configs['train_ft_path'], epoch_multiplier=dataset_configs['training_epoch_multiplier'], test_mode=False) kwargs = {} kwargs['shuffle'] = True loss_kwargs = {} train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True, **kwargs) # in training we drop the last incomplete minibatch # val_loader = None val_loader = torch.utils.data.DataLoader(VideoDataSet( dataset_configs, prop_file=dataset_configs['test_prop_file'], ft_path=dataset_configs['test_ft_path'], epoch_multiplier=dataset_configs['testing_epoch_multiplier'], reg_stats=train_loader.dataset.stats, test_mode=False), batch_size=args.batch_size, shuffle=False, drop_last=True, num_workers=args.workers, pin_memory=True) logging.info('Dataloaders constructed') """loss and optimizer""" activity_criterion = torch.nn.CrossEntropyLoss(**loss_kwargs).cuda() completeness_criterion = CompletenessLoss().cuda() regression_criterion = ClassWiseRegressionLoss().cuda() # for group in policies: # logging.info(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( # group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) optimizer = torch.optim.SGD(policies, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: validate(val_loader, model, activity_criterion, completeness_criterion, regression_criterion, 0, -1) return print('Start training loop') for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr_steps) train(train_loader, model, activity_criterion, completeness_criterion, regression_criterion, optimizer, epoch) # evaluate on validation set latest_ckpt_path = args.snapshot_pref + \ '_'.join((args.dataset, 'latest', 'checkpoint.pth.tar')) ckpt = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_loss': 1000, 'reg_stats': torch.from_numpy(train_loader.dataset.stats) } torch.save(ckpt, latest_ckpt_path) if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: loss = validate(val_loader, model, activity_criterion, completeness_criterion, regression_criterion, (epoch + 1) * len(train_loader), epoch) # remember best validation loss and save checkpoint # loss = np.exp(-epoch/100) is_best = loss < best_loss best_loss = min(loss, best_loss) ckpt['best_loss'] = best_loss save_checkpoint(ckpt, is_best, epoch, filename='checkpoint.pth.tar')