def model_fn(features, labels, mode, params): """Model function used in the estimator. Args: features (Tensor): Input features to the model. labels (Tensor): Labels tensor for training and evaluation. mode (ModeKeys): Specifies if training, evaluation or prediction. params (dict): Dictionary of Hyper-parameters. Returns: (EstimatorSpec): Model to be run by Estimator. """ # Define model's architecture if params['model'] == 'DNN': y_prob, logit, l1_reg, l2_reg = DNN.model(features, params, mode) else: tf.logging.fatal('Params model setting wrong with {0}'.format( params['model'])) sys.exit(1) export_outputs = None y_prediction = tf.reshape(y_prob, [-1]) predictions_dict = {'predicted': y_prediction} if mode in (TRAIN, EVAL): labels_flatten = tf.reshape(tf.cast(labels, dtype=tf.int32), [-1]) auc = tf.compat.v1.metrics.auc(labels_flatten, y_prediction) apk = tf.compat.v1.metrics.average_precision_at_k( tf.cast(labels_flatten, dtype=tf.int64), y_prediction, k=params['batch_size']) precision = tf.compat.v1.metrics.precision(labels_flatten, y_prediction) recall = tf.compat.v1.metrics.recall(labels_flatten, y_prediction) metrics = { 'eval_auc': auc, 'eval_avgpk': apk, 'eval_precision': precision, 'eval_recall': recall } tf.compat.v1.summary.scalar('train_auc', auc[1]) tf.compat.v1.summary.scalar('train_avgpk', apk[1]) tf.compat.v1.summary.scalar('train_precision', precision[1]) tf.compat.v1.summary.scalar('train_recall', recall[1]) loss = Loss.loss_func(labels, y_prob, logit, l1_reg, l2_reg, params) if mode == TRAIN: update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): global_step = tf.compat.v1.train.get_global_step() learning_rate = tf.compat.v1.train.exponential_decay( learning_rate=params['learning_rate'], global_step=global_step, decay_steps=params['decay_step'], decay_rate=params['decay_rate']) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate, epsilon=0.1) trainable_variables = tf.compat.v1.trainable_variables() gradients = tf.gradients(loss, trainable_variables) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip( clipped_gradients, trainable_variables), global_step=global_step) update_global_step = tf.compat.v1.assign( global_step, global_step + 1, name='update_global_step') final_train_op = tf.group(train_op, update_global_step) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions_dict, loss=loss, train_op=final_train_op, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions_dict, loss=loss, eval_metric_ops=metrics, export_outputs=export_outputs) elif mode == PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions_dict) else: tf.logging.fatal('Training MODE setting is wrong: {0}'.format(mode)) sys.exit(1)
def train(train_loop_func, logger, args): if args.amp: amp_handle = amp.init(enabled=args.fp16) # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=args.backbone) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() if args.fp16 and not args.amp: ssd300 = network_to_half(ssd300) if args.distributed: ssd300 = DDP(ssd300) optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: if args.amp: optimizer = amp_handle.wrap_optimizer(optimizer) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage. cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) ssd300.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if args.local_rank == 0: logger.update_epoch_time(epoch, end_epoch_time) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = { 'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info } if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() torch.save(obj, './models/epoch_{}.pt'.format(epoch)) train_loader.reset() print('total training time: {}'.format(total_time))
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda train_samples = 118287 # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='smddp', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time if torch.distributed.get_rank() == 0: throughput = train_samples / end_epoch_time logger.update_epoch_time(epoch, end_epoch_time) logger.update_throughput_speed(epoch, throughput) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.save and args.local_rank == 0: print("saving model...") obj = {'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info} if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() save_path = os.path.join(args.save, f'epoch_{epoch}.pt') torch.save(obj, save_path) logger.log('model path', save_path) train_loader.reset() if torch.distributed.get_rank() == 0: DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' }) logger.log_summary()
def train(args): if args.amp: amp_handle = amp.init(enabled=args.fp16) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) ssd300 = model(args) args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) iteration = 0 loss_func = Loss(dboxes) loss_func.cuda() optimizer = torch.optim.SGD( tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR( optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: if args.amp: optimizer = amp_handle.wrap_optimizer(optimizer) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) val_dataloader, inv_map = get_val_dataloader(args) train_loader = get_train_loader(args, dboxes) acc = 0 logger = Logger(args.batch_size, args.local_rank) for epoch in range(0, args.epochs): logger.start_epoch() scheduler.step() iteration = train_loop( ssd300, loss_func, epoch, optimizer, train_loader, iteration, logger, args) logger.end_epoch() if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Epoch {:2d}, Accuracy: {:4f} mAP'.format(epoch, acc)) if args.data_pipeline == 'dali': train_loader.reset() return acc, logger.average_speed()
def train(train_loop_func, logger, args): # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 if args.seed is None: args.seed = np.random.randint(1e4) if args.distributed: args.seed = (args.seed + torch.distributed.get_rank()) % 2**32 print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) train_loader = get_train_loader(args, args.seed - 2**31) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path)) # args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32) print(f"Actual starting LR: {args.learning_rate}") start_epoch = 0 iteration = 0 loss_func = Loss(dboxes) if use_cuda: ssd300.cuda() loss_func.cuda() # optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, # momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) optimizer = torch.optim.AdamW(tencent_trick(ssd300), lr=args.learning_rate, betas=(0.8, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=True) # scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) # scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=20, T_mult=1, eta_min=1e-6) scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=args.epochs, eta_min=1e-6) # scheduler = OneCycleLR(optimizer, max_lr=0.003, epochs=41, steps_per_epoch=173) # scheduler = CyclicLR(optimizer, base_lr=args.learning_rate, max_lr=2*args.learning_rate, # step_size_up=173*3, step_size_down=173*10) if args.amp: ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2') if args.distributed: ssd300 = DDP(ssd300) if args.checkpoint is not None: if os.path.isfile(args.checkpoint): load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage. cuda(torch.cuda.current_device())) start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('Provided checkpoint is not path to a file') return inv_map = {v: k for k, v in val_dataset.label_map.items()} total_time = 0 if args.mode == 'evaluation': acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: print('Model precision {} mAP'.format(acc)) return mean, std = generate_mean_std(args) for epoch in range(start_epoch, args.epochs): start_epoch_time = time.time() # scheduler.step() iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, scheduler, train_loader, val_dataloader, encoder, iteration, logger, args, mean, std) end_epoch_time = time.time() - start_epoch_time total_time += end_epoch_time # https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate scheduler.step() if args.local_rank == 0: logger.update_epoch_time(epoch, end_epoch_time) if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) if args.local_rank == 0: logger.update_epoch(epoch, acc) if args.save and args.local_rank == 0: print("saving model...") obj = { 'epoch': epoch + 1, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'label_map': val_dataset.label_info } if args.distributed: obj['model'] = ssd300.module.state_dict() else: obj['model'] = ssd300.state_dict() torch.save(obj, './models/epoch_{}.pt'.format(epoch)) train_loader.reset() print('total training time: {}'.format(total_time))
def train(args): args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.N_gpu = torch.distributed.get_world_size() else: args.N_gpu = 1 dboxes = dboxes300_coco() encoder = Encoder(dboxes) cocoGt = get_coco_ground_truth(args) val_dataset = get_val_dataset(args) val_dataloader = get_val_dataloader(val_dataset, args) ssd300 = SSD300(len(cocoGt.cats) + 1) args.learning_rate = args.learning_rate * \ args.N_gpu * (args.batch_size / 32) iteration = 0 loss_func = Loss(dboxes) ssd300.cuda() loss_func.cuda() if args.fp16: ssd300 = network_to_half(ssd300) if args.distributed: ssd300 = DDP(ssd300) optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.) inv_map = {v: k for k, v in val_dataset.label_map.items()} avg_loss = 0.0 acc = 0 batch_perf = AverageMeter() end = time.time() train_start = end args.train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") args.train_coco_root = os.path.join(args.data, "train2017") local_seed = set_seeds(args) if args.data_pipeline == 'no_dali': train_trans = SSDTransformer(dboxes, args, (300, 300), val=False) train_dataset = get_train_dataset(args, train_trans) train_loader = get_train_loader(train_dataset, args, args.num_workers) elif args.data_pipeline == 'dali': train_loader = get_train_dali_loader(args, dboxes, local_seed) for epoch in range(args.epochs): start_epoch_time = time.time() scheduler.step() epoch_loop(train_loader, args, ssd300, time.time(), loss_func, optimizer, iteration, avg_loss, batch_perf, epoch) torch.cuda.synchronize() if epoch in args.evaluation: acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args) try: train_loader.reset() except AttributeError: pass if args.local_rank == 0: print( "Training end: Average speed: {:3f} img/sec, Total time: {:3f} sec, Final accuracy: {:3f} mAP" .format(args.N_gpu * args.batch_size / batch_perf.avg, time.time() - train_start, acc))
all_time = 0 for epoch in range(opt.start_epochs, opt.epochs): start = time.time() model.train() for batch_i, datas in enumerate(train_loader): # for batch_i, (imgNames, imgs, targets) in enumerate(train_loader): for data in datas: imgs = data[0][0].cuda() targets = data[1][0].cuda() label_id = data[2][0].cuda() targets = torch.cat([label_id, targets], dim=1) _, outputs = model(imgs) loss = Loss(outputs, targets) optimizer.zero_grad() if opt.fp16: if opt.amp: with amp.scale_loss(loss, optimizer) as scale_loss: scale_loss.backward() else: # optimizer.backward(loss) loss.backward() else: loss.backward() optimizer.step() progress_bar(