def main(cfg): ###################################################################################### # Create Training Data ###################################################################################### datasets = build_dataset(cfg.data.train) tf_datasets = [ build_dataloader(datasets, cfg.batch_size_per_device, cfg.workers_per_gpu, num_gpus=hvd.size(), dist=True) ] ###################################################################################### # Build Model ###################################################################################### model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # Pass example through so tensor shapes are defined model.CLASSES = datasets.CLASSES _ = model(next(iter(tf_datasets[0][0]))) model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False) ###################################################################################### # Create Model Runner ###################################################################################### runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, optimizer=cfg.optimizer, work_dir=cfg.work_dir, logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16, loss_weights=cfg.loss_weights) runner.timestamp = int(time()) ###################################################################################### # Setup Training Hooks ###################################################################################### runner.register_hook( checkpoint.CheckpointHook(interval=cfg.checkpoint_interval, out_dir=cfg.outputs_path, s3_dir=None)) runner.register_hook( CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval)) runner.register_hook(iter_timer.IterTimerHook()) runner.register_hook(text.TextLoggerHook()) runner.register_hook( visualizer.Visualizer(cfg.data.val, interval=100, top_k=10)) runner.register_hook( tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path, interval=10, image_interval=100, s3_dir=None)) ###################################################################################### # Run Model ###################################################################################### runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
def main(): args = parse_args() num_gpus = len(gpus) cfg = Config.fromfile(args.config) # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) total_bs = len(gpus) * cfg.data.imgs_per_gpu cfg.optimizer['learning_rate'] = \ cfg.optimizer['learning_rate'] * total_bs / 8 # init distributed env first, since logger depends on the dist info. init_dist() if not gpus: distributed = False # single node single gpu else: distributed = True # create work_dir mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('Distributed training: {}'.format(distributed)) logger.info('TF MMDetection Version: {}'.format(__version__)) logger.info('Config:\n{}'.format(cfg.text)) logger.info('Tensorflow version: {}'.format(tf.version.VERSION)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}, deterministic: {}'.format( args.seed, args.deterministic)) set_random_seed(args.seed + get_dist_info()[0], deterministic=args.deterministic) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # dummy data to init network padded_img_side = max(cfg.data.train['scale']) img = tf.random.uniform(shape=[padded_img_side, padded_img_side, 3], dtype=tf.float32) img_meta = tf.constant( [465., 640., 3., 800., 1101., 3., float(padded_img_side), float(padded_img_side), 3., 1.7204301, 0.], dtype=tf.float32) # bboxes = tf.constant([[1.0, 1.0, 10.0, 10.0]], dtype=tf.float32) # labels = tf.constant([1], dtype=tf.int32) _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)), training=False) # print('BEFORE:', model.layers[0].layers[0].get_weights()[0][0,0,0,:]) weights_path = cfg.model['backbone']['weights_path'] logger.info('Loading weights from: {}'.format(weights_path)) model.layers[0].layers[0].load_weights(weights_path, by_name=True, skip_mismatch=True) #by_name=False) # print('AFTER:',model.layers[0].layers[0].get_weights()[0][0,0,0,:]) print_model_info(model, logger) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: datasets.append(build_dataset(cfg.data.val)) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) > 1: raise NotImplementedError train_detector(model, datasets, cfg, num_gpus=num_gpus, distributed=distributed, mixed_precision=args.amp, validate=args.validate, timestamp=timestamp)
def main(): args = parse_args() print(args) assert args.out or args.json_out, \ ('Please specify at least one operation to save the results) ' 'with the argument "--out" or "--json_out"') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') if args.json_out is not None and args.json_out.endswith('.json'): args.json_out = args.json_out[:-5] cfg = Config.fromfile(args.config) cfg.model.pretrained = None distributed = False # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) # build the model and load checkpoint model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) # dummy data to init network img = tf.random.uniform(shape=[1333, 1333, 3], dtype=tf.float32) img_meta = tf.constant( [465., 640., 3., 800., 1101., 3., 1333., 1333., 3., 1.7204301, 0.], dtype=tf.float32) _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)), training=False) load_checkpoint(model, args.checkpoint) model.CLASSES = dataset.CLASSES if not distributed: outputs = single_gpu_test(model, dataset) else: raise NotImplementedError rank, _, _, _ = get_dist_info() if args.out and rank == 0: print('\nwriting results to {}'.format(args.out)) fileio.dump(outputs, args.out) eval_types = args.eval if eval_types: print('Starting evaluate {}'.format(' and '.join(eval_types))) if eval_types == ['proposal_fast']: result_file = args.out coco_eval(result_file, eval_types, dataset.coco) else: if not isinstance(outputs[0], dict): result_files = results2json(dataset, outputs, args.out) coco_eval(result_files, eval_types, dataset.coco) else: for name in outputs[0]: print('\nEvaluating {}'.format(name)) outputs_ = [out[name] for out in outputs] result_file = args.out + '.{}'.format(name) result_files = results2json(dataset, outputs_, result_file) coco_eval(result_files, eval_types, dataset.coco) # Save predictions in the COCO json format if args.json_out and rank == 0: if not isinstance(outputs[0], dict): results2json(dataset, outputs, args.json_out) else: for name in outputs[0]: outputs_ = [out[name] for out in outputs] result_file = args.json_out + '.{}'.format(name) results2json(dataset, outputs_, result_file)
def main(cfg): decompress_data(cfg) ###################################################################################### # Create Training Data ###################################################################################### cfg.global_batch_size = cfg.batch_size_per_device * hvd.size() cfg.steps_per_epoch = cfg.coco_images // cfg.global_batch_size datasets = build_dataset(cfg.data.train) tf_datasets = [ build_dataloader(datasets, cfg.batch_size_per_device, cfg.workers_per_gpu, num_gpus=hvd.size(), dist=True) ] ###################################################################################### # Build Model ###################################################################################### #update any hyperparams that we may have passed in via arguments if cfg.ls > 0.0: cfg.model['bbox_head']['label_smoothing'] = cfg.ls if cfg.use_rcnn_bn: cfg.model['bbox_head']['use_bn'] = cfg.use_rcnn_bn if cfg.use_conv: cfg.model['bbox_head']['use_conv'] = cfg.use_conv cfg.schedule = args.schedule model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # Pass example through so tensor shapes are defined _ = model(next(iter(tf_datasets[0][0]))) model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False) ###################################################################################### # Build optimizer and associate scheduler ###################################################################################### # base learning rate is set for global batch size of 8, with linear scaling for larger batches base_learning_rate = cfg.base_learning_rate scaled_learning_rate = base_learning_rate * cfg.global_batch_size / 8 steps_per_epoch = cfg.steps_per_epoch if cfg.schedule == '1x': scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay( [steps_per_epoch * 8, steps_per_epoch * 10], [ scaled_learning_rate, scaled_learning_rate * 0.1, scaled_learning_rate * 0.01 ]) elif cfg.schedule == 'cosine': scheduler = tf.keras.experimental.CosineDecayRestarts( initial_learning_rate=scaled_learning_rate, first_decay_steps=12 * steps_per_epoch, t_mul=1, m_mul=1) #0-1-13 else: raise NotImplementedError warmup_init_lr = 1.0 / cfg.warmup_init_lr_scale * scaled_learning_rate scheduler = WarmupScheduler(scheduler, warmup_init_lr, cfg.warmup_steps) # FIXME: currently hardcoded to SGD optimizer = tf.keras.optimizers.SGD(scheduler, momentum=0.9, nesterov=False) if cfg.fp16: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale='dynamic') ###################################################################################### # Create Model Runner ###################################################################################### runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, optimizer=optimizer, work_dir=cfg.work_dir, logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16, loss_weights=cfg.loss_weights) runner.timestamp = int(time()) ###################################################################################### # Setup Training Hooks ###################################################################################### runner.register_hook( checkpoint.CheckpointHook(interval=cfg.checkpoint_interval, out_dir=cfg.outputs_path, s3_dir=cfg.s3_checkpoints, h5=True)) runner.register_hook( CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval)) runner.register_hook(iter_timer.IterTimerHook()) runner.register_hook(text.TextLoggerHook()) runner.register_hook( visualizer.Visualizer(cfg.data.val, interval=100, top_k=10)) runner.register_hook( tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path, image_interval=100, s3_dir=cfg.s3_tensorboard)) ###################################################################################### # Run Model ###################################################################################### runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
def main_sagemaker(args, cfg): """ Main training entry point for jobs launched via SageMaker """ instance_name = cfg.sagemaker_job['job_name'] s3_path = cfg.sagemaker_job['s3_path'] decompress_data() # setup data dirs based on SM CHANNELS num_gpus = len(gpus) # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) total_bs = get_dist_info()[2] * cfg.data.imgs_per_gpu cfg.optimizer[ 'learning_rate'] = cfg.optimizer['learning_rate'] * total_bs / 8 # init distributed env first, since logger depends on the dist info. init_dist() if not gpus: distributed = False # single node single gpu else: distributed = True # create work_dir mkdir_or_exist(osp.abspath(cfg.work_dir)) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('Distributed training: {}'.format(distributed)) logger.info('TF MMDetection Version: {}'.format(__version__)) logger.info('Config:\n{}'.format(cfg.text)) logger.info('Tensorflow version: {}'.format(tf.version.VERSION)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}, deterministic: {}'.format( args.seed, args.deterministic)) set_random_seed(args.seed + get_dist_info()[0], deterministic=args.deterministic) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # dummy data to init network padded_img_side = max(cfg.data.train['scale']) img = tf.random.uniform(shape=[padded_img_side, padded_img_side, 3], dtype=tf.float32) img_meta = tf.constant([ 465., 640., 3., 800., 1101., 3., float(padded_img_side), float(padded_img_side), 3., 1.7204301, 0. ], dtype=tf.float32) # bboxes = tf.constant([[1.0, 1.0, 10.0, 10.0]], dtype=tf.float32) # labels = tf.constant([1], dtype=tf.int32) _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)), training=False) # print('BEFORE:', model.layers[0].layers[0].get_weights()[0][0,0,0,:]) # sagemaker specific path resolution import os, pathlib data_root = pathlib.Path( os.getenv('SM_CHANNEL_COCO')).joinpath('coco').as_posix() cfg.data.train['dataset_dir'] = data_root cfg.data.val['dataset_dir'] = data_root weights_file = cfg.model['backbone']['weights_path'] weights_path = pathlib.Path( os.getenv('SM_CHANNEL_WEIGHTS')).joinpath(weights_file).as_posix() logger.info('Loading weights from: {}'.format(weights_path)) if osp.splitext(weights_file )[1] == '.h5': # older keras format from Keras model zoo model.layers[0].layers[0].load_weights(weights_path, by_name=True, skip_mismatch=True) else: # SavedModel format assumed - extract weights backbone_model = tf.keras.models.load_model(weights_path) # load weights if layers match for layer_idx, layer in enumerate(backbone_model.layers): if layer_idx < len(model.layers[0].layers[0].layers): model.layers[0].layers[0].layers[layer_idx].set_weights( layer.get_weights()) print('Loaded weights for:', layer.name) del backbone_model # print('AFTER:',model.layers[0].layers[0].get_weights()[0][0,0,0,:]) print_model_info(model, logger) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: datasets.append(build_dataset(cfg.data.val)) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) > 1: raise NotImplementedError train_detector(model, datasets, cfg, num_gpus=num_gpus, distributed=distributed, mixed_precision=args.amp, validate=args.validate, timestamp=timestamp)
def main_ec2(args, cfg): # start logger timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) """ Main training entry point for jobs launched directly on EC2 instances """ num_gpus = len(gpus) # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from if args.resume_dir is not None: if os.path.exists(args.resume_dir): logger.info("RESUMING TRAINING") # get the latest checkpoint all_chkpt = [ os.path.join(args.resume_dir, d) for d in os.listdir(args.resume_dir) if os.path.isdir(os.path.join(args.resume_dir, d)) ] if not all_chkpt: cfg.resume_from = None else: latest_chkpt = max(all_chkpt, key=os.path.getmtime) # set the latest checkpoint to resume_from cfg.resume_from = latest_chkpt else: logger.info("CHECKPOINT NOT FOUND, RESTARTING TRAINING") cfg.resume_from = None if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) total_bs = get_dist_info()[2] * cfg.data.imgs_per_gpu cfg.optimizer[ 'learning_rate'] = cfg.optimizer['learning_rate'] * total_bs / 8 # init distributed env first, since logger depends on the dist info. # init_dist() if not gpus: distributed = False # single node single gpu else: distributed = True # create work_dir mkdir_or_exist(osp.abspath(cfg.work_dir)) # log some basic info logger.info('Distributed training: {}'.format(distributed)) logger.info('TF MMDetection Version: {}'.format(__version__)) logger.info('Config:\n{}'.format(cfg.text)) logger.info('Tensorflow version: {}'.format(tf.version.VERSION)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}, deterministic: {}'.format( args.seed, args.deterministic)) set_random_seed(args.seed + get_dist_info()[0], deterministic=args.deterministic) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # dummy data to init network padded_img_side = max(cfg.data.train['scale']) img = tf.random.uniform(shape=[padded_img_side, padded_img_side, 3], dtype=tf.float32) img_meta = tf.constant([ 465., 640., 3., 800., 1101., 3., float(padded_img_side), float(padded_img_side), 3., 1.7204301, 0. ], dtype=tf.float32) # bboxes = tf.constant([[1.0, 1.0, 10.0, 10.0]], dtype=tf.float32) # labels = tf.constant([1], dtype=tf.int32) _ = model((tf.expand_dims(img, axis=0), tf.expand_dims(img_meta, axis=0)), training=False) #model.save('my_model') # print('BEFORE:', model.layers[0].layers[0].get_weights()[0][0,0,0,:]) weights_path = cfg.model['backbone']['weights_path'] logger.info('Loading weights from: {}'.format(weights_path)) if osp.splitext(weights_path )[1] == '.h5': # older keras format from Keras model zoo model.layers[0].layers[0].load_weights(weights_path, by_name=True, skip_mismatch=True) else: # SavedModel format assumed - extract weights backbone_model = tf.keras.models.load_model(weights_path) # load weights if layers match for layer_idx, layer in enumerate(backbone_model.layers): if layer_idx < len(model.layers[0].layers[0].layers): model.layers[0].layers[0].layers[layer_idx].set_weights( layer.get_weights()) print('Loaded weights for:', layer.name) del backbone_model # print('AFTER:',model.layers[0].layers[0].get_weights()[0][0,0,0,:]) print_model_info(model, logger) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: datasets.append(build_dataset(cfg.data.val)) datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) > 1: raise NotImplementedError train_detector(model, datasets, cfg, num_gpus=num_gpus, distributed=distributed, mixed_precision=args.amp, validate=args.validate, timestamp=timestamp)