def evaluate(test_step, metric, test_dist_dataset, num_batches, print_freq): """Runs evaluation steps and aggregate metrics""" timer = Timer() timer.tic() logger.info('Testing...') for batch_idx, x in enumerate(test_dist_dataset): labels, outputs = test_step(x) metric.update_state(labels, outputs) if batch_idx % print_freq == 0: time = timer.toc(average=False) logger.info('Predict for batch: {}/{} Time: {:.3f} sec'.format(batch_idx, num_batches, time)) timer.tic() logger.info('Total time: {:.3f} sec'.format(timer.total_time)) timer.reset() logger.info('Evaluating predictions...') timer.tic() result = metric.result() timer.toc(average=False) logger.info('Total time: {:.3f} sec'.format(timer.total_time)) return result
def export(config): model_builder = get_model_builder(config) strategy = tf.distribute.get_strategy() compression_ctrl, _, _ = restore_compressed_model(config, strategy, model_builder, config.ckpt_path) save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def __init__(self, params): logger.info('FastrcnnBoxLoss huber_loss_delta {}'.format( params.huber_loss_delta)) # The delta is typically around the mean value of regression target. # for instances, the regression targets of 512x512 input with 6 anchors on # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2]. self._huber_loss = tf.keras.losses.Huber( delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def build_model(self, weights=None, is_training=None): outputs = self.model_outputs(self._input_layer, is_training) keras_model = tf.keras.models.Model(inputs=self._input_layer, outputs=outputs, name='yolo_v4') if weights: logger.info('Loaded pretrained weights from {}'.format(weights)) keras_model.load_weights(weights, by_name=True) return keras_model
def resume_from_checkpoint(checkpoint_manager, ckpt_path, steps_per_epoch): if load_checkpoint(checkpoint_manager.checkpoint, ckpt_path) == 0: return 0 optimizer = checkpoint_manager.checkpoint.optimizer initial_step = optimizer.iterations.numpy() initial_epoch = initial_step // steps_per_epoch logger.info('Resuming from epoch %d (global step %d)', initial_epoch, initial_step) return initial_epoch, initial_step
def load_and_save_checkpoint(checkpoint, config): """ Load checkpoint and re-save it. """ load_checkpoint(checkpoint, config.ckpt_path) if config.checkpoint_save_dir is None: config.checkpoint_save_dir = config.log_dir checkpoint_manager = tf.train.CheckpointManager(checkpoint, config.checkpoint_save_dir, max_to_keep=None) save_path = checkpoint_manager.save() logger.info('Saved checkpoint: {}'.format(save_path))
def evaluate(self): """Evaluates with detections from all images with COCO API. Returns: coco_metric: float numpy array with shape [24] representing the coco-style evaluation metrics (box and mask). """ if not self._annotation_file: logger.info('There is no annotation_file in COCOEvaluator.') gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset( self._groundtruths) coco_gt = coco_utils.COCOWrapper( eval_type=('mask' if self._include_mask else 'box'), gt_dataset=gt_dataset) else: logger.info('Using annotation file: %s', self._annotation_file) coco_gt = self._coco_gt coco_predictions = coco_utils.convert_predictions_to_coco_annotations( self._predictions) coco_dt = coco_gt.load_res(predictions=coco_predictions) image_ids = [ann['image_id'] for ann in coco_predictions] coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox') coco_eval.params.imgIds = image_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() coco_metrics = coco_eval.stats if self._include_mask: mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm') mcoco_eval.params.imgIds = image_ids mcoco_eval.evaluate() mcoco_eval.accumulate() mcoco_eval.summarize() mask_coco_metrics = mcoco_eval.stats if self._include_mask: metrics = np.hstack((coco_metrics, mask_coco_metrics)) else: metrics = coco_metrics # Cleans up the internal variables in order for a fresh eval next time. self.reset() metrics_dict = {} for i, name in enumerate(self._metric_names): metrics_dict[name] = metrics[i].astype(np.float32) return metrics_dict
def _load_tfrecords(self): logger.info('Using TFRecords to load {} data.'.format(self._split)) dataset_key = self._dataset_name.replace('/', '') if dataset_key in self._tfrecord_datasets: self._dataset_loader = self._tfrecord_datasets[dataset_key]( config=self._config, is_train=self._is_train) else: raise ValueError('Unknown dataset name: {}'.format( self._dataset_name)) dataset = self._dataset_loader.as_dataset() return dataset
def build_model(self, weights=None, is_training=None): with keras_utils.maybe_enter_backend_graph(): outputs = self.model_outputs(self._input_layer, is_training) keras_model = tf.keras.models.Model(inputs=self._input_layer, outputs=outputs, name='retinanet') if self._checkpoint_path: logger.info('Init backbone') init_checkpoint_fn = self.make_restore_checkpoint_fn() init_checkpoint_fn(keras_model) if weights: logger.info('Loaded pretrained weights from {}'.format(weights)) keras_model.load_weights(weights) return keras_model
def build_model(self, weights=None, is_training=None): input_layers = self.build_input_layers(self._params, is_training) with keras_utils.maybe_enter_backend_graph(): outputs = self.model_outputs(input_layers, is_training) keras_model = tf.keras.models.Model(inputs=input_layers, outputs=outputs, name='maskrcnn') if self._checkpoint_path: logger.info('Init backbone') init_checkpoint_fn = self.make_restore_checkpoint_fn() init_checkpoint_fn(keras_model) if weights: logger.info('Loaded pretrained weights from {}'.format(weights)) _restore_baseline_weights(keras_model, weights) return keras_model
def train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer, num_test_batches, print_freq): train_summary_writer = SummaryWriter(log_dir, 'train') validation_summary_writer = SummaryWriter(log_dir, 'validation') compression_summary_writer = SummaryWriter(log_dir, 'compression') timer = Timer() timer.tic() statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) logger.info('Training...') for epoch in range(initial_epoch, epochs): logger.info('Epoch: {}/{}'.format(epoch, epochs)) train_epoch(train_step, compression_ctrl, epoch, initial_epoch, steps_per_epoch, optimizer, checkpoint_manager, train_dist_dataset, train_summary_writer, initial_step, print_freq, timer) test_metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, print_freq) validation_summary_writer(metrics=test_metric_result, step=optimizer.iterations.numpy()) eval_metric.reset_states() logger.info('Validation metric = {}'.format(test_metric_result)) statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) statistics = { f'compression/statistics/{name}': value for name, value in prepare_for_tensorboard(statistics).items() } compression_summary_writer(metrics=statistics, step=optimizer.iterations.numpy()) train_summary_writer.close() validation_summary_writer.close() compression_summary_writer.close()
def export(config): model_builder = get_model_builder(config) model = model_builder.build_model(weights=config.get('weights', None)) compression_state = None if config.ckpt_path: compression_state = load_compression_state(config.ckpt_path) compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config, compression_state) if config.ckpt_path: checkpoint = tf.train.Checkpoint( model=compress_model, compression_state=TFCompressionState(compression_ctrl)) load_checkpoint(checkpoint, config.ckpt_path) save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def train_epoch(train_step, compression_ctrl, epoch, initial_epoch, steps_per_epoch, optimizer, checkpoint_manager, train_dist_dataset, train_summary_writer, initial_step, print_freq, timer): compression_ctrl.scheduler.epoch_step(epoch) for step, x in enumerate(train_dist_dataset): if epoch == initial_epoch and step < initial_step % steps_per_epoch: continue if step == steps_per_epoch: save_path = checkpoint_manager.save() logger.info('Saved checkpoint for epoch={}: {}'.format( epoch, save_path)) break compression_ctrl.scheduler.step() train_loss = train_step(x) train_metric_result = tf.nest.map_structure( lambda s: s.numpy().astype(float), train_loss) if np.isnan(train_metric_result['total_loss']): raise ValueError('total loss is NaN') train_metric_result.update( {'learning_rate': optimizer.lr(optimizer.iterations).numpy()}) train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy()) if step % print_freq == 0: time = timer.toc(average=False) logger.info('Step: {}/{} Time: {:.3f} sec'.format( step, steps_per_epoch, time)) logger.info('Training metric = {}'.format(train_metric_result)) timer.tic()
def _restore_checkpoint_fn(keras_model): """Loads pretrained model through scaffold function.""" if not checkpoint_path: logger.info('checkpoint_path is empty') return var_prefix = prefix if prefix and not prefix.endswith('/'): var_prefix += '/' var_to_shape_map = _get_checkpoint_map(checkpoint_path) assert var_to_shape_map, 'var_to_shape_map should not be empty' vars_to_load = _build_assignment_map(keras_model, prefix=var_prefix, skip_variables_regex=skip_regex, var_to_shape_map=var_to_shape_map) if not vars_to_load: raise ValueError('Variables to load is empty.') tf.compat.v1.train.init_from_checkpoint(checkpoint_path, vars_to_load)
def _load_tfds(self): logger.info('Using TFDS to load {} data.'.format(self._split)) set_hard_limit_num_open_files() self._dataset_loader = tfds.builder(self._dataset_name, data_dir=self._dataset_dir) self._dataset_loader.download_and_prepare() decoders = {'image': tfds.decode.SkipDecoding()} \ if self._skip_decoding else None read_config = tfds.ReadConfig(interleave_cycle_length=64, interleave_block_length=1) dataset = self._dataset_loader.as_dataset( split=self._split, as_supervised=self._as_supervised, shuffle_files=self._is_train, decoders=decoders, read_config=read_config) return dataset
def build_scheduler(config, steps_per_epoch): optimizer_config = config.get('optimizer', {}) schedule_type = optimizer_config.get('schedule_type', 'step').lower() schedule_params = optimizer_config.get('schedule_params', {}) gamma = schedule_params.get('gamma', optimizer_config.get('gamma', 0.1)) base_lr = schedule_params.get('base_lr', optimizer_config.get('base_lr', None)) schedule_base_lr_check(schedule_type, base_lr) if schedule_type == 'exponential': step = schedule_params.get('step', optimizer_config.get('step', 1)) decay_steps = step * steps_per_epoch logger.info( 'Using exponential learning rate with: ' 'initial lr: %f, decay steps: %d, ' 'decay rate: %f', base_lr, decay_steps, gamma) lr = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=base_lr, decay_steps=decay_steps, decay_rate=gamma) elif schedule_type == 'piecewise_constant': boundaries = schedule_params.get( 'boundaries', optimizer_config.get('boundaries', None)) if boundaries is None: raise ValueError('`boundaries` parameter must be specified ' 'for the `piecewise_constant` scheduler') values = schedule_params.get('values', optimizer_config.get('values', None)) if values is None: raise ValueError('`values` parameter must be specified ' 'for the `piecewise_constant` scheduler') logger.info( 'Using Piecewise constant decay with warmup. ' 'Parameters: boundaries: %s, values: %s', boundaries, values) boundaries = [steps_per_epoch * x for x in boundaries] lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries, values) elif schedule_type == 'multistep': logger.info('Using MultiStep learning rate.') steps = schedule_params.get('steps', optimizer_config.get('steps', None)) if steps is None: raise ValueError('`steps` parameter must be specified ' 'for the `multistep` scheduler') steps = [steps_per_epoch * x for x in steps] lr = MultiStepLearningRate(base_lr, steps, gamma=gamma) elif schedule_type == 'step': step = schedule_params.get('step', optimizer_config.get('step', 1)) decay_steps = step * steps_per_epoch logger.info( 'Using Step learning rate with: ' 'base_lr: %f, decay steps: %d, ' 'gamma: %f', base_lr, decay_steps, gamma) lr = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=base_lr, decay_steps=decay_steps, decay_rate=gamma, staircase=True) elif schedule_type == 'step_warmup': lr = StepLearningRateWithLinearWarmup(schedule_params) elif schedule_type == 'cosine': decay_steps = steps_per_epoch * config.epochs logger.info( 'Using Cosine learning rate with: ' 'base_lr: %f, decay steps: %d, ', base_lr, decay_steps) lr = tf.keras.experimental.CosineDecay(initial_learning_rate=base_lr, decay_steps=decay_steps) else: raise KeyError( f'Unknown learning rate scheduler type: {schedule_type}') return lr
def run(config): strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) # Create dataset train_builder, test_builder = get_dataset_builders( config, strategy.num_replicas_in_sync) train_dataset = train_builder.build() test_dataset = test_builder.build() train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch num_test_batches = test_builder.steps_per_epoch # Create model builder model_builder = get_model_builder(config) def model_eval_fn(model): test_step = create_test_step_fn(strategy, model, model_builder.post_processing) metric_result = evaluate(test_step, model_builder.eval_metrics(), test_dist_dataset, num_test_batches, config.print_freq) return metric_result['AP'] # Register additional parameters in the NNCFConfig for initialization # the compressed model during building nncf_config = config.nncf_config nncf_config = register_default_init_args( nncf_config=nncf_config, data_loader=train_dataset, batch_size=train_builder.global_batch_size) resume_training = config.ckpt_path is not None compression_state = None if resume_training: compression_state = load_compression_state(config.ckpt_path) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None)) as model: with strategy.scope(): config.nncf_config.register_extra_structs( [ModelEvaluationArgs(eval_fn=model_eval_fn)]) compression_ctrl, compress_model = create_compressed_model( model, nncf_config, compression_state) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) eval_metric = model_builder.eval_metrics() loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) predict_post_process_fn = model_builder.post_processing checkpoint = tf.train.Checkpoint( model=compress_model, optimizer=optimizer, compression_state=TFCompressionState(compression_ctrl)) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if resume_training: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, config.ckpt_path, steps_per_epoch) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) if 'train' in config.mode: if is_accuracy_aware_training(config): train_summary_writer = SummaryWriter(config.log_dir, 'train') timer = Timer() timer.tic() def train_epoch_fn(compression_ctrl, model, epoch, **kwargs): train_step = create_train_step_fn(strategy, model, loss_fn, optimizer) train_epoch(train_step, compression_ctrl, epoch, initial_epoch, steps_per_epoch, optimizer, checkpoint_manager, train_dist_dataset, train_summary_writer, initial_step, config.print_freq, timer) def validate_fn(model, **kwargs): test_step = create_test_step_fn(strategy, model, predict_post_process_fn) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq) return metric_result['AP'] acc_aware_training_loop = create_accuracy_aware_training_loop( nncf_config, compression_ctrl) compress_model = acc_aware_training_loop.run( compress_model, train_epoch_fn=train_epoch_fn, validate_fn=validate_fn, tensorboard_writer=SummaryWriter(config.log_dir, 'accuracy_aware_training'), log_dir=config.log_dir) else: train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, num_test_batches, config.print_freq) statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq) logger.info('Validation metric = {}'.format(metric_result)) if config.metrics_dump is not None: write_metrics(metric_result['AP'], config.metrics_dump) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def load_checkpoint(checkpoint, ckpt_path): logger.info('Load from checkpoint is enabled') if tf.io.gfile.isdir(ckpt_path): path_to_checkpoint = tf.train.latest_checkpoint(ckpt_path) logger.info('Latest checkpoint: {}'.format(path_to_checkpoint)) else: path_to_checkpoint = ckpt_path if tf.io.gfile.exists( ckpt_path + '.index') else None logger.info('Provided checkpoint: {}'.format(path_to_checkpoint)) if not path_to_checkpoint: logger.info('No checkpoint detected') return 0 logger.info( 'Checkpoint file {} found and restoring from checkpoint'.format( path_to_checkpoint)) status = checkpoint.restore(path_to_checkpoint) status.expect_partial() logger.info('Completed loading from checkpoint') return None
def build_optimizer(config, scheduler): optimizer_config = config.get('optimizer', {}) optimizer_type = optimizer_config.get('type', 'adam').lower() optimizer_params = optimizer_config.get('optimizer_params', {}) logger.info('Building %s optimizer with params %s', optimizer_type, optimizer_params) if optimizer_type in ['sgd', 'momentum']: printable_names = {'sgd': 'SGD', 'momentum': 'momentum'} logger.info('Using %s optimizer', printable_names[optimizer_type]) default_momentum_value = 0.9 if optimizer_type == 'momentum' else 0.0 momentum = optimizer_params.get('momentum', default_momentum_value) nesterov = optimizer_params.get('nesterov', False) weight_decay = optimizer_config.get('weight_decay', None) common_params = { 'learning_rate': scheduler, 'nesterov': nesterov, 'momentum': momentum } if weight_decay: optimizer = tfa.optimizers.SGDW(**common_params, weight_decay=weight_decay) else: optimizer = tf.keras.optimizers.SGD(**common_params) elif optimizer_type == 'rmsprop': logger.info('Using RMSProp optimizer') rho = optimizer_params.get('rho', 0.9) momentum = optimizer_params.get('momentum', 0.9) epsilon = optimizer_params.get('epsilon', 1e-07) optimizer = tf.keras.optimizers.RMSprop(learning_rate=scheduler, rho=rho, momentum=momentum, epsilon=epsilon) elif optimizer_type in ['adam', 'adamw']: printable_names = {'adam': 'Adam', 'adamw': 'AdamW'} logger.info('Using %s optimizer', printable_names[optimizer_type]) beta_1, beta_2 = optimizer_params.get('betas', [0.9, 0.999]) epsilon = optimizer_params.get('eps', 1e-07) amsgrad = optimizer_params.get('amsgrad', False) w_decay_defaul_value = 0.01 if optimizer_type == 'adamw' else None weight_decay = optimizer_config.get('weight_decay', w_decay_defaul_value) common_params = { 'learning_rate': scheduler, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'amsgrad': amsgrad } if weight_decay: optimizer = tfa.optimizers.AdamW(**common_params, weight_decay=weight_decay) else: optimizer = tf.keras.optimizers.Adam(**common_params) else: raise ValueError('Unknown optimizer %s' % optimizer_type) moving_average_decay = optimizer_params.get('moving_average_decay', 0.) if moving_average_decay > 0.: logger.info('Including moving average decay.') optimizer = tfa.optimizers.MovingAverage( optimizer, average_decay=moving_average_decay, num_updates=None) if optimizer_params.get('lookahead', None): logger.info('Using lookahead optimizer.') optimizer = tfa.optimizers.Lookahead(optimizer) return optimizer
def _build_assignment_map(keras_model, prefix='', skip_variables_regex=None, var_to_shape_map=None): """Compute an assignment mapping for loading older checkpoints into a Keras model. Variable names are remapped from the original TPUEstimator model to the new Keras name. Args: keras_model: tf.keras.Model object to provide variables to assign. prefix: prefix in the variable name to be remove for alignment with names in the checkpoint. skip_variables_regex: regular expression to math the names of variables that do not need to be assign. var_to_shape_map: variable name to shape mapping from the checkpoint. Returns: The variable assignment map. """ assignment_map = {} checkpoint_names = None if var_to_shape_map: predicate = lambda x: not x.endswith('Momentum') and not x.endswith( 'global_step') checkpoint_names = list(filter(predicate, var_to_shape_map.keys())) for var in keras_model.variables: var_name = var.name if skip_variables_regex and re.match(skip_variables_regex, var_name): continue # Trim the index of the variable. if ':' in var_name: var_name = var_name[:var_name.rindex(':')] if var_name.startswith(prefix): var_name = var_name[len(prefix):] if not var_to_shape_map: assignment_map[var_name] = var continue # Match name with variables in the checkpoint. match_names = [] for x in checkpoint_names: if x.endswith(var_name): match_names.append(x) try: if match_names: assert len(match_names ) == 1, 'more then on matches for {}: {}'.format( var_name, match_names) checkpoint_names.remove(match_names[0]) assignment_map[match_names[0]] = var else: logger.info('Error not found var name: %s', var_name) except Exception as ex: logger.info('Error removing the match_name: %s', match_names) logger.info('Exception: %s', ex) raise logger.info('Found variable in checkpoint: %d', len(assignment_map)) return assignment_map
def run_evaluation(config, eval_timeout=None): """Runs evaluation on checkpoint save directory""" strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) validation_builder, calibration_builder = get_dataset_builders(config, strategy.num_replicas_in_sync) calibration_dataset = calibration_builder.build() val_dataset = validation_builder.build() num_batches = validation_builder.steps_per_epoch test_dist_dataset = strategy.experimental_distribute_dataset(val_dataset) config.nncf_config = register_default_init_args(nncf_config=config.nncf_config, data_loader=calibration_dataset, batch_size=validation_builder.global_batch_size) # We use `model_batch_size` to create input layer for model config.model_batch_size = validation_builder.batch_size model_builder = get_model_builder(config) eval_metric = model_builder.eval_metrics() predict_post_process_fn = model_builder.post_processing if 'test' in config.mode: compression_ctrl, compress_model, _ = restore_compressed_model(config, strategy, model_builder, config.ckpt_path) test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq) eval_metric.reset_states() logger.info('Test metric = {}'.format(metric_result)) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path)) elif 'train' in config.mode: validation_summary_writer = SummaryWriter(config.log_dir, 'validation') is_first_checkpoint = True for checkpoint_path in tf.train.checkpoints_iterator(config.checkpoint_save_dir, config.eval_timeout): if is_first_checkpoint: is_first_checkpoint = False _, compress_model, checkpoint = restore_compressed_model(config, strategy, model_builder, checkpoint_path) test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) else: checkpoint.restore(checkpoint_path).expect_partial() logger.info('Checkpoint file {} found and restoring from checkpoint'.format(checkpoint_path)) logger.info('Checkpoint step: {}'.format(checkpoint.step.numpy())) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq) current_step = checkpoint.step.numpy() validation_summary_writer(metrics=metric_result, step=current_step) eval_metric.reset_states() logger.info('Validation metric = {}'.format(metric_result)) validation_summary_writer.close() if config.metrics_dump is not None: write_metrics(metric_result['AP'], config.metrics_dump)
def train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer, print_freq): train_summary_writer = SummaryWriter(log_dir, 'train') compression_summary_writer = SummaryWriter(log_dir, 'compression') timer = Timer() timer.tic() logger.info('Training...') for epoch in range(initial_epoch, epochs): logger.info('Epoch: {}/{}'.format(epoch, epochs)) compression_ctrl.scheduler.epoch_step(epoch) for step, x in enumerate(train_dist_dataset): if epoch == initial_epoch and step < initial_step % steps_per_epoch: continue checkpoint_manager.checkpoint.step.assign_add(1) if step == steps_per_epoch: save_path = checkpoint_manager.save() logger.info('Saved checkpoint for epoch={}: {}'.format( epoch, save_path)) break compression_ctrl.scheduler.step() train_loss = train_step(x) train_metric_result = tf.nest.map_structure( lambda s: s.numpy().astype(float), train_loss) if np.isnan(train_metric_result['total_loss']): raise ValueError('total loss is NaN') train_metric_result.update( {'learning_rate': optimizer.lr(optimizer.iterations).numpy()}) train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy()) if step % print_freq == 0: time = timer.toc(average=False) logger.info('Step: {}/{} Time: {:.3f} sec'.format( step, steps_per_epoch, time)) logger.info('Training metric = {}'.format(train_metric_result)) timer.tic() statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) statistics = { f'compression/statistics/{name}': value for name, value in prepare_for_tensorboard(statistics).items() } compression_summary_writer(metrics=statistics, step=optimizer.iterations.numpy()) train_summary_writer.close() compression_summary_writer.close()
def run_train(config): strategy = get_distribution_strategy(config) # Create dataset builders = get_dataset_builders(config, strategy.num_replicas_in_sync) datasets = [builder.build() for builder in builders] train_builder, _ = builders train_dataset, calibration_dataset = datasets train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch # We use `model_batch_size` to create input layer for model config.model_batch_size = train_builder.batch_size # Create model builder model_builder = get_model_builder(config) # Register additional parameters in the NNCFConfig for initialization # the compressed model during building nncf_config = config.nncf_config nncf_config = register_default_init_args( nncf_config=nncf_config, data_loader=calibration_dataset, batch_size=train_builder.global_batch_size) resume_training = config.ckpt_path is not None compression_state = None if resume_training: compression_state = load_compression_state(config.ckpt_path) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=True) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, nncf_config, compression_state) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint( variables=variables, optimizer=optimizer, compression_state=TFCompressionState(compression_ctrl), step=tf.Variable(0)) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if resume_training: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, config.ckpt_path, steps_per_epoch) statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, config.print_freq) logger.info('Compression statistics') statistics = compression_ctrl.statistics() logger.info(statistics.to_str())