def train_loop( pipeline_config_path, model_dir, val_checkpoint_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, checkpoint_every_n=1000, checkpoint_max_to_keep=7, record_summaries=True, performance_summary_exporter=None, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. val_checkpoint_dir: The directory to save validation checkpoint. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. checkpoint_every_n: Checkpoint every n training steps. checkpoint_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. performance_summary_exporter: function for exporting performance metrics. **kwargs: Additional keyword arguments for configuration override. """ print('START train looop function ========================') ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] steps_per_sec_list = [] configs = get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError('train_pb2.load_all_detection_checkpoint_vars ' 'unsupported in TF2') config_util.update_fine_tune_checkpoint_type(train_config) fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Write the as-run pipeline config to disk. if save_final_config: tf.logging.info('Saving pipeline config file to directory {}'.format( model_dir)) pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate,) = optimizer_builder.build( train_config.optimizer, global_step=global_step) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if train_config.optimizer.use_moving_average: _ensure_model_is_built(detection_model, train_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) else: summary_writer = tf2.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: # TODO(b/135933080) Explore setting to 100 when GPU performance issues # are fixed. num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if train_config.fine_tune_checkpoint: load_fine_tune_checkpoint( detection_model, train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, fine_tune_checkpoint_version, train_config.run_fine_tune_checkpoint_dummy_computation, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) val_ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) manager_dir = get_filepath(strategy, model_dir) val_manager_dir = get_filepath(strategy, val_checkpoint_dir) # if not strategy.extended.should_checkpoint: # checkpoint_max_to_keep = 1 checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) val_manager = tf.compat.v2.train.CheckpointManager( val_ckpt, val_manager_dir, max_to_keep=checkpoint_max_to_keep) model_checkpoint_callback = tfc.ModelCheckpoint(val_manager) early_stopping_callback = tfc.EarlyStopping(min_delta=0.0001, patience=5, mode='min') train_logger_callback = tfc.TrainLogger(model_dir, 'logs.txt') cancellation_point = tfc.CancellationPoint() # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) val_ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() if hasattr(tf.distribute.Strategy, 'run'): per_replica_losses = strategy.run( train_step_fn, args=(features, labels)) else: per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): # Following suggestion on yaqs/5402607292645376 with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) if int(global_step.value()) == 0: manager.save() checkpointed_step = int(global_step.value()) logged_step = global_step.value() # num_epochs = (train_steps - global_step.value()) // num_steps_per_iteration last_step_time = time.time() for epoch, _ in enumerate(range(global_step.value(), train_steps, num_steps_per_iteration)): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() steps_per_sec = num_steps_per_iteration * 1.0 / time_taken tf.compat.v2.summary.scalar( 'steps_per_sec', steps_per_sec, step=global_step) steps_per_sec_list.append(steps_per_sec) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) manager.save() checkpointed_step = int(global_step.value()) log_metrics = eval_continuously(pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, timeout=20) log_metrics['train_total_loss'] = loss model_checkpoint_callback.step(epoch, log_metrics['Loss/total_loss']) stop_training = early_stopping_callback.step(epoch, log_metrics['Loss/total_loss']) train_logger_callback.log(log_metrics) if stop_training or cancellation_point.check(): break print(log_metrics) logged_step = global_step.value() # Remove the checkpoint directories of the non-chief workers that # MultiWorkerMirroredStrategy forces us to save during sync distributed # training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_writer_filepath) # TODO(pkanwar): add accuracy metrics. if performance_summary_exporter is not None: metrics = { 'steps_per_sec': np.mean(steps_per_sec_list), 'steps_per_sec_p50': np.median(steps_per_sec_list), 'steps_per_sec_max': max(steps_per_sec_list), 'last_batch_loss': float(loss) } mixed_precision = 'bf16' if kwargs['use_bfloat16'] else 'fp32' performance_summary_exporter(metrics, mixed_precision)
def train(zones, epochs=1, batch_size=32, learning_rate=0.001, version=None, gpus=None, mtype='vgg16', starting_model_file=None, img_dim=224, channels=1, train_layer_start=None): if not isinstance(zones, list): zones = [zones] key_zone = zones[0] # This will get the total number of zones in the batches which will inform the steps # (found the 'sum' trick on stack overflow) zone_count = len(sum(zone_aps_generator2.ZONE_COMBOS_DICT[key_zone],[])) print(f"zone_count: {zone_count}") #data_shape = sd.zones_max_dict(round_up=True)[zones[0]] data_shape = (len(zone_aps_generator2.ZONE_SLICE_DICT[key_zone]),) + (img_dim, img_dim) img_scale = True if mtype=='vgg16' else False train_batches = get_batches_aps_train('train', zones, data_shape, channels=channels, batch_size=batch_size, shuffle=True, img_scale=img_scale) steps_per_epoch = math.ceil(train_batches.samples / train_batches.batch_size) * 2 * zone_count print(f"training sample size: {train_batches.samples}") print(f"training batch size: {train_batches.batch_size}, steps: {steps_per_epoch}") val_batches = get_batches_aps_train('valid', zones, data_shape, channels=channels, batch_size=batch_size, shuffle=True, img_scale=img_scale) validation_steps = math.ceil(val_batches.samples / val_batches.batch_size) * 2 * zone_count print(f"validation sample size: {val_batches.samples}") print(f"validation batch size: {val_batches.batch_size}, steps: {validation_steps}") #---------------------------------- train_model = None if starting_model_file is not None: # https://github.com/fchollet/keras/issues/6865 (why I must compile saved model smf_path = os.path.join(config.PSCREENING_HOME, config.MODEL_DIR, starting_model_file) train_model = tf.keras.models.load_model(smf_path) if train_layer_start is not None: _set_trainable(train_model, train_layer_start) _, _, mtype, _, _ = _model_params(starting_model_file) else: ps_model = _get_model(mtype, output=len(zones), multi_gpu=(gpus is not None), train_layer_start=train_layer_start) #TODO: create the model with None as the time dimension? When looking at the code it looked like TimeDistributed #acts differently when None is passed as opposed to a fixed dimension. ps_model.create(input_shape=train_batches.data_shape) train_model = ps_model.model if gpus is not None: train_model = tf_util.multi_gpu_model(train_model, gpus) train_model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model_version = f"zone{zones[0]}-{mtype}-d{img_dim}-c{channels}-e{epochs}-bs{batch_size}-lr{str(learning_rate).split('.')[1]}" model_version += f"-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" if version is not None: model_version += f"-{version}" print(f"model_version: {model_version}") model_version_el = model_version + "-{epoch:02d}-{val_loss:.3f}" model_file = model_version_el + '.h5' model_file = os.path.join(config.PSCREENING_HOME, config.MODEL_DIR, model_file) cb_model_save = callbacks.ModelCheckpoint(model_file, multi_gpu=(gpus is not None)) weight1 = round(0.9 ** min(zone_count, 6), 2) train_model.fit_generator(train_batches, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=val_batches, validation_steps=validation_steps, callbacks=[cb_model_save], class_weight={0:1-weight1, 1:weight1}) return model_version
def main(): parser = argparse.ArgumentParser() parser.add_argument('--setting', '-x', help='Setting to use', required=True) args = parser.parse_args() setting = settings.load_setting(args.setting) time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') root_folder = os.path.join( setting.save_folder, '%s_%d_%s' % (args.setting, os.getpid(), time_string)) if not os.path.exists(root_folder): os.makedirs(root_folder) setting.model.compile(optimizer=setting.optimizer, loss=losses.SparseCategoricalCrossentropy( class_weight=setting.metrics_weights), metrics=[ metrics.LearningRateMetric(setting.optimizer, name='lr'), metrics.SparseCategoricalAccuracy( class_weight=setting.metrics_weights, name='pixel_acc'), metrics.SparseCategoricalMeanAccuracy( setting.dataset.num_class, class_weight=setting.metrics_weights, name='mean_acc'), metrics.SparseCategoricalMeanIoU( setting.dataset.num_class, class_weight=setting.metrics_weights, name='mean_iou') ]) # Backup code file_util.backup_code(root_folder) logging.info("CUDA_VISIBLE_DEVICES: %s" % setting.gpu_id) logging.info('PID: %d', os.getpid()) logging.info(str(args)) model_callbacks = [ # Interrupt training if `val_loss` stops improving for over 2 epochs # tf.keras.callbacks.EarlyStopping(patience=2, monitor='loss'), # Write TensorBoard logs to `root_folder` directory tf.keras.callbacks.TensorBoard(log_dir=root_folder), # Save the model with best iou callbacks.ModelCheckpoint(filepath=os.path.join( root_folder, setting.model.name + '-{epoch:03d}-' '{val_mean_iou:.4f}' '.hdf5'), monitor='val_mean_iou', mode='max', save_best_only=True, save_weights_only=True) ] dataset_train = setting.dataset.create_train_dataset() dataset_test = setting.dataset.create_test_dataset() # custom_entry_flow_conv1_1 = setting.model.get_layer(name='custom_entry_flow_conv1_1') # print('custom_entry_flow_conv1_1', custom_entry_flow_conv1_1.get_weights()) # for layer in setting.model.layers: # layer.trainable = False # setting.model.build(input_shape=tuple([2] + setting.dataset.get_input_shape())) # setting.model.summary() setting.model.fit( dataset_train, # epochs=1, epochs=setting.num_epochs, # steps_per_epoch=2, steps_per_epoch=setting.dataset.get_trainval_size() // setting.batch_size, validation_data=dataset_test, validation_freq=setting.validation_freq, callbacks=model_callbacks)
def train_net(train, val, model, name): transformations_train = transforms.apply_chain([ transforms.random_fliplr(), transforms.random_flipud(), transforms.augment(), torchvision.transforms.ToTensor() ]) transformations_val = transforms.apply_chain([ torchvision.transforms.ToTensor(), ]) dset_train = KaggleAmazonJPGDataset(train, paths.train_jpg, transformations_train, divide=False) train_loader = DataLoader(dset_train, batch_size=64, shuffle=True, num_workers=10, pin_memory=True) dset_val = KaggleAmazonJPGDataset(val, paths.train_jpg, transformations_val, divide=False) val_loader = DataLoader(dset_val, batch_size=64, num_workers=10, pin_memory=True) ignored_params = list(map(id, chain( model.classifier.parameters(), model.layer1.parameters(), model.layer2.parameters(), model.layer3.parameters(), model.layer4.parameters() ))) base_params = filter(lambda p: id(p) not in ignored_params, model.parameters()) optimizer = optim.Adam([ {'params': base_params}, {'params': model.layer1.parameters()}, {'params': model.layer2.parameters()}, {'params': model.layer3.parameters()}, {'params': model.layer4.parameters()}, {'params': model.classifier.parameters()} ], lr=0, weight_decay=0.0005) trainer = ModuleTrainer(model) def schedule(current_epoch, current_lrs, **logs): lrs = [1e-3, 1e-4, 1e-5] epochs = [0, 2, 10] for lr, epoch in zip(lrs, epochs): if current_epoch >= epoch: current_lrs[5] = lr if current_epoch >= 1: current_lrs[4] = lr * 0.4 current_lrs[3] = lr * 0.2 current_lrs[2] = lr * 0.1 current_lrs[1] = lr * 0.05 current_lrs[0] = lr * 0.01 return current_lrs trainer.set_callbacks([ callbacks.ModelCheckpoint( paths.models, name, save_best_only=False, saving_strategy=lambda epoch: True ), CSVLogger('./logs/' + name), LearningRateScheduler(schedule) ]) trainer.compile(loss=nn.BCELoss(), optimizer=optimizer) trainer.fit_loader(train_loader, val_loader, nb_epoch=35, verbose=1, cuda_device=0)
# learning_rate_updator = LearningRateUpdator(init_lr=0.001) callbacks = [ # Interrupts training when improvement stops callbacks.EarlyStopping( # Monitors the model’s validation accuracy monitor='val_loss', # Interrupts training when accuracy has stopped # improving for more than one epoch (that is, two epochs) patience=5, ), # Saves the current weights after every epoch callbacks.ModelCheckpoint( # Path to the destination model file filepath= 'saved_models/east_1031_7900_1976_{epoch:03d}_{loss:.4f}_{val_loss:.4f}.hdf5', # These two arguments mean you won’t overwrite the # model file unless val_loss has improved, which allows # you to keep the best model seen during training. monitor='val_loss', save_best_only=True, verbose=1), training_monitor, # accuracy_evaluator # learning_rate_updator ] train_gen = HDF5DatasetGenerator('data/train_7900.hdf5', batch_size=1).generator val_gen = HDF5DatasetGenerator('data/val_1976.hdf5', batch_size=1).generator H = model.fit_generator(train_gen(), steps_per_epoch=7900, callbacks=callbacks, epochs=100,