示例#1
0
def train_loop(
    pipeline_config_path,
    model_dir,
    val_checkpoint_dir,
    config_override=None,
    train_steps=None,
    use_tpu=False,
    save_final_config=False,
    checkpoint_every_n=1000,
    checkpoint_max_to_keep=7,
    record_summaries=True,
    performance_summary_exporter=None,
    **kwargs):
  """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    val_checkpoint_dir:
      The directory to save validation checkpoint.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    checkpoint_every_n:
      Checkpoint every n training steps.
    checkpoint_max_to_keep:
      int, the number of most recent checkpoints to keep in the model directory.
    record_summaries: Boolean, whether or not to record summaries.
    performance_summary_exporter: function for exporting performance metrics.
    **kwargs: Additional keyword arguments for configuration override.
  """

  print('START train looop function ========================')

  ## Parse the configs
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  steps_per_sec_list = []

  configs = get_configs_from_pipeline_file(
      pipeline_config_path, config_override=config_override)
  kwargs.update({
      'train_steps': train_steps,
      'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu
  })
  configs = merge_external_params_with_configs(
      configs, None, kwargs_dict=kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']

  unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
  add_regularization_loss = train_config.add_regularization_loss
  clip_gradients_value = None
  if train_config.gradient_clipping_by_norm > 0:
    clip_gradients_value = train_config.gradient_clipping_by_norm

  # update train_steps from config but only when non-zero value is provided
  if train_steps is None and train_config.num_steps != 0:
    train_steps = train_config.num_steps

  if kwargs['use_bfloat16']:
    tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

  if train_config.load_all_detection_checkpoint_vars:
    raise ValueError('train_pb2.load_all_detection_checkpoint_vars '
                     'unsupported in TF2')

  config_util.update_fine_tune_checkpoint_type(train_config)
  fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
  fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

  # Write the as-run pipeline config to disk.
  if save_final_config:
    tf.logging.info('Saving pipeline config file to directory {}'.format(
        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)

  # Build the model, optimizer, and training input
  strategy = tf.compat.v2.distribute.get_strategy()
  with strategy.scope():
    detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'](
        model_config=model_config, is_training=True)

    def train_dataset_fn(input_context):
      """Callable to create train input."""
      # Create the inputs.
      train_input = inputs.train_input(
          train_config=train_config,
          train_input_config=train_input_config,
          model_config=model_config,
          model=detection_model,
          input_context=input_context)
      train_input = train_input.repeat()
      return train_input

    train_input = strategy.experimental_distribute_datasets_from_function(
        train_dataset_fn)


    global_step = tf.Variable(
        0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step',
        aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA)
    optimizer, (learning_rate,) = optimizer_builder.build(
        train_config.optimizer, global_step=global_step)

    # We run the detection_model on dummy inputs in order to ensure that the
    # model and all its variables have been properly constructed. Specifically,
    # this is currently necessary prior to (potentially) creating shadow copies
    # of the model variables for the EMA optimizer.
    if train_config.optimizer.use_moving_average:
      _ensure_model_is_built(detection_model, train_input,
                             unpad_groundtruth_tensors)
      optimizer.shadow_copy(detection_model)

    if callable(learning_rate):
      learning_rate_fn = learning_rate
    else:
      learning_rate_fn = lambda: learning_rate

  ## Train the model
  # Get the appropriate filepath (temporary or not) based on whether the worker
  # is the chief.
  summary_writer_filepath = get_filepath(strategy,
                                         os.path.join(model_dir, 'train'))
  if record_summaries:
    summary_writer = tf.compat.v2.summary.create_file_writer(
        summary_writer_filepath)
  else:
    summary_writer = tf2.summary.create_noop_writer()

  if use_tpu:
    num_steps_per_iteration = 100
  else:
    # TODO(b/135933080) Explore setting to 100 when GPU performance issues
    # are fixed.
    num_steps_per_iteration = 1

  with summary_writer.as_default():
    with strategy.scope():
      with tf.compat.v2.summary.record_if(
          lambda: global_step % num_steps_per_iteration == 0):
        # Load a fine-tuning checkpoint.
        if train_config.fine_tune_checkpoint:
          load_fine_tune_checkpoint(
              detection_model, train_config.fine_tune_checkpoint,
              fine_tune_checkpoint_type, fine_tune_checkpoint_version,
              train_config.run_fine_tune_checkpoint_dummy_computation,
              train_input, unpad_groundtruth_tensors)

        ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)
        val_ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)

        manager_dir = get_filepath(strategy, model_dir)
        val_manager_dir = get_filepath(strategy, val_checkpoint_dir)



        # if not strategy.extended.should_checkpoint:
            # checkpoint_max_to_keep = 1
            
        checkpoint_max_to_keep = 1
        manager = tf.compat.v2.train.CheckpointManager(
            ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep)
        val_manager = tf.compat.v2.train.CheckpointManager(
            val_ckpt, val_manager_dir, max_to_keep=checkpoint_max_to_keep)

        model_checkpoint_callback = tfc.ModelCheckpoint(val_manager)
        early_stopping_callback = tfc.EarlyStopping(min_delta=0.0001, patience=5, mode='min')
        train_logger_callback = tfc.TrainLogger(model_dir, 'logs.txt')
        cancellation_point = tfc.CancellationPoint()
        

        # We use the following instead of manager.latest_checkpoint because
        # manager_dir does not point to the model directory when we are running
        # in a worker.
        latest_checkpoint = tf.train.latest_checkpoint(model_dir)
        ckpt.restore(latest_checkpoint)
        val_ckpt.restore(latest_checkpoint)

        def train_step_fn(features, labels):
          """Single train step."""
          loss = eager_train_step(
              detection_model,
              features,
              labels,
              unpad_groundtruth_tensors,
              optimizer,
              learning_rate=learning_rate_fn(),
              add_regularization_loss=add_regularization_loss,
              clip_gradients_value=clip_gradients_value,
              global_step=global_step,
              num_replicas=strategy.num_replicas_in_sync)
          global_step.assign_add(1)
          return loss

        def _sample_and_train(strategy, train_step_fn, data_iterator):
          features, labels = data_iterator.next()
          if hasattr(tf.distribute.Strategy, 'run'):
            per_replica_losses = strategy.run(
                train_step_fn, args=(features, labels))
          else:
            per_replica_losses = strategy.experimental_run_v2(
                train_step_fn, args=(features, labels))
          # TODO(anjalisridhar): explore if it is safe to remove the
          ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
          return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                 per_replica_losses, axis=None)

        @tf.function
        def _dist_train_step(data_iterator):
          """A distributed train step."""

          if num_steps_per_iteration > 1:
            for _ in tf.range(num_steps_per_iteration - 1):
              # Following suggestion on yaqs/5402607292645376
              with tf.name_scope(''):
                _sample_and_train(strategy, train_step_fn, data_iterator)

          return _sample_and_train(strategy, train_step_fn, data_iterator)

        train_input_iter = iter(train_input)

        if int(global_step.value()) == 0:
          manager.save()

        checkpointed_step = int(global_step.value())
        logged_step = global_step.value()

        # num_epochs = (train_steps - global_step.value()) // num_steps_per_iteration

        last_step_time = time.time()
        for epoch, _ in enumerate(range(global_step.value(), train_steps,
                       num_steps_per_iteration)):

          loss = _dist_train_step(train_input_iter)

          time_taken = time.time() - last_step_time
          last_step_time = time.time()
          steps_per_sec = num_steps_per_iteration * 1.0 / time_taken

          tf.compat.v2.summary.scalar(
              'steps_per_sec', steps_per_sec, step=global_step)

          steps_per_sec_list.append(steps_per_sec)

          if global_step.value() - logged_step >= 100:
            tf.logging.info(
                'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                    global_step.value(), time_taken / num_steps_per_iteration,
                    loss))

            manager.save()
            checkpointed_step = int(global_step.value())

            log_metrics = eval_continuously(pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, timeout=20)
            log_metrics['train_total_loss'] = loss

            model_checkpoint_callback.step(epoch, log_metrics['Loss/total_loss'])
            stop_training = early_stopping_callback.step(epoch, log_metrics['Loss/total_loss'])
            train_logger_callback.log(log_metrics)

            if stop_training or cancellation_point.check():
                break
            
            print(log_metrics)
            logged_step = global_step.value()

    

  # Remove the checkpoint directories of the non-chief workers that
  # MultiWorkerMirroredStrategy forces us to save during sync distributed
  # training.
  clean_temporary_directories(strategy, manager_dir)
  clean_temporary_directories(strategy, summary_writer_filepath)
  # TODO(pkanwar): add accuracy metrics.
  if performance_summary_exporter is not None:
    metrics = {
        'steps_per_sec': np.mean(steps_per_sec_list),
        'steps_per_sec_p50': np.median(steps_per_sec_list),
        'steps_per_sec_max': max(steps_per_sec_list),
        'last_batch_loss': float(loss)
    }
    mixed_precision = 'bf16' if kwargs['use_bfloat16'] else 'fp32'
    performance_summary_exporter(metrics, mixed_precision)
示例#2
0
def train(zones, epochs=1, batch_size=32, learning_rate=0.001,
          version=None, gpus=None, mtype='vgg16', starting_model_file=None,
          img_dim=224, channels=1, train_layer_start=None):
    if not isinstance(zones, list): zones = [zones]
    
    key_zone = zones[0]
    # This will get the total number of zones in the batches which will inform the steps
    # (found the 'sum' trick on stack overflow)
    zone_count = len(sum(zone_aps_generator2.ZONE_COMBOS_DICT[key_zone],[]))
    print(f"zone_count: {zone_count}")
    
    #data_shape = sd.zones_max_dict(round_up=True)[zones[0]]
    data_shape = (len(zone_aps_generator2.ZONE_SLICE_DICT[key_zone]),) + (img_dim, img_dim)

    img_scale = True if mtype=='vgg16' else False
    train_batches = get_batches_aps_train('train', zones, data_shape, channels=channels, batch_size=batch_size, shuffle=True, img_scale=img_scale)
    steps_per_epoch = math.ceil(train_batches.samples / train_batches.batch_size) * 2 * zone_count
    print(f"training sample size: {train_batches.samples}")
    print(f"training batch size: {train_batches.batch_size}, steps: {steps_per_epoch}")

    val_batches = get_batches_aps_train('valid', zones, data_shape, channels=channels, batch_size=batch_size, shuffle=True, img_scale=img_scale)    
    validation_steps = math.ceil(val_batches.samples / val_batches.batch_size) * 2 * zone_count
    print(f"validation sample size: {val_batches.samples}")
    print(f"validation batch size: {val_batches.batch_size}, steps: {validation_steps}")
    
    #----------------------------------
    train_model = None
    if starting_model_file is not None:
        # https://github.com/fchollet/keras/issues/6865 (why I must compile saved model
        smf_path = os.path.join(config.PSCREENING_HOME, config.MODEL_DIR, starting_model_file)
        train_model = tf.keras.models.load_model(smf_path)
        if train_layer_start is not None:
            _set_trainable(train_model, train_layer_start)
        _, _, mtype, _, _ = _model_params(starting_model_file)
    else:
        ps_model = _get_model(mtype, output=len(zones), multi_gpu=(gpus is not None), train_layer_start=train_layer_start)
        #TODO: create the model with None as the time dimension? When looking at the code it looked like TimeDistributed
        #acts differently when None is passed as opposed to a fixed dimension. 
        ps_model.create(input_shape=train_batches.data_shape)
        train_model = ps_model.model
    
    if gpus is not None:
        train_model = tf_util.multi_gpu_model(train_model, gpus)
    
    train_model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) 

    model_version = f"zone{zones[0]}-{mtype}-d{img_dim}-c{channels}-e{epochs}-bs{batch_size}-lr{str(learning_rate).split('.')[1]}"
    model_version += f"-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" 
    if version is not None:
        model_version += f"-{version}"
    print(f"model_version: {model_version}")
    model_version_el = model_version + "-{epoch:02d}-{val_loss:.3f}"
    
    model_file = model_version_el + '.h5'
    model_file = os.path.join(config.PSCREENING_HOME, config.MODEL_DIR, model_file)
    
    cb_model_save = callbacks.ModelCheckpoint(model_file, multi_gpu=(gpus is not None))
    
    weight1 = round(0.9 ** min(zone_count, 6), 2)
    train_model.fit_generator(train_batches,
                              steps_per_epoch=steps_per_epoch,
                              epochs=epochs,
                              validation_data=val_batches, 
                              validation_steps=validation_steps,
                              callbacks=[cb_model_save],
                              class_weight={0:1-weight1, 1:weight1})
         
    return model_version
示例#3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--setting',
                        '-x',
                        help='Setting to use',
                        required=True)
    args = parser.parse_args()
    setting = settings.load_setting(args.setting)

    time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    root_folder = os.path.join(
        setting.save_folder,
        '%s_%d_%s' % (args.setting, os.getpid(), time_string))
    if not os.path.exists(root_folder):
        os.makedirs(root_folder)

    setting.model.compile(optimizer=setting.optimizer,
                          loss=losses.SparseCategoricalCrossentropy(
                              class_weight=setting.metrics_weights),
                          metrics=[
                              metrics.LearningRateMetric(setting.optimizer,
                                                         name='lr'),
                              metrics.SparseCategoricalAccuracy(
                                  class_weight=setting.metrics_weights,
                                  name='pixel_acc'),
                              metrics.SparseCategoricalMeanAccuracy(
                                  setting.dataset.num_class,
                                  class_weight=setting.metrics_weights,
                                  name='mean_acc'),
                              metrics.SparseCategoricalMeanIoU(
                                  setting.dataset.num_class,
                                  class_weight=setting.metrics_weights,
                                  name='mean_iou')
                          ])
    # Backup code
    file_util.backup_code(root_folder)
    logging.info("CUDA_VISIBLE_DEVICES: %s" % setting.gpu_id)
    logging.info('PID: %d', os.getpid())
    logging.info(str(args))

    model_callbacks = [
        # Interrupt training if `val_loss` stops improving for over 2 epochs
        # tf.keras.callbacks.EarlyStopping(patience=2, monitor='loss'),
        # Write TensorBoard logs to `root_folder` directory
        tf.keras.callbacks.TensorBoard(log_dir=root_folder),
        # Save the model with best iou
        callbacks.ModelCheckpoint(filepath=os.path.join(
            root_folder, setting.model.name + '-{epoch:03d}-'
            '{val_mean_iou:.4f}'
            '.hdf5'),
                                  monitor='val_mean_iou',
                                  mode='max',
                                  save_best_only=True,
                                  save_weights_only=True)
    ]
    dataset_train = setting.dataset.create_train_dataset()
    dataset_test = setting.dataset.create_test_dataset()
    # custom_entry_flow_conv1_1 = setting.model.get_layer(name='custom_entry_flow_conv1_1')
    # print('custom_entry_flow_conv1_1', custom_entry_flow_conv1_1.get_weights())
    # for layer in setting.model.layers:
    #     layer.trainable = False
    # setting.model.build(input_shape=tuple([2] + setting.dataset.get_input_shape()))
    # setting.model.summary()
    setting.model.fit(
        dataset_train,
        # epochs=1,
        epochs=setting.num_epochs,
        # steps_per_epoch=2,
        steps_per_epoch=setting.dataset.get_trainval_size() //
        setting.batch_size,
        validation_data=dataset_test,
        validation_freq=setting.validation_freq,
        callbacks=model_callbacks)
def train_net(train, val, model, name):
    transformations_train = transforms.apply_chain([
        transforms.random_fliplr(),
        transforms.random_flipud(),
        transforms.augment(),
        torchvision.transforms.ToTensor()
    ])

    transformations_val = transforms.apply_chain([
        torchvision.transforms.ToTensor(),
    ])

    dset_train = KaggleAmazonJPGDataset(train, paths.train_jpg, transformations_train, divide=False)
    train_loader = DataLoader(dset_train,
                              batch_size=64,
                              shuffle=True,
                              num_workers=10,
                              pin_memory=True)

    dset_val = KaggleAmazonJPGDataset(val, paths.train_jpg, transformations_val, divide=False)
    val_loader = DataLoader(dset_val,
                            batch_size=64,
                            num_workers=10,
                            pin_memory=True)

    ignored_params = list(map(id, chain(
        model.classifier.parameters(),
        model.layer1.parameters(),
        model.layer2.parameters(),
        model.layer3.parameters(),
        model.layer4.parameters()
    )))
    base_params = filter(lambda p: id(p) not in ignored_params,
                         model.parameters())

    optimizer = optim.Adam([
        {'params': base_params},
        {'params': model.layer1.parameters()},
        {'params': model.layer2.parameters()},
        {'params': model.layer3.parameters()},
        {'params': model.layer4.parameters()},
        {'params': model.classifier.parameters()}
    ], lr=0, weight_decay=0.0005)

    trainer = ModuleTrainer(model)

    def schedule(current_epoch, current_lrs, **logs):
        lrs = [1e-3, 1e-4, 1e-5]
        epochs = [0, 2, 10]

        for lr, epoch in zip(lrs, epochs):
            if current_epoch >= epoch:
                current_lrs[5] = lr
                if current_epoch >= 1:
                    current_lrs[4] = lr * 0.4
                    current_lrs[3] = lr * 0.2
                    current_lrs[2] = lr * 0.1
                    current_lrs[1] = lr * 0.05
                    current_lrs[0] = lr * 0.01

        return current_lrs

    trainer.set_callbacks([
        callbacks.ModelCheckpoint(
            paths.models,
            name,
            save_best_only=False,
            saving_strategy=lambda epoch: True
        ),
        CSVLogger('./logs/' + name),
        LearningRateScheduler(schedule)
    ])

    trainer.compile(loss=nn.BCELoss(),
                    optimizer=optimizer)

    trainer.fit_loader(train_loader,
                       val_loader,
                       nb_epoch=35,
                       verbose=1,
                       cuda_device=0)
示例#5
0
# learning_rate_updator = LearningRateUpdator(init_lr=0.001)
callbacks = [
    # Interrupts training when improvement stops
    callbacks.EarlyStopping(
        # Monitors the model’s validation accuracy
        monitor='val_loss',
        # Interrupts training when accuracy has stopped
        # improving for more than one epoch (that is, two epochs)
        patience=5,
    ),
    # Saves the current weights after every epoch
    callbacks.ModelCheckpoint(
        # Path to the destination model file
        filepath=
        'saved_models/east_1031_7900_1976_{epoch:03d}_{loss:.4f}_{val_loss:.4f}.hdf5',
        # These two arguments mean you won’t overwrite the
        # model file unless val_loss has improved, which allows
        # you to keep the best model seen during training.
        monitor='val_loss',
        save_best_only=True,
        verbose=1),
    training_monitor,
    # accuracy_evaluator
    # learning_rate_updator
]
train_gen = HDF5DatasetGenerator('data/train_7900.hdf5',
                                 batch_size=1).generator
val_gen = HDF5DatasetGenerator('data/val_1976.hdf5', batch_size=1).generator
H = model.fit_generator(train_gen(),
                        steps_per_epoch=7900,
                        callbacks=callbacks,
                        epochs=100,