Пример #1
0
def wrapped_model_fn(model_fn, run_config):
    """Returns a new model_fn, which wraps the TPU support."""

    # Verifies the model_fn signature according to Estimator framework.
    estimator_lib._verify_model_fn_args(model_fn, params=None)  # pylint: disable=protected-access

    def _model_fn(features, labels, mode):
        """model_fn."""
        # TODO(jhseu): Move to EVAL and PREDICT to TPU.
        if mode != model_fn_lib.ModeKeys.TRAIN:
            return model_fn(features, labels, mode)

        dequeue_fn, enqueue_fn = (_create_infeed_enqueue_ops_and_dequeue_fn(
            run_config, features, labels))

        loss = _train_on_tpu_shards(run_config,
                                    train_step=_convert_model_fn_to_train_step(
                                        model_fn, dequeue_fn, mode,
                                        run_config))

        # Gets the variables back from TPU nodes. This means the variables updated
        # by TPU will now be *synced* to host memory.
        update_ops = [
            array_ops.check_numerics(v.read_value(),
                                     'Gradient for %s is NaN' % v.name).op
            for v in variables.trainable_variables()
        ]

        hooks = [
            TpuInfeedSessionHook(run_config, enqueue_fn),
            training.LoggingTensorHook(
                {
                    'loss': array_ops.identity(loss),
                    'step': training.get_global_step()
                },
                every_n_secs=30)
        ]

        return model_fn_lib.EstimatorSpec(
            mode,
            loss=array_ops.identity(loss),
            training_hooks=hooks,
            train_op=control_flow_ops.group(*update_ops))

    return _model_fn
Пример #2
0
    def __init__(self,
                 model_fn=None,
                 model_dir=None,
                 config=None,
                 params=None,
                 use_tpu=True):
        if config is None or not isinstance(config, tpu_config.RunConfig):
            raise ValueError(
                '`config` must be provided with type `tpu_config.RunConfig`')

        if use_tpu and params is not None and _BATCH_SIZE_KEY in params:
            if not isinstance(params[_BATCH_SIZE_KEY], int):
                raise ValueError(
                    '`{}` in params must be an int'.format(_BATCH_SIZE_KEY))
            params = copy.deepcopy(params)
            # The specified batch size is the batch size for the entire computation.
            # The input_fn is called per-shard, so we want to calculate the per-shard
            # batch size and pass that.
            if params[_BATCH_SIZE_KEY] % config.tpu_config.num_shards != 0:
                raise ValueError(
                    'batch size {} must be divisible by number of shards {}'.
                    format(params[_BATCH_SIZE_KEY],
                           config.tpu_config.num_shards))

        if use_tpu:
            if not isinstance(config, tpu_config.RunConfig):
                raise ValueError('`config` must be `tpu_config.RunConfig`')
            # Verifies the model_fn signature according to Estimator framework.
            estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
            # We cannot store config and params in this constructor as parent
            # constructor might change them, such as assigning a temp dir for
            # config.model_dir.
            model_function = wrapped_model_fn(model_fn)
        else:
            model_function = model_fn
        super(TpuEstimator, self).__init__(model_fn=model_function,
                                           model_dir=model_dir,
                                           config=config,
                                           params=params)
        self.use_tpu = use_tpu
Пример #3
0
  def __init__(self,
               model_fn=None,
               model_dir=None,
               config=None,
               params=None,
               use_tpu=True):
    if config is None or not isinstance(config, tpu_config.RunConfig):
      raise ValueError(
          '`config` must be provided with type `tpu_config.RunConfig`')

    if use_tpu and params is not None and _BATCH_SIZE_KEY in params:
      if not isinstance(params[_BATCH_SIZE_KEY], int):
        raise ValueError(
            '`{}` in params must be an int'.format(_BATCH_SIZE_KEY))
      params = copy.deepcopy(params)
      # The specified batch size is the batch size for the entire computation.
      # The input_fn is called per-shard, so we want to calculate the per-shard
      # batch size and pass that.
      if params[_BATCH_SIZE_KEY] % config.tpu_config.num_shards != 0:
        raise ValueError(
            'batch size {} must be divisible by number of shards {}'
            .format(params[_BATCH_SIZE_KEY], config.tpu_config.num_shards))

    if use_tpu:
      if not isinstance(config, tpu_config.RunConfig):
        raise ValueError('`config` must be `tpu_config.RunConfig`')
      # Verifies the model_fn signature according to Estimator framework.
      estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
      # We cannot store config and params in this constructor as parent
      # constructor might change them, such as assigning a temp dir for
      # config.model_dir.
      model_function = wrapped_model_fn(model_fn)
    else:
      model_function = model_fn
    super(TpuEstimator, self).__init__(
        model_fn=model_function,
        model_dir=model_dir,
        config=config,
        params=params)
    self.use_tpu = use_tpu
Пример #4
0
    def __init__(self, model_fn, model_dir=None, config=None, params=None):
        # Create a run configuration.
        if config is None:
            self._config = RunConfig()
            logging.info("Using default config.")
        else:
            if not isinstance(config, RunConfig):
                raise ValueError("config must be an instance of RunConfig, "
                                 "received {}.".format(config))
            self._config = config

        if (model_dir is not None) and (self._config.model_dir is not None):
            if model_dir != self._config.model_dir:
                raise ValueError(
                    "model_dir are set both in constructor and RunConfig, but with "
                    "different values. In constructor: '{}', in RunConfig: "
                    "'{}' ".format(model_dir, self._config.model_dir))

        self._model_dir = model_dir or self._config.model_dir or generate_model_dir(
        )
        if self._config.model_dir is None:
            self._config = self._config.replace(model_dir=self._model_dir)
        logging.info("Using config: {}".format(vars(self._config)))

        if self._config.session_config is None:
            self._session_config = config_pb2.ConfigProto(
                allow_soft_placement=True)
        else:
            self._session_config = self._config.session_config

        # Set device function depending if there are replicas or not.
        # pylint: disable=protected-access
        self._device_fn = tf_estimator._get_replica_device_setter(self._config)

        tf_estimator._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access

        self._model_fn = model_fn
        self._params = params or {}
Пример #5
0
  def __init__(self,
               model_fn=None,
               model_dir=None,
               config=None,
               params=None,
               use_tpu=True,
               train_batch_size=None):
    """Constructs an `TPUEstimator` instance.

    Args:
      model_fn: Model function as required by `Estimator`. For training, the
        returned `EstimatorSpec` cannot have hooks as it is not supported in
        `TPUEstimator`.
      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model. If `None`, the model_dir in
        `config` will be used if set. If both are set, they must be same. If
        both are `None`, a temporary directory will be used.
      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
      params: An optional `dict` of hyper parameters that will be passed into
        `input_fn` and `model_fn`.  Keys are names of parameters, values are
        basic python types. There are reserved keys for `TPUEstimator`,
        including 'batch_size'.
      use_tpu: A bool indicating whether TPU support is enabled. Currently, only
        applied to training. Evaluate and predict still happen on CPU.
      train_batch_size: An int representing the global training batch size.
        TPUEstimator transforms this global batch size to a per-shard batch
        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
        Cannot be `None` if `use_tpu` is `True`. Must be divisible by
        `config.tpu_config.num_shards`.

    Raises:
      ValueError: `params` has reserved keys already.
    """
    if config is None or not isinstance(config, tpu_config.RunConfig):
      raise ValueError(
          '`config` must be provided with type `tpu_config.RunConfig`')

    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
      raise ValueError(
          '{} are reserved keys but existed in params {}.'.format(
              _RESERVED_PARAMS_KEYS, params))

    if use_tpu:
      if train_batch_size is None:
        raise ValueError('`train_batch_size` cannot be `None`')
      if not isinstance(train_batch_size, int):
        raise ValueError('`train_batch_size` must be an int')
      if train_batch_size < 1:
        raise ValueError('`train_batch_size` must be positive')

      # The specified batch size is the batch size for the entire computation.
      # The input_fn and model_fn are called per-shard, so we want to calculate
      # the per-shard batch size and pass that.
      if train_batch_size % config.tpu_config.num_shards != 0:
        raise ValueError(
            'batch size {} must be divisible by number of shards {}'
            .format(train_batch_size, config.tpu_config.num_shards))

    if use_tpu:
      # Verifies the model_fn signature according to Estimator framework.
      estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
      # We cannot store config and params in this constructor as parent
      # constructor might change them, such as assigning a temp dir for
      # config.model_dir.
      model_function = augment_model_fn_with_tpu_support(
          model_fn, train_batch_size)
    else:
      model_function = model_fn

    super(TPUEstimator, self).__init__(
        model_fn=model_function,
        model_dir=model_dir,
        config=config,
        params=params)
    self._use_tpu = use_tpu
    self._train_batch_size = train_batch_size
Пример #6
0
    def __init__(self,
                 model_fn=None,
                 model_dir=None,
                 config=None,
                 params=None,
                 job_start_file='',
                 warm_start_from=None):
        """Constructs an `NPUEstimator` instance.

        Args:
            model_fn: Model function as required by `Estimator` which returns
                EstimatorSpec. `training_hooks`, 'evaluation_hooks',
                and `prediction_hooks` must not capure any NPU Tensor inside the model_fn.
            config: An `NPURunConfig` configuration object. Cannot be `None`.
            params: An optional `dict` of hyper parameters that will be passed into
                `input_fn` and `model_fn`.  Keys are names of parameters, values are
                basic python types..
            job_start_file: The path of the job start file. Cannot be `None`.
            warm_start_from: Optional string filepath to a checkpoint or SavedModel to
               warm-start from, or a `tf.estimator.WarmStartSettings`
               object to fully configure warm-starting.  If the string
               filepath is provided instead of a`tf.estimator.WarmStartSettings`,
               then all variables are warm-started, and it is assumed that vocabularies
               and `tf.Tensor` names are unchanged.
         """
        logging.info("NPUEstimator init...")

        if config is None or not isinstance(config, NPURunConfig):
            raise ValueError(
                '`config` must be provided with type `NPUConfigs`')

        # Verifies the model_fn signature according to Estimator framework.
        estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access

        # Load the graph optimizers.
        config = self.__load_graph_optimizers(config)

        # Init npu system: get task and device info from configuration file.
        if not self.__load_job_info(job_start_file):
            raise ValueError(
                'Load job info failed, '
                'please check whether `JOB_ID` is set in environment variable')

        # Check modie dir in NPUEstimator and NPURunConfig
        model_dir = self.__check_model_dir(model_dir, config)

        # Wrap model_fn to adding npu sessionhooks.
        model_function = self.__augment_model_fn(model_fn, model_dir, config)

        # Get the checkpoint file.
        if not warm_start_from:
            restore_from = self.__job_info._local_checkpoint_dir
            # tf use restore_from variable, no need to check safety.
            if restore_from is None or restore_from == "":
                restore_from = os.getenv('RESTORE_FROM')
        else:
            restore_from = warm_start_from

        # Passing non-None params as wrapped model_fn use it.
        params = params or {}
        with no_check_override():
            super(NPUEstimator, self).__init__(model_fn=model_function,
                                               model_dir=model_dir,
                                               config=config,
                                               params=params,
                                               warm_start_from=restore_from)
Пример #7
0
    def __init__(self,
                 model_fn=None,
                 model_dir=None,
                 config=None,
                 params=None,
                 use_tpu=True,
                 train_batch_size=None):
        """Constructs an `TPUEstimator` instance.

    Args:
      model_fn: Model function as required by `Estimator`. For training, the
        returned `EstimatorSpec` cannot have hooks as it is not supported in
        `TPUEstimator`.
      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model. If `None`, the model_dir in
        `config` will be used if set. If both are set, they must be same. If
        both are `None`, a temporary directory will be used.
      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
      params: An optional `dict` of hyper parameters that will be passed into
        `input_fn` and `model_fn`.  Keys are names of parameters, values are
        basic python types. There are reserved keys for `TPUEstimator`,
        including 'batch_size'.
      use_tpu: A bool indicating whether TPU support is enabled. Currently, only
        applied to training. Evaluate and predict still happen on CPU.
      train_batch_size: An int representing the global training batch size.
        TPUEstimator transforms this global batch size to a per-shard batch
        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
        Cannot be `None` if `use_tpu` is `True`. Must be divisible by
        `config.tpu_config.num_shards`.

    Raises:
      ValueError: `params` has reserved keys already.
    """
        if config is None or not isinstance(config, tpu_config.RunConfig):
            raise ValueError(
                '`config` must be provided with type `tpu_config.RunConfig`')

        if params is not None and any(k in params
                                      for k in _RESERVED_PARAMS_KEYS):
            raise ValueError(
                '{} are reserved keys but existed in params {}.'.format(
                    _RESERVED_PARAMS_KEYS, params))

        if use_tpu:
            if train_batch_size is None:
                raise ValueError('`train_batch_size` cannot be `None`')
            if not isinstance(train_batch_size, int):
                raise ValueError('`train_batch_size` must be an int')
            if train_batch_size < 1:
                raise ValueError('`train_batch_size` must be positive')

            # The specified batch size is the batch size for the entire computation.
            # The input_fn and model_fn are called per-shard, so we want to calculate
            # the per-shard batch size and pass that.
            if train_batch_size % config.tpu_config.num_shards != 0:
                raise ValueError(
                    'batch size {} must be divisible by number of shards {}'.
                    format(train_batch_size, config.tpu_config.num_shards))

        if use_tpu:
            # Verifies the model_fn signature according to Estimator framework.
            estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
            # We cannot store config and params in this constructor as parent
            # constructor might change them, such as assigning a temp dir for
            # config.model_dir.
            model_function = augment_model_fn_with_tpu_support(
                model_fn, train_batch_size)
        else:
            model_function = model_fn

        super(TPUEstimator, self).__init__(model_fn=model_function,
                                           model_dir=model_dir,
                                           config=config,
                                           params=params)
        self._use_tpu = use_tpu
        self._train_batch_size = train_batch_size