Пример #1
0
  def __init__(self, params, model, name="loss"):
    """Loss constructor.
    Note that loss constructors should not modify TensorFlow graph, all
    graph construction should happen in the
    :meth:`self._compute_loss() <_compute_loss>` method.

    Args:
      params (dict): parameters describing the loss.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this loss.
          Could be None if no model access is required for the use case.
      name (str): name for loss variable scope.

    Config parameters:

    * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``.
    """
    check_params(params, self.get_required_params(), self.get_optional_params())
    self._params = copy.deepcopy(params)
    self._model = model

    if 'dtype' not in self._params:
      if self._model:
        self._params['dtype'] = self._model.get_tf_dtype()
      else:
        self._params['dtype'] = tf.float32

    self._name = name
Пример #2
0
    def __init__(self, params):
        if params is None:
            params = {}
        check_params(
            config=params,
            required_dict={},
            optional_dict={
                'scale_min': float,
                'scale_max': float,
                'step_factor': float,
                'step_window': int
            },
        )
        self.scale_min = params.get('scale_min', 1.0)
        self.scale_max = params.get('scale_max', 2.**24)
        self.step_factor = params.get('step_factor', 2.0)
        self.step_window = params.get('step_window', 2000)

        self.iteration = tf.Variable(initial_value=0,
                                     trainable=False,
                                     dtype=tf.int64)
        self.last_overflow_iteration = tf.Variable(initial_value=-1,
                                                   trainable=False,
                                                   dtype=tf.int64)
        self.scale = tf.Variable(initial_value=self.scale_max, trainable=False)
Пример #3
0
    def __init__(self, params, model, name="decoder", mode='train'):
        """Decoder constructor.
    Note that decoder constructors should not modify TensorFlow graph, all
    graph construction should happen in the :meth:`self._decode() <_decode>`
    method.

    Args:
      params (dict): parameters describing the decoder.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this decoder.
          Could be None if no model access is required for the use case.
      name (str): name for decoder variable scope.
      mode (str): mode decoder is going to be run in.
          Could be "train", "eval" or "infer".

    Config parameters:

    * **initializer** --- any valid TensorFlow initializer. If no initializer
      is provided, model initializer will be used.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **regularizer** --- and valid TensorFlow regularizer. If no regularizer
      is provided, model regularizer will be used.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``, ``tf.float32``
      or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs. If no
      dtype is provided, model dtype will be used.
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())
        self._params = copy.deepcopy(params)
        self._model = model

        if 'dtype' not in self._params:
            if self._model:
                self._params['dtype'] = self._model.params['dtype']
            else:
                self._params['dtype'] = tf.float32

        self._name = name
        self._mode = mode
        self._compiled = False
Пример #4
0
    def __init__(self, params, model, num_workers, worker_id):
        """Data layer constructor.
    The TensorFlow graph should not be created here, but rather in the
    :meth:`self.build_graph() <build_graph>` method.

    Args:
      params (dict): parameters describing the data layer.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      model (instance of a class derived from :class:`Model<models.model.Model>`):
          parent model that created this data layer.
          Could be None if no model access is required for the use case.
      num_workers (int): number of Horovod processes or number of GPUs
          if Horovod is not used.
      worker_id (int): Horovod process id or GPU id if Horovod is not used.

    Config parameters:

    * **shuffle** (bool) --- whether to shuffle dataset after an epoch.
      Typically will be True for train and False for inference and evaluation.
    * **dtype** --- data dtype. Could be either ``tf.float16`` or ``tf.float32``.
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())
        self._params = copy.deepcopy(params)
        self._model = model

        if 'dtype' not in self._params:
            if self._model:
                self._params['dtype'] = self._model.get_tf_dtype()
            else:
                self._params['dtype'] = tf.float32

        if 'shuffle' not in params:
            self._params['shuffle'] = (self._params['mode'] == 'train')

        if self._params['mode'] != 'train' and self._params['shuffle']:
            raise ValueError(
                "Shuffle should not be performed in eval or infer modes")

        # should be used for correct evaluation on multiple GPUs
        self._num_workers = num_workers
        self._worker_id = worker_id
Пример #5
0
    def __init__(self, params):
        if params is None:
            params = {}
        check_params(
            config=params,
            required_dict={},
            optional_dict={
                'scale_min': float,
                'scale_max': float,
                'log_max': float,
                'beta1': float,
                'beta2': float,
                'overflow_std_dev': float
            },
        )
        self.scale_min = params.get('scale_min', 1.0)
        self.scale_max = params.get('scale_max', 2.**24)
        self.log_max = params.get('log_max', 16.)
        self.beta1 = params.get('beta1', 0.99)
        self.beta2 = params.get('beta2', 0.999)
        self.overflow_std_dev = params.get('overflow_std_dev', 3.09)

        self.iteration = tf.Variable(initial_value=0,
                                     trainable=False,
                                     dtype=tf.int64)
        self.scale = tf.Variable(initial_value=1.0, trainable=False)
        self.x_hat = tf.Variable(initial_value=0,
                                 trainable=False,
                                 dtype=tf.float32)
        self.slow_x_hat = tf.Variable(initial_value=0,
                                      trainable=False,
                                      dtype=tf.float32)
        self.xsquared_hat = tf.Variable(initial_value=0,
                                        trainable=False,
                                        dtype=tf.float32)
        self.b1_correction = tf.Variable(initial_value=1.,
                                         trainable=False,
                                         dtype=tf.float32)
        self.b2_correction = tf.Variable(initial_value=1.,
                                         trainable=False,
                                         dtype=tf.float32)
Пример #6
0
def post_process_gradients(grads_and_vars, summaries, lr,
                           clip_gradients, larc_params):
  """Applies post processing to gradients, i.e. clipping, LARC, summaries."""
  if "global_gradient_norm" in summaries:
    tf.summary.scalar(
        "global_gradient_norm",
        _global_norm_with_cast(grads_and_vars),
    )

  # Optionally clip gradients by global norm.
  if clip_gradients is not None:
    grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients)

  # Add histograms for variables, gradients and gradient norms.
  for gradient, variable in grads_and_vars:
    if isinstance(gradient, tf.IndexedSlices):
      grad_values = gradient.values
    else:
      grad_values = gradient

    if isinstance(variable, tf.IndexedSlices):
      var_values = variable.values
    else:
      var_values = variable

    if grad_values is not None:
      var_name = variable.name.replace(":", "_")
      if "gradients" in summaries:
        # need to mask nans for automatic loss scaling
        tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values))
      if "gradient_norm" in summaries:
        tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values))
      if "variables" in summaries:
        tf.summary.histogram("variables/%s" % var_name, var_values)
      if "variable_norm" in summaries:
        tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values))

  if clip_gradients is not None and "global_gradient_norm" in summaries:
    tf.summary.scalar(
        "global_clipped_gradient_norm",
        _global_norm_with_cast(grads_and_vars),
    )

  # LARC gradient re-scaling
  if larc_params is not None:
    check_params(
        config=larc_params,
        required_dict={'larc_eta': float},
        optional_dict={
            'larc_mode': ['clip', 'scale'],
            'min_update': float,
            'epsilon': float
        },
    )
    larc_eta = larc_params['larc_eta']
    larc_mode = larc_params.get('larc_mode', 'clip')
    min_update = larc_params.get('min_update', 1e-7)
    eps = larc_params.get('epsilon', 1e-7)

    grads_and_vars_larc = [None] * len(grads_and_vars)
    for idx, (g, v) in enumerate(grads_and_vars):
      var_dtype = v.dtype
      v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
      g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

      if larc_mode == 'clip':
        larc_grad_update = tf.maximum(
            larc_eta * v_norm / (lr * (g_norm + eps)),
            min_update,
        )
        if "larc_summaries" in summaries:
          tf.summary.scalar('larc_clip_on/{}'.format(v.name),
                            tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))
        larc_grad_update = tf.minimum(larc_grad_update, 1.0)
      else:
        larc_grad_update = tf.maximum(
            larc_eta * v_norm / (g_norm + eps),
            min_update,
        )
      larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
      grads_and_vars_larc[idx] = (larc_grad_update * g, v)

      # adding additional summary
      if "larc_summaries" in summaries:
        tf.summary.scalar('larc_grad_update/{}'.format(v.name),
                          larc_grad_update)
        tf.summary.scalar("larc_final_lr/{}".format(v.name),
                          tf.cast(lr, var_dtype) * larc_grad_update)
    grads_and_vars = grads_and_vars_larc
  return grads_and_vars
Пример #7
0
    def __init__(self, params, mode="train", hvd=None):
        """Model constructor.
    The TensorFlow graph should not be created here, but rather in the
    :meth:`self.compile() <compile>` method.

    Args:
      params (dict): parameters describing the model.
          All supported parameters are listed in :meth:`get_required_params`,
          :meth:`get_optional_params` functions.
      mode (string, optional): "train", "eval" or "infer".
          If mode is "train" all parts of the graph will be built
          (model, loss, optimizer).
          If mode is "eval", only model and loss will be built.
          If mode is "infer", only model will be built.
      hvd (optional): if Horovod is used, this should be
          ``horovod.tensorflow`` module.
          If Horovod is not used, it should be None.

    Config parameters:

    * **random_seed** (int) --- random seed to use.
    * **use_horovod** (bool) --- whether to use Horovod for distributed
      execution.
    * **num_gpus** (int) --- number of GPUs to use. This parameter cannot be
      used if ``gpu_ids`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **gpu_ids** (list of ints) --- GPU ids to use. This parameter cannot be
      used if ``num_gpus`` is specified. When ``use_horovod`` is True
      this parameter is ignored.
    * **batch_size_per_gpu** (int) --- batch size to use for each GPU.
    * **eval_batch_size_per_gpu** (int) --- batch size to use for each GPU during
      inference. This is for when training and inference have different computation
      and memory requirements, such as when training uses sampled softmax and
      inference uses full softmax. If not specified, it's set
      to ``batch_size_per_gpu``.
    * **restore_best_checkpoint** (bool) --- if set to True, when doing evaluation 
      and inference, the model will load the best checkpoint instead of the latest
      checkpoint. Best checkpoint is evaluated based on evaluation results, so 
      it's only available when the model is trained untder ``train_eval`` mode.
      Default to False.
    * **load_model** (str) --- points to the location of the pretrained model for
      transfer learning. If specified, during training, the system will look
      into the checkpoint in this folder and restore all variables whose names and 
      shapes match a variable in the new model.
    * **num_epochs** (int) --- number of epochs to run training for.
      This parameter cannot be used if ``max_steps`` is specified.
    * **max_steps** (int) --- number of steps to run training for.
      This parameter cannot be used if ``num_epochs`` is specified.
    * **save_summaries_steps** (int or None) --- how often to save summaries.
      Setting it to None disables summaries saving.
    * **print_loss_steps** (int or None) --- how often to print loss during
      training. Setting it to None disables loss printing.
    * **print_samples_steps** (int or None) --- how often to print training
      samples (input sequences, correct answers and model predictions).
      Setting it to None disables samples printing.
    * **print_bench_info_steps** (int or None) --- how often to print training
      benchmarking information (average number of objects processed per step).
      Setting it to None disables intermediate benchmarking printing, but
      the average information across the whole training will always be printed
      after the last iteration.
    * **save_checkpoint_steps** (int or None) --- how often to save model
      checkpoints. Setting it to None disables checkpoint saving.
    * **eval_steps** (int) --- how often to run evaluation during training.
      This parameter is only checked if ``--mode`` argument of ``run.py`` is
      "train\_eval". If no evaluation is needed you should use "train" mode.
    * **logdir** (string) --- path to the log directory where all checkpoints
      and summaries will be saved.
    * **data_layer** (any class derived from
      :class:`DataLayer <data.data_layer.DataLayer>`) --- data layer class
      to use.
    * **data_layer_params** (dict) --- dictionary with data layer
      configuration.
      For complete list of possible parameters see the corresponding
      class docs.
    * **optimizer** (string or TensorFlow optimizer class) --- optimizer to
      use for training. Could be either "Adam", "Adagrad", "Ftrl", "Momentum",
      "RMSProp", "SGD" or any valid TensorFlow optimizer class.
    * **optimizer_params** (dict) --- dictionary that will be passed to
      optimizer ``__init__`` method.
    * **initializer** --- any valid TensorFlow initializer.
    * **initializer_params** (dict) --- dictionary that will be passed to
      initializer ``__init__`` method.
    * **freeze_variables_regex** (str or None) --- if zero or more characters
      at the beginning of the name of a trainable variable match this
      pattern, then this variable will be frozen during training.
      Setting it to None disables freezing of variables.
    * **regularizer** --- and valid TensorFlow regularizer.
    * **regularizer_params** (dict) --- dictionary that will be passed to
      regularizer ``__init__`` method.
    * **dtype** --- model dtype. Could be either ``tf.float16``,
      ``tf.float32`` or "mixed". For details see
      :ref:`mixed precision training <mixed_precision>` section in docs.
    * **lr_policy** --- any valid learning rate policy function. For examples,
      see :any:`optimizers.lr_policies` module.
    * **lr_policy_params** (dict) --- dictionary containing lr_policy
      parameters.
    * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping
      will be performed if some gradients exceed this value (this is checked
      for each variable independently).
    * **loss_scaling** --- could be float or string. If float, static loss
      scaling is applied. If string, the corresponding automatic
      loss scaling algorithm is used. Must be one of 'Backoff'
      of 'LogMax' (case insensitive). Only used when dtype="mixed". For details
      see :ref:`mixed precision training <mixed_precision>` section in docs.
    * **loss_scaling_params** (dict) --- dictionary containing loss scaling
      parameters.
    * **summaries** (list) --- which summaries to log. Could contain
      "learning_rate", "gradients", "gradient_norm", "global_gradient_norm",
      "variables", "variable_norm", "loss_scale".
    * **iter_size** (int) --- use this parameter to emulate large batches.
      The gradients will be accumulated for ``iter_size`` number of steps before
      applying update.
    * **larc_params** --- dictionary with parameters for LARC (or LARS)
      optimization algorithms. Can contain the following parameters:

      * **larc_mode** --- Could be either "scale" (LARS) or "clip" (LARC).
        Note that it works in addition to any other optimization algorithm
        since we treat
        it as adaptive gradient clipping and learning rate adjustment.
      * **larc_eta** (float) --- LARC or LARS scaling parameter.
      * **min_update** (float) --- minimal value of the LARC (LARS) update.
      * **epsilon** (float) --- small number added to gradient norm in
        denominator for numerical stability.
    """
        check_params(params, self.get_required_params(),
                     self.get_optional_params())

        self._params = copy.deepcopy(params)

        if self._params.get('iter_size', 1) > 1 and hvd is None:
            raise ValueError("iter_size is only supported in Horovod mode")

        # parameter checks
        self._mode = mode
        self._interactive = False
        if self._mode == "interactive_infer":
            self._mode = "infer"
            self._interactive = True

        if self._mode not in ["train", "infer", "eval"]:
            raise ValueError(
                "Mode has to be one of ['train', 'infer', 'eval']")

        if "max_steps" in params and "num_epochs" in params:
            raise ValueError(
                "You can't provide both max_steps and num_epochs. "
                "Please, remove one of them from the config.")
        if mode == "train":
            if "max_steps" not in params and "num_epochs" not in params:
                raise ValueError("For training mode either max_steps or "
                                 "num_epochs has to be provided")

        if 'print_samples_steps' not in self._params:
            self._params['print_samples_steps'] = None
        if 'print_loss_steps' not in self._params:
            self._params['print_loss_steps'] = None
        if 'save_checkpoint_steps' not in self._params:
            self._params['save_checkpoint_steps'] = None
        if 'save_summaries_steps' not in self._params:
            self._params['save_summaries_steps'] = None
        if 'print_bench_info_steps' not in self._params:
            self._params['print_bench_info_steps'] = None

        self._params['finetune'] = self._params.get('finetune', False)
        # self._params['base_logdir'] = self._params.get('base_logdir', None)
        self._params['load_model'] = self._params.get('load_model', None)
        self._params['load_fc'] = self._params.get('load_fc', False)
        self._params['eval_batch_size_per_gpu'] = self._params.get(
            'eval_batch_size_per_gpu', self._params['batch_size_per_gpu'])

        # checking that frequencies of samples and loss are aligned
        s_fr = self._params['print_samples_steps']
        l_fr = self._params['print_loss_steps']
        if s_fr is not None and l_fr is not None and s_fr % l_fr != 0:
            raise ValueError("print_samples_steps has to be a multiple of "
                             "print_loss_steps.")

        self._hvd = hvd
        if self._hvd:
            self._gpu_ids = range(1)
        else:
            if 'gpu_ids' in self._params:
                self._gpu_ids = self._params['gpu_ids']
            elif 'num_gpus' in self._params:
                self._gpu_ids = range(self._params['num_gpus'])
            else:
                raise ValueError('Either "gpu_ids" or "num_gpus" has to '
                                 'be specified in the config')

        if self._interactive and len(self._gpu_ids) > 1:
            raise ValueError(
                "Interactive infer is meant to be used with 1 gpu")

        # setting random seed
        rs = self._params.get('random_seed', int(time.time()))
        if self.on_horovod:
            rs += hvd.rank()
        tf.set_random_seed(rs)
        np.random.seed(rs)

        if 'dtype' not in self._params:
            self._params['dtype'] = tf.float32

        dl_params = self._params.get('data_layer_params', {})
        if mode == 'train':
            dl_params['batch_size'] = self._params['batch_size_per_gpu']
        else:
            dl_params['batch_size'] = self._params['eval_batch_size_per_gpu']
        if 'lm_vocab_file' in self._params:
            dl_params['lm_vocab_file'] = self._params['lm_vocab_file']
        if 'processed_data_folder' in self._params:
            dl_params['processed_data_folder'] = self._params[
                'processed_data_folder']
        dl_params['mode'] = self._mode
        dl_params['interactive'] = self._interactive

        if self.on_horovod:
            self._data_layer = self._params['data_layer'](
                params=dl_params,
                model=self,
                num_workers=self._hvd.size(),
                worker_id=self._hvd.rank(),
            )
        else:
            self._data_layers = []
            for worker_id in range(self.num_gpus):
                self._data_layers.append(self._params['data_layer'](
                    params=dl_params,
                    model=self,
                    num_workers=self.num_gpus,
                    worker_id=worker_id,
                ))

        if self._mode == "train":
            if "max_steps" in self._params:
                self._last_step = self._params["max_steps"]
                self._steps_in_epoch = None
            else:
                # doing a few less steps if data size is not divisible by the batch size
                self._steps_in_epoch = self.get_data_layer().get_size_in_samples() // \
                                       self.get_data_layer().params['batch_size']
                if self._steps_in_epoch is None:
                    raise ValueError(
                        'The data_layer is not compatible with '
                        'epoch execution, since it does not provide '
                        'get_size_in_samples() method. Either update the '
                        'data layer or switch to using "max_steps" '
                        'paremeter.')
                if self.on_horovod:
                    self._steps_in_epoch //= self._hvd.size()
                else:
                    self._steps_in_epoch //= self.num_gpus
                self._steps_in_epoch //= self._params.get('iter_size', 1)
                if self._steps_in_epoch == 0:
                    raise ValueError(
                        "Overall batch size is too big for this dataset.")
                self._last_step = self._params[
                    'num_epochs'] * self._steps_in_epoch

        if self.on_horovod:
            self._output = None
        else:
            self._outputs = [None] * self.num_gpus

        self.loss = None
        self.train_op = None
        self.eval_losses = None
        self._num_objects_per_step = None
        self.skip_update_ph = None