Пример #1
0
  def finalize_evaluation(self, results_per_batch, training_step=None):
    total_word_lev = 0.0
    total_word_count = 0.0
    for word_lev, word_count in results_per_batch:
      total_word_lev += word_lev
      total_word_count += word_count

    total_wer = 1.0 * total_word_lev / total_word_count
    deco_print("Validation WER:  {:.4f}".format(total_wer), offset=4)
    return {
        "Eval WER": total_wer,
    }
Пример #2
0
    def finalize_evaluation(self, results_per_batch, training_step=None):
        preds, targets = [], []
        for preds_cur, targets_cur in results_per_batch:
            if self.params.get('eval_using_bleu', True):
                preds.extend(preds_cur)
                targets.extend(targets_cur)

        if self.params.get('eval_using_bleu', True):
            eval_bleu = calculate_bleu(preds, targets)
            deco_print("Eval BLUE score: {}".format(eval_bleu), offset=4)
            return {'Eval_BLEU_Score': eval_bleu}

        return {}
Пример #3
0
    def __init__(self, params, model, name="ctc_loss"):
        """CTC loss constructor.

    See parent class for arguments description.

    Config parameters:

    * **mask_nan** (bool) --- whether to mask nans in the loss output. Defaults
      to True.
    """
        super(CTCLoss, self).__init__(params, model, name)
        self._mask_nan = self.params.get("mask_nan", True)
        # this loss can only operate in full precision
        if self.params['dtype'] != tf.float32:
            deco_print("Warning: defaulting CTC loss to work in float32")
        self.params['dtype'] = tf.float32
Пример #4
0
    def finalize_evaluation(self, results_per_batch, training_step=None):
        top1 = 0.0
        top5 = 0.0
        total = 0.0

        for cur_total, cur_top1, cur_top5 in results_per_batch:
            top1 += cur_top1
            top5 += cur_top5
            total += cur_total

        top1 = 1.0 * top1 / total
        top5 = 1.0 * top5 / total
        deco_print("Validation top-1: {:.4f}".format(top1), offset=4)
        deco_print("Validation top-5: {:.4f}".format(top5), offset=4)
        return {
            "Eval top-1": top1,
            "Eval top-5": top5,
        }
Пример #5
0
    def maybe_print_logs(self, input_values, output_values, training_step):
        labels = input_values['target_tensors'][0]
        logits = output_values[0]

        labels = np.where(labels == 1)[1]

        total = logits.shape[0]
        top1 = np.sum(np.argmax(logits, axis=1) == labels)
        top5 = np.sum(labels[:,
                             np.newaxis] == np.argpartition(logits, -5)[:,
                                                                        -5:])

        top1 = 1.0 * top1 / total
        top5 = 1.0 * top5 / total
        deco_print("Train batch top-1: {:.4f}".format(top1), offset=4)
        deco_print("Train batch top-5: {:.4f}".format(top5), offset=4)
        return {
            "Train batch top-1": top1,
            "Train batch top-5": top5,
        }
Пример #6
0
    def maybe_print_logs(self, input_values, output_values, training_step):
        x, len_x = input_values['source_tensors']
        y, len_y = input_values['target_tensors']
        samples = output_values[0]

        x_sample = x[0]
        len_x_sample = len_x[0]
        y_sample = y[0]
        len_y_sample = len_y[0]

        deco_print(
            "Train Source[0]:     " + array_to_string(
                x_sample[:len_x_sample],
                vocab=self.get_data_layer().params['source_idx2seq'],
                delim=self.get_data_layer().params["delimiter"],
            ),
            offset=4,
        )
        deco_print(
            "Train Target[0]:     " + array_to_string(
                y_sample[:len_y_sample],
                vocab=self.get_data_layer().params['target_idx2seq'],
                delim=self.get_data_layer().params["delimiter"],
            ),
            offset=4,
        )
        deco_print(
            "Train Prediction[0]: " + array_to_string(
                samples[0, :],
                vocab=self.get_data_layer().params['target_idx2seq'],
                delim=self.get_data_layer().params["delimiter"],
            ),
            offset=4,
        )
        return {}
Пример #7
0
  def after_run(self, run_context, run_values):
    results, step = run_values.results
    self._iter_count = step

    if not results:
      return
    self._timer.update_last_triggered_step(self._iter_count - 1)

    if self._model.steps_in_epoch is None:
      deco_print("Global step {}:".format(step), end=" ")
    else:
      deco_print(
          "Epoch {}, global step {}:".format(
              step // self._model.steps_in_epoch, step),
          end=" ",
      )

    loss = results[0]
    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      if self._print_ppl:
        deco_print("Train loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
                   .format(loss, math.exp(loss),
                           loss/math.log(2)),
                   start="", end=", ")
      else:
        deco_print(
          "Train loss: {:.4f} ".format(loss),
          offset=4)

    tm = (time.time() - self._last_time) / self._every_steps
    m, s = divmod(tm, 60)
    h, m = divmod(m, 60)

    deco_print(
        "time per step = {}:{:02}:{:.3f}".format(int(h), int(m), s),
        start="",
    )
    self._last_time = time.time()
Пример #8
0
    def evaluate(self, input_values, output_values):
        ex, elen_x = input_values['source_tensors']
        ey, elen_y = input_values['target_tensors']

        x_sample = ex[0]
        len_x_sample = elen_x[0]
        y_sample = ey[0]
        len_y_sample = elen_y[0]

        deco_print(
            "*****EVAL Source[0]:     " + array_to_string(
                x_sample[:len_x_sample],
                vocab=self.get_data_layer().params['source_idx2seq'],
                delim=self.get_data_layer().params["delimiter"],
            ),
            offset=4,
        )
        deco_print(
            "*****EVAL Target[0]:     " + array_to_string(
                y_sample[:len_y_sample],
                vocab=self.get_data_layer().params['target_idx2seq'],
                delim=self.get_data_layer().params["delimiter"],
            ),
            offset=4,
        )
        samples = output_values[0]
        deco_print(
            "*****EVAL Prediction[0]: " + array_to_string(
                samples[0, :],
                vocab=self.get_data_layer().params['target_idx2seq'],
                delim=self.get_data_layer().params["delimiter"],
            ),
            offset=4,
        )
        preds, targets = [], []

        if self.params.get('eval_using_bleu', True):
            preds.extend([
                transform_for_bleu(
                    sample,
                    vocab=self.get_data_layer().params['target_idx2seq'],
                    ignore_special=True,
                    delim=self.get_data_layer().params["delimiter"],
                    bpe_used=self.params.get('bpe_used', False),
                ) for sample in samples
            ])
            targets.extend([[
                transform_for_bleu(
                    yi,
                    vocab=self.get_data_layer().params['target_idx2seq'],
                    ignore_special=True,
                    delim=self.get_data_layer().params["delimiter"],
                    bpe_used=self.params.get('bpe_used', False),
                )
            ] for yi in ey])

        return preds, targets
Пример #9
0
    def infer(self, input_values, output_values):
        if self._lm_phase:
            vocab = self.get_data_layer().corp.dictionary.idx2word
            seed_tokens = self.params['encoder_params']['seed_tokens']
            for i in range(len(seed_tokens)):
                print('Seed:', vocab[seed_tokens[i]] + '\n')
                deco_print(
                    "Output: " + array_to_string(
                        output_values[0][i],
                        vocab=self.get_data_layer().corp.dictionary.idx2word,
                        delim=self.delimiter,
                    ),
                    offset=4,
                )
            return []
        else:
            ex, elen_x = input_values['source_tensors']
            ey, elen_y = None, None
            if 'target_tensors' in input_values:
                ey, elen_y = input_values['target_tensors']

            n_samples = len(ex)
            results = []
            for i in range(n_samples):
                current_x = array_to_string(
                    ex[i][:elen_x[i]],
                    vocab=self.get_data_layer().corp.dictionary.idx2word,
                    delim=self.delimiter,
                ),
                current_pred = np.argmax(output_values[0][i])
                curret_y = None
                if ey is not None:
                    current_y = np.argmax(ey[i])

                results.append((current_x[0], current_pred, current_y))
            return results
Пример #10
0
  def after_run(self, run_context, run_values):
    results, step = run_values.results
    self._iter_count = step

    if not self._triggered and step != self._last_step - 1:
      return
    self._timer.update_last_triggered_step(self._iter_count - 1)

    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      deco_print("Running evaluation on a validation set:")

    results_per_batch, total_loss = get_results_for_epoch(
        self._model, run_context.session, mode="eval", compute_loss=True,
    )


    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      if self._print_ppl:
        deco_print("Validation loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
                   .format(total_loss, math.exp(total_loss),
                           total_loss/math.log(2)), offset=4)
      else:
        deco_print(
          "Validation loss: {:.4f} ".format(total_loss),
          offset=4)


      dict_to_log = self._model.finalize_evaluation(results_per_batch, step)
      dict_to_log['eval_loss'] = total_loss

      if self._print_ppl:
        # Add bpc and ppl metrics to tensorboard
        dict_to_log['ppl'] = math.exp(total_loss)
        dict_to_log['bpc'] = math.exp(total_loss/math.log(2))
        
      # saving the best validation model
      if self._model.params['save_checkpoint_steps'] and \
         total_loss < self._best_eval_loss:
        self._best_eval_loss = total_loss
        self._eval_saver.save(
            run_context.session,
            os.path.join(self._model.params['logdir'], 'best_models',
                         'val_loss={:.4f}-step'.format(total_loss)),
            global_step=step + 1,
        )

      # optionally logging to tensorboard any values
      # returned from maybe_print_logs
      if self._model.params['save_summaries_steps']:
        log_summaries_from_dict(
            dict_to_log,
            self._model.params['logdir'],
            step,
        )
Пример #11
0
 def finalize_inference(self, results_per_batch, output_file):
     with codecs.open(output_file, 'w', 'utf-8') as fout:
         step = 0
         for input_strings, output_strings in results_per_batch:
             for input_string, output_string in zip(input_strings,
                                                    output_strings):
                 fout.write(output_string + "\n")
                 if step % 200 == 0:
                     deco_print("Input sequence:  {}".format(input_string))
                     deco_print("Output sequence: {}".format(output_string))
                     deco_print("")
                 step += 1
Пример #12
0
  def maybe_print_logs(self, input_values, output_values, training_step):
    y, len_y = input_values['target_tensors']
    decoded_sequence = output_values
    y_one_sample = y[0]
    len_y_one_sample = len_y[0]
    decoded_sequence_one_batch = decoded_sequence[0]

    if self.is_bpe:
      dec_list = sparse_tensor_to_chars_bpe(decoded_sequence_one_batch)[0]
      true_text = self.get_data_layer().sp.DecodeIds(y_one_sample[:len_y_one_sample].tolist())
      pred_text = self.get_data_layer().sp.DecodeIds(dec_list)

    else:
      # we also clip the sample by the correct length
      true_text = "".join(map(
          self.get_data_layer().params['idx2char'].get,
          y_one_sample[:len_y_one_sample],
      ))
      pred_text = "".join(self.tensor_to_chars(
          decoded_sequence_one_batch,
          self.get_data_layer().params['idx2char'],
          **self.tensor_to_char_params
      )[0])
    sample_wer = levenshtein(true_text.split(), pred_text.split()) / \
        len(true_text.split())

    self.autoregressive = self.get_data_layer().params.get('autoregressive', False)
    self.plot_attention = False  # (output_values[1] != None).all()
    if self.plot_attention:
      attention_summary = plot_attention(
          output_values[1][0], pred_text, output_values[2][0], training_step)

    deco_print("Sample WER: {:.4f}".format(sample_wer), offset=4)
    deco_print("Sample target:     " + true_text, offset=4)
    deco_print("Sample prediction: " + pred_text, offset=4)

    if self.plot_attention:
      return {
          'Sample WER': sample_wer,
          'Attention Summary': attention_summary,
      }
    else:
      return {
          'Sample WER': sample_wer,
      }
Пример #13
0
    def evaluate(self, input_values, output_values):
        ex, elen_x = input_values['source_tensors']
        ey, elen_y = input_values['target_tensors']

        x_sample = ex[0]
        len_x_sample = elen_x[0]
        y_sample = ey[0]
        len_y_sample = elen_y[0]

        return_values = {}

        if self._lm_phase:
            flip = random.random()
            if flip <= 0.9:
                return return_values

            deco_print(
                "*****EVAL Source[0]:     " + array_to_string(
                    x_sample[:len_x_sample],
                    vocab=self.get_data_layer().corp.dictionary.idx2word,
                    delim=self.delimiter,
                ),
                offset=4,
            )
            samples = np.argmax(output_values[0][0], axis=-1)
            deco_print(
                "*****EVAL Target[0]:     " + array_to_string(
                    y_sample[:len_y_sample],
                    vocab=self.get_data_layer().corp.dictionary.idx2word,
                    delim=self.delimiter,
                ),
                offset=4,
            )

            deco_print(
                "*****EVAL Prediction[0]: " + array_to_string(
                    samples,
                    vocab=self.get_data_layer().corp.dictionary.idx2word,
                    delim=self.delimiter,
                ),
                offset=4,
            )
        else:
            deco_print(
                "*****EVAL Source[0]:     " + array_to_string(
                    x_sample[:len_x_sample],
                    vocab=self.get_data_layer().corp.dictionary.idx2word,
                    delim=self.delimiter,
                ),
                offset=4,
            )
            samples = output_values[0][0]
            deco_print(
                "EVAL Target[0]:     " + str(np.argmax(y_sample)),
                offset=4,
            )
            deco_print(
                "EVAL Prediction[0]:     " + str(samples),
                offset=4,
            )

            labels = np.argmax(ey, 1)
            preds = np.argmax(output_values[0], axis=-1)
            print('Labels', labels)
            print('Preds', preds)

            return_values['accuracy'] = metrics.accuracy(labels, preds)

            if self._print_f1:
                return_values['true_pos'] = metrics.true_positives(
                    labels, preds)
                return_values['pred_pos'] = np.sum(preds)
                return_values['actual_pos'] = np.sum(labels)

        return return_values
Пример #14
0
    def maybe_print_logs(self, input_values, output_values, training_step):
        x, len_x = input_values['source_tensors']
        y, len_y = input_values['target_tensors']

        x_sample = x[0]
        len_x_sample = len_x[0]
        y_sample = y[0]
        len_y_sample = len_y[0]

        deco_print(
            "Train Source[0]:     " + array_to_string(
                x_sample[:len_x_sample],
                vocab=self.get_data_layer().corp.dictionary.idx2word,
                delim=self.delimiter,
            ),
            offset=4,
        )

        if self._lm_phase:
            deco_print(
                "Train Target[0]:     " + array_to_string(
                    y_sample[:len_y_sample],
                    vocab=self.get_data_layer().corp.dictionary.idx2word,
                    delim=self.delimiter,
                ),
                offset=4,
            )
        else:
            deco_print(
                "TRAIN Target[0]:     " + str(np.argmax(y_sample)),
                offset=4,
            )
            samples = output_values[0][0]
            deco_print(
                "TRAIN Prediction[0]:     " + str(samples),
                offset=4,
            )
            labels = np.argmax(y, 1)
            preds = np.argmax(output_values[0], axis=-1)
            print('Labels', labels)
            print('Preds', preds)

            deco_print(
                "Accuracy: {:.4f}".format(metrics.accuracy(labels, preds)),
                offset=4,
            )

            if self._print_f1:
                deco_print(
                    "Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}".format(
                        metrics.precision(labels, preds),
                        metrics.recall(labels, preds),
                        metrics.f1(labels, preds)),
                    offset=4,
                )

        return {}
Пример #15
0
def build_layer(inputs, layer, layer_params, data_format,
                regularizer, training, verbose=True):
  """This function builds a layer from the layer function and it's parameters.

  It will automatically add regularizer parameter to the layer_params if the
  layer supports regularization. To check this, it will look for the
  "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this
  order in the ``layer`` call signature. If one of this parameters is supported
  it will pass regularizer object as a value for that parameter. Based on the
  same "checking signature" technique "data_format" and "training" parameters
  will try to be added. Finally, "axis" parameter will try to be specified with
  axis = ``1 if data_format == 'channels_first' else 3``. This is required for
  automatic building batch normalization layer.

  Args:
    inputs: input Tensor that will be passed to the layer. Note that layer has
        to accept input as the first parameter.
    layer: layer function or class with ``__call__`` method defined.
    layer_params (dict): parameters passed to the ``layer``.
    data_format (string): data format ("channels_first" or "channels_last")
        that will be tried to be passed as an additional argument.
    regularizer: regularizer instance that will be tried to be passed as an
        additional argument.
    training (bool): whether layer is built in training mode. Will be tried to
        be passed as an additional argument.
    verbose (bool): whether to print information about built layers.

  Returns:
    Tensor with layer output.
  """
  layer_params_cp = copy.deepcopy(layer_params)
  for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']:
    if reg_name not in layer_params_cp and \
       reg_name in signature(layer).parameters:
      layer_params_cp.update({reg_name: regularizer})

  if 'data_format' not in layer_params_cp and \
     'data_format' in signature(layer).parameters:
    layer_params_cp.update({'data_format': data_format})

  # necessary to check axis for correct batch normalization processing
  if 'axis' not in layer_params_cp and \
     'axis' in signature(layer).parameters:
    layer_params_cp.update({'axis': 1 if data_format == 'channels_first' else 3})

  if 'training' not in layer_params_cp and \
     'training' in signature(layer).parameters:
    layer_params_cp.update({'training': training})

  outputs = layer(inputs, **layer_params_cp)

  if verbose:
    if hasattr(layer, '_tf_api_names'):
      layer_name = layer._tf_api_names[0]
    else:
      layer_name = layer
    deco_print("Building layer: {}(inputs, {})".format(
        layer_name,
        ", ".join("{}={}".format(key, value)
                  for key, value in layer_params_cp.items())
    ))
  return outputs
Пример #16
0
    def compile(self,
                force_var_reuse=False,
                checkpoint=None,
                use_trt=False,
                precision='FP32'):
        """TensorFlow graph is built here."""
        if 'initializer' not in self.params:
            initializer = None
        else:
            init_dict = self.params.get('initializer_params', {})
            initializer = self.params['initializer'](**init_dict)

        if not self.on_horovod:  # not using Horovod
            # below we follow data parallelism for multi-GPU training
            losses = []
            for gpu_cnt, gpu_id in enumerate(self._gpu_ids):
                with tf.device("/gpu:{}".format(gpu_id)), tf.variable_scope(
                        name_or_scope=tf.get_variable_scope(),
                        # re-using variables across GPUs.
                        reuse=force_var_reuse or (gpu_cnt > 0),
                        initializer=initializer,
                        dtype=self.get_tf_dtype(),
                ):
                    deco_print("Building graph on GPU:{}".format(gpu_id))

                    if self._interactive:
                        self.get_data_layer(
                            gpu_cnt).create_interactive_placeholders()
                    else:
                        self.get_data_layer(gpu_cnt).build_graph()
                    input_tensors = self.get_data_layer(gpu_cnt).input_tensors
                    loss, self._outputs[
                        gpu_cnt] = self.build_forward_pass_graph(
                            input_tensors,
                            gpu_id=gpu_cnt,
                            checkpoint=checkpoint,
                            use_trt=use_trt,
                            precision=precision)
                    if self._outputs[gpu_cnt] is not None and \
                       not isinstance(self._outputs[gpu_cnt], list):
                        raise ValueError(
                            'Decoder outputs have to be either None or list')
                    if self._mode == "train" or self._mode == "eval":
                        losses.append(loss)

            # end of for gpu_ind loop
            if self._mode == "train":
                self.loss = tf.reduce_mean(losses)
            if self._mode == "eval":
                self.eval_losses = losses
        else:  # is using Horovod
            # gpu_id should always be zero, since Horovod takes care of isolating
            # different processes to 1 GPU only
            with tf.device("/gpu:0"), tf.variable_scope(
                    name_or_scope=tf.get_variable_scope(),
                    reuse=force_var_reuse,
                    initializer=initializer,
                    dtype=self.get_tf_dtype(),
            ):
                deco_print("Building graph in Horovod rank: {}".format(
                    self._hvd.rank()))
                self.get_data_layer().build_graph()
                input_tensors = self.get_data_layer().input_tensors

                all_loss, self._output = self._build_forward_pass_graph(
                    input_tensors, gpu_id=0)
                if isinstance(all_loss, (dict, )):
                    loss = all_loss['loss']
                else:
                    loss = all_loss

                if self._output is not None and not isinstance(
                        self._output, list):
                    raise ValueError(
                        'Decoder outputs have to be either None or list')

                if self._mode == "train":
                    self.loss = loss
                if self._mode == "eval":
                    self.eval_losses = [loss]

        try:
            self._num_objects_per_step = [
                self._get_num_objects_per_step(worker_id)
                for worker_id in range(self.num_gpus)
            ]
        except NotImplementedError:
            pass

        if self._mode == "train":
            if 'lr_policy' not in self.params:
                lr_policy = None
            else:
                lr_params = self.params.get('lr_policy_params', {})
                # adding default decay_steps = max_steps if lr_policy supports it and
                # different value is not provided
                func_params = signature(self.params['lr_policy']).parameters
                if 'decay_steps' in func_params and 'decay_steps' not in lr_params:
                    lr_params['decay_steps'] = self._last_step
                if 'steps_per_epoch' in func_params and \
                   'steps_per_epoch' not in lr_params and 'num_epochs' in self.params:
                    lr_params['steps_per_epoch'] = self.steps_in_epoch
                lr_policy = lambda gs: self.params['lr_policy'](global_step=gs,
                                                                **lr_params)

            if self.params.get('iter_size', 1) > 1:
                self.skip_update_ph = tf.placeholder(tf.bool)

            var_list = tf.trainable_variables()
            freeze_variables_regex = self.params.get('freeze_variables_regex',
                                                     None)
            if freeze_variables_regex is not None:
                pattern = re.compile(freeze_variables_regex)
                var_list = [
                    var for var in tf.trainable_variables()
                    if not pattern.match(var.name)
                ]

            self.train_op = optimize_loss(
                loss=tf.cast(self.loss, tf.float32) +
                get_regularization_loss(),
                dtype=self.params['dtype'],
                optimizer=self.params['optimizer'],
                optimizer_params=self.params.get('optimizer_params', {}),
                var_list=var_list,
                clip_gradients=self.params.get('max_grad_norm', None),
                learning_rate_decay_fn=lr_policy,
                summaries=self.params.get('summaries', None),
                larc_params=self.params.get('larc_params', None),
                loss_scaling=self.params.get('loss_scaling', 1.0),
                loss_scaling_params=self.params.get('loss_scaling_params',
                                                    None),
                on_horovod=self.on_horovod,
                iter_size=self.params.get('iter_size', 1),
                skip_update_ph=self.skip_update_ph,
                model=self)
            tf.summary.scalar(name="train_loss", tensor=self.loss)
            if self.steps_in_epoch:
                tf.summary.scalar(
                    name="epoch",
                    tensor=tf.floor(
                        tf.train.get_global_step() /
                        tf.constant(self.steps_in_epoch, dtype=tf.int64)),
                )

            if not self.on_horovod or self._hvd.rank() == 0:
                if freeze_variables_regex is not None:
                    deco_print('Complete list of variables:')
                    for var in tf.trainable_variables():
                        deco_print('{}'.format(var.name), offset=2)
                deco_print("Trainable variables:")
                total_params = 0
                unknown_shape = False
                for var in var_list:
                    var_params = 1
                    deco_print('{}'.format(var.name), offset=2)
                    deco_print('shape: {}, {}'.format(var.get_shape(),
                                                      var.dtype),
                               offset=4)
                    if var.get_shape():
                        for dim in var.get_shape():
                            var_params *= dim.value
                        total_params += var_params
                    else:
                        unknown_shape = True
                if unknown_shape:
                    deco_print(
                        "Encountered unknown variable shape, can't compute total "
                        "number of parameters.")
                else:
                    deco_print(
                        'Total trainable parameters: {}'.format(total_params))
Пример #17
0
    def _build_forward_pass_graph(self, input_tensors, gpu_id=0):
        """TensorFlow graph for encoder-decoder-loss model is created here.
    This function connects encoder, decoder and loss together. As an input for
    encoder it will specify source tensors (as returned from
    the data layer). As an input for decoder it will specify target tensors
    as well as all output returned from encoder. For loss it
    will also specify target tensors and all output returned from
    decoder. Note that loss will only be built for mode == "train" or "eval".

    Args:
      input_tensors (dict): ``input_tensors`` dictionary that has to contain
          ``source_tensors`` key with the list of all source tensors, and
          ``target_tensors`` with the list of all target tensors. Note that
          ``target_tensors`` only need to be provided if mode is
          "train" or "eval".
      gpu_id (int, optional): id of the GPU where the current copy of the model
          is constructed. For Horovod this is always zero.

    Returns:
      tuple: tuple containing loss tensor as returned from
      ``loss.compute_loss()`` and list of outputs tensors, which is taken from
      ``decoder.decode()['outputs']``. When ``mode == 'infer'``, loss will
      be None.
    """
        if not isinstance(input_tensors, dict) or \
           'source_tensors' not in input_tensors:
            raise ValueError('Input tensors should be a dict containing '
                             '"source_tensors" key')

        if not isinstance(input_tensors['source_tensors'], list):
            raise ValueError('source_tensors should be a list')

        source_tensors = input_tensors['source_tensors']
        if self.mode == "train" or self.mode == "eval":
            if 'target_tensors' not in input_tensors:
                raise ValueError(
                    'Input tensors should contain "target_tensors" key'
                    'when mode != "infer"')
            if not isinstance(input_tensors['target_tensors'], list):
                raise ValueError('target_tensors should be a list')
            target_tensors = input_tensors['target_tensors']

        with tf.variable_scope("ForwardPass"):
            encoder_input = {"source_tensors": source_tensors}
            encoder_output = self.encoder.encode(input_dict=encoder_input)

            decoder_input = {"encoder_output": encoder_output}
            if self.mode == "train" or self.mode == "eval":
                decoder_input['target_tensors'] = target_tensors
            decoder_output = self.decoder.decode(input_dict=decoder_input)
            model_outputs = decoder_output.get("outputs", None)

            if self.mode == "train" or self.mode == "eval":
                with tf.variable_scope("Loss"):
                    loss_input_dict = {
                        "decoder_output": decoder_output,
                        "target_tensors": target_tensors,
                    }
                    loss = self.loss_computator.compute_loss(loss_input_dict)
            else:
                deco_print("Inference Mode. Loss part of graph isn't built.")
                loss = None
            return loss, model_outputs
Пример #18
0
def train(train_model, eval_model=None, debug_port=None):
    if eval_model is not None and 'eval_steps' not in eval_model.params:
        raise ValueError("eval_steps parameter has to be specified "
                         "if eval_model is provided")
    hvd = train_model.hvd
    if hvd:
        master_worker = hvd.rank() == 0
    else:
        master_worker = True

    # initializing session parameters
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    # pylint: disable=no-member
    sess_config.gpu_options.allow_growth = True
    if hvd is not None:
        # pylint: disable=no-member
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    # defining necessary hooks
    hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)]
    if hvd is not None:
        hooks.append(BroadcastGlobalVariablesHook(0))

    if master_worker:
        checkpoint_dir = train_model.params['logdir']
        load_model_dir = train_model.params['load_model']
    else:
        checkpoint_dir = None
        load_model_dir = None

    if eval_model is not None:
        # noinspection PyTypeChecker
        hooks.append(
            RunEvaluationHook(
                every_steps=eval_model.params['eval_steps'],
                model=eval_model,
                last_step=train_model.last_step,
                print_ppl=isinstance(eval_model.get_data_layer(),
                                     WKTDataLayer),
            ), )

    if master_worker:
        if train_model.params['save_checkpoint_steps'] is not None:
            # noinspection PyTypeChecker
            saver = tf.train.Saver(
                save_relative_paths=True,
                max_to_keep=train_model.params['num_checkpoints'])
            hooks.append(
                tf.train.CheckpointSaverHook(
                    checkpoint_dir,
                    saver=saver,
                    save_steps=train_model.params['save_checkpoint_steps'],
                ))
        if train_model.params['print_loss_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                PrintLossAndTimeHook(
                    every_steps=train_model.params['print_loss_steps'],
                    model=train_model,
                    print_ppl=isinstance(train_model.get_data_layer(),
                                         WKTDataLayer),
                ))
        if train_model.params['print_samples_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                PrintSamplesHook(
                    every_steps=train_model.params['print_samples_steps'],
                    model=train_model,
                ))

    total_time = 0.0
    bench_start = train_model.params.get('bench_start', 10)

    if debug_port:
        hooks.append(
            tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port)))

    if train_model.on_horovod:
        init_data_layer = train_model.get_data_layer().iterator.initializer
    else:
        init_data_layer = tf.group([
            train_model.get_data_layer(i).iterator.initializer
            for i in range(train_model.num_gpus)
        ])

    if (not load_model_dir) or tf.train.latest_checkpoint(checkpoint_dir):
        scaffold = tf.train.Scaffold(local_init_op=tf.group(
            tf.local_variables_initializer(), init_data_layer))
    else:
        scaffold = TransferScaffold(local_init_op=tf.group(
            tf.local_variables_initializer(), init_data_layer))
    fetches = [train_model.train_op]
    try:
        total_objects = 0.0
        # on horovod num_gpus is 1
        for worker_id in range(train_model.num_gpus):
            fetches.append(train_model.get_num_objects_per_step(worker_id))
    except NotImplementedError:
        deco_print(
            "WARNING: Can't compute number of objects per step, since "
            "train model does not define get_num_objects_per_step method.")

    # starting training
    if load_model_dir and not tf.train.latest_checkpoint(checkpoint_dir):
        sess = TransferMonitoredTrainingSession(
            scaffold=scaffold,
            checkpoint_dir=checkpoint_dir,
            save_summaries_steps=train_model.params['save_summaries_steps'],
            config=sess_config,
            save_checkpoint_secs=None,
            log_step_count_steps=train_model.params['save_summaries_steps'],
            stop_grace_period_secs=300,
            hooks=hooks,
            load_model_dir=load_model_dir,
            load_fc=train_model.params['load_fc'])
    else:
        sess = tf.train.MonitoredTrainingSession(
            scaffold=scaffold,
            checkpoint_dir=checkpoint_dir,
            save_summaries_steps=train_model.params['save_summaries_steps'],
            config=sess_config,
            save_checkpoint_secs=None,
            log_step_count_steps=train_model.params['save_summaries_steps'],
            stop_grace_period_secs=300,
            hooks=hooks)
    step = 0
    num_bench_updates = 0
    while True:
        if sess.should_stop():
            break
        tm = time.time()
        try:
            feed_dict = {}
            iter_size = train_model.params.get('iter_size', 1)
            if iter_size > 1:
                feed_dict[train_model.skip_update_ph] = step % iter_size != 0
            if step % iter_size == 0:
                if step >= bench_start:
                    num_bench_updates += 1
                fetches_vals = sess.run(fetches, feed_dict)
            else:
                # necessary to skip "no-update" steps when iter_size > 1
                def run_with_no_hooks(step_context):
                    return step_context.session.run(fetches, feed_dict)

                fetches_vals = sess.run_step_fn(run_with_no_hooks)
        except tf.errors.OutOfRangeError:
            break
        if step >= bench_start:
            total_time += time.time() - tm
            if len(fetches) > 1:
                for i in range(train_model.num_gpus):
                    total_objects += np.sum(fetches_vals[i + 1])
                if train_model.params['print_bench_info_steps'] is not None:
                    if step % train_model.params['print_bench_info_steps'] == 0:
                        total_objects_cur = collect_if_horovod(total_objects,
                                                               hvd,
                                                               mode="sum")
                        if master_worker:
                            avg_objects = 1.0 * total_objects_cur / total_time
                            deco_print("Avg objects per second: {:.3f}".format(
                                avg_objects))

        step += 1
    sess.close()

    if len(fetches) > 1:
        total_objects = collect_if_horovod(total_objects, hvd, mode="sum")

    if master_worker:
        deco_print("Finished training")
        if step > bench_start:
            avg_time = 1.0 * total_time / num_bench_updates
            deco_print("Avg time per step: {:.3f}s".format(avg_time))
            if len(fetches) > 1:
                avg_objects = 1.0 * total_objects / total_time
                deco_print(
                    "Avg objects per second: {:.3f}".format(avg_objects))
        else:
            deco_print("Not enough steps for benchmarking")