示例#1
0
  def train(self):
    """Trains the model."""
    params, flags_obj, is_train = self.params, self.flags_obj, True
    _ensure_dir(flags_obj.model_dir)
    if self.distribution_strategy:
      with self.distribution_strategy.scope():
        model = transformer.create_model(params, is_train)
        opt = self._create_optimizer()
        model.compile(opt)
    else:
      model = transformer.create_model(params, is_train)
      opt = self._create_optimizer()
      model.compile(opt)

    model.summary()

    # TODO(guptapriya): Figure out a way to structure input that works in both 
    # distributed and non distributed cases.
    train_ds = data_pipeline.train_input_fn(params)
    if not self.distribution_strategy:
      map_data_fn = data_pipeline.map_data_for_transformer_fn
      train_ds = train_ds.map(
          map_data_fn, num_parallel_calls=params["num_parallel_calls"])

    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

    if flags_obj.train_steps < flags_obj.steps_between_evals:
      flags_obj.steps_between_evals = flags_obj.train_steps
    iterations = flags_obj.train_steps // flags_obj.steps_between_evals

    cased_score, uncased_score = None, None
    for i in range(1, iterations + 1):
      print("Start train iteration:{}/{}".format(i, iterations))
      history = model.fit(
          train_ds,
          initial_epoch=i-1,
          epochs=i,
          steps_per_epoch=flags_obj.steps_between_evals,
          callbacks=callbacks,
          # If TimeHistory is enabled, progress bar would be messy. Increase the
          # verbose level to get rid of it.
          verbose=(2 if flags_obj.enable_time_history else 1))
      print("End train iteration:{}/{} global step:{}".format(
          i,
          iterations,
          i*flags_obj.steps_between_evals))
      tf.compat.v1.logging.info("Train history: {}".format(history.history))
      stats = misc.build_stats(history, callbacks)

      if (flags_obj.bleu_source and flags_obj.bleu_ref):
        uncased_score, cased_score = self.eval()

      print("BLEU: uncased={}, cased={}".format(uncased_score, cased_score))

    stats = misc.build_stats(history, callbacks)
    if uncased_score and cased_score:
      stats["bleu_uncased"] = uncased_score
      stats["bleu_cased"] = cased_score
    return stats
示例#2
0
 def eval(self):
     """Evaluates the model."""
     params, flags_obj, is_train = self.params, self.flags_obj, False
     with tf.name_scope("model"):
         model = transformer.create_model(params, is_train)
         self._load_weights_if_possible(model, flags_obj.init_weight_path)
         model.summary()
     evaluate_and_log_bleu(model, flags_obj.bleu_source, flags_obj.bleu_ref,
                           flags_obj.vocab_file)
示例#3
0
 def test_create_model_not_train(self):
   model = transformer.create_model(self.params, False)
   inputs, outputs = model.inputs, model.outputs
   self.assertEqual(len(inputs), 1)
   self.assertEqual(len(outputs), 2)
   self.assertEqual(inputs[0].shape.as_list(), [None, None])
   self.assertEqual(inputs[0].dtype, tf.int64)
   self.assertEqual(outputs[0].shape.as_list(), [None, None])
   self.assertEqual(outputs[0].dtype, tf.int32)
   self.assertEqual(outputs[1].shape.as_list(), [None])
   self.assertEqual(outputs[1].dtype, tf.float32)
示例#4
0
 def eval(self):
   """Evaluates the model."""
   if not self.predict_model:
     self.predict_model = transformer.create_model(self.params, False)
   self._load_weights_if_possible(
       self.predict_model,
       tf.train.latest_checkpoint(self.flags_obj.model_dir))
   self.predict_model.summary()
   return evaluate_and_log_bleu(self.predict_model,
                                self.flags_obj.bleu_source,
                                self.flags_obj.bleu_ref,
                                self.flags_obj.vocab_file)
 def eval(self):
     """Evaluates the model."""
     with distribution_utils.get_strategy_scope(self.distribution_strategy):
         if not self.predict_model:
             self.predict_model = transformer.create_model(self.params, False)
         self._load_weights_if_possible(
             self.predict_model,
             tf.train.latest_checkpoint(self.flags_obj.model_dir))
         self.predict_model.summary()
     return evaluate_and_log_bleu(
         self.predict_model, self.params, self.flags_obj.bleu_source,
         self.flags_obj.bleu_ref, self.flags_obj.vocab_file,
         self.distribution_strategy if self.use_tpu else None)
示例#6
0
  def predict(self):
    """Predicts result from the model."""
    params, flags_obj, is_train = self.params, self.flags_obj, False

    with tf.name_scope("model"):
      model = transformer.create_model(params, is_train)
      self._load_weights_if_possible(model, flags_obj.init_weight_path)
      model.summary()
    subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)

    ds = data_pipeline.eval_input_fn(params)
    ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
    ret = model.predict(ds)
    val_outputs, _ = ret
    length = len(val_outputs)
    for i in range(length):
      translate.translate_from_input(val_outputs[i], subtokenizer)
示例#7
0
  def eval(self):
    """Evaluates the model."""
    distribution_strategy = self.distribution_strategy if self.use_tpu else None

    # We only want to create the model under DS scope for TPU case.
    # When 'distribution_strategy' is None, a no-op DummyContextManager will
    # be used.
    with distribution_utils.get_strategy_scope(distribution_strategy):
      if not self.predict_model:
        self.predict_model = transformer.create_model(self.params, False)
      self._load_weights_if_possible(
          self.predict_model,
          tf.train.latest_checkpoint(self.flags_obj.model_dir))
      self.predict_model.summary()
    return evaluate_and_log_bleu(
        self.predict_model, self.params, self.flags_obj.bleu_source,
        self.flags_obj.bleu_ref, self.flags_obj.vocab_file,
        distribution_strategy)
    def train(self):
        """Trains the model."""
        params, flags_obj, is_train = self.params, self.flags_obj, True
        _ensure_dir(flags_obj.model_dir)
        model = transformer.create_model(params, is_train)
        opt = self._create_optimizer()

        model.compile(opt, target_tensors=[])
        model.summary()

        map_data_fn = data_pipeline.map_data_for_transformer_fn
        train_ds = data_pipeline.train_input_fn(params)
        train_ds = train_ds.map(
            map_data_fn, num_parallel_calls=params["num_parallel_calls"])

        callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

        if flags_obj.train_steps < flags_obj.steps_between_evals:
            flags_obj.steps_between_evals = flags_obj.train_steps
        iterations = flags_obj.train_steps // flags_obj.steps_between_evals

        cased_score, uncased_score = None, None
        for i in range(1, iterations + 1):
            print("Start train iteration:{}/{}".format(i, iterations))
            history = model.fit(train_ds,
                                initial_epoch=i - 1,
                                epochs=i,
                                steps_per_epoch=flags_obj.steps_between_evals,
                                callbacks=callbacks,
                                verbose=2)
            print("End train iteration:{}/{} global step:{}".format(
                i, iterations, i * flags_obj.steps_between_evals))
            tf.compat.v1.logging.info("Train history: {}".format(
                history.history))
            stats = misc.build_stats(history, callbacks)

            if (flags_obj.bleu_source and flags_obj.bleu_ref):
                uncased_score, cased_score = self.eval()

        stats = misc.build_stats(history, callbacks)
        if uncased_score and cased_score:
            stats["bleu_uncased"] = uncased_score
            stats["bleu_cased"] = cased_score
        return stats
示例#9
0
    def train(self):
        """Trains the model."""
        params, flags_obj, is_train = self.params, self.flags_obj, True
        model = transformer.create_model(params, is_train)
        opt = self._create_optimizer()

        model.compile(opt, target_tensors=[])
        model.summary()
        self._load_weights_if_possible(model, flags_obj.init_weight_path)

        cur_log_dir = _get_log_dir_or_default(flags_obj)
        _ensure_dir(cur_log_dir)

        map_data_fn = data_pipeline.map_data_for_transformer_fn
        train_ds = data_pipeline.train_input_fn(params)
        train_ds = train_ds.map(
            map_data_fn, num_parallel_calls=params["num_parallel_calls"])
        valid_ds = data_pipeline.eval_input_fn(params)
        valid_ds = valid_ds.map(
            map_data_fn, num_parallel_calls=params["num_parallel_calls"])

        init_epoch = flags_obj.init_epoch or 0
        init_steps = init_epoch * flags_obj.steps_per_epoch
        callbacks = self._create_callbacks(cur_log_dir, init_steps, params)

        history = model.fit(train_ds,
                            initial_epoch=init_epoch,
                            epochs=flags_obj.train_epochs,
                            steps_per_epoch=flags_obj.steps_per_epoch,
                            validation_data=valid_ds,
                            validation_steps=flags_obj.validation_steps,
                            callbacks=callbacks)
        tf.compat.v1.logging.info("\nTrain history: {}".format(
            history.history))

        save_weight_path = os.path.join(cur_log_dir,
                                        "saves-model-weights.hdf5")
        save_model_path = os.path.join(cur_log_dir, "saves-model.hdf5")
        model.save_weights(save_weight_path)
        model.save(save_model_path)
    def train(self):
        """Trains the model."""
        params = self.params
        flags_obj = self.flags_obj
        # Sets config options.
        # xla?
        keras_utils.set_session_config(
            enable_xla=flags_obj.enable_xla)

        _ensure_dir(flags_obj.model_dir)
        with distribution_utils.get_strategy_scope(self.distribution_strategy):
            model = transformer.create_model(params, is_train=True)
            opt = self._create_optimizer()

            # 恢复checkpoint
            current_step = 0
            checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
            latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
            if latest_checkpoint:
                checkpoint.restore(latest_checkpoint)
                logging.info("Loaded checkpoint %s", latest_checkpoint)
                current_step = opt.iterations.numpy() #?

            # 分布式,均值
            if params["use_ctl"]:
                train_loss_metric = tf.keras.metrics.Mean(
                    "training_loss", dtype=tf.float32)
            else:
                # 模型训练的配置,包括优化器、LOSS等
                model.compile(opt)

        # model结构
        model.summary()

        if self.use_tpu:
            # Different from experimental_distribute_dataset,
            # experimental_distribute_datasets_from_function requires
            # per-replica/local batch size.
            params["batch_size"] /= self.distribution_strategy.num_replicas_in_sync
            train_ds = (
                self.distribution_strategy
                    .experimental_distribute_datasets_from_function(
                    lambda ctx: data_pipeline.train_input_fn(params, ctx)))
        else:
            # 平行句对
            train_ds = data_pipeline.train_input_fn(params)
            map_data_fn = data_pipeline.map_data_for_transformer_fn
            train_ds = train_ds.map(
                map_data_fn, num_parallel_calls=params["num_parallel_calls"])
        if params["use_ctl"]:
            train_ds_iterator = iter(train_ds)

        callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

        # TODO(b/139418525): Refactor the custom training loop logic.
        @tf.function
        def train_steps(iterator, steps):
            """Training steps function for TPU runs.

            Args:
              iterator: The input iterator of the training dataset.
              steps: An integer, the number of training steps.

            Returns:
              A float, the loss value.
            """

            def _step_fn(inputs):
                """Per-replica step function."""
                inputs, targets = inputs
                with tf.GradientTape() as tape:
                    logits = model([inputs, targets], training=True)
                    loss = metrics.transformer_loss(logits, targets,
                                                    params["label_smoothing"],
                                                    params["vocab_size"])
                    # Scales the loss, which results in using the average loss across all
                    # of the replicas for backprop.
                    scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync

                # De-dupes variables due to keras tracking issues.
                tvars = list({id(v): v for v in model.trainable_variables}.values())
                grads = tape.gradient(scaled_loss, tvars)
                opt.apply_gradients(zip(grads, tvars))
                # For reporting, the metric takes the mean of losses.
                train_loss_metric.update_state(loss)

            for _ in tf.range(steps):
                train_loss_metric.reset_states()
                self.distribution_strategy.experimental_run_v2(
                    _step_fn, args=(next(iterator),))

        cased_score, uncased_score = None, None
        cased_score_history, uncased_score_history = [], []
        while current_step < flags_obj.train_steps:
            remaining_steps = flags_obj.train_steps - current_step
            train_steps_per_eval = (
                remaining_steps if remaining_steps < flags_obj.steps_between_evals
                else flags_obj.steps_between_evals)
            current_iteration = current_step // flags_obj.steps_between_evals

            logging.info(
                "Start train iteration at global step:{}".format(current_step))
            history = None
            # tpu使用的是上述train_steps函数
            # gpu可直接用model.fit()
            if params["use_ctl"]:
                if not self.use_tpu:
                    raise NotImplementedError(
                        "Custom training loop on GPUs is not implemented.")
                # Runs training steps.
                train_steps(train_ds_iterator,
                            tf.convert_to_tensor(train_steps_per_eval, dtype=tf.int32))
                current_step += train_steps_per_eval
                train_loss = train_loss_metric.result().numpy().astype(float)
                logging.info("Train Step: %d/%d / loss = %s",
                             current_step, flags_obj.train_steps, train_loss)

                checkpoint_name = checkpoint.save(
                    os.path.join(
                        flags_obj.model_dir,
                        "ctl_step_{}.ckpt".format(current_step)))
                logging.info("Saved checkpoint to %s", checkpoint_name)
            else:
                if self.use_tpu:
                    raise NotImplementedError(
                        "Keras model.fit on TPUs is not implemented.")
                history = model.fit(
                    train_ds,
                    initial_epoch=current_iteration,
                    epochs=current_iteration + 1,
                    steps_per_epoch=train_steps_per_eval,
                    callbacks=callbacks,
                    # If TimeHistory is enabled, progress bar would be messy. Increase
                    # the verbose level to get rid of it.
                    verbose=(2 if flags_obj.enable_time_history else 1))
                current_step += train_steps_per_eval
                logging.info("Train history: {}".format(history.history))

            logging.info("End train iteration at global step:{}".format(current_step))

            if (flags_obj.bleu_source and flags_obj.bleu_ref):
                # 区分大小写
                uncased_score, cased_score = self.eval()
                cased_score_history.append([current_iteration + 1, cased_score])
                uncased_score_history.append([current_iteration + 1, uncased_score])

        stats = ({
                     "loss": train_loss
                 } if history is None else misc.build_stats(history, callbacks))
        if uncased_score and cased_score:
            stats["bleu_uncased"] = uncased_score
            stats["bleu_cased"] = cased_score
            stats["bleu_uncased_history"] = uncased_score_history
            stats["bleu_cased_history"] = cased_score_history
        return stats
示例#11
0
  def train(self):
    """Trains the model."""
    params, flags_obj, is_train = self.params, self.flags_obj, True
    # Sets config options.
    keras_utils.set_session_config(
        enable_xla=flags_obj.enable_xla)

    _ensure_dir(flags_obj.model_dir)
    if self.distribution_strategy:
      with self.distribution_strategy.scope():
        model = transformer.create_model(params, is_train)
        opt = self._create_optimizer()
        model.compile(opt)
    else:
      model = transformer.create_model(params, is_train)
      opt = self._create_optimizer()
      model.compile(opt)

    model.summary()

    train_ds = data_pipeline.train_input_fn(params)
    map_data_fn = data_pipeline.map_data_for_transformer_fn
    train_ds = train_ds.map(map_data_fn,
                            num_parallel_calls=params["num_parallel_calls"])

    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

    if flags_obj.train_steps < flags_obj.steps_between_evals:
      flags_obj.steps_between_evals = flags_obj.train_steps
    iterations = flags_obj.train_steps // flags_obj.steps_between_evals

    cased_score, uncased_score = None, None
    cased_score_history, uncased_score_history = [], []
    for i in range(1, iterations + 1):
      print("Start train iteration:{}/{}".format(i, iterations))
      history = model.fit(
          train_ds,
          initial_epoch=i-1,
          epochs=i,
          steps_per_epoch=flags_obj.steps_between_evals,
          callbacks=callbacks,
          # If TimeHistory is enabled, progress bar would be messy. Increase the
          # verbose level to get rid of it.
          verbose=(2 if flags_obj.enable_time_history else 1))
      print("End train iteration:{}/{} global step:{}".format(
          i,
          iterations,
          i*flags_obj.steps_between_evals))
      tf.compat.v1.logging.info("Train history: {}".format(history.history))
      stats = misc.build_stats(history, callbacks)

      if (flags_obj.bleu_source and flags_obj.bleu_ref):
        uncased_score, cased_score = self.eval()
        cased_score_history.append([i, cased_score])
        uncased_score_history.append([i, uncased_score])

    stats = misc.build_stats(history, callbacks)
    if uncased_score and cased_score:
      stats["bleu_uncased"] = uncased_score
      stats["bleu_cased"] = cased_score
      stats["bleu_uncased_history"] = uncased_score_history
      stats["bleu_cased_history"] = cased_score_history
    return stats
示例#12
0
  def train(self):
    """Trains the model."""
    params = self.params
    flags_obj = self.flags_obj
    # Sets config options.
    keras_utils.set_session_config(
        enable_xla=flags_obj.enable_xla)

    _ensure_dir(flags_obj.model_dir)
    with distribution_utils.get_strategy_scope(self.distribution_strategy):
      model = transformer.create_model(params, is_train=True)
      opt = self._create_optimizer()
      if params["use_ctl"]:
        train_loss_metric = tf.keras.metrics.Mean(
            "training_loss", dtype=tf.float32)
      else:
        model.compile(opt)

    model.summary()

    train_ds = data_pipeline.train_input_fn(params)
    if self.use_tpu:
      if params["is_tpu_pod"]:
        train_ds = (
            self.distribution_strategy
            .experimental_distribute_datasets_from_function(
                lambda: data_pipeline.train_input_fn(params)))
      else:
        train_ds = (
            self.distribution_strategy.experimental_distribute_dataset(train_ds)
        )
    else:
      map_data_fn = data_pipeline.map_data_for_transformer_fn
      train_ds = train_ds.map(
          map_data_fn, num_parallel_calls=params["num_parallel_calls"])

    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

    # TODO(b/139418525): Refactor the custom training loop logic.
    @tf.function
    def train_steps(iterator, steps):
      """Training steps function for TPU runs.

      Args:
        iterator: The input iterator of the training dataset.
        steps: An integer, the number of training steps.

      Returns:
        A float, the loss value.
      """

      def _step_fn(inputs):
        """Per-replica step function."""
        inputs, targets = inputs
        with tf.GradientTape() as tape:
          logits = model([inputs, targets], training=True)
          loss = metrics.transformer_loss(logits, targets,
                                          params["label_smoothing"],
                                          params["vocab_size"])
          # Scales the loss, which results in using the average loss across all
          # of the replicas for backprop.
          scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync

        # De-dupes variables due to keras tracking issues.
        tvars = list(
            object_identity.ObjectIdentitySet(model.trainable_variables))
        grads = tape.gradient(scaled_loss, tvars)
        opt.apply_gradients(zip(grads, tvars))
        # For reporting, the metric takes the mean of losses.
        train_loss_metric.update_state(loss)

      for _ in tf.range(steps):
        train_loss_metric.reset_states()
        self.distribution_strategy.experimental_run_v2(
            _step_fn, args=(next(iterator),))

    if self.use_tpu:
      checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
      latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
      if latest_checkpoint:
        checkpoint.restore(latest_checkpoint)
        logging.info("Loaded checkpoint %s", latest_checkpoint)

    if flags_obj.train_steps < flags_obj.steps_between_evals:
      flags_obj.steps_between_evals = flags_obj.train_steps
    iterations = flags_obj.train_steps // flags_obj.steps_between_evals

    cased_score, uncased_score = None, None
    cased_score_history, uncased_score_history = [], []
    for i in range(1, iterations + 1):
      print("Start train iteration:{}/{}".format(i, iterations))
      history = None
      if params["use_ctl"]:
        if not self.use_tpu:
          raise NotImplementedError(
              "Custom training loop on GPUs is not implemented.")
        train_steps_per_eval = tf.convert_to_tensor(
            flags_obj.steps_between_evals, dtype=tf.int32)

        # Runs training steps.
        train_steps(iter(train_ds), train_steps_per_eval)
        train_loss = train_loss_metric.result().numpy().astype(float)
        logging.info("Train Step: %d/%d / loss = %s",
                     i * flags_obj.steps_between_evals, flags_obj.train_steps,
                     train_loss)

        checkpoint_name = checkpoint.save(
            os.path.join(
                flags_obj.model_dir,
                "ctl_step_{}.ckpt".format(i * flags_obj.steps_between_evals)))
        logging.info("Saved checkpoint to %s", checkpoint_name)
      else:
        if self.use_tpu:
          raise NotImplementedError(
              "Keras model.fit on TPUs is not implemented.")
        history = model.fit(
            train_ds,
            initial_epoch=i - 1,
            epochs=i,
            steps_per_epoch=flags_obj.steps_between_evals,
            callbacks=callbacks,
            # If TimeHistory is enabled, progress bar would be messy. Increase
            # the verbose level to get rid of it.
            verbose=(2 if flags_obj.enable_time_history else 1))
        logging.info("Train history: {}".format(history.history))

      print("End train iteration:{}/{} global step:{}".format(
          i,
          iterations,
          i*flags_obj.steps_between_evals))

      if (flags_obj.bleu_source and flags_obj.bleu_ref):
        uncased_score, cased_score = self.eval()
        cased_score_history.append([i, cased_score])
        uncased_score_history.append([i, uncased_score])

    stats = ({
        "loss": train_loss
    } if history is None else misc.build_stats(history, callbacks))
    if uncased_score and cased_score:
      stats["bleu_uncased"] = uncased_score
      stats["bleu_cased"] = cased_score
      stats["bleu_uncased_history"] = uncased_score_history
      stats["bleu_cased_history"] = cased_score_history
    return stats