예제 #1
0
 def eval(self):
     """Evaluates the model."""
     if not self.predict_model:
         self.predict_model = transformer.create_model(self.params, False)
     self._load_weights_if_possible(
         self.predict_model,
         tf.train.latest_checkpoint(self.flags_obj.model_dir))
     self.predict_model.summary()
예제 #2
0
 def test_create_model_not_train(self):
     model = transformer.create_model(self.params, False)
     inputs, outputs = model.inputs, model.outputs
     self.assertEqual(len(inputs), 1)
     self.assertEqual(len(outputs), 2)
     self.assertEqual(inputs[0].shape.as_list(), [None, None])
     self.assertEqual(inputs[0].dtype, tf.int64)
     self.assertEqual(outputs[0].shape.as_list(), [None, None])
     self.assertEqual(outputs[0].dtype, tf.int32)
     self.assertEqual(outputs[1].shape.as_list(), [None])
     self.assertEqual(outputs[1].dtype, tf.float32)
예제 #3
0
파일: dtitle.py 프로젝트: gang4gh/dl
 def create_model(self, mode):
     logging.info('use_reformer = {}'.format(self.flags_obj.use_reformer))
     if self.flags_obj.use_reformer:
         logging.info(
             f'num_hashes, test_num_hashes = {self.params["num_hashes"]}, {self.params["test_num_hashes"]}'
         )
         logging.info(
             f'allow_duplicated_attention = {self.flags_obj.allow_duplicated_attention}'
         )
         return reformer.create_model(self.params, mode=mode)
     else:
         return transformer.create_model(self.params, mode=mode)
예제 #4
0
    def train(self):
        """Trains the model."""
        params = self.params
        flags_obj = self.flags_obj

        _ensure_dir(flags_obj.model_dir)

        model = transformer.create_model(params, is_train=True)
        opt = self._create_optimizer()

        current_step = 0
        checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
        latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
        if latest_checkpoint:
            checkpoint.restore(latest_checkpoint)
            logging.info('Loaded checkpoint %s', latest_checkpoint)
            current_step = opt.iterations.numpy()
        model.compile(opt)
        model.summary()

        train_ds = data_pipeline.train_input_fn(params)
        map_data_fn = data_pipeline.map_data_for_transformer_fn
        train_ds = train_ds.map(
            map_data_fn, num_parallel_calls=params['num_parallel_calls'])

        callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

        while current_step < flags_obj.train_steps:
            remaining_steps = flags_obj.train_steps - current_step
            train_steps_per_eval = (remaining_steps if remaining_steps <
                                    flags_obj.steps_between_evals else
                                    flags_obj.steps_between_evals)
            current_iteration = current_step // flags_obj.steps_between_evals

            logging.info(
                "Start train iteration at global step:{}".format(current_step))
            history = model.fit(
                train_ds,
                initial_epoch=current_iteration,
                epochs=current_iteration + 1,
                steps_per_epoch=train_steps_per_eval,
                callbacks=callbacks,
                # If TimeHistory is enabled, progress bar would be messy. Increase
                # the verbose level to get rid of it.
                verbose=1)
            current_step += train_steps_per_eval
            logging.info("Train history: {}".format(history.history))

            logging.info(
                "End train iteration at global step:{}".format(current_step))

        stats = _build_stats(history, callbacks)
        return stats
예제 #5
0
    def eval(self):
        """Evaluates the model."""
        self.params['train'] = False

        with distribution_utils.get_strategy_scope(self.distribution_strategy):
            if not self.predict_model:
                self.predict_model = transformer.create_model(self.params, False)
            self._load_weights_if_possible(
                self.predict_model,
                tf.train.latest_checkpoint(self.flags_obj.model_dir))
            self.predict_model.summary()
        return evaluate_and_log_bleu(
            self.predict_model, self.params, self.flags_obj.bleu_source,
            self.flags_obj.bleu_ref, self.flags_obj.vocab_file,
            self.distribution_strategy if self.use_tpu else None)
예제 #6
0
    def eval(self):
        """Evaluates the model."""

        # We only want to create the model under DS scope for TPU case.
        # When 'distribution_strategy' is None, a no-op DummyContextManager will
        # be used.
        if not self.predict_model:
            self.predict_model = transformer.create_model(self.params, False)
        self._load_weights_if_possible(
            self.predict_model,
            tf.train.latest_checkpoint(self.flags_obj.model_dir))
        self.predict_model.summary()
        return evaluate_and_log_bleu(self.predict_model, self.params,
                                     self.flags_obj.bleu_source,
                                     self.flags_obj.bleu_ref,
                                     self.flags_obj.vocab_file)
예제 #7
0
    def predict(self):
        """Predicts result from the model."""
        params = self.params
        flags_obj = self.flags_obj

        with tf.name_scope("model"):
            model = transformer.create_model(params, is_train=False)
            self._load_weights_if_possible(
                model, tf.train.latest_checkpoint(self.flags_obj.model_dir))
            model.summary()
        subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)

        ds = data_pipeline.eval_input_fn(params)
        ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
        ret = model.predict(ds)
        val_outputs, _ = ret
        length = len(val_outputs)
        for i in range(length):
            translate.translate_from_input(val_outputs[i], subtokenizer)
예제 #8
0
    def train(self):
        """Trains the model."""
        params = self.params
        flags_obj = self.flags_obj
        # Sets config options.
        keras_utils.set_session_config(enable_xla=flags_obj.enable_xla)

        _ensure_dir(flags_obj.model_dir)
        with distribution_utils.get_strategy_scope(self.distribution_strategy):
            model = transformer.create_model(params, is_train=True)
            opt = self._create_optimizer()

            current_step = 0
            checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
            latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
            if latest_checkpoint:
                # checkpoint.restore(latest_checkpoint)
                model.load_weights(latest_checkpoint)
                logging.info("Loaded checkpoint %s", latest_checkpoint)
                current_step = opt.iterations.numpy()

            if params["use_ctl"]:
                train_loss_metric = tf.keras.metrics.Mean("training_loss",
                                                          dtype=tf.float32)
                if params["enable_tensorboard"]:
                    summary_writer = tf.compat.v2.summary.create_file_writer(
                        flags_obj.model_dir)
                else:
                    summary_writer = tf.compat.v2.summary.create_noop_writer()
                train_metrics = [train_loss_metric]
                if params["enable_metrics_in_training"]:
                    train_metrics = train_metrics + model.metrics
            else:
                model.compile(opt)

        model.summary()

        if self.use_tpu:
            # Different from experimental_distribute_dataset,
            # experimental_distribute_datasets_from_function requires
            # per-replica/local batch size.
            params[
                "batch_size"] /= self.distribution_strategy.num_replicas_in_sync
            train_ds = (
                self.distribution_strategy.
                experimental_distribute_datasets_from_function(
                    lambda ctx: data_pipeline.train_input_fn(params, ctx)))
        else:
            train_ds = data_pipeline.train_input_fn(params)
            map_data_fn = data_pipeline.map_data_for_transformer_fn
            train_ds = train_ds.map(
                map_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        if params["use_ctl"]:
            train_ds_iterator = iter(train_ds)

        callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

        # Only TimeHistory callback is supported for CTL
        if params["use_ctl"]:
            callbacks = [
                cb for cb in callbacks
                if isinstance(cb, keras_utils.TimeHistory)
            ]

        # TODO(b/139418525): Refactor the custom training loop logic.
        @tf.function
        def train_steps(iterator, steps):
            """Training steps function for TPU runs.

      Args:
        iterator: The input iterator of the training dataset.
        steps: An integer, the number of training steps.

      Returns:
        A float, the loss value.
      """
            def _step_fn(inputs):
                """Per-replica step function."""
                inputs, targets = inputs
                with tf.GradientTape() as tape:
                    logits = model([inputs, targets], training=True)
                    loss = metrics.transformer_loss(logits, targets,
                                                    params["label_smoothing"],
                                                    params["vocab_size"])
                    # Scales the loss, which results in using the average loss across all
                    # of the replicas for backprop.
                    scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync

                # De-dupes variables due to keras tracking issues.
                tvars = list({id(v): v
                              for v in model.trainable_variables}.values())
                grads = tape.gradient(scaled_loss, tvars)
                opt.apply_gradients(zip(grads, tvars))
                # For reporting, the metric takes the mean of losses.
                train_loss_metric.update_state(loss)

            for _ in tf.range(steps):
                train_loss_metric.reset_states()
                self.distribution_strategy.run(_step_fn,
                                               args=(next(iterator), ))

        cased_score, uncased_score = None, None
        cased_score_history, uncased_score_history = [], []
        while current_step < flags_obj.train_steps:
            remaining_steps = flags_obj.train_steps - current_step
            train_steps_per_eval = (remaining_steps if remaining_steps <
                                    flags_obj.steps_between_evals else
                                    flags_obj.steps_between_evals)
            current_iteration = current_step // flags_obj.steps_between_evals

            logging.info(
                "Start train iteration at global step:{}".format(current_step))
            history = None
            if params["use_ctl"]:
                if not self.use_tpu:
                    raise NotImplementedError(
                        "Custom training loop on GPUs is not implemented.")

                # Runs training steps.
                with summary_writer.as_default():
                    for cb in callbacks:
                        cb.on_epoch_begin(current_iteration)
                        cb.on_batch_begin(0)

                    train_steps(
                        train_ds_iterator,
                        tf.convert_to_tensor(train_steps_per_eval,
                                             dtype=tf.int32))
                    current_step += train_steps_per_eval
                    train_loss = train_loss_metric.result().numpy().astype(
                        float)
                    logging.info("Train Step: %d/%d / loss = %s", current_step,
                                 flags_obj.train_steps, train_loss)

                    for cb in callbacks:
                        cb.on_batch_end(train_steps_per_eval - 1)
                        cb.on_epoch_end(current_iteration)

                    if params["enable_tensorboard"]:
                        for metric_obj in train_metrics:
                            tf.compat.v2.summary.scalar(
                                metric_obj.name, metric_obj.result(),
                                current_step)
                            summary_writer.flush()

                for cb in callbacks:
                    cb.on_train_end()

                if flags_obj.enable_checkpointing:
                    # avoid check-pointing when running for benchmarking.
                    checkpoint_name = checkpoint.save(
                        os.path.join(flags_obj.model_dir,
                                     "ctl_step_{}.ckpt".format(current_step)))
                    logging.info("Saved checkpoint to %s", checkpoint_name)
            else:
                if self.use_tpu:
                    raise NotImplementedError(
                        "Keras model.fit on TPUs is not implemented.")
                history = model.fit(
                    train_ds,
                    initial_epoch=current_iteration,
                    epochs=current_iteration + 1,
                    steps_per_epoch=train_steps_per_eval,
                    callbacks=callbacks,
                    # If TimeHistory is enabled, progress bar would be messy. Increase
                    # the verbose level to get rid of it.
                    verbose=(2 if flags_obj.enable_time_history else 1))
                current_step += train_steps_per_eval
                logging.info("Train history: {}".format(history.history))

            logging.info(
                "End train iteration at global step:{}".format(current_step))

            if (flags_obj.bleu_source and flags_obj.bleu_ref):
                uncased_score, cased_score = self.eval()
                cased_score_history.append(
                    [current_iteration + 1, cased_score])
                uncased_score_history.append(
                    [current_iteration + 1, uncased_score])

        stats = ({"loss": train_loss} if history is None else {})
        misc.update_stats(history, stats, callbacks)
        if uncased_score and cased_score:
            stats["bleu_uncased"] = uncased_score
            stats["bleu_cased"] = cased_score
            stats["bleu_uncased_history"] = uncased_score_history
            stats["bleu_cased_history"] = cased_score_history
        return stats
예제 #9
0
    def train(self):
        """Trains the model."""
        params = self.params
        flags_obj = self.flags_obj
        # Sets config options.
        keras_utils.set_session_config(enable_xla=flags_obj.enable_xla)

        _ensure_dir(flags_obj.model_dir)
        model = transformer.create_model(params, is_train=True)
        opt = self._create_optimizer()

        current_step = 0
        checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
        latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
        if latest_checkpoint:
            checkpoint.restore(latest_checkpoint)
            logging.info("Loaded checkpoint %s", latest_checkpoint)
            current_step = opt.iterations.numpy()

        if params["use_ctl"]:
            train_loss_metric = tf.keras.metrics.Mean("training_loss",
                                                      dtype=tf.float32)
            if params["enable_tensorboard"]:
                summary_writer = tf.compat.v2.summary.create_file_writer(
                    flags_obj.model_dir)
            else:
                summary_writer = tf.compat.v2.summary.create_noop_writer()
            train_metrics = [train_loss_metric]
            if params["enable_metrics_in_training"]:
                train_metrics = train_metrics + model.metrics
        else:
            model.compile(opt)

        model.summary()

        train_ds = data_pipeline.train_input_fn(params)
        train_ds = train_ds.map(
            lambda x, y: ((x, y), ),
            num_parallel_calls=params["num_parallel_calls"])
        if params["use_ctl"]:
            train_ds_iterator = iter(train_ds)

        callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

        # TODO(b/139418525): Refactor the custom training loop logic.
        @tf.function
        def train_steps(iterator, steps):
            """Training steps function for TPU runs.

      Args:
        iterator: The input iterator of the training dataset.
        steps: An integer, the number of training steps.

      Returns:
        A float, the loss value.
      """
            def _step_fn(inputs):
                """Per-replica step function."""
                inputs, targets = inputs
                with tf.GradientTape() as tape:
                    logits = model([inputs, targets], training=True)
                    loss = metrics.transformer_loss(logits, targets,
                                                    params["label_smoothing"],
                                                    params["vocab_size"])

                # De-dupes variables due to keras tracking issues.
                tvars = list({id(v): v
                              for v in model.trainable_variables}.values())
                grads = tape.gradient(loss, tvars)
                opt.apply_gradients(zip(grads, tvars))
                # For reporting, the metric takes the mean of losses.
                train_loss_metric.update_state(loss)

            for _ in tf.range(steps):
                train_loss_metric.reset_states()
                _step_fn(next(iterator))

        cased_score, uncased_score = None, None
        cased_score_history, uncased_score_history = [], []
        while current_step < flags_obj.train_steps:
            remaining_steps = flags_obj.train_steps - current_step
            train_steps_per_eval = (remaining_steps if remaining_steps <
                                    flags_obj.steps_between_evals else
                                    flags_obj.steps_between_evals)
            current_iteration = current_step // flags_obj.steps_between_evals

            logging.info(
                "Start train iteration at global step:{}".format(current_step))
            history = None
            if params["use_ctl"]:
                # Runs training steps.
                with summary_writer.as_default():
                    train_steps(
                        train_ds_iterator,
                        tf.convert_to_tensor(train_steps_per_eval,
                                             dtype=tf.int32))
                    current_step += train_steps_per_eval
                    train_loss = train_loss_metric.result().numpy().astype(
                        float)
                    logging.info("Train Step: %d/%d / loss = %s", current_step,
                                 flags_obj.train_steps, train_loss)

                    if params["enable_tensorboard"]:
                        for metric_obj in train_metrics:
                            tf.summary.scalar(metric_obj.name,
                                              metric_obj.result(),
                                              current_step)

                checkpoint_name = checkpoint.save(
                    os.path.join(flags_obj.model_dir,
                                 "ctl_step_{}.ckpt".format(current_step)))
                logging.info("Saved checkpoint to %s", checkpoint_name)
            else:
                history = model.fit(
                    train_ds,
                    initial_epoch=current_iteration,
                    epochs=current_iteration + 1,
                    steps_per_epoch=train_steps_per_eval,
                    callbacks=callbacks,
                    # If TimeHistory is enabled, progress bar would be messy. Increase
                    # the verbose level to get rid of it.
                    verbose=(2 if flags_obj.enable_time_history else 1))
                current_step += train_steps_per_eval
                logging.info("Train history: {}".format(history.history))

            logging.info(
                "End train iteration at global step:{}".format(current_step))

            if (flags_obj.bleu_source and flags_obj.bleu_ref):
                uncased_score, cased_score = self.eval()
                cased_score_history.append(
                    [current_iteration + 1, cased_score])
                uncased_score_history.append(
                    [current_iteration + 1, uncased_score])

        stats = ({
            "loss": train_loss
        } if history is None else misc.build_stats(history, callbacks))
        if uncased_score and cased_score:
            stats["bleu_uncased"] = uncased_score
            stats["bleu_cased"] = cased_score
            stats["bleu_uncased_history"] = uncased_score_history
            stats["bleu_cased_history"] = cased_score_history
        return stats
예제 #10
0
    def train(self):
        """Trains the model."""
        self.params['train'] = True
        params = self.params
        flags_obj = self.flags_obj
        # Sets config options.
        keras_utils.set_session_config(enable_xla=flags_obj.enable_xla)

        _ensure_dir(flags_obj.model_dir)
        with distribution_utils.get_strategy_scope(self.distribution_strategy):
            model = transformer.create_model(params, is_train=True)
            opt = self._create_optimizer()

            current_step = 0
            checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
            latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
            if latest_checkpoint:
                checkpoint.restore(latest_checkpoint)
                logging.info("###Loaded checkpoint %s", latest_checkpoint)
                current_step = opt.iterations.numpy()
                logging.info(current_step)

            model.compile(opt)

        model.summary()

        if self.use_tpu:
            # Different from experimental_distribute_dataset,
            # experimental_distribute_datasets_from_function requires
            # per-replica/local batch size.
            params[
                "batch_size"] /= self.distribution_strategy.num_replicas_in_sync
            train_ds = (
                self.distribution_strategy.
                experimental_distribute_datasets_from_function(
                    lambda ctx: data_pipeline.train_input_fn(params, ctx)))
        else:
            train_ds = data_pipeline.train_input_fn(params)
            map_data_fn = data_pipeline.map_data_for_transformer_fn
            train_ds = train_ds.map(
                map_data_fn, num_parallel_calls=params["num_parallel_calls"])
        # if params["use_ctl"]:
        #     train_ds_iterator = iter(train_ds)

        callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)

        # TODO(b/139418525): Refactor the custom training loop logic.
        train_loss = 0

        # cased_score, uncased_score = None, None
        # cased_score_history, uncased_score_history = [], []
        while current_step < flags_obj.train_steps:
            remaining_steps = flags_obj.train_steps - current_step
            train_steps_per_eval = (remaining_steps if remaining_steps <
                                    flags_obj.steps_between_evals else
                                    flags_obj.steps_between_evals)
            current_iteration = current_step // flags_obj.steps_between_evals

            logging.info(
                "Start train iteration at global step:{}".format(current_step))
            history = None
            if params["use_ctl"]:
                if not self.use_tpu:
                    raise NotImplementedError(
                        "Custom training loop on GPUs is not implemented.")
                # Runs training steps.
                # train_steps(train_ds_iterator,
                #             tf.convert_to_tensor(train_steps_per_eval, dtype=tf.int32))
                # current_step += train_steps_per_eval
                # train_loss = train_loss_metric.result().numpy().astype(float)
                # logging.info("Train Step: %d/%d / loss = %s",
                #              current_step, flags_obj.train_steps, train_loss)
                #
                # checkpoint_name = checkpoint.save(
                #     os.path.join(
                #         flags_obj.model_dir,
                #         "ctl_step_{}.ckpt".format(current_step)))
                # logging.info("Saved checkpoint to %s", checkpoint_name)
            else:
                if self.use_tpu:
                    raise NotImplementedError(
                        "Keras model.fit on TPUs is not implemented.")
                history = model.fit(
                    train_ds,
                    initial_epoch=current_iteration,
                    epochs=current_iteration + 1,
                    steps_per_epoch=train_steps_per_eval,
                    callbacks=callbacks,
                    # If TimeHistory is enabled, progress bar would be messy. Increase
                    # the verbose level to get rid of it.
                    verbose=(2 if flags_obj.enable_time_history else 1))
                current_step += train_steps_per_eval
                logging.info("Train history: {}".format(history.history))

            logging.info(
                "End train iteration at global step:{}".format(current_step))

            # if (flags_obj.bleu_source and flags_obj.bleu_ref):
            #     uncased_score, cased_score = self.eval()
            #     cased_score_history.append([current_iteration + 1, cased_score])
            #     uncased_score_history.append([current_iteration + 1, uncased_score])

        stats = ({
            "loss": train_loss
        } if history is None else misc.build_stats(history, callbacks))
        # if uncased_score and cased_score:
        #     stats["bleu_uncased"] = uncased_score
        #     stats["bleu_cased"] = cased_score
        #     stats["bleu_uncased_history"] = uncased_score_history
        #     stats["bleu_cased_history"] = cased_score_history
        return stats