Пример #1
0
    return loss, h


optimizer = EagerOptimizer(loss, optim="adam", lr=args.lr)
for epoch in range(args.epochs):

    loss_accum = 0.
    step = 0
    start = time.time()
    h = None

    SET_TRAIN_FLAG(True)

    for x, y in train_input_fn():
        # Optimize the model
        loss_value, h = optimizer.update_with_hidden(model, h, x, y)
        loss_accum += loss_value
        step += 1
    print('training time {}'.format(time.time() - start))

    mean_loss = loss_accum / step
    print('Training Loss {}, Perplexity {}'.format(mean_loss,
                                                   np.exp(mean_loss)))

    step = 0
    loss_accum = 0
    SET_TRAIN_FLAG(False)

    for x, y in eval_input_fn():
        # Track progress
        # compare predicted label to actual label
Пример #2
0
class LanguageModelTrainerDistributedTf(Trainer):
    """A Trainer for LM distributed eager training
    """
    def __init__(self, model_params, **kwargs):
        super().__init__()
        if type(model_params) is dict:
            self.model = create_model_for('lm', **model_params)
        else:
            self.model = model_params

        loss_fn = loss_with_state if self.model.requires_state else loss_without_state
        self.optimizer = EagerOptimizer(loss_fn, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)

        strategy_type = kwargs.get('strategy_type', 'mirror')
        gpus = int(kwargs.get('gpus', 1))
        endpoint = kwargs.get('endpoint')
        self.strategy = create_distribute_strategy(strategy_type, gpus,
                                                   endpoint)

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _num_toks(y):
        return tf.reduce_prod(get_shape_as_list(y))

    def train(self, ts, reporting_fns, steps=0, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        strategy = self.strategy

        def _replicated_train_step_no_state(inputs):

            features, y = inputs
            per_replica_loss = self.optimizer.update(self.model, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks

        def _replicated_train_step_with_state(inputs, hidden):
            features, y = inputs
            per_replica_loss, new_hidden = self.optimizer.update_with_hidden(
                self.model, hidden, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return new_hidden, per_replica_report_loss, per_replica_toks

        with strategy.scope():
            train_iter = iter(ts)
            SET_TRAIN_FLAG(True)
            epoch_loss = tf.Variable(0.0)
            epoch_div = tf.Variable(0, dtype=tf.int32)
            nstep_loss = tf.Variable(0.0)
            nstep_div = tf.Variable(0, dtype=tf.int32)
            self.nstep_start = time.time()
            start = time.time()

            @tf.function
            def _distributed_train_no_state(inputs):
                per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_train_step_no_state, args=(inputs, ))
                return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_loss,
                                       axis=None), strategy.reduce(
                                           tf.distribute.ReduceOp.SUM,
                                           per_replica_toks,
                                           axis=None)

            @tf.function
            def _distributed_train_with_state(inputs, hidden):

                h, per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_train_step_with_state, args=(
                        inputs,
                        hidden,
                    ))
                step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_loss,
                                            axis=None)
                step_toks = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_toks,
                                            axis=None)
                return h, step_loss, step_toks

            h = None
            for i in range(steps):

                inputs = next(train_iter)
                if self.model.requires_state:
                    h, step_loss, step_toks = _distributed_train_with_state(
                        inputs, h)
                else:
                    step_loss, step_toks = _distributed_train_no_state(inputs)
                epoch_loss.assign_add(step_loss)
                nstep_loss.assign_add(step_loss)
                epoch_div.assign_add(step_toks)
                nstep_div.assign_add(step_toks)
                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(),
                                                nstep_div.numpy())
                    self.report(step, metrics, self.nstep_start, 'Train',
                                'STEP', reporting_fns, self.nsteps)
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.time()

            epoch_loss = epoch_loss.numpy()
            epoch_div = epoch_div.numpy()
            metrics = self.calc_metrics(epoch_loss, epoch_div)
            self.train_epochs += 1
            self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                        reporting_fns)
            return metrics

    def calc_metrics(self, agg, norm):
        metrics = super().calc_metrics(agg, norm)
        metrics['perplexity'] = np.exp(metrics['avg_loss'])
        return metrics

    def test(self, vs, reporting_fns, phase, steps=0):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        strategy = self.strategy

        def _replicated_test_step_no_state(inputs):
            features, y = inputs
            per_replica_loss = loss_without_state(self.model, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks

        def _replicated_test_step_with_state(inputs, hidden):
            features, y = inputs
            per_replica_loss, new_hidden = loss_with_state(
                self.model, hidden, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return new_hidden, per_replica_report_loss, per_replica_toks

        with strategy.scope():
            SET_TRAIN_FLAG(False)
            test_iter = iter(vs)
            epoch_loss = tf.Variable(0.0)
            epoch_div = tf.Variable(0, dtype=tf.int32)
            self.nstep_start = time.time()
            start = time.time()

            @tf.function
            def _distributed_test_no_state(inputs):
                per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_test_step_no_state, args=(inputs, ))
                return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_loss,
                                       axis=None), strategy.reduce(
                                           tf.distribute.ReduceOp.SUM,
                                           per_replica_toks,
                                           axis=None)

            @tf.function
            def _distributed_test_with_state(inputs, hidden):

                h, per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_test_step_with_state, args=(
                        inputs,
                        hidden,
                    ))
                step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_loss,
                                            axis=None)
                step_toks = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_toks,
                                            axis=None)
                return h, step_loss, step_toks

            epochs = 0
            if phase == 'Valid':
                self.valid_epochs += 1
                epochs = self.valid_epochs

            h = None
            for i in range(steps):
                inputs = next(test_iter)
                if self.model.requires_state:
                    h, per_replica_loss, per_replica_toks = _distributed_test_with_state(
                        inputs, h)
                else:
                    per_replica_loss, per_replica_toks = _distributed_test_no_state(
                        inputs)
                epoch_loss.assign_add(per_replica_loss)
                epoch_div.assign_add(per_replica_toks)
            metrics = self.calc_metrics(epoch_loss.numpy(), epoch_div.numpy())
            self.report(epochs, metrics, start, phase, 'EPOCH', reporting_fns)
            return metrics

    def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset)
Пример #3
0
class LanguageModelTrainerEagerTf(Trainer):
    """A Trainer to use for eager mode

    """
    def __init__(self, model_params, **kwargs):
        super().__init__()

        if type(model_params) is dict:
            self.model = create_model_for('lm', **model_params)
        else:
            self.model = model_params

        loss_fn = loss_with_state if self.model.requires_state else loss_without_state
        self.optimizer = EagerOptimizer(loss_fn, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _num_toks(y):
        return tf.reduce_prod(get_shape_as_list(y))

    def train(self, ts, reporting_fns):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """

        SET_TRAIN_FLAG(True)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.perf_counter()
        start = time.perf_counter()

        def _train_step_no_state(inputs):
            """Replicated training step."""

            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            toks = self._num_toks(y)
            report_loss = loss * tf.cast(toks, tf.float32)
            return report_loss, toks

        def _train_step_with_state(inputs, hidden):
            """Replicated training step."""

            features, y = inputs
            loss, hidden = self.optimizer.update_with_hidden(
                self.model, hidden, features, y)
            toks = self._num_toks(y)
            report_loss = loss * tf.cast(toks, tf.float32)
            return hidden, report_loss, toks

        if get_version(tf) >= 2:
            _train_step_with_state = tf.function(_train_step_with_state)
            _train_step_no_state = tf.function(_train_step_no_state)

        h = None
        for inputs in ts:
            if self.model.requires_state:
                h, step_report_loss, step_toks = _train_step_with_state(
                    inputs, h)
            else:
                step_report_loss, step_toks = _train_step_no_state(inputs)

            epoch_loss.assign_add(step_report_loss)
            nstep_loss.assign_add(step_report_loss)
            epoch_div.assign_add(step_toks)
            nstep_div.assign_add(step_toks)

            step = self.optimizer.global_step.numpy() + 1
            if step % self.nsteps == 0:
                metrics = self.calc_metrics(nstep_loss.numpy(),
                                            nstep_div.numpy())
                self.report(step, metrics, self.nstep_start, 'Train', 'STEP',
                            reporting_fns, self.nsteps)
                nstep_loss.assign(0.0)
                nstep_div.assign(0)
                self.nstep_start = time.perf_counter()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        self.train_epochs += 1
        self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                    reporting_fns)
        return metrics

    def calc_metrics(self, agg, norm):
        metrics = super().calc_metrics(agg, norm)
        metrics['perplexity'] = np.exp(metrics['avg_loss'])
        return metrics

    def test(self, vs, reporting_fns, phase):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        total_loss = 0.0
        total_toks = 0
        epochs = 0
        if phase == 'Valid':
            self.valid_epochs += 1
            epochs = self.valid_epochs
        SET_TRAIN_FLAG(False)

        start = time.perf_counter()
        h = None
        for features, y in vs:
            if self.model.requires_state:
                loss_value, h = loss_with_state(self.model, h, features, y)
            else:
                loss_value = loss_without_state(self.model, features, y)
            loss_value = loss_value.numpy()
            toks = self._num_toks(y)
            total_loss += loss_value * tf.cast(toks, tf.float32).numpy()
            total_toks += toks.numpy()

        metrics = self.calc_metrics(total_loss, total_toks)
        self.report(epochs, metrics, start, phase, 'EPOCH', reporting_fns)
        return metrics