class LanguageModelTrainerDistributedTf(Trainer): """A Trainer for LM distributed eager training """ def __init__(self, model_params, **kwargs): super().__init__() if type(model_params) is dict: self.model = create_model_for('lm', **model_params) else: self.model = model_params loss_fn = loss_with_state if self.model.requires_state else loss_without_state self.optimizer = EagerOptimizer(loss_fn, **kwargs) self.nsteps = kwargs.get('nsteps', 500) self._checkpoint = tf.train.Checkpoint( optimizer=self.optimizer.optimizer, model=self.model) checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid()) self.checkpoint_manager = tf.train.CheckpointManager( self._checkpoint, directory=checkpoint_dir, max_to_keep=5) strategy_type = kwargs.get('strategy_type', 'mirror') gpus = int(kwargs.get('gpus', 1)) endpoint = kwargs.get('endpoint') self.strategy = create_distribute_strategy(strategy_type, gpus, endpoint) def checkpoint(self): """This method saves a checkpoint :return: None """ self.checkpoint_manager.save() def recover_last_checkpoint(self): """Recover the last saved checkpoint :return: None """ print( self._checkpoint.restore( self.checkpoint_manager.latest_checkpoint)) @staticmethod def _num_toks(y): return tf.reduce_prod(get_shape_as_list(y)) def train(self, ts, reporting_fns, steps=0, dataset=True): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ strategy = self.strategy def _replicated_train_step_no_state(inputs): features, y = inputs per_replica_loss = self.optimizer.update(self.model, features, y) per_replica_toks = self._num_toks(y) per_replica_report_loss = per_replica_loss * tf.cast( per_replica_toks, tf.float32) return per_replica_report_loss, per_replica_toks def _replicated_train_step_with_state(inputs, hidden): features, y = inputs per_replica_loss, new_hidden = self.optimizer.update_with_hidden( self.model, hidden, features, y) per_replica_toks = self._num_toks(y) per_replica_report_loss = per_replica_loss * tf.cast( per_replica_toks, tf.float32) return new_hidden, per_replica_report_loss, per_replica_toks with strategy.scope(): train_iter = iter(ts) SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() start = time.time() @tf.function def _distributed_train_no_state(inputs): per_replica_loss, per_replica_toks = strategy.experimental_run_v2( _replicated_train_step_no_state, args=(inputs, )) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None), strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None) @tf.function def _distributed_train_with_state(inputs, hidden): h, per_replica_loss, per_replica_toks = strategy.experimental_run_v2( _replicated_train_step_with_state, args=( inputs, hidden, )) step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None) step_toks = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None) return h, step_loss, step_toks h = None for i in range(steps): inputs = next(train_iter) if self.model.requires_state: h, step_loss, step_toks = _distributed_train_with_state( inputs, h) else: step_loss, step_toks = _distributed_train_no_state(inputs) epoch_loss.assign_add(step_loss) nstep_loss.assign_add(step_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report(step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns) return metrics def calc_metrics(self, agg, norm): metrics = super().calc_metrics(agg, norm) metrics['perplexity'] = np.exp(metrics['avg_loss']) return metrics def test(self, vs, reporting_fns, phase, steps=0): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ strategy = self.strategy def _replicated_test_step_no_state(inputs): features, y = inputs per_replica_loss = loss_without_state(self.model, features, y) per_replica_toks = self._num_toks(y) per_replica_report_loss = per_replica_loss * tf.cast( per_replica_toks, tf.float32) return per_replica_report_loss, per_replica_toks def _replicated_test_step_with_state(inputs, hidden): features, y = inputs per_replica_loss, new_hidden = loss_with_state( self.model, hidden, features, y) per_replica_toks = self._num_toks(y) per_replica_report_loss = per_replica_loss * tf.cast( per_replica_toks, tf.float32) return new_hidden, per_replica_report_loss, per_replica_toks with strategy.scope(): SET_TRAIN_FLAG(False) test_iter = iter(vs) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() start = time.time() @tf.function def _distributed_test_no_state(inputs): per_replica_loss, per_replica_toks = strategy.experimental_run_v2( _replicated_test_step_no_state, args=(inputs, )) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None), strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None) @tf.function def _distributed_test_with_state(inputs, hidden): h, per_replica_loss, per_replica_toks = strategy.experimental_run_v2( _replicated_test_step_with_state, args=( inputs, hidden, )) step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None) step_toks = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None) return h, step_loss, step_toks epochs = 0 if phase == 'Valid': self.valid_epochs += 1 epochs = self.valid_epochs h = None for i in range(steps): inputs = next(test_iter) if self.model.requires_state: h, per_replica_loss, per_replica_toks = _distributed_test_with_state( inputs, h) else: per_replica_loss, per_replica_toks = _distributed_test_no_state( inputs) epoch_loss.assign_add(per_replica_loss) epoch_div.assign_add(per_replica_toks) metrics = self.calc_metrics(epoch_loss.numpy(), epoch_div.numpy()) self.report(epochs, metrics, start, phase, 'EPOCH', reporting_fns) return metrics def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
class Seq2SeqTrainerEagerTf(Trainer): """Eager mode trainer for seq2sew The trainer can run in 2 modes: `dataset` and `feed_dict`. When the former, the graph is assumed to be connected by features attached to the input so the `feed_dict` will only be used to pass dropout information. When the latter, we will use the baseline DataFeed to read the object into the `feed_dict` """ def __init__(self, model_params, **kwargs): super().__init__() if type(model_params) is dict: self.model = create_model_for('seq2seq', **model_params) else: self.model = model_params self.tgt_rlut = kwargs['tgt_rlut'] self.loss = Seq2SeqLoss(**kwargs) self.optimizer = EagerOptimizer(self.loss, **kwargs) self.nsteps = kwargs.get('nsteps', 500) checkpoint_dir = kwargs.get('checkpoint') if checkpoint_dir is None: checkpoint_dir = f'./tf-seq2seq-{os.getpid()}' self._checkpoint, self.checkpoint_manager = setup_tf2_checkpoints( self.optimizer, self.model, checkpoint_dir) self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4)) def checkpoint(self): """This method saves a checkpoint :return: None """ self.checkpoint_manager.save() def recover_last_checkpoint(self): """Recover the last saved checkpoint :return: None """ print( self._checkpoint.restore( self.checkpoint_manager.latest_checkpoint)) @staticmethod def _num_toks(y): return tf.prod(get_shape_as_list(y)) def _num_toks(self, lens): return tf.reduce_sum(lens) def train(self, ts, reporting_fns): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.perf_counter() start = time.perf_counter() @tf.function def _train_step(features, y): """Replicated training step.""" loss = self.optimizer.update(self.model, features, y) toks = self._num_toks(features['tgt_len']) report_loss = loss * tf.cast(toks, tf.float32) return report_loss, toks with autograph_options({ "function_optimization": False, "layout_optimizer": False }): for features, y in ts: features['dst'] = y[:, :-1] step_report_loss, step_toks = _train_step(features, y) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report(step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.perf_counter() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns) return metrics def calc_metrics(self, agg, norm): metrics = super().calc_metrics(agg, norm) metrics['perplexity'] = np.exp(metrics['avg_loss']) return metrics def _evaluate(self, es, reporting_fns, **kwargs): """Run the model with beam search and report Bleu. :param es: `tf.dataset` of input :param reporting_fns: Input hooks """ preds = [] golds = [] start = time.perf_counter() for features, tgt in es: features['dst'] = tgt[:, :-1] tgt_lens = features.pop('tgt_len') top_preds = self.model.predict(features, make_input=False, **kwargs)[0] preds.extend( convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut)) metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]} self.report(0, metrics, start, 'Test', 'EPOCH', reporting_fns) return metrics def test(self, vs, reporting_fns, phase='Valid', **kwargs): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(False) if phase == 'Test': return self._evaluate(vs, reporting_fns, **kwargs) self.valid_epochs += 1 total_loss = 0 total_toks = 0 preds = [] golds = [] start = time.perf_counter() for features, tgt in vs: features['dst'] = tgt[:, :-1] top_preds = self.model.predict(features, beam=1, make_input=False)[0] loss_value = self.loss(self.model, features, tgt).numpy() toks = tf.cast(self._num_toks(features['tgt_len']), tf.float32).numpy() total_loss += loss_value * toks total_toks += toks preds.extend( convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend( convert_seq2seq_golds(tgt, features['tgt_len'], self.tgt_rlut)) metrics = self.calc_metrics(total_loss, total_toks) metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0] self.report(self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns) return metrics
class LanguageModelTrainerEagerTf(Trainer): """A Trainer to use for eager mode """ def __init__(self, model_params, **kwargs): super().__init__() if type(model_params) is dict: self.model = create_model_for('lm', **model_params) else: self.model = model_params loss_fn = loss_with_state if self.model.requires_state else loss_without_state self.optimizer = EagerOptimizer(loss_fn, **kwargs) self.nsteps = kwargs.get('nsteps', 500) self._checkpoint = tf.train.Checkpoint( optimizer=self.optimizer.optimizer, model=self.model) checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid()) self.checkpoint_manager = tf.train.CheckpointManager( self._checkpoint, directory=checkpoint_dir, max_to_keep=5) def checkpoint(self): """This method saves a checkpoint :return: None """ self.checkpoint_manager.save() def recover_last_checkpoint(self): """Recover the last saved checkpoint :return: None """ print( self._checkpoint.restore( self.checkpoint_manager.latest_checkpoint)) @staticmethod def _num_toks(y): return tf.reduce_prod(get_shape_as_list(y)) def train(self, ts, reporting_fns): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.perf_counter() start = time.perf_counter() def _train_step_no_state(inputs): """Replicated training step.""" features, y = inputs loss = self.optimizer.update(self.model, features, y) toks = self._num_toks(y) report_loss = loss * tf.cast(toks, tf.float32) return report_loss, toks def _train_step_with_state(inputs, hidden): """Replicated training step.""" features, y = inputs loss, hidden = self.optimizer.update_with_hidden( self.model, hidden, features, y) toks = self._num_toks(y) report_loss = loss * tf.cast(toks, tf.float32) return hidden, report_loss, toks if get_version(tf) >= 2: _train_step_with_state = tf.function(_train_step_with_state) _train_step_no_state = tf.function(_train_step_no_state) h = None for inputs in ts: if self.model.requires_state: h, step_report_loss, step_toks = _train_step_with_state( inputs, h) else: step_report_loss, step_toks = _train_step_no_state(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report(step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.perf_counter() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns) return metrics def calc_metrics(self, agg, norm): metrics = super().calc_metrics(agg, norm) metrics['perplexity'] = np.exp(metrics['avg_loss']) return metrics def test(self, vs, reporting_fns, phase): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ total_loss = 0.0 total_toks = 0 epochs = 0 if phase == 'Valid': self.valid_epochs += 1 epochs = self.valid_epochs SET_TRAIN_FLAG(False) start = time.perf_counter() h = None for features, y in vs: if self.model.requires_state: loss_value, h = loss_with_state(self.model, h, features, y) else: loss_value = loss_without_state(self.model, features, y) loss_value = loss_value.numpy() toks = self._num_toks(y) total_loss += loss_value * tf.cast(toks, tf.float32).numpy() total_toks += toks.numpy() metrics = self.calc_metrics(total_loss, total_toks) self.report(epochs, metrics, start, phase, 'EPOCH', reporting_fns) return metrics
class TaggerTrainerEagerTf(EpochReportingTrainer): """A Trainer to use for eager mode training """ def __init__(self, model_params, **kwargs): """Create a Trainer, and give it the parameters needed to instantiate the model :param model_params: The model parameters :param kwargs: See below :Keyword Arguments: * *nsteps* (`int`) -- If we should report every n-steps, this should be passed * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e * *clip* (`int`) -- If we are doing gradient clipping, what value to use * *optim* (`str`) -- The name of the optimizer we are using * *lr* (`float`) -- The learning rate we are using * *mom* (`float`) -- If we are using SGD, what value to use for momentum * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9` * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999` * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8 """ super().__init__() if type(model_params) is dict: self.model = create_model_for('tagger', **model_params) else: self.model = model_params span_type = kwargs.get('span_type', 'iob') verbose = kwargs.get('verbose', False) self.evaluator = TaggerEvaluatorEagerTf(self.model, span_type, verbose) self.optimizer = EagerOptimizer(loss, **kwargs) self.nsteps = kwargs.get('nsteps', six.MAXSIZE) checkpoint_dir = kwargs.get('checkpoint') if checkpoint_dir is None: checkpoint_dir = f'./tf-tagger-{os.getpid()}' self._checkpoint, self.checkpoint_manager = setup_tf2_checkpoints( self.optimizer, self.model, checkpoint_dir) def checkpoint(self): """This method saves a checkpoint :return: None """ self.checkpoint_manager.save() def recover_last_checkpoint(self): """Recover the last saved checkpoint :return: None """ print( self._checkpoint.restore( self.checkpoint_manager.latest_checkpoint)) @staticmethod def _get_batchsz(batch_dict): return batch_dict['y'].shape[0] def _train(self, loader, steps=0, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ SET_TRAIN_FLAG(True) reporting_fns = kwargs.get('reporting_fns', []) pg = create_progress_bar(steps) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.perf_counter() @tf.function def _train_step(inputs): features, y = inputs loss = self.optimizer.update(self.model, features, y) batchsz = get_shape_as_list(y)[0] report_loss = loss * batchsz return report_loss, batchsz with autograph_options({ "function_optimization": False, "layout_optimizer": False }): for inputs in pg(loader): step_report_loss, step_batchsz = _train_step(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_batchsz) nstep_div.assign_add(step_batchsz) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report(step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.perf_counter() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics def _test(self, ts, steps=0, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ return self.evaluator.test(ts, steps, **kwargs)
class Seq2SeqTrainerDistributedTf(Trainer): """A Trainer to use for eager distributed mode """ def __init__(self, model_params, **kwargs): super().__init__() if type(model_params) is dict: self.model = create_model_for('seq2seq', **model_params) else: self.model = model_params self.tgt_rlut = kwargs['tgt_rlut'] self.optimizer = EagerOptimizer(loss, **kwargs) self.nsteps = kwargs.get('nsteps', 500) self._checkpoint = tf.train.Checkpoint( optimizer=self.optimizer.optimizer, model=self.model) checkpoint_dir = '{}-{}'.format("./tf-seq2seq", os.getpid()) self.checkpoint_manager = tf.train.CheckpointManager( self._checkpoint, directory=checkpoint_dir, max_to_keep=5) strategy_type = kwargs.get('strategy_type', 'mirror') gpus = int(kwargs.get('gpus', 1)) endpoint = kwargs.get('endpoint') self.strategy = create_distribute_strategy(strategy_type, gpus, endpoint) self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4)) def checkpoint(self): """This method saves a checkpoint :return: None """ self.checkpoint_manager.save() def recover_last_checkpoint(self): """Recover the last saved checkpoint :return: None """ print( self._checkpoint.restore( self.checkpoint_manager.latest_checkpoint)) @staticmethod def _num_toks(y): return tf.prod(get_shape_as_list(y)) def _num_toks(self, lens): return tf.reduce_sum(lens) def train(self, ts, reporting_fns, steps=0): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ strategy = self.strategy #num_replicas = strategy.num_replicas_in_sync def _replicated_train_step(inputs): features, y = inputs per_replica_loss = self.optimizer.update(self.model, features, y) per_replica_toks = self._num_toks(features['tgt_len']) per_replica_report_loss = per_replica_loss * tf.cast( per_replica_toks, tf.float32) return per_replica_report_loss, per_replica_toks with strategy.scope(): SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() start = time.time() @tf.function def _distributed_train_step(inputs): per_replica_loss, per_replica_toks = strategy.experimental_run_v2( _replicated_train_step, args=(inputs, )) total_step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None) total_toks = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None) return total_step_loss, total_toks with autograph_options({ "function_optimization": False, "layout_optimizer": False }): train_iter = iter(ts) for i in range(steps): features, y = next(train_iter) step_report_loss, step_toks = _distributed_train_step( (features, y)) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy().item() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy().item(), nstep_div.numpy().item()) self.report(step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns) return metrics def calc_metrics(self, agg, norm): metrics = super().calc_metrics(agg, norm) metrics['perplexity'] = np.exp(metrics['avg_loss']).item() return metrics def _evaluate(self, es, reporting_fns, **kwargs): """Run the model with beam search and report Bleu. :param es: `tf.dataset` of input :param reporting_fns: Input hooks """ preds = [] golds = [] start = time.time() kwargs['make_input'] = False for features, tgt in es: tgt_lens = features.pop('tgt_len') top_preds = self.model.predict(features, **kwargs) preds.extend( convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut)) metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]} self.report(0, metrics, start, 'Test', 'EPOCH', reporting_fns) return metrics def test(self, vs, reporting_fns, steps=0, phase='Valid', **kwargs): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ def _replicated_valid_step(inputs): features, tgt = inputs top_preds = self.model.predict(features, beam=1, make_input=False) per_replica_loss = loss(self.model, features, tgt) per_replica_toks = self._num_toks(features['tgt_len']) per_replica_report_loss = per_replica_loss * tf.cast( per_replica_toks, tf.float32) return per_replica_report_loss, per_replica_toks, top_preds if phase == 'Test': SET_TRAIN_FLAG(False) return self._evaluate(vs, reporting_fns, **kwargs) strategy = self.strategy num_replicas = strategy.num_replicas_in_sync with strategy.scope(): SET_TRAIN_FLAG(False) self.valid_epochs += 1 total_loss = tf.Variable(0.0) total_toks = tf.Variable(0, dtype=tf.int32) preds = [] golds = [] start = time.time() test_iter = iter(vs) for i in range(steps): features, tgt = next(test_iter) inputs = (features, tgt) per_replica_loss, per_replica_toks, _ = strategy.experimental_run_v2( _replicated_valid_step, args=(inputs, )) total_loss.assign_add( strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)) total_toks.assign_add( strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None)) # Not sure a good way to get top preds merged yet metrics = self.calc_metrics(total_loss.numpy(), total_toks.numpy()) self.report(self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns) return metrics def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
return model.decoder_model.neg_log_loss(unary, y, x['lengths']) optim = EagerOptimizer(loss, optim="adam", lr=args.lr) import time num_epochs = args.epochs for epoch in range(num_epochs): # Training loop - using batches of 32 loss_acc = 0. step = 0 start = time.time() for x, y in train_input_fn(): # Optimize the model loss_value = optim.update(model, x, y) loss_acc += loss_value step += 1 print('training time {}'.format(time.time() - start)) mean_loss = loss_acc / step print('Training Loss {}'.format(mean_loss)) acc = 0 total = 0 for x, y in eval_input_fn(): # Optimize the model # Track progress # compare predicted label to actual label