def test_group_by_batches_truncated(self): self.assertEqual([], list(util.group_by_batches([], 2, truncate=True))) self.assertEqual([[1], [2], [3]], list( util.group_by_batches([1, 2, 3], 1, truncate=True))) self.assertEqual([[1, 2]], list( util.group_by_batches([1, 2, 3], 2, truncate=True)))
def _run(self, supervisor, session): batches = ( # generates (size, feed_dict) pairs (len(batch), self.compiler.build_feed_dict(batch)) for batch in util.group_by_batches(self.examples, self.batch_size)) if self.eval_interval_secs: gen_batches = util.epochs(batches, shuffle=False) # memoize batches max_reported_step = 0 # Should eval for the final measurement even if _should_stop is true. while not (self._should_stop(supervisor) and max_reported_step > 0): start_time = time.time() if self._restore(supervisor, session): step = tf.train.global_step(session, self.global_step) if step > max_reported_step: max_reported_step = step results = self._eval_batches( supervisor, session, next(gen_batches), step) self._report_loss_and_save_best(supervisor, session, step, *results) if self._should_stop(supervisor): break else: self.log_and_print('not running eval because step=%s' % step) sleep_time = self.eval_interval_secs - (time.time() - start_time) if sleep_time > 0: time.sleep(sleep_time) elif self._restore(supervisor, session): step = tf.train.global_step(session, self.global_step) results = self._eval_batches(supervisor, session, batches, step) if results[0] is not None: self._report_loss_and_save_best(supervisor, session, step, *results) self.report_done()
def prepare_batches(shuffled): for batch in util.group_by_batches(shuffled, self.batch_size, truncate=self.exact_batch_sizes): feed_dict[self.compiler.loom_input_tensor] = batch if self.compute_summaries: feed_dict[self.batch_size_placeholder] = len(batch) yield
def prepare_batches(shuffled): for batch in util.group_by_batches(shuffled, self.batch_size, truncate=False): feed_dict[self.compiler.loom_input_tensor] = batch feed_dict[self._batch_size_ph] = len(batch) yield
def _run(self, supervisor, session): batches = ( # generates (size, feed_dict) pairs (len(batch), self.compiler.build_feed_dict(batch)) for batch in util.group_by_batches(self.examples, self.batch_size)) if self.eval_interval_secs: gen_batches = util.epochs(batches, shuffle=False) # memoize batches max_reported_step = 0 while not (self._should_stop(supervisor) and max_reported_step > 0): start_time = time.time() if self._restore(supervisor, session): step = tf.train.global_step(session, self.global_step) if step > max_reported_step: max_reported_step = step results = self._eval_batches(supervisor, session, next(gen_batches), step) if results[0] is None: break # should_stop returned true self._report_loss_and_save_best( supervisor, session, step, *results) else: self.log_and_print('not running eval because step=%s' % step) sleep_time = self.eval_interval_secs - (time.time() - start_time) if sleep_time > 0: time.sleep(sleep_time) elif self._restore(supervisor, session): step = tf.train.global_step(session, self.global_step) results = self._eval_batches(supervisor, session, batches, step) if results[0] is not None: self._report_loss_and_save_best(supervisor, session, step, *results) self.report_done()
def _run(self, supervisor, session): batches = ( # generates (size, feed_dict) pairs (len(batch), self.compiler.build_feed_dict(batch)) for batch in util.group_by_batches(self.examples, self.batch_size)) if self._restore(supervisor, session): step = tf.train.global_step(session, self._global_step) results = self._eval_batches(supervisor, session, batches, step) if results[0] is not None: self._report_loss_and_save_best(supervisor, session, step, *results)
def build_loom_input_batched(self, examples, batch_size=None, metric_labels=False, ordered=False): """Turns examples into a feed value for `self.loom_input_tensor`. The result is an iterator; work doesn't happen until you call e.g. `next()` or `list()` on it. Args: examples: A non-empty iterable of examples to be built into tensors. batch_size: The maximum number of examples to compile into each loom input. Defaults to 100. If multiprocessing then this will also be the chunk size for each unit of work. metric_labels: Whether or not to return metric labels. ordered: Whether or not to preserve ordering when multiprocessing, otherwise has not effect (and order is always preserved). Returns: Feed value(s) corresponding to `examples` grouped into batches. The result itself can be fed directly to `self.loom_input_tensor`, or be iterated over to feed values batch-by-batch. If `metric_labels` is True, an iterable of `(batch_feed_value, metric_labels)` tuples. Raises: TypeError: If `examples` is not an iterable. RuntimeError: If [`init_loom()`](#td.Compiler.init_loom) has not been called. """ self._check_build('build_loom_input_batched', examples) if batch_size is None: batch_size = 100 batches = util.group_by_batches(examples, batch_size) results = _map_maybe_parallel(self.pool, _subprocess_build_batch, self._build_batch, batches, ordered, chunk_size=1, metric_labels=metric_labels) # If metric_labels is false, use an edible iterator so that # `results` can be fed. if not metric_labels: results = util.EdibleIterator(results) return results
def _run(self, supervisor, session): train_feed_dict = self.train_feeds.copy() train_fetches = {'train_op': self.train_op, 'loss': self.loss_total, 'step': self.global_step} if self.compute_summaries: train_fetches['summaries'] = self.summaries # The training loop is essentially the same regardless of whether # we are passing batches by feed dict or by loom input # tensor. There are a few minor differences: # # 1. By feed dict, we compute the size of the training set lazily, # as we iterate over it in the first epoch. By input tensor, we # calculate train_size as batch_size * batches_per_epoch. # # 2. By feed dict, we get the size of each batch by calling len() # on it (since the last batch in the epoch may have less than # batch_size elements). By input tensor, we require that every # batch have exactly batch_size elements. # # 3. By feed dict we need to create batches of inputs, and feed # them every time we run the train op (obviously). if self.examples: epochs, train_size = self._by_feed_dict(train_feed_dict) else: epochs, train_size = self._by_input_tensor(train_feed_dict) if self.dev_examples: # Memoize a generator of batches of (size, feed_dict) pairs. gen_dev_batches = util.epochs( ((len(batch), self.compiler.build_feed_dict(batch)) for batch in util.group_by_batches( self.dev_examples, self.batch_size)), shuffle=False) # If there is an existing checkpoint in logdir, and we are # saving the best model, calculate best_loss before doing any # training, so we don't potentially replace a better-performing # checkpoint with a worse one. ckpt = tf.train.get_checkpoint_state(self.logdir) if ckpt and ckpt.model_checkpoint_path: _, self._best_loss, _ = self._eval_batches( supervisor, session, next(gen_dev_batches), None, is_dev=True) if self._best_loss is None: return # should_stop returned true for epoch, batches in enumerate(epochs, 1): train_loss = 0.0 for _ in batches: if self._should_stop(supervisor): return results = session.run(train_fetches, train_feed_dict) train_loss += results['loss'] if self.compute_summaries: supervisor.summary_computed( session, results['summaries'], results['step']) if train_size == 0: raise ValueError('examples must be non-empty') if self.exact_batch_sizes and epoch == 1: if train_size < self.batch_size: raise ValueError('when exact_batch_sizes is true, examples must have ' 'at least batch_size items; %s vs. %s' % ( train_size, self.batch_size)) train_size -= train_size % self.batch_size train_loss /= train_size self.report_loss(results['step'], train_loss) log_str = 'epoch:%5d train[loss: %.3e]' % (epoch, train_loss) if self.dev_examples: dev_size, dev_loss, dev_metrics = self._eval_batches( supervisor, session, next(gen_dev_batches), results['step'], is_dev=True) if dev_size is None: return # should_stop returned true if epoch == 1: self.log_and_print('train_size: %d dev_size: %d' % (train_size, dev_size)) log_str += ' dev[%s]' % _eval_str(dev_size, dev_loss, dev_metrics) self.log_and_print(log_str) self._save_best(session, supervisor.saver, dev_loss, results['step']) else: if epoch == 1: self.log_and_print('train_size: %d' % train_size) self.log_and_print(log_str) if not self.dev_examples and self.is_chief_trainer: save_path = os.path.join(self.logdir, 'model.ckpt') save_fname = supervisor.saver.save( session, save_path, global_step=results['step']) self.log_and_print('final model saved in file: %s' % save_fname)
def _run(self, supervisor, session): return self.results_fn(itertools.chain.from_iterable( self._run_batch(examples, session) for examples in util.group_by_batches(self.examples, self.batch_size)))
def _run(self, supervisor, session): return self.results_fn( itertools.chain.from_iterable( self._run_batch(examples, session) for examples in util.group_by_batches(self.examples, self.batch_size)))
def _run(self, supervisor, session): train_feed_dict = self.train_feeds.copy() train_fetches = { 'train_op': self.train_op, 'loss': self._loss_total, 'step': self._global_step } train_fetches['summaries'] = self._summaries epochs, train_size = self._by_feed_dict(train_feed_dict) if self.dev_examples: # Memoize a generator of batches of (size, feed_dict) pairs. gen_dev_batches = util.epochs( ((len(batch), self.compiler.build_feed_dict(batch)) for batch in util.group_by_batches(self.dev_examples, self.batch_size)), shuffle=False) # If there is an existing checkpoint in logdir, and we are # saving the best model, calculate best_loss before doing any # training, so we don't potentially replace a better-performing # checkpoint with a worse one. ckpt = tf.train.get_checkpoint_state(self.logdir) if ckpt and ckpt.model_checkpoint_path: _, self._best_loss, _ = self._eval_batches( supervisor, session, next(gen_dev_batches), None, is_dev=True) if self._best_loss is None: return # should_stop returned true for epoch, batches in enumerate(epochs, 1): self.log_and_print('Starting epoch %d.' % epoch) train_loss = 0.0 for (k, _) in enumerate(batches): results = session.run(train_fetches, train_feed_dict) train_loss += results['loss'] self.log_and_print('Batch %d: loss %f' % (k, results['loss'])) supervisor.summary_computed(session, results['summaries'], results['step']) if train_size == 0: raise ValueError('examples must be non-empty') train_loss /= train_size log_str = 'epoch:%5d train[loss: %.3e]' % (epoch, train_loss) if self.dev_examples: dev_size, dev_loss, dev_metrics = self._eval_batches( supervisor, session, next(gen_dev_batches), results['step'], is_dev=True) if epoch == 1: self.log_and_print('train_size: %d dev_size: %d' % (train_size, dev_size)) log_str += ' dev[%s]' % _eval_str(dev_size, dev_loss, dev_metrics) self._save_best(session, supervisor.saver, dev_loss, results['step']) else: if epoch == 1: self.log_and_print('train_size: %d' % train_size) self.log_and_print(log_str) if not self.dev_examples: save_path = os.path.join(self.logdir, 'model.ckpt') save_fname = supervisor.saver.save(session, save_path, global_step=results['step']) self.log_and_print('final model saved in file: %s' % save_fname)
def test_group_by_batches_truncated(self): self.assertEqual([], list(util.group_by_batches([], 2, truncate=True))) self.assertEqual([[1], [2], [3]], list(util.group_by_batches([1, 2, 3], 1, truncate=True))) self.assertEqual([[1, 2]], list(util.group_by_batches([1, 2, 3], 2, truncate=True)))
def test_group_by_batches(self): self.assertEqual([], list(util.group_by_batches([], 2))) self.assertEqual([[1], [2], [3]], list(util.group_by_batches([1, 2, 3], 1))) self.assertEqual([[1, 2], [3]], list(util.group_by_batches([1, 2, 3], 2)))
def _run(self, supervisor, session): train_feed_dict = self.train_feeds.copy() train_fetches = { 'train_op': self.train_op, 'loss': self.loss_total, 'step': self.global_step } if self.compute_summaries: train_fetches['summaries'] = self.summaries # The training loop is essentially the same regardless of whether # we are passing batches by feed dict or by loom input # tensor. There are a few minor differences: # # 1. By feed dict, we compute the size of the training set lazily, # as we iterate over it in the first epoch. By input tensor, we # calculate train_size as batch_size * batches_per_epoch. # # 2. By feed dict, we get the size of each batch by calling len() # on it (since the last batch in the epoch may have less than # batch_size elements). By input tensor, we require that every # batch have exactly batch_size elements. # # 3. By feed dict we need to create batches of inputs, and feed # them every time we run the train op (obviously). if self.examples: epochs, train_size = self._by_feed_dict(train_feed_dict) else: epochs, train_size = self._by_input_tensor(train_feed_dict) if self.dev_examples: # Memoize a generator of batches of (size, feed_dict) pairs. gen_dev_batches = util.epochs( ((len(batch), self.compiler.build_feed_dict(batch)) for batch in util.group_by_batches(self.dev_examples, self.batch_size)), shuffle=False) for epoch, batches in enumerate(epochs, 1): train_loss = 0.0 for _ in batches: if self._should_stop(supervisor): return results = session.run(train_fetches, train_feed_dict) train_loss += results['loss'] if self.compute_summaries: supervisor.summary_computed(session, results['summaries'], results['step']) if train_size == 0: raise ValueError('examples must be non-empty') if self.exact_batch_sizes and epoch == 1: if train_size < self.batch_size: raise ValueError( 'when exact_batch_sizes is true, examples must have ' 'at least batch_size items; %s vs. %s' % (train_size, self.batch_size)) train_size -= train_size % self.batch_size train_loss /= train_size self.report_loss(results['step'], train_loss) log_str = 'epoch:%5d train[loss: %.3e]' % (epoch, train_loss) if self.dev_examples: dev_size, dev_loss, dev_metrics = self._eval_batches( supervisor, session, next(gen_dev_batches), results['step'], is_dev=True) if dev_size is None: return # should_stop returned true if epoch == 1: self.log_and_print('train_size: %d dev_size: %d' % (train_size, dev_size)) log_str += ' dev[%s]' % _eval_str(dev_size, dev_loss, dev_metrics) self.log_and_print(log_str) self._save_best(session, supervisor.saver, dev_loss, results['step']) else: if epoch == 1: self.log_and_print('train_size: %d' % train_size) self.log_and_print(log_str) if not self.dev_examples and self.is_chief_trainer: save_path = os.path.join(self.logdir, 'model.ckpt') save_fname = supervisor.saver.save(session, save_path, global_step=results['step']) self.log_and_print('final model saved in file: %s' % save_fname)