def _run_training_loop(self, m, curr_epoch): """Trains the cifar model `m` for one epoch.""" start_time = time.time() while True: try: with self._new_session(m): train_accuracy = helper_utils.run_epoch_training( self.session, m, self.data_loader, curr_epoch) tf.logging.info('Saving model after epoch') self.save_model(step=curr_epoch) break except (tf.errors.AbortedError, tf.errors.UnavailableError) as e: tf.logging.info('Retryable error caught: %s. Retrying.', e) tf.logging.info('Finished epoch: {}'.format(curr_epoch)) tf.logging.info('Epoch time(min): {}'.format( (time.time() - start_time) / 60.0)) return train_accuracy
def _run_training_loop(self, m, curr_epoch): """Trains the cifar model `m` for one epoch.""" start_time = time.time() while True: try: with self._new_session(m): train_accuracy = helper_utils.run_epoch_training( self.session, m, self.data_loader, curr_epoch) tf.logging.info('Saving model after epoch') self.save_model(step=curr_epoch) break except (tf.errors.AbortedError, tf.errors.UnavailableError) as e: tf.logging.info('Retryable error caught: %s. Retrying.', e) tf.logging.info('Finished epoch: {}'.format(curr_epoch)) tf.logging.info('Epoch time(min): {}'.format( (time.time() - start_time) / 60.0)) return train_accuracy