Exemplo n.º 1
0
    def CreateRunners(self, jobs, logdir, trial=base_trial.NoOpTrial()):
        """Creates a list of runners based on `FLAGS.mode`.

    Args:
      jobs: a list of runner jobs.
      logdir: the directory used for logging, usually on CNS.
      trial: optional `Trial` object, used for reporting measures and early
        stopping.

    Returns:
      A list of `.BaseRunner`, one per job in `jobs`.
    """
        all_runners = []
        is_training = 'trainer' in jobs or 'trainer_client' in jobs
        for j in jobs:
            tf_master = FLAGS.tf_master
            # Ensure that decoder or evaler threads do not clobber variables being
            # updated by trainer by forcing them to use independent sessions.
            if is_training and (j.startswith('decoder')
                                or j.startswith('evaler')):
                tf_master = ''

            runner = self._CreateRunner(j, FLAGS.model_task_name, logdir,
                                        tf_master, trial)
            all_runners.append(runner)
        return all_runners
Exemplo n.º 2
0
    def __init__(self,
                 params,
                 model_task_name,
                 logdir,
                 tf_master,
                 trial=base_trial.NoOpTrial()):
        """Construct a new BaseRunner.

    Args:
      params:  Params object containing model configuration.
      model_task_name:  String name of the task this runner should execute for
        multitask models only.  See flag for details.
      logdir:  String path to the log directory to output to.
      tf_master:  String path to the master job, e.g. 'local'.
      trial:   An optional hyperparameter trial. Used by Vizier studies.
    """
        p = params.Copy()
        # Set in subclasses.
        self._job_name = ''

        self._params = trial.OverrideModelParams(p)
        tf.logging.info('=' * 60)
        for line in self.params.ToText().split('\n'):
            tf.logging.info('%s', line)
        tf.logging.info('=' * 60)

        self._logdir = logdir
        self._tf_master = tf_master
        self._model_task_name = model_task_name
        self._trial = trial
        # If the runner is conducting a Vizier trial, scope all the variables
        # (e.g., global_step) by the trial id so that we do not share states across
        # trials.
        self._container_id = self._trial.Name()
        self._should_report_metrics = False

        # To early terminate a runner, we set max_steps here and that will trigger
        # appropriate ShouldStop behavior in the threads. This is used by Vizier
        # to early stop a trial.
        self._max_steps = None

        self.params.cluster.logdir = logdir
        self._cluster = cluster_factory.Cluster(self.params.cluster)
        self._train_dir = os.path.join(self._logdir, 'train')
        tf.io.gfile.makedirs(self._train_dir)
        self._graph = tf.Graph()
        self._summary_writer = None
        self._initialize_tables = None
        self._dequeue_thread_complete = False

        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)
        self._early_stop = None
        if p.train.early_stop and p.train.early_stop.window:
            self._early_stop = early_stop.EarlyStop(p.train.early_stop)
            with self._graph.as_default():
                self._early_stop.FProp(None)

        self._init_input_ops = []

        self._SetStatusMessage('Starting ...')
Exemplo n.º 3
0
  def CreateRunners(cls, jobs, model_name, logdir,
                    trial=base_trial.NoOpTrial()):
    """Creates a list of runners based on `FLAGS.mode`.

    Args:
      jobs: a list of runner jobs.
      model_name: name of a registered `ModelParams` class.
      logdir: the directory used for logging, usually on CNS.
      trial: optional `Trial` object, used for reporting measures and early
        stopping.

    Returns:
      A list of `.BaseRunner`, one per job in `jobs`.
    """

    runners = []
    for j in jobs:
      tf_master = FLAGS.tf_master
      # Ensure that decoder or evaler threads do not clobber variables being
      # updated by trainer by forcing them to use independent sessions.
      if ('trainer' in jobs and
          (j.startswith('decoder') or j.startswith('evaler'))):
        tf_master = ''

      runner = cls._CreateRunner(j, model_name, FLAGS.model_task_name, logdir,
                                 tf_master, trial)
      runners.append(runner)
    return runners
Exemplo n.º 4
0
    def __init__(self,
                 params,
                 model_task_name,
                 logdir,
                 tf_master,
                 trial=base_trial.NoOpTrial()):
        """Construct a new BaseRunner.

    Args:
      params:  Params object containing model configuration.
      model_task_name:  String name of the task this runner should execute
        for multitask models only.  See flag for details.
      logdir:  String path to the log directory to output to.
      tf_master:  String path to the master job, e.g. 'local'.
      trial:   An optional hyperparameter trial. Used by Vizier studies.
    """
        p = params.Copy()
        p.add_summary = FLAGS.add_summary

        self._params = trial.OverrideModelParams(p)
        tf.logging.info('=' * 60)
        for line in self.params.ToText().split('\n'):
            tf.logging.info('%s', line)
        tf.logging.info('=' * 60)

        self._logdir = logdir
        self._tf_master = tf_master
        self._model_task_name = model_task_name
        self._trial = trial
        # If the runner is conducting a Vizier trial, scope all the variables
        # (e.g., global_step) by the trial id so that we do not share states across
        # trials.
        self._container_id = self._trial.Name()

        self._cluster = cluster_factory.Cluster(self.params.cluster)
        self._train_dir = os.path.join(self._logdir, 'train')
        self._graph = tf.Graph()
        self._summary_writer = None
        self.initialize_tables = None

        early_stop.MetricHistory.SetLogdirInMetricHistories(p, logdir)
        self._early_stop = None
        if p.train.early_stop and p.train.early_stop.window:
            self._early_stop = early_stop.EarlyStop(p.train.early_stop)
            with self._graph.as_default():
                self._early_stop.FProp(None)

        self._SetStatusMessage('Starting ...')
Exemplo n.º 5
0
 def __init__(self, *args, **kwargs):
     super(BaseTrainerTest, self).__init__(*args, **kwargs)
     self._trial = base_trial.NoOpTrial()
Exemplo n.º 6
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self._trial = base_trial.NoOpTrial()
Exemplo n.º 7
0
    def __init__(self,
                 params,
                 model_task_name='',
                 logdir='',
                 tf_master='',
                 trial=base_trial.NoOpTrial()):
        """A job runner.

    Args:
      params:  Params object containing model configuration.
      model_task_name:  String name of the task this runner should execute for
        multitask models only.  See flag for details.
      logdir:  String path to the log directory to output to.
      tf_master:  String path to the master job, e.g. 'local'.
      trial:   An optional hyperparameter trial. Used by Vizier studies.
    """
        self._params = trial.OverrideModelParams(params.Copy())
        p = self.params

        self._model_task_name = model_task_name
        self._logdir = logdir
        self._train_dir = os.path.join(self._logdir, 'train')
        tf.io.gfile.makedirs(self._train_dir)

        self._tf_master = tf_master
        self._trial = trial

        # If the runner is conducting a Vizier trial, scope all the variables
        # (e.g., global_step) by the trial id so that we do not share states across
        # trials.
        self._container_id = self._trial.Name()

        # Set in subclasses.
        self._job_name = ''
        self._daemon = False
        self._verbose_enqueue_logging = False

        self._checkpointer = None
        self._should_report_metrics = False

        if py_utils.IsEagerMode():
            self._graph = None
        else:
            self._graph = tf.Graph()
        self._summary_writer = None
        self._initialize_tables = None
        self._dequeue_thread_complete = False

        self._early_stop = None
        # The actual EarlyStop object.
        if p.train.early_stop and p.train.early_stop.window:
            early_stop.MetricHistory.SetLogdirInMetricHistories(
                p, self._logdir)
            self._early_stop = p.train.early_stop.Instantiate()
            self._verbose_enqueue_logging = True

        # Merged TF scalar summaries for training related input data stats.
        self._merged_input_data_summary_op = None

        # To early terminate a runner, we set max_steps here and that will trigger
        # appropriate ShouldStop behavior in the threads. This is used by Vizier
        # to early stop a trial and also EarlyStop to stop training based on
        # metrics.
        self._max_steps_for_early_stop = None

        self.enqueue_ops = None

        tf.logging.info('=' * 60)
        for line in self.params.ToText().split('\n'):
            tf.logging.info('%s', line)
        tf.logging.info('=' * 60)

        self._SetStatusMessage('Starting ...')
        self.params.cluster.logdir = logdir
        self._cluster = cluster_factory.Cluster(self.params.cluster)
        self._worker_cluster_def = self._cluster.worker_cluster_def

        if py_utils.IsEagerMode():
            self._cluster.InitDevicesEager()
        else:
            self._cluster.InitDevices(self._GetSession())

        # Ensure global step tensor is created.
        with contextlib.ExitStack() as stack:
            if not py_utils.IsEagerMode():
                stack.enter_context(self._graph.as_default())
                stack.enter_context(tf.device(self._cluster.GetPlacer()))
            # It is important that we enter the tf.container scope *after*
            # the graph scope. If we reverse the ordering, the tf.container
            # basically has no-effect which is a tricky silent error.
            stack.enter_context(tf.container(self._container_id))
            self._global_step_var = py_utils.GetOrCreateGlobalStepVar()