示例#1
0
 def get_current_step_learning_rate(self):
     """
 :rtype: tf.Tensor
 """
     lr = self.learning_rate_var
     if self.config.typed_dict.get("dynamic_learning_rate"):
         # To implement any kind of cyclic learning rate during the epoch. E.g.: https://arxiv.org/abs/1608.03983
         with tf.name_scope("dynamic_learning_rate"):
             from Util import CollectionReadCheckCovered
             opts = CollectionReadCheckCovered(
                 self.config.typed_dict["dynamic_learning_rate"])
             # Currently all intervals of same step size.
             interval_steps = tf.constant(
                 opts["interval"],
                 name="interval",
                 dtype=self.network.global_train_step.dtype)
             step_in_interval = tf.mod(self.network.global_train_step,
                                       interval_steps,
                                       name="step_in_interval")
             factor = tf.pow(tf.constant(opts["decay"],
                                         name="decay",
                                         dtype=tf.float32),
                             tf.to_float(step_in_interval,
                                         name="step_in_interval_float"),
                             name="factor")
             lr *= factor
             opts.assert_all_read()
     if self.config.is_true("use_horovod") and self.config.is_true(
             "horovod_scale_lr"):
         import horovod.tensorflow as hvd
         lr *= hvd.size()
     return lr
示例#2
0
 def __init__(self, config, train_data):
   """
   :param Config.Config config:
   :param Dataset train_data:
   """
   self.config = config
   self.opts = CollectionReadCheckCovered(config.get_of_type("hyper_param_tuning", dict, {}))
   self.log = log.v1
   train_data.init_seq_order(epoch=1)
   self.train_data = StaticDataset.copy_from_dataset(
     train_data, max_seqs=self.opts.get("num_train_steps", 100))
   self.hyper_params = []  # type: list[HyperParam]
   self._find_hyper_params()
   if not self.hyper_params:
     raise Exception("No hyper params found.")
   self.hyper_params.sort(key=lambda p: p.unique_idx)
   print("We have found these hyper params:")
   for p in self.hyper_params:
     print(" %s" % p.description())
   self.dry_run_first_individual = self.opts.get("dry_run_first_individual", True)
   self.num_iterations = self.opts["num_tune_iterations"]
   self.num_individuals = self.opts["num_individuals"]
   self.num_kill_individuals = self.opts.get(
     "num_kill_individuals", self.num_individuals // 2)
   self.num_best = self.opts.get("num_best", 10)
   self.num_threads = self.opts.get("num_threads", guess_requested_max_num_threads())
   self.opts.assert_all_read()
示例#3
0
 def __init__(self, config, train_data):
   """
   :param Config.Config config:
   :param Dataset train_data:
   """
   self.config = config
   self.opts = CollectionReadCheckCovered(config.get_of_type("hyper_param_tuning", dict, {}))
   self.log = log.v1
   train_data.init_seq_order(epoch=1)
   self.train_data = StaticDataset.copy_from_dataset(
     train_data, max_seqs=self.opts.get("num_train_steps", 100))
   self.hyper_params = []  # type: list[HyperParam]
   self._find_hyper_params()
   if not self.hyper_params:
     raise Exception("No hyper params found.")
   self.hyper_params.sort(key=lambda p: p.unique_idx)
   print("We have found these hyper params:")
   for p in self.hyper_params:
     print(" %s" % p.description())
   self.dry_run_first_individual = self.opts.get("dry_run_first_individual", True)
   self.num_iterations = self.opts["num_tune_iterations"]
   self.num_individuals = self.opts["num_individuals"]
   self.num_kill_individuals = self.opts.get(
     "num_kill_individuals", self.num_individuals // 2)
   self.num_best = self.opts.get("num_best", 10)
   self.num_threads = self.opts.get("num_threads", guess_requested_max_num_threads())
   self.opts.assert_all_read()
示例#4
0
 def __init__(self, config):
     """
 :param Config config:
 """
     print("Initialize distributed TensorFlow", file=log.v2)
     self.config = config
     opts = config.get_of_type("distributed_tf", dict, {})
     opts = CollectionReadCheckCovered(opts)
     self.opts = opts
     if opts.get("local_only", False):  # might be useful for testing
         cluster_resolver = LocalOnlyClusterResolver()
         print("Use local-only cluster resolver,", file=log.v4, end=" ")
     elif os.environ.get("TF_CONFIG", ""):
         cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver(
         )
         print("Use TF_CONFIG %s," % os.environ["TF_CONFIG"],
               file=log.v4,
               end=" ")
     else:
         cluster_resolver = MPIClusterResolver()
         print("Use MPI cluster resolver,", file=log.v4, end=" ")
     print("cluster spec %s, master %s" %
           (cluster_resolver.cluster_spec(), cluster_resolver.master()),
           file=log.v4)
     self.cluster_resolver = cluster_resolver
     cluster_spec = cluster_resolver.cluster_spec()
     self.cluster_spec = cluster_spec
     tf_session_opts = config.typed_dict.get("tf_session_opts", {})
     server_config = tf.compat.v1.ConfigProto(**tf_session_opts)
     # Note that there is no clean way currently in TF to uninit the TF server.
     # If we would use this multiple times (e.g. in tests),
     # it might actually be better to cache the server as a singleton...
     server = tf.distribute.Server(cluster_spec,
                                   job_name=cluster_resolver.task_type,
                                   task_index=cluster_resolver.task_id,
                                   config=server_config)
     self.server = server
     self.strategy = ReturnnDefaultStrategy(
     )  # not really used currently...
     self.opts.assert_all_read()
示例#5
0
 def get_current_step_learning_rate(self):
   """
   :rtype: tf.Tensor
   """
   lr = self.learning_rate_var
   if self.config.typed_dict.get("dynamic_learning_rate"):
     # To implement any kind of cyclic learning rate during the epoch. E.g.: https://arxiv.org/abs/1608.03983
     with tf.name_scope("dynamic_learning_rate"):
       from Util import CollectionReadCheckCovered
       opts = CollectionReadCheckCovered(self.config.typed_dict["dynamic_learning_rate"])
       # Currently all intervals of same step size.
       interval_steps = tf.constant(opts["interval"], name="interval", dtype=self.network.global_train_step.dtype)
       step_in_interval = tf.mod(self.network.global_train_step, interval_steps, name="step_in_interval")
       factor = tf.pow(
         tf.constant(opts["decay"], name="decay", dtype=tf.float32),
         tf.to_float(step_in_interval, name="step_in_interval_float"), name="factor")
       lr *= factor
       opts.assert_all_read()
   if self.config.is_true("use_horovod") and self.config.is_true("horovod_scale_lr"):
     # noinspection PyPackageRequirements,PyUnresolvedReferences
     import horovod.tensorflow as hvd
     lr *= hvd.size()
   return lr
示例#6
0
class Optimization:
  def __init__(self, config, train_data):
    """
    :param Config.Config config:
    :param Dataset train_data:
    """
    self.config = config
    self.opts = CollectionReadCheckCovered(config.get_of_type("hyper_param_tuning", dict, {}))
    self.log = log.v1
    train_data.init_seq_order(epoch=1)
    self.train_data = StaticDataset.copy_from_dataset(
      train_data, max_seqs=self.opts.get("num_train_steps", 100))
    self.hyper_params = []  # type: list[HyperParam]
    self._find_hyper_params()
    if not self.hyper_params:
      raise Exception("No hyper params found.")
    self.hyper_params.sort(key=lambda p: p.unique_idx)
    print("We have found these hyper params:")
    for p in self.hyper_params:
      print(" %s" % p.description())
    self.dry_run_first_individual = self.opts.get("dry_run_first_individual", True)
    self.num_iterations = self.opts["num_tune_iterations"]
    self.num_individuals = self.opts["num_individuals"]
    self.num_kill_individuals = self.opts.get(
      "num_kill_individuals", self.num_individuals // 2)
    self.num_best = self.opts.get("num_best", 10)
    self.num_threads = self.opts.get("num_threads", guess_requested_max_num_threads())
    self.opts.assert_all_read()

  def _find_hyper_params(self, base=None, visited=None):
    """
    :param _AttrChain base:
    :param set[int] visited: set of ids
    """
    from inspect import ismodule
    if base is None:
      base = _AttrChain(base=self.config)
    if isinstance(base.value, HyperParam):
      base.value.usages.append(base)
      if base.value not in self.hyper_params:
        self.hyper_params.append(base.value)
      return
    if visited is None:
      visited = set()
    if id(base.value) in visited:
      return
    visited.add(id(base.value))
    if ismodule(base.value):
      return
    if isinstance(base.value, dict):
      col_type = _AttribOrKey.ColTypeDict
      keys = base.value.keys()
    elif isinstance(base.value, Config):
      col_type = _AttribOrKey.ColTypeConfig
      keys = base.value.typed_dict.keys()
    else:
      # Add other specific object types, but not in generic all.
      return
    for key in sorted(keys):
      child = base.get_extended_chain(_AttribOrKey(key=key, col_type=col_type))
      self._find_hyper_params(base=child, visited=visited)

  def get_population(self, iteration_idx, num_individuals):
    """
    :param int iteration_idx:
    :param int num_individuals:
    :rtype: list[Individual]
    """
    assert num_individuals > 0
    return [
      self.get_individual(iteration_idx=iteration_idx, individual_idx=i)
      for i in range(num_individuals)]

  def get_individual(self, iteration_idx, individual_idx):
    """
    :param int iteration_idx:
    :param int individual_idx:
    :rtype: Individual
    """
    return Individual(
      {p: p.get_random_value_by_idx(iteration_idx=iteration_idx, individual_idx=individual_idx)
       for p in self.hyper_params},
      name="%i-%i" % (iteration_idx, individual_idx))

  def cross_over(self, population, iteration_idx):
    """
    :param list[Individual] population: modified in-place
    :param int iteration_idx:
    """
    for i in range(len(population) - 1):
      population[i] = population[i].cross_over(
        hyper_params=self.hyper_params,
        population=population[:i] + population[i + 1:],
        random_seed=iteration_idx * 1013 + i * 17)

  def create_config_instance(self, hyper_param_mapping, gpu_ids):
    """
    :param dict[HyperParam] hyper_param_mapping: maps each hyper param to some value
    :param set[int] gpu_ids:
    :rtype: Config
    """
    assert set(self.hyper_params) == set(hyper_param_mapping.keys())
    from Util import deepcopy
    config = deepcopy(self.config)
    assert isinstance(config, Config)
    for p, value in hyper_param_mapping.items():
      assert isinstance(p, HyperParam)
      for attr_chain in p.usages:
        attr_chain.write_attrib(base=config, new_value=value)
    tf_session_opts = config.typed_dict.setdefault("tf_session_opts", {})
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto
    gpu_opts = tf_session_opts.setdefault("gpu_options", tf.GPUOptions())
    if isinstance(gpu_opts, dict):
      gpu_opts = tf.GPUOptions(**gpu_opts)
    gpu_opts.visible_device_list = ",".join(map(str, sorted(gpu_ids)))
    return config

  def work(self):
    print("Starting hyper param search. Using %i threads." % self.num_threads, file=log.v1)
    from TFUtil import get_available_gpu_devices
    from Log import wrap_log_streams, StreamDummy
    from threading import Thread, Condition
    from Util import progress_bar, hms, is_tty

    class Outstanding:
      cond = Condition()
      threads = []  # type: list[WorkerThread]
      population = []
      exit = False
      exception = None

    class WorkerThread(Thread):
      def __init__(self, gpu_ids):
        """
        :param set[int] gpu_ids:
        """
        super(WorkerThread, self).__init__(name="Hyper param tune train thread")
        self.gpu_ids = gpu_ids
        self.trainer = None  # type: _IndividualTrainer
        self.finished = False
        self.start()

      def cancel(self, join=False):
        with Outstanding.cond:
          if self.trainer:
            self.trainer.cancel_flag = True
            if self.trainer.runner:
              self.trainer.runner.cancel_flag = True
        if join:
          self.join()

      def get_complete_frac(self):
        with Outstanding.cond:
          if self.trainer and self.trainer.runner:
            return self.trainer.runner.data_provider.get_complete_frac()
        return 0.0

      def run(self_thread):
        try:
          while True:
            with Outstanding.cond:
              if Outstanding.exit or Outstanding.exception:
                return
              if not Outstanding.population:
                self_thread.finished = True
                Outstanding.cond.notify_all()
                return
              individual = Outstanding.population.pop(0)
              self_thread.trainer = _IndividualTrainer(optim=self, individual=individual, gpu_ids=self_thread.gpu_ids)
            self_thread.name = "Hyper param tune train thread on %r" % individual.name
            self_thread.trainer.run()
        except Exception as exc:
          with Outstanding.cond:
            if not Outstanding.exception:
              Outstanding.exception = exc or True
            Outstanding.cond.notify_all()
          for thread in Outstanding.threads:
            if thread is not self_thread:
              thread.cancel()
          if not isinstance(exc, CancelTrainingException):
            with Outstanding.cond:  # So that we don't mix up multiple on sys.stderr.
              # This would normally dump it on sys.stderr so it's fine.
              sys.excepthook(*sys.exc_info())

    best_individuals = []
    population = []
    canceled = False
    num_gpus = len(get_available_gpu_devices())
    print("Num available GPUs:", num_gpus)
    num_gpus = num_gpus or 1  # Would be ignored anyway.
    interactive = is_tty()
    try:
      print("Population of %i individuals (hyper param setting instances), running for %i evaluation iterations." % (
        self.num_individuals, self.num_iterations), file=log.v2)
      for cur_iteration_idx in range(1, self.num_iterations + 1):
        print("Starting iteration %i." % cur_iteration_idx, file=log.v2)
        if cur_iteration_idx == 1:
          population.append(Individual(
            {p: p.get_default_value() for p in self.hyper_params}, name="default"))
          population.append(Individual(
            {p: p.get_initial_value() for p in self.hyper_params}, name="canonical"))
        population.extend(self.get_population(
          iteration_idx=cur_iteration_idx, num_individuals=self.num_individuals - len(population)))
        if cur_iteration_idx > 1:
          self.cross_over(population=population, iteration_idx=cur_iteration_idx)
        if cur_iteration_idx == 1 and self.dry_run_first_individual:
          # Train first directly for testing and to see log output.
          # Later we will strip away all log output.
          print("Very first try with log output:", file=log.v2)
          _IndividualTrainer(optim=self, individual=population[0], gpu_ids={0}).run()
        print("Starting training with thread pool of %i threads." % self.num_threads)
        iteration_start_time = time.time()
        with wrap_log_streams(StreamDummy(), also_sys_stdout=True, tf_log_verbosity="WARN"):
          Outstanding.exit = False
          Outstanding.population = list(population)
          Outstanding.threads = [WorkerThread(gpu_ids={i % num_gpus}) for i in range(self.num_threads)]
          try:
            while True:
              with Outstanding.cond:
                if all([thread.finished for thread in Outstanding.threads]) or Outstanding.exception:
                  break
                complete_frac = max(len(population) - len(Outstanding.population) - len(Outstanding.threads), 0)
                complete_frac += sum([thread.get_complete_frac() for thread in Outstanding.threads])
                complete_frac /= float(len(population))
                remaining_str = ""
                if complete_frac > 0:
                  start_elapsed = time.time() - iteration_start_time
                  total_time_estimated = start_elapsed / complete_frac
                  remaining_estimated = total_time_estimated - start_elapsed
                  remaining_str = hms(remaining_estimated)
                if interactive:
                  progress_bar(complete_frac, prefix=remaining_str, file=sys.__stdout__)
                else:
                  print(
                    "Progress: %.02f%%" % (complete_frac * 100),
                    "remaining:", remaining_str or "unknown", file=sys.__stdout__)
                  sys.__stdout__.flush()
                Outstanding.cond.wait(1 if interactive else 10)
            for thread in Outstanding.threads:
              thread.join()
          finally:
            Outstanding.exit = True
            for thread in Outstanding.threads:
              thread.cancel(join=True)
        Outstanding.threads = []
        print("Training iteration elapsed time:", hms(time.time() - iteration_start_time))
        if Outstanding.exception:
          raise Outstanding.exception
        assert not Outstanding.population
        print("Training iteration finished.")
        population.sort(key=lambda p: p.cost)
        del population[-self.num_kill_individuals:]
        best_individuals.extend(population)
        best_individuals.sort(key=lambda p: p.cost)
        del best_individuals[self.num_best:]
        population = best_individuals[:self.num_kill_individuals // 4] + population
        print("Current best setting, individual %s" % best_individuals[0].name, "cost:", best_individuals[0].cost)
        for p in self.hyper_params:
          print(" %s -> %s" % (p.description(), best_individuals[0].hyper_param_mapping[p]))
    except KeyboardInterrupt:
      print("KeyboardInterrupt, canceled search.")
      canceled = True

    print("Best %i settings:" % len(best_individuals))
    for individual in best_individuals:
      print("Individual %s" % individual.name, "cost:", individual.cost)
      for p in self.hyper_params:
        print(" %s -> %s" % (p.description(), individual.hyper_param_mapping[p]))
示例#7
0
class Optimization:
  def __init__(self, config, train_data):
    """
    :param Config.Config config:
    :param Dataset train_data:
    """
    self.config = config
    self.opts = CollectionReadCheckCovered(config.get_of_type("hyper_param_tuning", dict, {}))
    self.log = log.v1
    train_data.init_seq_order(epoch=1)
    self.train_data = StaticDataset.copy_from_dataset(
      train_data, max_seqs=self.opts.get("num_train_steps", 100))
    self.hyper_params = []  # type: list[HyperParam]
    self._find_hyper_params()
    if not self.hyper_params:
      raise Exception("No hyper params found.")
    self.hyper_params.sort(key=lambda p: p.unique_idx)
    print("We have found these hyper params:")
    for p in self.hyper_params:
      print(" %s" % p.description())
    self.dry_run_first_individual = self.opts.get("dry_run_first_individual", True)
    self.num_iterations = self.opts["num_tune_iterations"]
    self.num_individuals = self.opts["num_individuals"]
    self.num_kill_individuals = self.opts.get(
      "num_kill_individuals", self.num_individuals // 2)
    self.num_best = self.opts.get("num_best", 10)
    self.num_threads = self.opts.get("num_threads", guess_requested_max_num_threads())
    self.opts.assert_all_read()

  def _find_hyper_params(self, base=None, visited=None):
    """
    :param _AttrChain base:
    :param set[int] visited: set of ids
    """
    from inspect import ismodule
    if base is None:
      base = _AttrChain(base=self.config)
    if isinstance(base.value, HyperParam):
      base.value.usages.append(base)
      if base.value not in self.hyper_params:
        self.hyper_params.append(base.value)
      return
    if visited is None:
      visited = set()
    if id(base.value) in visited:
      return
    visited.add(id(base.value))
    if ismodule(base.value):
      return
    if isinstance(base.value, dict):
      col_type = _AttribOrKey.ColTypeDict
      keys = base.value.keys()
    elif isinstance(base.value, Config):
      col_type = _AttribOrKey.ColTypeConfig
      keys = base.value.typed_dict.keys()
    else:
      # Add other specific object types, but not in generic all.
      return
    for key in sorted(keys):
      child = base.get_extended_chain(_AttribOrKey(key=key, col_type=col_type))
      self._find_hyper_params(base=child, visited=visited)

  def get_population(self, iteration_idx, num_individuals):
    """
    :param int iteration_idx:
    :param int num_individuals:
    :rtype: list[Individual]
    """
    assert num_individuals > 0
    return [
      self.get_individual(iteration_idx=iteration_idx, individual_idx=i)
      for i in range(num_individuals)]

  def get_individual(self, iteration_idx, individual_idx):
    """
    :param int iteration_idx:
    :param int individual_idx:
    :rtype: Individual
    """
    return Individual(
      {p: p.get_random_value_by_idx(iteration_idx=iteration_idx, individual_idx=individual_idx)
       for p in self.hyper_params},
      name="%i-%i" % (iteration_idx, individual_idx))

  def cross_over(self, population, iteration_idx):
    """
    :param list[Individual] population: modified in-place
    :param int iteration_idx:
    """
    for i in range(len(population) - 1):
      population[i] = population[i].cross_over(
        hyper_params=self.hyper_params,
        population=population[:i] + population[i + 1:],
        random_seed=iteration_idx * 1013 + i * 17)

  def create_config_instance(self, hyper_param_mapping, gpu_ids):
    """
    :param dict[HyperParam] hyper_param_mapping: maps each hyper param to some value
    :param set[int] gpu_ids:
    :rtype: Config
    """
    assert set(self.hyper_params) == set(hyper_param_mapping.keys())
    from Util import deepcopy
    config = deepcopy(self.config)
    assert isinstance(config, Config)
    for p, value in hyper_param_mapping.items():
      assert isinstance(p, HyperParam)
      for attr_chain in p.usages:
        attr_chain.write_attrib(base=config, new_value=value)
    tf_session_opts = config.typed_dict.setdefault("tf_session_opts", {})
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto
    gpu_opts = tf_session_opts.setdefault("gpu_options", tf.GPUOptions())
    if isinstance(gpu_opts, dict):
      gpu_opts = tf.GPUOptions(**gpu_opts)
    gpu_opts.visible_device_list = ",".join(map(str, sorted(gpu_ids)))
    return config

  def work(self):
    print("Starting hyper param search. Using %i threads." % self.num_threads, file=log.v1)
    from TFUtil import get_available_gpu_devices
    from Log import wrap_log_streams, StreamDummy
    from threading import Thread, Condition
    from Util import progress_bar, hms, is_tty

    class Outstanding:
      cond = Condition()
      threads = []  # type: list[WorkerThread]
      population = []
      exit = False
      exception = None

    class WorkerThread(Thread):
      def __init__(self, gpu_ids):
        """
        :param set[int] gpu_ids:
        """
        super(WorkerThread, self).__init__(name="Hyper param tune train thread")
        self.gpu_ids = gpu_ids
        self.trainer = None  # type: _IndividualTrainer
        self.finished = False
        self.start()

      def cancel(self, join=False):
        with Outstanding.cond:
          if self.trainer:
            self.trainer.cancel_flag = True
            if self.trainer.runner:
              self.trainer.runner.cancel_flag = True
        if join:
          self.join()

      def get_complete_frac(self):
        with Outstanding.cond:
          if self.trainer and self.trainer.runner:
            return self.trainer.runner.data_provider.get_complete_frac()
        return 0.0

      def run(self_thread):
        try:
          while True:
            with Outstanding.cond:
              if Outstanding.exit or Outstanding.exception:
                return
              if not Outstanding.population:
                self_thread.finished = True
                Outstanding.cond.notify_all()
                return
              individual = Outstanding.population.pop(0)
              self_thread.trainer = _IndividualTrainer(optim=self, individual=individual, gpu_ids=self_thread.gpu_ids)
            self_thread.name = "Hyper param tune train thread on %r" % individual.name
            self_thread.trainer.run()
        except Exception as exc:
          with Outstanding.cond:
            if not Outstanding.exception:
              Outstanding.exception = exc or True
            Outstanding.cond.notify_all()
          for thread in Outstanding.threads:
            if thread is not self_thread:
              thread.cancel()
          if not isinstance(exc, CancelTrainingException):
            with Outstanding.cond:  # So that we don't mix up multiple on sys.stderr.
              # This would normally dump it on sys.stderr so it's fine.
              sys.excepthook(*sys.exc_info())

    best_individuals = []
    population = []
    canceled = False
    num_gpus = len(get_available_gpu_devices())
    print("Num available GPUs:", num_gpus)
    num_gpus = num_gpus or 1  # Would be ignored anyway.
    interactive = is_tty()
    try:
      print("Population of %i individuals (hyper param setting instances), running for %i evaluation iterations." % (
        self.num_individuals, self.num_iterations), file=log.v2)
      for cur_iteration_idx in range(1, self.num_iterations + 1):
        print("Starting iteration %i." % cur_iteration_idx, file=log.v2)
        if cur_iteration_idx == 1:
          population.append(Individual(
            {p: p.get_default_value() for p in self.hyper_params}, name="default"))
          population.append(Individual(
            {p: p.get_initial_value() for p in self.hyper_params}, name="canonical"))
        population.extend(self.get_population(
          iteration_idx=cur_iteration_idx, num_individuals=self.num_individuals - len(population)))
        if cur_iteration_idx > 1:
          self.cross_over(population=population, iteration_idx=cur_iteration_idx)
        if cur_iteration_idx == 1 and self.dry_run_first_individual:
          # Train first directly for testing and to see log output.
          # Later we will strip away all log output.
          print("Very first try with log output:", file=log.v2)
          _IndividualTrainer(optim=self, individual=population[0], gpu_ids={0}).run()
        print("Starting training with thread pool of %i threads." % self.num_threads)
        iteration_start_time = time.time()
        with wrap_log_streams(StreamDummy(), also_sys_stdout=True, tf_log_verbosity="WARN"):
          Outstanding.exit = False
          Outstanding.population = list(population)
          Outstanding.threads = [WorkerThread(gpu_ids={i % num_gpus}) for i in range(self.num_threads)]
          try:
            while True:
              with Outstanding.cond:
                if all([thread.finished for thread in Outstanding.threads]) or Outstanding.exception:
                  break
                complete_frac = max(len(population) - len(Outstanding.population) - len(Outstanding.threads), 0)
                complete_frac += sum([thread.get_complete_frac() for thread in Outstanding.threads])
                complete_frac /= float(len(population))
                remaining_str = ""
                if complete_frac > 0:
                  start_elapsed = time.time() - iteration_start_time
                  total_time_estimated = start_elapsed / complete_frac
                  remaining_estimated = total_time_estimated - start_elapsed
                  remaining_str = hms(remaining_estimated)
                if interactive:
                  progress_bar(complete_frac, prefix=remaining_str, file=sys.__stdout__)
                else:
                  print(
                    "Progress: %.02f%%" % (complete_frac * 100),
                    "remaining:", remaining_str or "unknown", file=sys.__stdout__)
                  sys.__stdout__.flush()
                Outstanding.cond.wait(1 if interactive else 10)
            for thread in Outstanding.threads:
              thread.join()
          finally:
            Outstanding.exit = True
            for thread in Outstanding.threads:
              thread.cancel(join=True)
        Outstanding.threads = []
        print("Training iteration elapsed time:", hms(time.time() - iteration_start_time))
        if Outstanding.exception:
          raise Outstanding.exception
        assert not Outstanding.population
        print("Training iteration finished.")
        population.sort(key=lambda p: p.cost)
        del population[-self.num_kill_individuals:]
        best_individuals.extend(population)
        best_individuals.sort(key=lambda p: p.cost)
        del best_individuals[self.num_best:]
        population = best_individuals[:self.num_kill_individuals // 4] + population
        print("Current best setting, individual %s" % best_individuals[0].name, "cost:", best_individuals[0].cost)
        for p in self.hyper_params:
          print(" %s -> %s" % (p.description(), best_individuals[0].hyper_param_mapping[p]))
    except KeyboardInterrupt:
      print("KeyboardInterrupt, canceled search.")
      canceled = True

    print("Best %i settings:" % len(best_individuals))
    for individual in best_individuals:
      print("Individual %s" % individual.name, "cost:", individual.cost)
      for p in self.hyper_params:
        print(" %s -> %s" % (p.description(), individual.hyper_param_mapping[p]))