def print_process(self): if not self.parent.interactive and not log.v[5]: return start_elapsed = time.time() - self.parent.start_time complete = self.parent.batches.completed_frac() assert complete > 0 total_time_estimated = start_elapsed / complete remaining_estimated = total_time_estimated - start_elapsed if log.verbose[5]: mem_usage = self.device_mem_usage_str(self.alloc_devices) info = [ self.parent.report_prefix, "batch %i" % self.run_start_batch_idx ] if self.eval_info: # Such as score. info += [ "%s %s" % item for item in sorted(self.eval_info.items()) ] info += [ "elapsed %s" % hms(start_elapsed), "exp. remaining %s" % hms(remaining_estimated), "complete %.02f%%" % (complete * 100) ] if mem_usage: info += ["memory %s" % mem_usage] print(", ".join(filter(None, info)), file=log.v5) if self.parent.interactive: progress_bar(complete, hms(remaining_estimated))
def _print_process(self, report_prefix, step, step_duration, eval_info): if not self._show_interactive_process_bar and not log.v[5]: return start_elapsed = time.time() - self.start_time complete = self.data_provider.batches.completed_frac() assert complete > 0 total_time_estimated = start_elapsed / complete remaining_estimated = total_time_estimated - start_elapsed if log.verbose[5]: info = [report_prefix, "step %i" % step] if eval_info: # Such as score. info += ["%s %s" % item for item in sorted(eval_info.items())] info += [ "%.3f sec/step" % step_duration, "elapsed %s" % hms(start_elapsed), "exp. remaining %s" % hms(remaining_estimated), "complete %.02f%%" % (complete * 100) ] print(", ".join(filter(None, info)), file=log.v5) elif self._show_interactive_process_bar: from Util import progress_bar progress_bar(complete, hms(remaining_estimated))
def print_process(self): if not self.parent.interactive and not log.v[5]: return start_elapsed = time.time() - self.parent.start_time complete = self.parent.batches.completed_frac() assert complete > 0 total_time_estimated = start_elapsed / complete remaining_estimated = total_time_estimated - start_elapsed if log.verbose[5]: mem_usage = self.device_mem_usage_str(self.alloc_devices) info = [ self.parent.report_prefix, "batch %i" % self.run_start_batch_idx] if self.eval_info: # Such as score. info += ["%s %s" % item for item in sorted(self.eval_info.items())] info += [ "elapsed %s" % hms(start_elapsed), "exp. remaining %s" % hms(remaining_estimated), "complete %.02f%%" % (complete * 100)] if mem_usage: info += ["memory %s" % mem_usage] print >> log.v5, ", ".join(filter(None, info)) if self.parent.interactive: progress_bar(complete, hms(remaining_estimated))
def run_inner(self): self.start_time = time.time() for device in self.devices: device.prepare(epoch=self.epoch, **self.get_device_prepare_args()) self.initialize() terminal_width, _ = terminal_size() self.interactive = (log.v[3] and terminal_width >= 0) print("starting task", self.task, file=log.v5) for device in self.devices: device.eval_batch_idx = -1 device.start_epoch_stats() device.num_frames = 0 device.num_updates = 0 device.tot = 0 num_device_runs = 1 if self.share_batches else len(self.devices) deviceRuns = [ self.DeviceBatchRun( self, [self.devices[i]] if not self.share_batches else self.devices) for i in range(num_device_runs) ] results = {'batchess': [], 'results': [], 'num_frames': NumbersDict(0)} run_frames = NumbersDict(0) cost_result_format = -1 crashed = False assert num_device_runs > 0 while True: if getattr(sys, "exited", False): # This happens when we exit Python. # Without this check, this thread would keep running until all exit handlers of Python are done. print("%s stopped" % self, file=log.v5) crashed = True break for i in range(num_device_runs): if deviceRuns[i].crashed or not deviceRuns[i].is_alive(): crashed = True break if deviceRuns[i].finished: results['batchess'] += deviceRuns[i].result['batchess'][:] results['results'] += deviceRuns[i].result['results'][:] results['result_format'] = deviceRuns[i].result[ 'result_format'] deviceRuns[i].finished = False if crashed: break if cost_result_format < 0 and deviceRuns[i].result['result_format']: for idx, fmt in enumerate( deviceRuns[i].result['result_format']): if fmt and fmt.startswith('cost:'): cost_result_format = idx total_cost = 0 if results['results'] and cost_result_format >= 0: total_cost = numpy.asarray( results['results'])[:, cost_result_format].sum() if total_cost >= self.eval_batch_size or not self.batches.has_more( ): if all(not (dev.finished or dev.allocated or dev.processing) for dev in deviceRuns): results['num_frames'] = run_frames self.num_frames += run_frames if self.share_batches: run_frames *= len(self.devices) self.reduce(run_frames) self.eval_batch_idx += 1 run_frames = NumbersDict(0) results['batchess'] = [] results['results'] = [] for device in self.devices: device.num_frames = 0 device.num_updates = 0 if not self.batches.has_more(): break else: time.sleep(0.01) match = True while self.batches.has_more( ) and total_cost < self.eval_batch_size and match: self.batch_idx = self.batches.get_current_batch_idx() if self.batch_idx < self.start_batch: self.batches.advance(1) break match = False for i in range(num_device_runs): if not deviceRuns[i].allocated: deviceRuns[i].allocate() run_frames += deviceRuns[i].run_frames match = True break if not match: time.sleep(0.01) for run in deviceRuns: run.stop() if crashed: return for device in self.devices: device.finish_epoch_stats() self.finalize() if self.interactive: progress_bar() self.elapsed = (time.time() - self.start_time)
def _print_finish_process(self): if self._show_interactive_process_bar: from Util import progress_bar progress_bar()
def run_inner(self): self.start_time = time.time() for device in self.devices: device.prepare(epoch=self.epoch, **self.get_device_prepare_args()) self.initialize() terminal_width, _ = terminal_size() self.interactive = (log.v[3] and terminal_width >= 0) print >> log.v5, "starting task", self.task for device in self.devices: device.eval_batch_idx = -1 device.start_epoch_stats() device.num_frames = 0 device.num_updates = 0 device.tot = 0 num_device_runs = 1 if self.share_batches else len(self.devices) deviceRuns = [ self.DeviceBatchRun(self, [self.devices[i]] if not self.share_batches else self.devices) for i in range(num_device_runs) ] results = { 'batchess': [], 'results': [], 'num_frames' : NumbersDict(0) } run_frames = NumbersDict(0) crashed = False while True: if getattr(sys, "exited", False): # This happens when we exit Python. # Without this check, this thread would keep running until all exit handlers of Python are done. print >> log.v5, "%s stopped" % self crashed = True break for i in range(num_device_runs): if deviceRuns[i].crashed: crashed = True break if deviceRuns[i].finished: results['batchess'] += deviceRuns[i].result['batchess'][:] results['results'] += deviceRuns[i].result['results'][:] results['result_format'] = deviceRuns[i].result['result_format'] deviceRuns[i].finished = False if crashed: break if run_frames.max_value() >= self.eval_batch_size or not self.batches.has_more(): if all(not (dev.finished or dev.allocated or dev.processing) for dev in deviceRuns): results['num_frames'] = run_frames self.num_frames += run_frames if self.share_batches: run_frames *= len(self.devices) self.reduce(run_frames) self.eval_batch_idx += 1 run_frames = NumbersDict(0) results['batchess'] = [] results['results'] = [] for device in self.devices: device.num_frames = 0 device.num_updates = 0 if not self.batches.has_more(): break else: time.sleep(0.01) match = True while self.batches.has_more() and run_frames.max_value() < self.eval_batch_size and match: self.batch_idx = self.batches.get_current_batch_idx() if self.batch_idx < self.start_batch: self.batches.advance(1) break match = False for i in range(num_device_runs): if not deviceRuns[i].allocated: deviceRuns[i].allocate() run_frames += deviceRuns[i].run_frames match = True break if not match: time.sleep(0.01) for run in deviceRuns: run.stop() if crashed: return for device in self.devices: device.finish_epoch_stats() self.finalize() if self.interactive: progress_bar() self.elapsed = (time.time() - self.start_time)
def work(self): print("Starting hyper param search. Using %i threads." % self.num_threads, file=log.v1) from TFUtil import get_available_gpu_devices from Log import wrap_log_streams, StreamDummy from threading import Thread, Condition from Util import progress_bar, hms, is_tty class Outstanding: cond = Condition() threads = [] # type: list[WorkerThread] population = [] exit = False exception = None class WorkerThread(Thread): def __init__(self, gpu_ids): """ :param set[int] gpu_ids: """ super(WorkerThread, self).__init__(name="Hyper param tune train thread") self.gpu_ids = gpu_ids self.trainer = None # type: _IndividualTrainer self.finished = False self.start() def cancel(self, join=False): with Outstanding.cond: if self.trainer: self.trainer.cancel_flag = True if self.trainer.runner: self.trainer.runner.cancel_flag = True if join: self.join() def get_complete_frac(self): with Outstanding.cond: if self.trainer and self.trainer.runner: return self.trainer.runner.data_provider.get_complete_frac() return 0.0 def run(self_thread): try: while True: with Outstanding.cond: if Outstanding.exit or Outstanding.exception: return if not Outstanding.population: self_thread.finished = True Outstanding.cond.notify_all() return individual = Outstanding.population.pop(0) self_thread.trainer = _IndividualTrainer(optim=self, individual=individual, gpu_ids=self_thread.gpu_ids) self_thread.name = "Hyper param tune train thread on %r" % individual.name self_thread.trainer.run() except Exception as exc: with Outstanding.cond: if not Outstanding.exception: Outstanding.exception = exc or True Outstanding.cond.notify_all() for thread in Outstanding.threads: if thread is not self_thread: thread.cancel() if not isinstance(exc, CancelTrainingException): with Outstanding.cond: # So that we don't mix up multiple on sys.stderr. # This would normally dump it on sys.stderr so it's fine. sys.excepthook(*sys.exc_info()) best_individuals = [] population = [] canceled = False num_gpus = len(get_available_gpu_devices()) print("Num available GPUs:", num_gpus) num_gpus = num_gpus or 1 # Would be ignored anyway. interactive = is_tty() try: print("Population of %i individuals (hyper param setting instances), running for %i evaluation iterations." % ( self.num_individuals, self.num_iterations), file=log.v2) for cur_iteration_idx in range(1, self.num_iterations + 1): print("Starting iteration %i." % cur_iteration_idx, file=log.v2) if cur_iteration_idx == 1: population.append(Individual( {p: p.get_default_value() for p in self.hyper_params}, name="default")) population.append(Individual( {p: p.get_initial_value() for p in self.hyper_params}, name="canonical")) population.extend(self.get_population( iteration_idx=cur_iteration_idx, num_individuals=self.num_individuals - len(population))) if cur_iteration_idx > 1: self.cross_over(population=population, iteration_idx=cur_iteration_idx) if cur_iteration_idx == 1 and self.dry_run_first_individual: # Train first directly for testing and to see log output. # Later we will strip away all log output. print("Very first try with log output:", file=log.v2) _IndividualTrainer(optim=self, individual=population[0], gpu_ids={0}).run() print("Starting training with thread pool of %i threads." % self.num_threads) iteration_start_time = time.time() with wrap_log_streams(StreamDummy(), also_sys_stdout=True, tf_log_verbosity="WARN"): Outstanding.exit = False Outstanding.population = list(population) Outstanding.threads = [WorkerThread(gpu_ids={i % num_gpus}) for i in range(self.num_threads)] try: while True: with Outstanding.cond: if all([thread.finished for thread in Outstanding.threads]) or Outstanding.exception: break complete_frac = max(len(population) - len(Outstanding.population) - len(Outstanding.threads), 0) complete_frac += sum([thread.get_complete_frac() for thread in Outstanding.threads]) complete_frac /= float(len(population)) remaining_str = "" if complete_frac > 0: start_elapsed = time.time() - iteration_start_time total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed remaining_str = hms(remaining_estimated) if interactive: progress_bar(complete_frac, prefix=remaining_str, file=sys.__stdout__) else: print( "Progress: %.02f%%" % (complete_frac * 100), "remaining:", remaining_str or "unknown", file=sys.__stdout__) sys.__stdout__.flush() Outstanding.cond.wait(1 if interactive else 10) for thread in Outstanding.threads: thread.join() finally: Outstanding.exit = True for thread in Outstanding.threads: thread.cancel(join=True) Outstanding.threads = [] print("Training iteration elapsed time:", hms(time.time() - iteration_start_time)) if Outstanding.exception: raise Outstanding.exception assert not Outstanding.population print("Training iteration finished.") population.sort(key=lambda p: p.cost) del population[-self.num_kill_individuals:] best_individuals.extend(population) best_individuals.sort(key=lambda p: p.cost) del best_individuals[self.num_best:] population = best_individuals[:self.num_kill_individuals // 4] + population print("Current best setting, individual %s" % best_individuals[0].name, "cost:", best_individuals[0].cost) for p in self.hyper_params: print(" %s -> %s" % (p.description(), best_individuals[0].hyper_param_mapping[p])) except KeyboardInterrupt: print("KeyboardInterrupt, canceled search.") canceled = True print("Best %i settings:" % len(best_individuals)) for individual in best_individuals: print("Individual %s" % individual.name, "cost:", individual.cost) for p in self.hyper_params: print(" %s -> %s" % (p.description(), individual.hyper_param_mapping[p]))