def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") records = self._get_current_step_records() if not records: logger.error("There is no model to evaluate.") return self.master = create_master() for record in records: _record = ReportRecord(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) Report().broadcast(_record) self._evaluate_single_model(record) self.master.join() for record in records: Report().update_report({ "step_name": record.step_name, "worker_id": record.worker_id }) Report().output_step_all_records(step_name=General.step_name, weights_file=False, performance=True) self.master.close_client() Report().backup_output_path()
def __init__(self): """Initialize.""" super().__init__() self.generator = Generator.restore() if not self.generator: self.generator = Generator() Report.restore() self.master = create_master(update_func=self.generator.update)
def _broadcast(self, pfms): """Boadcase pfrm to record.""" record = Report().receive(self.step_name, self.worker_id) if record.performance: record.performance.update(pfms) else: record.performance = pfms Report().broadcast(record) logging.info("valid record: {}".format(record))
def do(self): """Do the main task in this pipe step.""" logging.debug("NasPipeStep started...") while not self.generator.is_completed: res = self.generator.sample() if res: self._dispatch_trainer(res) else: time.sleep(0.2) self.master.join() logging.debug("Pareto_front values: %s", Report().pareto_front(General.step_name)) Report().output_pareto_front(General.step_name) self.master.close_client()
def search(self): """Search one NetworkDesc from search space. :return: search id, network desc :rtype: int, NetworkDesc """ if self.random_count < self.random_models: desc = self._random_sample() return self.random_count, desc records = Report().get_pareto_front_records(self.step_name, self.num_individual) codes = [ record.desc.get('nbit_w_list') + record.desc.get('nbit_a_list') for record in records ] logging.info("codes=%s", codes) if len(codes) < 2: encoding1, encoding2 = codes[0], codes[0] else: encoding1, encoding2 = random.sample(codes, 2) choice = random.randint(0, 1) # mutate if choice == 0: encoding_new = self.mutatation(encoding1) # crossover else: encoding_new, _ = self.crossover(encoding1, encoding2) self.ea_count += 1 if self.ea_count % self.num_individual == 0: self.ea_epoch += 1 desc = self.codec.decode(encoding_new) return self.random_count + self.ea_count, desc
def do(self): """Start to run fully train with horovod or local trainer.""" logger.info("FullyTrainPipeStep started...") cls_trainer = ClassFactory.get_cls('trainer', "Trainer") if cls_trainer.config.distributed: self._do_distributed_fully_train() else: records = self._get_current_step_records() logger.debug("load pipestep records: {}".format(records)) self.master = create_master(update_func=Report().update_report) self._train_multi_models(records) Report().output_step_all_records(step_name=self.task.step_name, weights_file=True, performance=True) self.master.close_client() Report().backup_output_path()
def search(self): """Search one mutated model. :return: current number of samples, and the model """ desc = deepcopy(self.search_space) search_desc = desc.custom records = Report().get_pareto_front_records(['random', 'mutate']) codes = [] for record in records: custom = record.desc['custom'] codes.append(custom['code']) num_ops = len(search_desc.op_names) upper_bounds = [ num_ops, 2, 2, num_ops, num_ops, 5, 5, num_ops, num_ops, 8, 8, num_ops, num_ops, 4, 4, 5, 5, 6, 6 ] code_to_mutate = random.choice(codes) index = random.randrange(len(upper_bounds)) choices = list(range(upper_bounds[index])) choices.pop(int(code_to_mutate[index + 1], 36)) choice = random.choice(choices) code_mutated = code_to_mutate[:index + 1] + str(choice) + code_to_mutate[index + 2:] search_desc['code'] = code_mutated search_desc['method'] = "mutate" logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc = self.codec.decode(search_desc) self.sample_count += 1 desc['custom'] = search_desc return self.sample_count, desc
def _evaluate_single_model(self, record): try: worker_info = { "step_name": record.step_name, "worker_id": record.worker_id } _record = dict(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) _init_record = ReportRecord().load_dict(_record) Report().broadcast(_init_record) if EvaluatorConfig.gpu_evaluator_enable: cls_evaluator = ClassFactory.get_cls(ClassType.GPU_EVALUATOR, "GpuEvaluator") evaluator = cls_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) if EvaluatorConfig.davinci_mobile_evaluator_enable: cls_evaluator = ClassFactory.get_cls( ClassType.DAVINCI_MOBILE_EVALUATOR, "DavinciMobileEvaluator") evaluator = cls_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) except Exception: logger.error( "Failed to evaluate model, worker info={}".format(worker_info)) logger.error(traceback.format_exc()) return
def sample(self): """Sample a work id and model from search algorithm.""" res = self.search_alg.search() if not res: return None if not isinstance(res, list): res = [res] if len(res) == 0: return None out = [] for sample in res: desc = sample.get("desc") if isinstance(sample, dict) else sample[1] desc = self._decode_hps(desc) model_desc = deepcopy(desc) if "modules" in desc: PipeStepConfig.model.model_desc = deepcopy(desc) elif "network" in desc: origin_desc = PipeStepConfig.model.model_desc desc = update_dict(desc["network"], origin_desc) PipeStepConfig.model.model_desc = deepcopy(desc) if self.quota.is_filtered(desc): continue record = self.record.from_sample(sample, desc) Report().broadcast(record) out.append((record.worker_id, model_desc)) return out
def search(self): """Search one mutated model. :return: current number of samples, and the model """ desc = deepcopy(self.search_space) search_desc = desc.custom # TODO: merge sr ea in one pipe step. records = Report().get_pareto_front_records(['random', 'mutate']) codes = [] for record in records: codes.append(record.desc['custom']['code']) code_to_mutate = random.choice(codes) current_mutate, code_mutated = 0, code_to_mutate num_candidates = len(search_desc["candidates"]) while current_mutate < self.num_mutate: code_new = self.mutate_once(code_mutated, num_candidates) if code_new != code_mutated: current_mutate += 1 code_mutated = code_new logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc['code'] = code_mutated search_desc['method'] = "mutate" search_desc = self.codec.decode(search_desc) desc['custom'] = search_desc self.sample_count += 1 return dict(worker_id=self.sample_count, desc=desc)
def _get_current_step_records(self): step_name = General.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_step_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = self._load_single_model_records() final_records = [] for record in records: if not record.weights_file: logger.error("Model file is not existed, id={}".format( record.worker_id)) else: record.step_name = General.step_name final_records.append(record) logging.debug("Records: {}".format(final_records)) return final_records
def get_pareto_front(self): """Get the pareto front of trained candidates.""" records = Report().get_pareto_front_records() codes = [] for record in records: codes.append(record.desc['code']) code_to_mutate = random.choice(codes) return code_to_mutate
def _init_report(self): info = dict( worker_id=self.trainer.worker_id, desc=self.cfg.model_desc, step_name=self.trainer.step_name, weights_file=self.best_model_file) record = ReportRecord().load_dict(info) logging.debug("Broadcast Record=%s", str(record)) Report().broadcast(record)
def __init__(self): self.step_name = General.step_name self.search_space = SearchSpace() self.search_alg = SearchAlgorithm(self.search_space) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name if hasattr(self.search_alg.config, 'objective_keys'): self.record.objective_keys = self.search_alg.config.objective_keys self.quota = QuotaCompare('restrict')
def _simulate_tiny_pipeline(self, cfg_tiny): """Simulate tiny pipeline by using one sample one epoch.""" report = Report() for i, step_name in enumerate(PipelineConfig.steps): step_cfg = cfg_tiny.get(step_name) step_cfg.trainer.distributed = False step_cfg.trainer.epochs = 1 self.restrict_config.trials[step_name] = 1 General.step_name = step_name PipeStepConfig.from_json(step_cfg) pipestep = PipeStep() if i == 0: pipestep.do() record = report.get_step_records(step_name)[-1] self.epoch_time = record.runtime _worker_path = TaskOps().local_base_path if os.path.exists(_worker_path): os.system('rm -rf {}'.format(_worker_path)) if step_cfg.pipe_step.type == 'NasPipeStep': self.params_dict[step_name][ 'max_samples'] = pipestep.generator.search_alg.max_samples _file = os.path.join(TaskOps().step_path, ".generator") if os.path.exists(_file): os.system('rm {}'.format(_file))
def _init_evaluator(self): """Do evaluate stuff. :param finished_trainer_info: the finished trainer info :type: list or dict """ use_evaluator, cls_evaluator_set = self._use_evaluator() if not use_evaluator: return record = Report().receive(self.step_name, self.worker_id) model_desc = record.desc for cls in cls_evaluator_set: evaluator = cls(worker_info=self.worker_info, model_desc=model_desc) self.add_evaluator(evaluator)
def __init__(self, search_space=None, **kwargs): """Init SearchAlgorithm.""" super(SearchAlgorithm, self).__init__() # modify config by kwargs, using local scope if self.config and kwargs: self.config.from_json(kwargs) self.search_space = search_space if hasattr(self.config, 'codec'): self.codec = Codec(search_space, type=self.config.codec) else: self.codec = None logging.debug("Config=%s", self.config) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name self._get_search_space_list()
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") records = [] cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def update(self, step_name, worker_id): """Update search algorithm accord to the worker path. :param step_name: step name :param worker_id: current worker id :return: """ report = Report() record = report.receive(step_name, worker_id) logging.debug("Get Record=%s", str(record)) self.search_alg.update(record.serialize()) report.dump_report(record.step_name, record) self.dump() logging.info("Update Success. step_name=%s, worker_id=%s", step_name, worker_id) logging.info("Best values: %s", Report().print_best(step_name=General.step_name))
def _train_single_model(self, model_desc=None, model_id=None): cls_trainer = ClassFactory.get_cls('trainer', "Trainer") step_name = self.task.step_name if model_desc is not None: sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("Broadcast Record=%s", str(record)) Report().broadcast(record) trainer = cls_trainer(model_desc=model_desc, id=model_id) else: trainer = cls_trainer(None, 0) # resume training if vega.is_torch_backend() and General._resume: trainer.load_checkpoint = True trainer._resume_training = True if cls_trainer.config.distributed: self._do_distributed_fully_train() else: self._do_single_fully_train(trainer)
def _broadcast(self, epoch, performance): record = Report().receive(self.trainer.step_name, self.trainer.worker_id) record.performance = performance Report().broadcast(record) logging.debug("report_callback record: {}".format(record))
def _save_best(self, desc): record = Report().receive(self.step_name, self.sample_count + 1) record.performance = {"accuracy": 100} record.desc = desc Report().broadcast(record)
def after_train(self, logs=None): """Close the connection of report.""" self._broadcast(self.epoch) Report().close(self.trainer.step_name, self.trainer.worker_id)
def _broadcast(self, epoch=None): record = Report().receive(self.trainer.step_name, self.trainer.worker_id) if self.trainer.config.report_on_epoch: record.epoch = self.trainer.epochs # todo: remove in FinedGrainedSpace if self.trainer.config.codec: record.desc = self.trainer.config.codec if not record.desc: record.desc = self.trainer.model_desc record.performance = self.trainer.performance record.objectives = self.trainer.valid_metrics.objectives if record.performance is not None: for key in record.performance: if key not in record.objectives: if (key == 'flops' or key == 'params'): record.objectives.update({key: 'MIN'}) else: record.objectives.update({key: 'MAX'}) record.model_path = self.trainer.model_path record.checkpoint_path = self.trainer.checkpoint_file record.weights_file = self.trainer.weights_file if self.trainer.runtime is not None: record.runtime = self.trainer.runtime Report().broadcast(record) logging.debug("report_callback record: {}".format(record))