示例#1
0
 def _train_single_model(self,
                         model_desc=None,
                         model_id=None,
                         weights_file=None):
     cls_trainer = ClassFactory.get_cls('trainer')
     step_name = self.task.step_name
     if model_desc is not None:
         sample = dict(worker_id=model_id,
                       desc=model_desc,
                       step_name=step_name)
         record = ReportRecord().load_dict(sample)
         logging.debug("Broadcast Record=%s", str(record))
         trainer = cls_trainer(model_desc=model_desc,
                               id=model_id,
                               pretrained_model_file=weights_file)
     else:
         trainer = cls_trainer(None, 0)
         record = ReportRecord(trainer.step_name,
                               trainer.worker_id,
                               desc=trainer.model_desc)
     ReportClient.broadcast(record)
     ReportServer.add_watched_var(trainer.step_name, trainer.worker_id)
     # resume training
     if vega.is_torch_backend() and General._resume:
         trainer.load_checkpoint = True
         trainer._resume_training = True
     if self._distributed_training:
         self._do_distributed_fully_train(trainer)
     else:
         self._do_single_fully_train(trainer)
示例#2
0
 def __init__(self):
     self.step_name = General.step_name
     self.search_space = SearchSpace()
     self.search_alg = SearchAlgorithm(self.search_space)
     self.report = Report()
     self.record = ReportRecord()
     self.record.step_name = self.step_name
     if hasattr(self.search_alg.config, 'objective_keys'):
         self.record.objective_keys = self.search_alg.config.objective_keys
     self.quota = QuotaCompare('restrict')
示例#3
0
 def __init__(self):
     self.step_name = General.step_name
     self.search_space = SearchSpace()
     self.search_alg = SearchAlgorithm(self.search_space)
     self.record = ReportRecord()
     self.record.step_name = self.step_name
     if hasattr(self.search_alg.config, 'objective_keys'):
         self.record.objective_keys = self.search_alg.config.objective_keys
     self.quota = QuotaCompare('restrict')
     self.affinity = None if General.quota.affinity.type is None else QuotaAffinity(
         General.quota.affinity)
示例#4
0
 def do(self):
     """Start to run benchmark evaluator."""
     logger.info("BenchmarkPipeStep started...")
     records = self._get_current_step_records()
     if not records:
         logger.error("There is no model to evaluate.")
         return
     self.master = create_master()
     for record in records:
         _record = ReportRecord(worker_id=record.worker_id,
                                desc=record.desc,
                                step_name=record.step_name)
         Report().broadcast(_record)
         self._evaluate_single_model(record)
     self.master.join()
     for record in records:
         Report().update_report({
             "step_name": record.step_name,
             "worker_id": record.worker_id
         })
     Report().output_step_all_records(step_name=General.step_name,
                                      weights_file=False,
                                      performance=True)
     self.master.close_client()
     Report().backup_output_path()
示例#5
0
 def _evaluate_single_model(self, record):
     try:
         worker_info = {
             "step_name": record.step_name,
             "worker_id": record.worker_id
         }
         _record = dict(worker_id=record.worker_id,
                        desc=record.desc,
                        step_name=record.step_name)
         _init_record = ReportRecord().load_dict(_record)
         Report().broadcast(_init_record)
         if EvaluatorConfig.gpu_evaluator_enable:
             cls_evaluator = ClassFactory.get_cls(ClassType.GPU_EVALUATOR,
                                                  "GpuEvaluator")
             evaluator = cls_evaluator(worker_info=worker_info,
                                       model_desc=record.desc,
                                       weights_file=record.weights_file)
             self.master.run(evaluator)
         if EvaluatorConfig.davinci_mobile_evaluator_enable:
             cls_evaluator = ClassFactory.get_cls(
                 ClassType.DAVINCI_MOBILE_EVALUATOR,
                 "DavinciMobileEvaluator")
             evaluator = cls_evaluator(worker_info=worker_info,
                                       model_desc=record.desc,
                                       weights_file=record.weights_file)
             self.master.run(evaluator)
     except Exception:
         logger.error(
             "Failed to evaluate model, worker info={}".format(worker_info))
         logger.error(traceback.format_exc())
         return
示例#6
0
 def _init_report(self):
     info = dict(worker_id=self.trainer.worker_id,
                 desc=self.cfg.model_desc,
                 step_name=self.trainer.step_name,
                 weights_file=self.best_model_file)
     record = ReportRecord().load_dict(info)
     logging.debug("Broadcast Record=%s", str(record))
     ReportClient.broadcast(record)
示例#7
0
 def __init__(self, search_space=None, **kwargs):
     """Init SearchAlgorithm."""
     super(SearchAlgorithm, self).__init__()
     # modify config by kwargs, using local scope
     if self.config and kwargs:
         self.config.from_dict(kwargs)
     self.search_space = search_space
     if hasattr(self.config, 'codec'):
         self.codec = Codec(search_space, type=self.config.codec)
     else:
         self.codec = None
     logging.debug("Config=%s", self.config)
     self.record = ReportRecord()
     self.record.step_name = self.step_name
     self._get_search_space_list()
示例#8
0
 def do(self):
     """Start to run benchmark evaluator."""
     logger.info("BenchmarkPipeStep started...")
     records = self._get_current_step_records()
     if not records:
         logger.error("There is no model to evaluate.")
         return
     self.master = create_master()
     for record in records:
         _record = ReportRecord(worker_id=record.worker_id,
                                desc=record.desc,
                                step_name=record.step_name)
         ReportClient().broadcast(_record)
         ReportServer().add_watched_var(record.step_name, record.worker_id)
         self._evaluate_single_model(record)
     self.master.join()
     ReportServer().output_step_all_records(step_name=General.step_name)
     self.master.close_client()
     ReportServer().backup_output_path()
示例#9
0
 def _get_current_step_records(self):
     step_name = self.task.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         # records = ReportServer().get_pareto_front_records(PipelineConfig.steps[cur_index - 1])
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = ReportServer().load_records_from_model_folder(
             models_folder)
     else:
         records = [ReportRecord(step_name, 0)]
     logging.debug("Records: {}".format(records))
     for record in records:
         record.step_name = step_name
     return records
示例#10
0
 def _train_single_model(self, model_desc=None, model_id=None):
     cls_trainer = ClassFactory.get_cls('trainer', "Trainer")
     step_name = self.task.step_name
     if model_desc is not None:
         sample = dict(worker_id=model_id,
                       desc=model_desc,
                       step_name=step_name)
         record = ReportRecord().load_dict(sample)
         logging.debug("Broadcast Record=%s", str(record))
         Report().broadcast(record)
         trainer = cls_trainer(model_desc=model_desc, id=model_id)
     else:
         trainer = cls_trainer(None, 0)
     # resume training
     if vega.is_torch_backend() and General._resume:
         trainer.load_checkpoint = True
         trainer._resume_training = True
     if cls_trainer.config.distributed:
         self._do_distributed_fully_train()
     else:
         self._do_single_fully_train(trainer)
示例#11
0
 def _load_single_model_records(self):
     model_desc = PipeStepConfig.model.model_desc
     model_desc_file = PipeStepConfig.model.model_desc_file
     if model_desc_file:
         model_desc_file = model_desc_file.replace(
             "{local_base_path}",
             TaskOps().local_base_path)
         model_desc = Config(model_desc_file)
     if not model_desc:
         logger.error("Model desc or Model desc file is None.")
         return []
     model_file = PipeStepConfig.model.pretrained_model_file
     if not model_file:
         logger.error("Model file is None.")
         return []
     if not os.path.exists(model_file):
         logger.error("Model file is not existed.")
         return []
     return [
         ReportRecord().load_dict(
             dict(worker_id="1", desc=model_desc, weights_file=model_file))
     ]
示例#12
0
class Generator(object):
    """Convert search space and search algorithm, sample a new model."""

    def __init__(self):
        self.step_name = General.step_name
        self.search_space = SearchSpace()
        self.search_alg = SearchAlgorithm(self.search_space)
        self.report = Report()
        self.record = ReportRecord()
        self.record.step_name = self.step_name
        if hasattr(self.search_alg.config, 'objective_keys'):
            self.record.objective_keys = self.search_alg.config.objective_keys
        self.quota = QuotaCompare('restrict')

    @property
    def is_completed(self):
        """Define a property to determine search algorithm is completed."""
        return self.search_alg.is_completed or self.quota.is_halted()

    def sample(self):
        """Sample a work id and model from search algorithm."""
        res = self.search_alg.search()
        if not res:
            return None
        if not isinstance(res, list):
            res = [res]
        if len(res) == 0:
            return None
        out = []
        for sample in res:
            desc = sample.get("desc") if isinstance(sample, dict) else sample[1]
            desc = self._decode_hps(desc)
            model_desc = deepcopy(desc)
            if "modules" in desc:
                PipeStepConfig.model.model_desc = deepcopy(desc)
            elif "network" in desc:
                origin_desc = PipeStepConfig.model.model_desc
                desc = update_dict(desc["network"], origin_desc)
                PipeStepConfig.model.model_desc = deepcopy(desc)
            if self.quota.is_filtered(desc):
                continue
            record = self.record.from_sample(sample, desc)
            Report().broadcast(record)
            out.append((record.worker_id, model_desc))
        return out

    def update(self, step_name, worker_id):
        """Update search algorithm accord to the worker path.

        :param step_name: step name
        :param worker_id: current worker id
        :return:
        """
        report = Report()
        record = report.receive(step_name, worker_id)
        logging.debug("Get Record=%s", str(record))
        self.search_alg.update(record.serialize())
        report.dump_report(record.step_name, record)
        self.dump()
        logging.info("Update Success. step_name=%s, worker_id=%s", step_name, worker_id)
        logging.info("Best values: %s", Report().print_best(step_name=General.step_name))

    @staticmethod
    def _decode_hps(hps):
        """Decode hps: `trainer.optim.lr : 0.1` to dict format.

        And convert to `zeus.common.config import Config` object
        This Config will be override in Trainer or Datasets class
        The override priority is: input hps > user configuration >  default configuration
        :param hps: hyper params
        :return: dict
        """
        hps_dict = {}
        if hps is None:
            return None
        if isinstance(hps, tuple):
            return hps
        for hp_name, value in hps.items():
            hp_dict = {}
            for key in list(reversed(hp_name.split('.'))):
                if hp_dict:
                    hp_dict = {key: hp_dict}
                else:
                    hp_dict = {key: value}
            # update cfg with hps
            hps_dict = update_dict(hps_dict, hp_dict, [])
        return Config(hps_dict)

    def dump(self):
        """Dump generator to file."""
        step_path = TaskOps().step_path
        _file = os.path.join(step_path, ".generator")
        with open(_file, "wb") as f:
            pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)

    @classmethod
    def restore(cls):
        """Restore generator from file."""
        step_path = TaskOps().step_path
        _file = os.path.join(step_path, ".generator")
        if os.path.exists(_file):
            with open(_file, "rb") as f:
                return pickle.load(f)
        else:
            return None