Пример #1
0
 def search(self):
     """Search a sample."""
     if self.need_reignite_queue:
         # share weights and use reignite model
         worker_id, desc = self.need_reignite_queue.popitem()
         reignite_record = ReportServer().get_record(
             self.step_name, worker_id)
         desc['network.backbone.code'] = reignite_record.desc['code']
         desc['network.backbone.weight_file'] = reignite_record.weights_file
         self.finished_reignite_queue[worker_id] = desc
         logging.info(
             "Finished reignited models, work_id:{} desc:{}".format(
                 worker_id, desc))
         return dict(worker_id=worker_id, encoded_desc=desc)
     pareto_records = ReportServer().get_pareto_front_records(
         choice='normal')
     best_record = pareto_records[0] if pareto_records else None
     desc = self.search_space.sample()
     arch_code = desc.get('network.backbone.code')
     if best_record:
         desc['network.backbone.weight_file'] = best_record.weights_file
     if best_record or not self.config.retain_original_code:
         desc['network.backbone.code'] = self._mutate_serialnet(arch_code)
     self.sample_count += 1
     if self.config.reignite and self.config.reignite_desc:
         return self._reignite(desc)
     logging.info("desc:{}".format(desc))
     return dict(worker_id=self.sample_count, encoded_desc=desc)
Пример #2
0
def shutdown_cluster():
    """Shutdown all distributed cluster."""
    # detect master is running
    if not General._parallel:
        return
    try:
        logging.info("Try to shutdown cluster.")

        # stop ReportServer
        from zeus.report import ReportServer
        ReportServer.stop()

        # stop Master
        from zeus.trainer.utils import load_master_ip
        from distributed import Client
        ip, port = load_master_ip()
        if ip is None or port is None:
            logging.info("Stand-alone mode, no need to shut down the cluster.")
            return
        shutdown_client = Client("{}:{}".format(ip, port))
        logging.info("Cluster will be shut down.")
        shutdown_client.shutdown()
        shutdown_client.close()
        del shutdown_client
        time.sleep(15)
        logging.info("Cluster is shut down.")
    except Exception as e:
        logging.error("Pipeline's cluster shutdown error: {}".format(str(e)))
        logging.error(traceback.format_exc())
Пример #3
0
 def _train_single_model(self,
                         model_desc=None,
                         model_id=None,
                         weights_file=None):
     cls_trainer = ClassFactory.get_cls('trainer')
     step_name = self.task.step_name
     if model_desc is not None:
         sample = dict(worker_id=model_id,
                       desc=model_desc,
                       step_name=step_name)
         record = ReportRecord().load_dict(sample)
         logging.debug("Broadcast Record=%s", str(record))
         trainer = cls_trainer(model_desc=model_desc,
                               id=model_id,
                               pretrained_model_file=weights_file)
     else:
         trainer = cls_trainer(None, 0)
         record = ReportRecord(trainer.step_name,
                               trainer.worker_id,
                               desc=trainer.model_desc)
     ReportClient.broadcast(record)
     ReportServer.add_watched_var(trainer.step_name, trainer.worker_id)
     # resume training
     if vega.is_torch_backend() and General._resume:
         trainer.load_checkpoint = True
         trainer._resume_training = True
     if self._distributed_training:
         self._do_distributed_fully_train(trainer)
     else:
         self._do_single_fully_train(trainer)
Пример #4
0
 def _simulate_tiny_pipeline(self, cfg_tiny):
     """Simulate tiny pipeline by using one sample one epoch."""
     report = ReportServer()
     for i, step_name in enumerate(PipelineConfig.steps):
         step_cfg = cfg_tiny.get(step_name)
         if step_cfg.pipe_step.type != 'SearchPipeStep':
             continue
         step_cfg.trainer.distributed = False
         step_cfg.trainer.epochs = 1
         self.restrict_config.trials[step_name] = 1
         General.step_name = step_name
         PipeStepConfig.from_dict(step_cfg)
         pipestep = PipeStep()
         if i == 0:
             pipestep.do()
             record = report.get_step_records(step_name)[-1]
             self.epoch_time = record.runtime
             _worker_path = TaskOps().local_base_path
             if os.path.exists(_worker_path):
                 os.system('rm -rf {}'.format(_worker_path))
         if step_cfg.pipe_step.type == 'SearchPipeStep':
             self.params_dict[step_name][
                 'max_samples'] = pipestep.generator.search_alg.max_samples
         _file = os.path.join(TaskOps().step_path, ".generator")
         if os.path.exists(_file):
             os.system('rm {}'.format(_file))
Пример #5
0
 def _dispatch_trainer(self, samples):
     for (id, desc, hps) in samples:
         cls_trainer = ClassFactory.get_cls(ClassType.TRAINER)
         TrainerConfig.from_dict(self.user_trainer_config)
         trainer = cls_trainer(id=id, model_desc=desc, hps=hps)
         evaluator = self._get_evaluator(trainer)
         logging.info("submit trainer, id={}".format(id))
         ReportServer.add_watched_var(General.step_name, trainer.worker_id)
         self.master.run(trainer, evaluator)
Пример #6
0
 def __init__(self):
     """Initialize."""
     super().__init__()
     if not hasattr(self, "generator"):
         self.generator = Generator.restore()
     if not self.generator:
         self.generator = Generator()
     ReportServer.restore()
     self.master = create_master(update_func=self.generator.update)
     self.user_trainer_config = TrainerConfig().to_dict()
Пример #7
0
 def do(self):
     """Start to run fully train with horovod or local trainer."""
     logger.info("TrainPipeStep started...")
     records = self._get_current_step_records()
     logger.debug("load pipestep records: {}".format(records))
     self.master = create_master()
     self._train_multi_models(records)
     self.master.join()
     ReportServer().output_step_all_records(step_name=self.task.step_name)
     self.master.close_client()
     ReportServer().backup_output_path()
Пример #8
0
 def close_client(self):
     """Close cluster client."""
     ReportServer.stop()
     self._thread_runing = False
     # Waiting thread exit.
     time.sleep(1)
     if hasattr(self, "client") and self.client is not None:
         self.client.close()
         del self.client
     # Waiting cluster closed
     time.sleep(1)
Пример #9
0
    def search(self):
        """Search code of one model.

        :return: searched code of the model.
        """
        records = ReportServer().get_pareto_front_records(['nas'])
        encodings = []
        for record in records:
            custom = record.desc['custom']
            encodings.append(custom['encoding'])
        pareto_front = encodings
        model_str = random.choice(pareto_front)
        print('model_str', model_str)
        ratio = np.random.randint(low=0, high=self.num_transform + 1)
        print('ratio', ratio)
        context_path = model_str[0].split('_')
        spatial_path = model_str[1].split('_')
        print('context_path', context_path)
        print('spatial_path', spatial_path)
        spatial_path = self.mutate_channel(spatial_path)
        context_path[-1] = self.do_transform(context_path[-1], num_mutate=ratio)
        spatial_path[-1] = self.do_transform(spatial_path[-1], num_mutate=self.num_transform - ratio)
        encoding = ['_'.join(context_path), '_'.join(spatial_path)]
        logging.info("Mutate from {} to {}".format(model_str, encoding))
        return encoding
Пример #10
0
    def search(self):
        """Search one mutated model.

        :return: current number of samples, and the model
        """
        desc = deepcopy(self.search_space)
        search_desc = desc.custom
        records = ReportServer().get_pareto_front_records(['random', 'mutate'])
        codes = []
        for record in records:
            custom = record.desc['custom']
            codes.append(custom['code'])
        num_ops = len(search_desc.op_names)
        upper_bounds = [num_ops, 2, 2, num_ops, num_ops, 5, 5, num_ops, num_ops,
                        8, 8, num_ops, num_ops, 4, 4, 5, 5, 6, 6]
        code_to_mutate = random.choice(codes)
        index = random.randrange(len(upper_bounds))
        choices = list(range(upper_bounds[index]))
        choices.pop(int(code_to_mutate[index + 1], 36))
        choice = random.choice(choices)
        code_mutated = code_to_mutate[:index + 1] + str(choice) + code_to_mutate[index + 2:]
        search_desc['code'] = code_mutated
        search_desc['method'] = "mutate"
        logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated))
        search_desc = self.codec.decode(search_desc)
        self.sample_count += 1
        desc['custom'] = search_desc
        return self.sample_count, desc
Пример #11
0
    def search(self):
        """Search one NetworkDesc from search space.

        :return: search id, network desc
        :rtype: int, NetworkDesc
        """
        if self.random_count < self.random_models:
            self.random_count += 1
            desc = self._random_sample()
            # desc.update({"trainer.codec": dict(desc)})
            return self.random_count, desc
        records = ReportServer().get_pareto_front_records(self.step_name, self.num_individual)
        codes = [record.desc.get('backbone').get('encoding') for record in records]
        logging.info("codes=%s", codes)
        if len(codes) < 2:
            encoding1, encoding2 = codes[0], codes[0]
        else:
            encoding1, encoding2 = random.sample(codes, 2)
        choice = random.randint(0, 1)
        # mutate
        if choice == 0:
            encoding_new = self.mutatation(encoding1)
        # crossover
        else:
            encoding_new, _ = self.crossover(encoding1, encoding2)
        self.ea_count += 1
        if self.ea_count % self.num_individual == 0:
            self.ea_epoch += 1
        desc = self.codec.decode(encoding_new)
        return self.random_count + self.ea_count, desc
Пример #12
0
 def _get_current_step_records(self):
     step_name = General.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = ReportServer().load_records_from_model_folder(
             models_folder)
     else:
         records = self._load_single_model_records()
     final_records = []
     for record in records:
         if not record.weights_file:
             logger.error("Model file is not existed, id={}".format(
                 record.worker_id))
         else:
             record.step_name = General.step_name
             final_records.append(record)
     logging.debug("Records: {}".format(final_records))
     return final_records
Пример #13
0
    def search(self):
        """Search one mutated model.

        :return: current number of samples, and the model
        """
        desc = deepcopy(self.search_space)
        search_desc = desc.custom
        # TODO: merge sr ea in one pipe step.
        records = ReportServer().get_pareto_front_records(['random', 'mutate'])
        codes = []
        for record in records:
            codes.append(record.desc['custom']['code'])
        code_to_mutate = random.choice(codes)
        current_mutate, code_mutated = 0, code_to_mutate
        num_candidates = len(search_desc["candidates"])
        while current_mutate < self.num_mutate:
            code_new = self.mutate_once(code_mutated, num_candidates)
            if code_new != code_mutated:
                current_mutate += 1
                code_mutated = code_new
        logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated))
        search_desc['code'] = code_mutated
        search_desc['method'] = "mutate"
        search_desc = self.codec.decode(search_desc)
        desc['custom'] = search_desc
        self.sample_count += 1
        return dict(worker_id=self.sample_count, encoded_desc=desc)
Пример #14
0
 def do(self):
     """Do the main task in this pipe step."""
     logging.debug("SearchPipeStep started...")
     while not self.generator.is_completed:
         res = self.generator.sample()
         if res:
             self._dispatch_trainer(res)
         else:
             time.sleep(0.2)
     self.master.join()
     logging.debug("Pareto_front values: %s",
                   ReportServer().pareto_front(General.step_name))
     ReportServer().output_pareto_front(General.step_name)
     self.master.close_client()
     if General.clean_worker_dir:
         self._clean_checkpoint()
Пример #15
0
 def get_pareto_front(self):
     """Get the pareto front of trained candidates."""
     records = ReportServer().get_pareto_front_records()
     codes = []
     for record in records:
         codes.append(record.desc['code'])
     code_to_mutate = random.choice(codes)
     return code_to_mutate
Пример #16
0
    def update(self, step_name, worker_id):
        """Update search algorithm accord to the worker path.

        :param step_name: step name
        :param worker_id: current worker id
        :return:
        """
        record = ReportClient.get_record(step_name, worker_id)
        logging.debug("Get Record=%s", str(record))
        self.search_alg.update(record.serialize())
        self.dump()
        if not hasattr(
                self.search_alg,
                '_remove_watched_var') or self.search_alg._remove_watched_var:
            ReportServer.remove_watched_var(step_name, worker_id)
        logging.info("Update Success. step_name=%s, worker_id=%s, desc=%s",
                     step_name, worker_id, record.desc)
        logging.info("Best values: %s",
                     ReportServer().print_best(step_name=General.step_name))
Пример #17
0
 def do(self):
     """Start to run benchmark evaluator."""
     logger.info("BenchmarkPipeStep started...")
     records = self._get_current_step_records()
     if not records:
         logger.error("There is no model to evaluate.")
         return
     self.master = create_master()
     for record in records:
         _record = ReportRecord(worker_id=record.worker_id,
                                desc=record.desc,
                                step_name=record.step_name)
         ReportClient().broadcast(_record)
         ReportServer().add_watched_var(record.step_name, record.worker_id)
         self._evaluate_single_model(record)
     self.master.join()
     ReportServer().output_step_all_records(step_name=General.step_name)
     self.master.close_client()
     ReportServer().backup_output_path()
Пример #18
0
 def search(self):
     """Search a sample."""
     pareto_records = ReportServer().get_pareto_front_records(
         choice='normal')
     best_record = pareto_records[0] if pareto_records else None
     desc = self.search_space.sample()
     if best_record:
         desc['network.neck.code'] = self._mutate_parallelnet(
             best_record.desc.get("neck").get('code'))
     self.sample_count += 1
     logging.info("desc:{}".format(desc))
     return dict(worker_id=self.sample_count, encoded_desc=desc)
Пример #19
0
    def run(self):
        """Execute the whole pipeline."""

        def _shutdown_cluster(signum, frame):
            logging.info("Shutdown urgently.")
            shutdown_cluster()
            os._exit(0)

        steps_time = []
        error_occured = False

        try:
            signal.signal(signal.SIGINT, _shutdown_cluster)
            signal.signal(signal.SIGTERM, _shutdown_cluster)
            _ = ReportServer()
            for step_name in PipelineConfig.steps:
                step_cfg = UserConfig().data.get(step_name)
                General.step_name = step_name
                PipeStepConfig.renew()
                PipeStepConfig.from_dict(step_cfg, skip_check=False)
                self._set_evaluator_config(step_cfg)
                logging.info("-" * 48)
                logging.info("  Step: {}".format(step_name))
                logging.info("-" * 48)
                logger.debug("Pipe step config: {}".format(PipeStepConfig()))
                if PipeStepConfig.type == "SearchPipeStep":
                    General._parallel = General.parallel_search
                if PipeStepConfig.type == "TrainPipeStep":
                    General._parallel = General.parallel_fully_train

                start_time = datetime.datetime.now()
                PipeStep().do()
                end_time = datetime.datetime.now()
                steps_time.append([step_name, start_time, end_time, self._interval_time(start_time, end_time)])
        except Exception:
            logger.error("Failed to run pipeline.")
            logger.error(traceback.format_exc())
            error_occured = True
        try:
            shutdown_cluster()
        except Exception:
            logger.error("Failed to shutdown dask cluster.")
            logger.error(traceback.format_exc())

        if not error_occured:
            self._show_pipeline_info(steps_time, step_name)
Пример #20
0
    def search(self):
        """Search one NetworkDesc from search space.

        :return: search id, network desc
        :rtype: int, NetworkDesc
        """
        if self.sample_count < self.random_models:
            self.sample_count += 1
            return dict(worker_id=self.sample_count,
                        encoded_desc=self.search_space.sample())
        records = ReportServer().get_pareto_front_records(
            self.step_name, self.num_individual)
        if not records:
            return None
        codes = []
        each_codes_cache = {}
        # Merge codes
        for record in records:
            each_code = []
            for key, item in record.desc.get('props').items():
                each_codes_cache[key] = len(item)
                each_code.extend(item)
            codes.append(each_code)
        self.length = len(codes[0])
        logging.info("codes sum={}, code length={}".format(
            sum(codes[0]), self.length))
        if len(codes) < 2:
            encoding1, encoding2 = codes[0], codes[0]
        else:
            encoding1, encoding2 = random.sample(codes, 2)
        choice = random.randint(0, 1)
        # mutate
        if choice == 0:
            encoding_new = self.mutatation(encoding1)
        # crossover
        else:
            encoding_new, _ = self.crossover(encoding1, encoding2)
        # split codes
        desc = {}
        for _name, _size in each_codes_cache.items():
            desc["network.props.{}".format(_name)] = encoding_new[:_size]
            encoding_new = encoding_new[_size:]
        self.sample_count += 1
        return dict(worker_id=self.sample_count, encoded_desc=desc)
Пример #21
0
 def search(self):
     """Search code of a model."""
     desc = deepcopy(self.search_space)
     search_desc = self.search_space.custom
     if self.sample_count < self.max_sample_random:
         encoding = self.random.search()
     else:
         records = ReportServer().get_pareto_front_records(['nas'])
         if len(records) == 0:
             encoding = self.random.search()
             print('pareto_front_records is None, do random search')
         else:
             encoding = self.mutate.search()
     search_desc['encoding'] = deepcopy(encoding)
     encoding[0] = self.codec.decode(encoding[0])
     search_desc['config'] = encoding
     self.sample_count += 1
     desc['custom'] = search_desc
     return self.sample_count, desc
Пример #22
0
 def _get_current_step_records(self):
     step_name = self.task.step_name
     models_folder = PipeStepConfig.pipe_step.get("models_folder")
     cur_index = PipelineConfig.steps.index(step_name)
     if cur_index >= 1 or models_folder:
         # records = ReportServer().get_pareto_front_records(PipelineConfig.steps[cur_index - 1])
         if not models_folder:
             models_folder = FileOps.join_path(
                 TaskOps().local_output_path,
                 PipelineConfig.steps[cur_index - 1])
         models_folder = models_folder.replace("{local_base_path}",
                                               TaskOps().local_base_path)
         records = ReportServer().load_records_from_model_folder(
             models_folder)
     else:
         records = [ReportRecord(step_name, 0)]
     logging.debug("Records: {}".format(records))
     for record in records:
         record.step_name = step_name
     return records
Пример #23
0
 def __init__(self, update_func=None):
     """Init master attrs, setup and start dask distributed cluster and local multiprocess pool."""
     self.cfg = General()
     self.task_count = 0
     self.eval_count = General.worker.eval_count
     self.dask_env = DaskEnv(General.env, self.__master_path__,
                             General.devices_per_trainer,
                             TaskOps().temp_path)
     status = self.dask_env.start()
     if not status or not self.dask_env.is_master:
         sys.exit(0)
     self._start_cluster()
     self.t_queue = Queue()
     self.update_func = update_func
     self.evaluator_list = {}
     self._thread_runing = True
     self._lock = Lock()
     self._thread = self._run_monitor_thread()
     ReportServer().renew()
     return
Пример #24
0
    def __init__(self, search_space=None, **kwargs):
        """Init DnetNas."""
        super(DnetNas, self).__init__(search_space, **kwargs)
        # ea or random
        self.num_mutate = self.config.policy.num_mutate
        self.random_ratio = self.config.policy.random_ratio
        self.max_sample = self.config.range.max_sample
        self.min_sample = self.config.range.min_sample
        self.sample_count = 0
        logging.info("inited DnetNas")
        self.pareto_front = ParetoFront(self.config.pareto.object_count,
                                        self.config.pareto.max_object_ids)
        self._best_desc_file = 'nas_model_desc.json'

        block_nas_folder = ModelConfig.models_folder.format(
            local_base_path=self.local_base_path)
        logging.info(f'folder: {block_nas_folder}')
        base_reports = ReportServer().load_records_from_model_folder(
            block_nas_folder)
        logging.info(f'base_reports: {base_reports}')
        self.base_block = base_reports[0].desc['backbone']['encoding'].split(
            '_')[0]