def copy_pareto_output(self, step_name=None, worker_ids=[]): """Copy files related to pareto from worker to output.""" taskops = TaskOps() local_output_path = os.path.join(taskops.local_output_path, step_name) if not (step_name and os.path.exists(local_output_path)): return for worker_id in worker_ids: desDir = os.path.join(local_output_path, str(worker_id)) FileOps.make_dir(desDir) local_worker_path = taskops.get_worker_subpath( step_name, str(worker_id)) srcDir = FileOps.join_path(taskops.local_base_path, local_worker_path) copy_search_file(srcDir, desDir)
def csv_to_records(self, csv_file_path, step_name=None, record_name='best'): """Transfer cvs_file to records.""" local_output_path = '' if not csv_file_path and not step_name: return [] elif csv_file_path: local_output_path = csv_file_path if (not os.path.exists(local_output_path) or local_output_path) and step_name: local_output_path = os.path.join(TaskOps().local_output_path, step_name) csv_file_path = os.path.join(local_output_path, "{}.csv".format(record_name)) logging.info("csv_file_path: {}".format(csv_file_path)) if not os.path.isfile(csv_file_path): return [] csv_headr = pd.read_csv(csv_file_path).columns.values csv_value = pd.read_csv(csv_file_path).values records = [] for item in csv_value: record = dict(zip(csv_headr, item)) records.append(ReportRecord().load_dict(record)) logging.info("csv_to_records: {}".format(records)) return records
def _init_model(self): """Initialize model if fully training a model. :return: config of fully train model :rtype: config file """ config = Config(self.cfg.config_template) config['total_epochs'] = self.cfg.epoch if 'model_desc_file' in self.cfg: _model_desc_file = self.cfg.model_desc_file.replace( "{local_base_path}", TaskOps().local_base_path) _total_list = ListDict.load_csv(_model_desc_file) pre_arch = _total_list.sort('mAP')[0]['arch'] pretrained = pre_arch.split('_')[1] pre_worker_id = _total_list.sort('mAP')[0]['pre_worker_id'] model_desc = dict(arch=pre_arch, pre_arch=pretrained, pre_worker_id=-1) logging.info( "Initialize fully train model from: {}".format(model_desc)) if self.cfg.regnition: # re-write config from previous result config['model']['backbone']['reignition'] = True config['model']['pretrained'] = os.path.join( self.output_path, pretrained + '_imagenet.pth') else: config['model']['pretrained'] = extract_backbone_from_pth( self.output_path, pre_worker_id, pretrained) elif 'model_desc' in self.cfg: model_desc = self.cfg.model_desc else: raise ValueError('Missing model description!') model_desc = update_config(config, model_desc) return model_desc
def _append_record_to_csv(self, record_name=None, step_name=None, record=None, mode='a'): """Transfer record to csv file.""" local_output_path = os.path.join(TaskOps().local_output_path, step_name) logging.debug( "recode to csv, local_output_path={}".format(local_output_path)) if not record_name and os.path.exists(local_output_path): return file_path = os.path.join(local_output_path, "{}.csv".format(record_name)) FileOps.make_base_dir(file_path) try: for key in record: if isinstance(record[key], dict) or isinstance( record[key], list): record[key] = str(record[key]) data = pd.DataFrame([record]) if not os.path.exists(file_path): data.to_csv(file_path, index=False) elif os.path.exists(file_path) and os.path.getsize( file_path) and mode == 'a': data.to_csv(file_path, index=False, mode=mode, header=0) else: data.to_csv(file_path, index=False, mode=mode) except Exception as ex: logging.info( 'Can not transfer record to csv file Error: {}'.format(ex))
def _save_worker_record(cls, record): step_name = record.get('step_name') worker_id = record.get('worker_id') _path = TaskOps().get_local_worker_path(step_name, worker_id) for record_name in ["desc", "performance"]: _file_name = None _file = None record_value = record.get(record_name) if not record_value: continue _file = None try: # for cars/darts save multi-desc if isinstance(record_value, list) and record_name == "desc": for idx, value in enumerate(record_value): _file_name = "desc_{}.json".format(idx) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) else: _file_name = None if record_name == "desc": _file_name = "desc_{}.json".format(worker_id) if record_name == "performance": _file_name = "performance_{}.json".format(worker_id) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) except Exception as ex: logging.error( "Failed to save {}, file={}, desc={}, msg={}".format( record_name, _file, record_value, str(ex)))
def __init__(self, search_space=None, **kwargs): super(SpNas, self).__init__(search_space, **kwargs) self.search_space = search_space # self.codec = Codec(self.config.codec, search_space) self.sample_level = self.config.sample_level self.max_sample = self.config.max_sample self.max_optimal = self.config.max_optimal self._total_list_file = self.config.total_list.replace( "{local_base_path}", TaskOps().local_base_path) self.serial_settings = self.config.serial_settings self._total_list = ListDict() self.sample_count = 0 self.init_code = None self.output_path = TaskOps().local_output_path if self.config.last_search_result: last_search_file = self.config.last_search_result.replace( "{local_base_path}", TaskOps().local_base_path) assert FileOps.exists( last_search_file), "Not found serial results!" last_search_results = ListDict.load_csv(last_search_file) pre_worker_id, pre_arch = self.select_from_remote( self.max_optimal, last_search_results) # re-write config template if self.config.regnition: self.codec.config_template['model']['backbone'][ 'reignition'] = True assert FileOps.exists( os.path.join(self.output_path, pre_arch + '_imagenet.pth') ), "Not found {} pretrained .pth file!".format(pre_arch) pretrained_pth = os.path.join(self.output_path, pre_arch + '_imagenet.pth') self.codec.config_template['model'][ 'pretrained'] = pretrained_pth pre_worker_id = -1 # update config template self.init_code = dict(arch=pre_arch, pre_arch=pre_arch.split('_')[1], pre_worker_id=pre_worker_id) logging.info("inited SpNas {}-level search...".format( self.sample_level))
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") records = [] cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace( "{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def _save_model_desc_file(self, id, desc): output_path = TaskOps(UserConfig().data.general).local_output_path desc_file = os.path.join(output_path, "nas", "model_desc_{}.json".format(id)) FileOps.make_base_dir(desc_file) output = {} for key in desc: if key in ["type", "modules", "custom"]: output[key] = desc[key] with open(desc_file, "w") as f: json.dump(output, f)
def _output_records(self, step_name, records, desc=True, weights_file=False, performance=False): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] if desc: outputs_globs += glob.glob( FileOps.join_path(worker_path, "desc_*.json")) if weights_file: outputs_globs += glob.glob( FileOps.join_path(worker_path, "model_*.pth")) if performance: outputs_globs += glob.glob( FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: FileOps.copy_file(_file, step_path)
def _load_pretrained_model(cls, network, model, model_checkpoint): if not model_checkpoint and network._model_type == NetTypes.TORCH_VISION_MODEL: model_file_name = get_torchvision_model_file(network._model_name) full_path = "{}/torchvision_models/checkpoints/{}".format( TaskOps().model_zoo_path, model_file_name) else: full_path = model_checkpoint logging.info("load model weights from file.") logging.debug("Weights file: {}".format(full_path)) if not os.path.isfile(full_path): raise "Pretrained model is not existed, model={}".format(full_path) checkpoint = torch.load(full_path) model.load_state_dict(checkpoint) return model
def _start_cluster(self): """Set and start dask distributed cluster.""" self.md = ClusterDaskDistributor(self.dask_env.master_address) self.client = self.md.get_client() local_host = None if "BATCH_CURRENT_HOST" in os.environ: local_host = os.environ["BATCH_CURRENT_HOST"] elif "BATCH_CUSTOM0_HOSTS" in os.environ: local_host = os.environ["BATCH_CUSTOM0_HOSTS"] plugin = WorkerEnv(self.dask_env.slave_proc_num, self.dask_env.slave_gpus_per_proc, local_host, os.getpid(), TaskOps(self.cfg).temp_path) self.client.register_worker_plugin(plugin) return
def __init__(self): """Init master attrs, setup and start dask distributed cluster and local multiprocess pool.""" self.cfg = copy.deepcopy(UserConfig().data.general) self.task_count = 0 self.eval_count = self.cfg.worker.eval_count self.dask_env = DaskEnv(UserConfig().data.env, self.__master_path__, self.cfg.worker.gpus_per_job, TaskOps(self.cfg).temp_path) status = self.dask_env.start() if not status or not self.dask_env.is_master: sys.exit(0) self._start_cluster() self._start_evaluator_multiprocess() self.t_queue = Queue() # now save GPU and Dloop Evaluator result. self.e_queue = utils.PairDictQueue() return
def update(self, record): """Update sampler.""" step_name = record.get("step_name") worker_id = record.get("worker_id") worker_result_path = TaskOps().get_local_worker_path( step_name, worker_id) performance_file = self.performance_path(worker_result_path) logging.info( "SpNas.update(), performance file={}".format(performance_file)) info = FileOps.load_pickle(performance_file) if info is not None: self._total_list.append(info) else: logging.info("SpNas.update(), file is not exited, " "performance file={}".format(performance_file)) self.save_output(self.output_path) if self.backup_base_path is not None: FileOps.copy_folder(self.output_path, self.backup_base_path)
def set_torch_home(): """Set TORCH_HOME to local path.""" task = TaskOps(DefaultConfig().data.general) full_path = os.path.abspath("{}/torchvision_models".format( task.model_zoo_path)) os.environ['TORCH_HOME'] = full_path
def __init__(self): self.task = TaskOps()
def backup_output_path(self): """Back up output to local path.""" backup_path = TaskOps().backup_base_path if backup_path is None: return FileOps.copy_folder(TaskOps().local_output_path, backup_path)
def __init__(self): self.task = TaskOps(UserConfig().data.general)