def _do_horovod_fully_train(self): pwd_dir = os.path.dirname(os.path.abspath(__file__)) cf_file = os.path.join(pwd_dir, 'cf.pickle') cf_content = {'configs': ClassFactory.__configs__, 'registry': ClassFactory.__registry__, 'data': UserConfig().__data__, 'network_registry': NetworkFactory.__network_registry__, 'general': obj2config(General)} with open(cf_file, 'wb') as f: pickle.dump(cf_content, f) cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle') FileOps.copy_file(cf_file, cf_file_remote) if os.environ.get('DLS_TASK_NUMBER') is None: # local cluster worker_ips = '127.0.0.1' if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1': worker_ips = General.cluster.master_ip for ip in General.cluster.slaves: worker_ips = worker_ips + ',' + ip cmd = ['bash', '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote, worker_ips] else: # Roma cmd = ['bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote] proc = subprocess.Popen(cmd, env=os.environ) proc.wait()
def _init_model(self, model=None): """Load model desc from save path and parse to model.""" if model is not None: return model model_cfg = ClassFactory.__configs__.get('model') if 'model_desc_file' in model_cfg and model_cfg.model_desc_file is not None: desc_file = model_cfg.model_desc_file.replace( "{model_zoo}", self.model_zoo_path) desc_file = desc_file.replace("{local_base_path}", self.local_base_path) if ":" not in desc_file: desc_file = os.path.abspath(desc_file) if ":" in desc_file: local_desc_file = FileOps.join_path( self.local_output_path, os.path.basename(desc_file)) FileOps.copy_file(desc_file, local_desc_file) desc_file = local_desc_file if self.horovod: hvd.join() model_desc = Config(desc_file) logging.info("net_desc:{}".format(model_desc)) elif 'model_desc' in model_cfg and model_cfg.model_desc is not None: model_desc = model_cfg.model_desc else: return None if model_desc is not None: self.model_desc = model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() return model else: return None
def _copy_needed_file(self): if "pareto_front_file" in self.cfg and self.cfg.pareto_front_file is not None: init_pareto_front_file = self.cfg.pareto_front_file.replace( "{local_base_path}", self.local_base_path) self.pareto_front_file = FileOps.join_path(self.result_path, "pareto_front.csv") FileOps.copy_file(init_pareto_front_file, self.pareto_front_file) if "random_file" in self.cfg and self.cfg.random_file is not None: init_random_file = self.cfg.random_file.replace( "{local_base_path}", self.local_base_path) self.random_file = FileOps.join_path(self.local_output_path, self.cfg.step_name, "random.csv") FileOps.copy_file(init_random_file, self.random_file)
def _save_descript(self): """Save result descript.""" template_file = self.config.darts_template_file genotypes = self.search_alg.codec.calc_genotype(self._get_arch_weights()) if template_file == "{default_darts_cifar10_template}": template = DartsNetworkTemplateConfig.cifar10 elif template_file == "{default_darts_imagenet_template}": template = DartsNetworkTemplateConfig.imagenet else: dst = FileOps.join_path(self.trainer.get_local_worker_path(), os.path.basename(template_file)) FileOps.copy_file(template_file, dst) template = Config(dst) model_desc = self._gen_model_desc(genotypes, template) self.trainer.config.codec = model_desc
def _save_descript(self, descript): """Save result descript. :param descript: darts search result descript :type descript: dict or Config """ template_file = self.cfg.darts_template_file genotypes = self.search_alg.codec.calc_genotype(self.model.arch_weights) if template_file == "{default_darts_cifar10_template}": template = DefaultConfig().data.default_darts_cifar10_template elif template_file == "{default_darts_imagenet_template}": template = DefaultConfig().data.default_darts_imagenet_template else: dst = FileOps.join_path(self.trainer.get_local_worker_path(), os.path.basename(template_file)) FileOps.copy_file(template_file, dst) template = Config(dst) model_desc = self._gen_model_desc(genotypes, template) self.trainer.output_model_desc(self.trainer.worker_id, model_desc)
def _init_model(self, model=None): """Load model desc from save path and parse to model.""" if model is not None: if vega.is_torch_backend() and self.use_cuda: model = model.cuda() return model model_cfg = Config(ClassFactory.__configs__.get('model')) if "model_desc_file" in model_cfg and model_cfg.model_desc_file is not None: desc_file = model_cfg.model_desc_file desc_file = desc_file.replace("{local_base_path}", self.local_base_path) if ":" not in desc_file: desc_file = os.path.abspath(desc_file) if ":" in desc_file: local_desc_file = FileOps.join_path( self.local_output_path, os.path.basename(desc_file)) FileOps.copy_file(desc_file, local_desc_file) desc_file = local_desc_file model_desc = Config(desc_file) logging.info("net_desc:{}".format(model_desc)) elif "model_desc" in model_cfg and model_cfg.model_desc is not None: model_desc = model_cfg.model_desc elif "models_folder" in model_cfg and model_cfg.models_folder is not None: folder = model_cfg.models_folder.replace("{local_base_path}", self.local_base_path) pattern = FileOps.join_path(folder, "desc_*.json") desc_file = glob.glob(pattern)[0] model_desc = Config(desc_file) else: return None if model_desc is not None: self.model_desc = model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() if vega.is_torch_backend() and self.use_cuda: model = model.cuda() return model else: return None
def _output_records(self, step_name, records, desc=True, weights_file=False, performance=False): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] if desc: outputs_globs += glob.glob( FileOps.join_path(worker_path, "desc_*.json")) if weights_file: outputs_globs += glob.glob( FileOps.join_path(worker_path, "model_*.pth")) if performance: outputs_globs += glob.glob( FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: FileOps.copy_file(_file, step_path)
def save_genotypes_to_json(self, genotypes, acc, obj, save_folder, ga_epoch): """Save genotypes. :param genotypes: Genotype for models :type genotypes: namedtuple Genotype :param acc: accuracy :type acc: ndarray :param obj: objectives, etc. FLOPs or number of parameters :type obj: ndarray :param save_name: Path to save :type save_name: string """ if self.trainer.cfg.darts_template_file == "{default_darts_cifar10_template}": template = DefaultConfig().data.default_darts_cifar10_template elif self.trainer.cfg.darts_template_file == "{default_darts_imagenet_template}": template = DefaultConfig().data.default_darts_imagenet_template else: worker_path = self.trainer.get_local_worker_path() _path = os.path.join(worker_path, save_folder + '_{}'.format(ga_epoch)) if not os.path.isdir(_path): os.makedirs(_path) base_file = os.path.basename(self.trainer.cfg.darts_template_file) local_template = FileOps.join_path(self.trainer.local_output_path, base_file) FileOps.copy_file(self.trainer.cfg.darts_template_file, local_template) with open(local_template, 'r') as f: template = json.load(f) for idx in range(len(genotypes)): template_cfg = Config(template) template_cfg.super_network.normal.genotype = genotypes[idx].normal template_cfg.super_network.reduce.genotype = genotypes[idx].reduce self.trainer.output_model_desc(idx, template_cfg)