def __init__(self): """Init Pipeline and set task_id and configs.""" self.cfg = UserConfig().data # TODO: validate cfg by validator if self.cfg is None: raise ValueError("please load user config before pipeline init.") self._steps = self.cfg.pipeline logger.info("pipeline steps:%s", str(self._steps))
def run(self): """Execute the whole pipeline.""" def _shutdown_cluster(signum, frame): logging.info("Shutdown urgently.") Master.shutdown() os._exit(0) try: signal.signal(signal.SIGINT, _shutdown_cluster) signal.signal(signal.SIGTERM, _shutdown_cluster) for step_name in PipelineConfig.steps: step_cfg = UserConfig().data.get(step_name) General.step_name = step_name ClassFactory().set_current_step(step_cfg) # load Config obj form desc load_conf_from_desc(PipeStepConfig, step_cfg) logger.info("Start pipeline step: [{}]".format(step_name)) PipeStep().do() except Exception: logger.error("Failed to run pipeline.") logger.error(traceback.format_exc()) try: Master.shutdown() except Exception: logger.error("Failed to shutdown dask cluster.") logger.error(traceback.format_exc())
def get_write_ip_master_local(): """Get the ip and port that write in a system path. here will not download anything from S3. """ local_base_path = UserConfig().data.general.task.local_base_path local_task_id = UserConfig().data.general.task.task_id local_path = os.path.join(local_base_path, local_task_id, 'ip_address/') file_path = os.path.join(local_path, 'ip_address.txt') if os.path.isfile(file_path): with open(file_path, 'r') as f: ip = f.readline().strip() port = f.readline().strip() logging.info("get write ip, ip={}, port={}".format(ip, port)) return ip, port else: return None, None
def get_write_ip(args): """Get the ip and port that write in a system path. :param argparse.ArgumentParser args: `args` is a argparse that should contain `init_method`, `rank` and `world_size`. :return: the ip and port . :rtype: str, str. """ local_base_path = UserConfig().data.general.task.local_base_path local_task_id = UserConfig().data.general.task.task_id local_path = os.path.join(local_base_path, local_task_id, 'ip_address/') if not os.path.exists(local_path): FileOps.make_dir(local_path) file_path = os.path.join(local_path, 'ip_address.txt') with open(file_path, 'r') as f: ip = f.readline().strip() port = f.readline().strip() logging.info("get write ip, ip={}, port={}".format(ip, port)) return ip, port
def write_ip(ip_address, port, args): """Write the ip and port in a system path. :param str ip_address: The `ip_address` need to write. :param str port: The `port` need to write. :param argparse.ArgumentParser args: `args` is a argparse that should contain `init_method`, `rank` and `world_size`. """ local_base_path = UserConfig().data.general.task.local_base_path local_task_id = UserConfig().data.general.task.task_id local_path = os.path.join(local_base_path, local_task_id, 'ip_address/') if not os.path.exists(local_path): FileOps.make_dir(local_path) file_path = os.path.join(local_path, 'ip_address.txt') logging.info("write ip, file path={}".format(file_path)) with open(file_path, 'w') as f: f.write(ip_address + "\n") f.write(port + "\n")
def __init__(self, args=None): """Init DistributedWorker.""" self.cfg = copy.deepcopy(args) super(DistributedWorker, self).__init__(self.cfg) # privates DistributedWorker.__worker_id__ = DistributedWorker.__worker_id__ + 1 self._worker_id = DistributedWorker.__worker_id__ # publics self.rank = 0 self.world_size = 1 self.worker_addr = "" self.worker_nccl_port = 16666 self.timeout = int(float(self.cfg.worker.timeout) * 60 * 60) self.__env_config__ = (copy.deepcopy(UserConfig().data), copy.deepcopy(ClassFactory.__configs__), copy.deepcopy(ClassFactory.__registry__)) return
class Pipeline(object): """Load configs and provide `run` method to start pipe steps. In this class, Pipeline will parse all pipe steps from the config data. Execute steps one by one and set glob configs with current step config. """ def __init__(self): """Init Pipeline and set task_id and configs.""" self.cfg = UserConfig().data # TODO: validate cfg by validator if self.cfg is None: raise ValueError("please load user config before pipeline init.") self._steps = self.cfg.pipeline logger.info("pipeline steps:%s", str(self._steps)) def run(self): """Execute the whole pipeline.""" def _shutdown_cluster(signum, frame): logging.info("Shutdown urgently.") Master.shutdown() os._exit(0) try: signal.signal(signal.SIGINT, _shutdown_cluster) signal.signal(signal.SIGTERM, _shutdown_cluster) for step_name in self._steps: step_cfg = self.cfg.get(step_name) self.cfg.general["step_name"] = step_name ClassFactory().set_current_step(step_cfg) logger.info("Start pipeline step: [{}]".format(step_name)) PipeStep().do() except Exception: logger.error("Failed to run pipeline.") logger.error(traceback.format_exc()) try: Master.shutdown() except Exception: logger.error("Failed to shutdown dask cluster.") logger.error(traceback.format_exc())
def __init__(self): """Init master.""" self.cfg = copy.deepcopy(UserConfig().data.general) self.step_name = None self.worker_id = None
import logging import horovod.torch as hvd from vega.core.common.class_factory import ClassFactory from vega.core.common.user_config import UserConfig from vega.core.common.file_ops import FileOps parser = argparse.ArgumentParser(description='Horovod Fully Train') parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file') args = parser.parse_args() if 'VEGA_INIT_ENV' in os.environ: exec(os.environ.copy()['VEGA_INIT_ENV']) logging.info('start horovod setting') hvd.init() try: import moxing as mox mox.file.set_auth(obs_client_log=False) except: pass FileOps.copy_file(args.cf_file, './cf_file.pickle') hvd.join() with open('./cf_file.pickle', 'rb') as f: cf_content = pickle.load(f) ClassFactory.__configs__ = cf_content.get('configs') ClassFactory.__registry__ = cf_content.get('registry') UserConfig().__data__ = cf_content.get('data') cls_trainer = ClassFactory.get_cls('trainer') trainer = cls_trainer(None, 0) trainer.train_process()