Пример #1
0
 def __init__(self):
     """Init Pipeline and set task_id and configs."""
     self.cfg = UserConfig().data
     # TODO: validate cfg by validator
     if self.cfg is None:
         raise ValueError("please load user config before pipeline init.")
     self._steps = self.cfg.pipeline
     logger.info("pipeline steps:%s", str(self._steps))
Пример #2
0
    def run(self):
        """Execute the whole pipeline."""
        def _shutdown_cluster(signum, frame):
            logging.info("Shutdown urgently.")
            Master.shutdown()
            os._exit(0)

        try:
            signal.signal(signal.SIGINT, _shutdown_cluster)
            signal.signal(signal.SIGTERM, _shutdown_cluster)
            for step_name in PipelineConfig.steps:
                step_cfg = UserConfig().data.get(step_name)
                General.step_name = step_name
                ClassFactory().set_current_step(step_cfg)
                # load Config obj form desc
                load_conf_from_desc(PipeStepConfig, step_cfg)
                logger.info("Start pipeline step: [{}]".format(step_name))
                PipeStep().do()
        except Exception:
            logger.error("Failed to run pipeline.")
            logger.error(traceback.format_exc())
        try:
            Master.shutdown()
        except Exception:
            logger.error("Failed to shutdown dask cluster.")
            logger.error(traceback.format_exc())
Пример #3
0
def get_write_ip_master_local():
    """Get the ip and port that write in a system path.

    here will not download anything from S3.
    """
    local_base_path = UserConfig().data.general.task.local_base_path
    local_task_id = UserConfig().data.general.task.task_id
    local_path = os.path.join(local_base_path, local_task_id, 'ip_address/')
    file_path = os.path.join(local_path, 'ip_address.txt')
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            ip = f.readline().strip()
            port = f.readline().strip()
            logging.info("get write ip, ip={}, port={}".format(ip, port))
            return ip, port
    else:
        return None, None
Пример #4
0
def get_write_ip(args):
    """Get the ip and port that write in a system path.

    :param argparse.ArgumentParser args: `args` is a argparse that should
         contain `init_method`, `rank` and `world_size`.
    :return: the ip and port .
    :rtype: str, str.

    """
    local_base_path = UserConfig().data.general.task.local_base_path
    local_task_id = UserConfig().data.general.task.task_id
    local_path = os.path.join(local_base_path, local_task_id, 'ip_address/')
    if not os.path.exists(local_path):
        FileOps.make_dir(local_path)
    file_path = os.path.join(local_path, 'ip_address.txt')
    with open(file_path, 'r') as f:
        ip = f.readline().strip()
        port = f.readline().strip()
        logging.info("get write ip, ip={}, port={}".format(ip, port))
        return ip, port
Пример #5
0
def write_ip(ip_address, port, args):
    """Write the ip and port in a system path.

    :param str ip_address: The `ip_address` need to write.
    :param str port: The `port` need to write.
    :param argparse.ArgumentParser args: `args` is a argparse that should
         contain `init_method`, `rank` and `world_size`.

    """
    local_base_path = UserConfig().data.general.task.local_base_path
    local_task_id = UserConfig().data.general.task.task_id
    local_path = os.path.join(local_base_path, local_task_id, 'ip_address/')
    if not os.path.exists(local_path):
        FileOps.make_dir(local_path)

    file_path = os.path.join(local_path, 'ip_address.txt')
    logging.info("write ip, file path={}".format(file_path))
    with open(file_path, 'w') as f:
        f.write(ip_address + "\n")
        f.write(port + "\n")
Пример #6
0
 def __init__(self, args=None):
     """Init DistributedWorker."""
     self.cfg = copy.deepcopy(args)
     super(DistributedWorker, self).__init__(self.cfg)
     # privates
     DistributedWorker.__worker_id__ = DistributedWorker.__worker_id__ + 1
     self._worker_id = DistributedWorker.__worker_id__
     # publics
     self.rank = 0
     self.world_size = 1
     self.worker_addr = ""
     self.worker_nccl_port = 16666
     self.timeout = int(float(self.cfg.worker.timeout) * 60 * 60)
     self.__env_config__ = (copy.deepcopy(UserConfig().data),
                            copy.deepcopy(ClassFactory.__configs__),
                            copy.deepcopy(ClassFactory.__registry__))
     return
Пример #7
0
class Pipeline(object):
    """Load configs and provide `run` method to start pipe steps.

    In this class, Pipeline will parse all pipe steps from the config data.
    Execute steps one by one and set glob configs with current step config.
    """
    def __init__(self):
        """Init Pipeline and set task_id and configs."""
        self.cfg = UserConfig().data
        # TODO: validate cfg by validator
        if self.cfg is None:
            raise ValueError("please load user config before pipeline init.")
        self._steps = self.cfg.pipeline
        logger.info("pipeline steps:%s", str(self._steps))

    def run(self):
        """Execute the whole pipeline."""
        def _shutdown_cluster(signum, frame):
            logging.info("Shutdown urgently.")
            Master.shutdown()
            os._exit(0)

        try:
            signal.signal(signal.SIGINT, _shutdown_cluster)
            signal.signal(signal.SIGTERM, _shutdown_cluster)
            for step_name in self._steps:
                step_cfg = self.cfg.get(step_name)
                self.cfg.general["step_name"] = step_name
                ClassFactory().set_current_step(step_cfg)
                logger.info("Start pipeline step: [{}]".format(step_name))
                PipeStep().do()
        except Exception:
            logger.error("Failed to run pipeline.")
            logger.error(traceback.format_exc())
        try:
            Master.shutdown()
        except Exception:
            logger.error("Failed to shutdown dask cluster.")
            logger.error(traceback.format_exc())
Пример #8
0
 def __init__(self):
     """Init master."""
     self.cfg = copy.deepcopy(UserConfig().data.general)
     self.step_name = None
     self.worker_id = None
Пример #9
0
import logging
import horovod.torch as hvd
from vega.core.common.class_factory import ClassFactory
from vega.core.common.user_config import UserConfig
from vega.core.common.file_ops import FileOps

parser = argparse.ArgumentParser(description='Horovod Fully Train')
parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file')
args = parser.parse_args()

if 'VEGA_INIT_ENV' in os.environ:
    exec(os.environ.copy()['VEGA_INIT_ENV'])
logging.info('start horovod setting')
hvd.init()
try:
    import moxing as mox

    mox.file.set_auth(obs_client_log=False)
except:
    pass
FileOps.copy_file(args.cf_file, './cf_file.pickle')
hvd.join()
with open('./cf_file.pickle', 'rb') as f:
    cf_content = pickle.load(f)
ClassFactory.__configs__ = cf_content.get('configs')
ClassFactory.__registry__ = cf_content.get('registry')
UserConfig().__data__ = cf_content.get('data')
cls_trainer = ClassFactory.get_cls('trainer')
trainer = cls_trainer(None, 0)
trainer.train_process()