Exemplo n.º 1
0
def get_all_inters_from_yaml(file, filters):
    _envs = envs.load_yaml(file)
    all_flattens = {}

    def fatten_env_namespace(namespace_nests, local_envs):
        for k, v in local_envs.items():
            if isinstance(v, dict):
                nests = copy.deepcopy(namespace_nests)
                nests.append(k)
                fatten_env_namespace(nests, v)
            elif (k == "dataset" or k == "phase"
                  or k == "runner") and isinstance(v, list):
                for i in v:
                    if i.get("name") is None:
                        raise ValueError("name must be in dataset list. ", v)
                    nests = copy.deepcopy(namespace_nests)
                    nests.append(k)
                    nests.append(i["name"])
                    fatten_env_namespace(nests, i)
            else:
                global_k = ".".join(namespace_nests + [k])
                all_flattens[global_k] = v

    fatten_env_namespace([], _envs)
    ret = {}
    for k, v in all_flattens.items():
        for f in filters:
            if k.startswith(f):
                ret[k] = v
    return ret
Exemplo n.º 2
0
    def master():
        from paddlerec.core.engine.cluster.cluster import ClusterEngine

        # Get fleet_mode & device
        run_extras = get_all_inters_from_yaml(args.model, ["runner."])
        mode = envs.get_runtime_environ("mode")
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
        device = device.upper()
        fleet_mode = fleet_mode.upper()

        if fleet_mode == "COLLECTIVE" and device != "GPU":
            raise ValueError("COLLECTIVE can not be used without GPU")

        # Get Thread nums
        model_envs = envs.load_yaml(args.model)
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = model_envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        thread_num = []
        for phase in phases:
            thread_num.append(int(phase["thread_num"]))
        max_thread_num = max(thread_num)

        backend_envs = envs.load_yaml(args.backend)
        flattens = envs.flatten_environs(backend_envs, "_")
        flattens["engine_role"] = "MASTER"
        flattens["engine_mode"] = envs.get_runtime_environ("mode")
        flattens["engine_run_config"] = args.model
        flattens["max_thread_num"] = max_thread_num
        flattens["fleet_mode"] = fleet_mode
        flattens["device"] = device
        flattens["backend_yaml"] = args.backend
        envs.set_runtime_environs(flattens)

        launch = ClusterEngine(None, args.model)
        return launch
Exemplo n.º 3
0
 def __init__(self, config=None):
     self._status_processor = {}
     self._place = fluid.CPUPlace()
     self._exe = fluid.Executor(self._place)
     self._exector_context = {}
     self._context = {'status': 'uninit', 'is_exit': False}
     self._config_yaml = config
     self._config = envs.load_yaml(config)
Exemplo n.º 4
0
def get_inters_from_yaml(file, filters):
    _envs = envs.load_yaml(file)
    flattens = envs.flatten_environs(_envs)
    inters = {}
    for k, v in flattens.items():
        for f in filters:
            if k.startswith(f):
                inters[k] = v
    return inters
Exemplo n.º 5
0
def user_define_engine(engine_yaml):
    _config = envs.load_yaml(engine_yaml)
    envs.set_runtime_environs(_config)
    train_location = envs.get_global_env("engine.file")
    train_dirname = os.path.dirname(train_location)
    base_name = os.path.splitext(os.path.basename(train_location))[0]
    sys.path.append(train_dirname)
    trainer_class = envs.lazy_instance_by_fliename(base_name,
                                                   "UserDefineTraining")
    return trainer_class
Exemplo n.º 6
0
    def master():
        role = "MASTER"
        from paddlerec.core.engine.cluster.cluster import ClusterEngine
        _envs = envs.load_yaml(args.backend)
        flattens = envs.flatten_environs(_envs, "_")
        flattens["engine_role"] = role
        flattens["engine_run_config"] = args.model
        flattens["engine_temp_path"] = tempfile.mkdtemp()
        envs.set_runtime_environs(flattens)
        print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value")))

        launch = ClusterEngine(None, args.model)
        return launch
Exemplo n.º 7
0
def local_cluster_engine(args):
    from paddlerec.core.engine.local_cluster import LocalClusterEngine

    _envs = envs.load_yaml(args.model)
    run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."])
    trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class",
                                   None)

    if trainer_class:
        trainer = trainer_class
    else:
        trainer = "GeneralTrainer"

    executor_mode = "train"
    distributed_strategy = run_extras.get(
        "runner." + _envs["mode"] + ".distribute_strategy", "async")

    worker_num = run_extras.get("runner." + _envs["mode"] + ".worker_num", 1)
    server_num = run_extras.get("runner." + _envs["mode"] + ".server_num", 1)
    selected_gpus = run_extras.get(
        "runner." + _envs["mode"] + ".selected_gpus", "0")

    fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "")
    if fleet_mode == "":
        device = run_extras.get("runner." + _envs["mode"] + ".device", "cpu")
        if len(selected_gpus.split(",")) > 1 and device.upper() == "GPU":
            fleet_mode = "COLLECTIVE"
        else:
            fleet_mode = "PS"

    cluster_envs = {}
    cluster_envs["server_num"] = server_num
    cluster_envs["worker_num"] = worker_num
    cluster_envs["selected_gpus"] = selected_gpus
    cluster_envs["start_port"] = envs.find_free_port()
    cluster_envs["fleet_mode"] = fleet_mode
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["train.trainer.executor_mode"] = executor_mode
    cluster_envs["train.trainer.strategy"] = distributed_strategy
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    cluster_envs["CPU_NUM"] = "2"
    print("launch {} engine with cluster to run model: {}".format(
        trainer, args.model))

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalClusterEngine(cluster_envs, args.model)
    return launch
Exemplo n.º 8
0
    def get_worker_num(run_extras, workers):
        _envs = envs.load_yaml(args.model)
        mode = envs.get_runtime_environ("mode")
        workspace = envs.get_runtime_environ("workspace")
        phases_class = ".".join(["runner", mode, "phases"])
        phase_names = run_extras.get(phases_class)
        phases = []
        all_phases = _envs.get("phase")
        if phase_names is None:
            phases = all_phases
        else:
            for phase in all_phases:
                if phase["name"] in phase_names:
                    phases.append(phase)

        dataset_names = []
        for phase in phases:
            dataset_names.append(phase["dataset_name"])

        datapaths = []
        for dataset in _envs.get("dataset"):
            if dataset["name"] in dataset_names:
                datapaths.append(dataset["data_path"])

        if not datapaths:
            raise ValueError("data path must exist for training/inference")

        datapaths = [
            envs.workspace_adapter_by_specific(path, workspace)
            for path in datapaths
        ]

        all_workers = [len(os.listdir(path)) for path in datapaths]
        all_workers.append(workers)
        max_worker_num = min(all_workers)

        if max_worker_num >= workers:
            return workers

        print(
            "phases do not have enough datas for training, set worker/gpu cards num from {} to {}"
            .format(workers, max_worker_num))

        return max_worker_num
Exemplo n.º 9
0
def local_mpi_engine(args):
    print("launch cluster engine with cluster to run model: {}".format(
        args.model))
    from paddlerec.core.engine.local_mpi import LocalMPIEngine

    print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format(
        args.model))

    mpi = util.run_which("mpirun")
    if not mpi:
        raise RuntimeError("can not find mpirun, please check environment")

    _envs = envs.load_yaml(args.model)
    run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."])
    trainer_class = run_extras.get("runner." + _envs["mode"] + ".runner_class",
                                   None)
    executor_mode = "train"
    distributed_strategy = run_extras.get(
        "runner." + _envs["mode"] + ".distribute_strategy", "async")
    fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode",
                                "ps")

    if trainer_class:
        trainer = trainer_class
    else:
        trainer = "GeneralTrainer"

    cluster_envs = {}
    cluster_envs["mpirun"] = mpi
    cluster_envs["train.trainer.trainer"] = trainer
    cluster_envs["log_dir"] = "logs"
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.executor_mode"] = executor_mode
    cluster_envs["fleet_mode"] = fleet_mode
    cluster_envs["train.trainer.strategy"] = distributed_strategy
    cluster_envs["train.trainer.threads"] = "2"
    cluster_envs["train.trainer.engine"] = "local_cluster"
    cluster_envs["train.trainer.platform"] = envs.get_platform()

    set_runtime_envs(cluster_envs, args.model)
    launch = LocalMPIEngine(cluster_envs, args.model)
    return launch
Exemplo n.º 10
0
    def worker():
        role = "WORKER"

        _envs = envs.load_yaml(args.model)
        run_extras = get_all_inters_from_yaml(args.model,
                                              ["train.", "runner."])
        trainer_class = run_extras.get(
            "runner." + _envs["mode"] + ".trainer_class", None)

        if trainer_class:
            trainer = trainer_class
        else:
            trainer = "GeneralTrainer"

        executor_mode = "train"

        distributed_strategy = run_extras.get(
            "runner." + _envs["mode"] + ".distribute_strategy", "async")
        selected_gpus = run_extras.get(
            "runner." + _envs["mode"] + ".selected_gpus", "0")
        fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode",
                                    "ps")

        cluster_envs = {}
        cluster_envs["selected_gpus"] = selected_gpus
        cluster_envs["fleet_mode"] = fleet_mode
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.executor_mode"] = executor_mode
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.strategy"] = distributed_strategy
        cluster_envs["train.trainer.threads"] = envs.get_runtime_environ(
            "CPU_NUM")
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
        set_runtime_envs(cluster_envs, args.model)

        trainer = TrainerFactory.create(args.model)
        return trainer
Exemplo n.º 11
0
def single_infer_engine(args):
    _envs = envs.load_yaml(args.model)
    run_extras = get_all_inters_from_yaml(args.model, ["runner."])

    mode = envs.get_runtime_environ("mode")
    trainer_class = ".".join(["runner", mode, "trainer_class"])
    fleet_class = ".".join(["runner", mode, "fleet_mode"])
    device_class = ".".join(["runner", mode, "device"])
    selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])

    trainer = run_extras.get(trainer_class, "GeneralTrainer")
    fleet_mode = run_extras.get(fleet_class, "ps")
    device = run_extras.get(device_class, "cpu")
    selected_gpus = run_extras.get(selected_gpus_class, "0")
    executor_mode = "infer"

    single_envs = {}

    if device.upper() == "GPU":
        selected_gpus_num = len(selected_gpus.split(","))
        if selected_gpus_num != 1:
            raise ValueError(
                "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS"
            )

        single_envs["selsected_gpus"] = selected_gpus
        single_envs["FLAGS_selected_gpus"] = selected_gpus

    single_envs["train.trainer.trainer"] = trainer
    single_envs["train.trainer.executor_mode"] = executor_mode
    single_envs["fleet_mode"] = fleet_mode
    single_envs["train.trainer.threads"] = "2"
    single_envs["train.trainer.platform"] = envs.get_platform()
    single_envs["train.trainer.engine"] = "single"

    set_runtime_envs(single_envs, args.model)
    trainer = TrainerFactory.create(args.model)
    return trainer
Exemplo n.º 12
0
    def __init__(self, config=None):
        self._status_processor = {}
        self.model = None
        self.inference_models = []
        self.increment_models = []
        self._exector_context = {}
        self._context = {'status': 'uninit', 'is_exit': False}
        self._context["config_yaml"] = config

        self._model = {}
        self._dataset = {}

        self._runner_name = envs.get_runtime_environ("mode")
        self._context["runner_name"] = self._runner_name

        phase_names = envs.get_global_env(
            "runner." + self._runner_name + ".phases", None)

        _config = envs.load_yaml(config)

        self._context["env"] = _config
        self._context["dataset"] = _config.get("dataset")

        phases = []
        if phase_names is None:
            phases = _config.get("phase")
        else:
            for phase in _config.get("phase"):
                if phase["name"] in phase_names:
                    phases.append(phase)

        self._context["phases"] = phases
        print("PaddleRec: Runner {} Begin".format(self._runner_name))
        self.which_engine()
        self.which_device()
        self.which_fleet_mode()
        self.which_executor_mode()
        self.legality_check()
Exemplo n.º 13
0
def yaml_validation(config):
    all_checkers, require_checkers = register()

    _config = envs.load_yaml(config)
    flattens = envs.flatten_environs(_config)

    for required in require_checkers:
        if required not in flattens.keys():
            print("\ncan not find {} in yaml, which is required\n".format(
                required))
            return False

    for name, flatten in flattens.items():
        checker = all_checkers.get(name, None)

        if not checker:
            continue

        ret = checker.is_valid(name, flattens)
        if not ret:
            return False

    return True
Exemplo n.º 14
0
def yaml_validation(config):
    all_checkers = register()

    require_checkers = []
    for name, checker in all_checkers.items():
        if checker.required:
            require_checkers.append(name)

    _config = envs.load_yaml(config)

    for required in require_checkers:
        if required not in _config.keys():
            print("\ncan not find {} in yaml, which is required\n".format(
                required))
            return False

    for name, value in _config.items():
        checker = all_checkers.get(name, None)
        if checker:
            ret = checker.is_valid(name, value)
            if not ret:
                return False

    return True
Exemplo n.º 15
0
 def create(config):
     _config = envs.load_yaml(config)
     envs.set_global_envs(_config)
     envs.update_workspace()
     trainer = TrainerFactory._build_trainer(config)
     return trainer
Exemplo n.º 16
0
 def __init__(self):
     # get backend env
     backend_yaml = envs.get_runtime_environ("backend_yaml")
     _env = envs.load_yaml(backend_yaml)
     self.backend_env = envs.flatten_environs(_env, ".")
     self.cluster_env = {}
Exemplo n.º 17
0
 def __init__(self, config):
     dg.MultiSlotDataGenerator.__init__(self)
     _config = envs.load_yaml(config)
Exemplo n.º 18
0
 def __init__(self, config):
     dg.MultiSlotDataGenerator.__init__(self)
     _config = envs.load_yaml(config)
     envs.set_global_envs(_config)
     envs.update_workspace()