Пример #1
0
    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            from paddle.fluid.transpiler.details.checkport import wait_server_ready
            wait_server_ready(fleet.server_endpoints(to_string=False))

        if not self._transpile_config.sync_mode:
            if self._transpile_config.geo_sgd_mode:
                self._communicator = Communicator(
                    self.main_program, self.vars_info, fleet.worker_num(),
                    self._transpile_config.geo_sgd_need_push_nums)
            else:
                self._communicator = Communicator(self.main_program)

            if not self._communicator.is_running():
                self._communicator.start()
            else:
                warnings.warn("communicator has been initialized, skip")
Пример #2
0
    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        if not self._transpile_config.sync_mode:
            self._communicator = Communicator(self.main_program)
            self._communicator.start()
Пример #3
0
    def test_communicator_init_and_start(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        comm = Communicator(fleet.main_program)
        comm.start()
        time.sleep(10)
        comm.stop()
Пример #4
0
 def test_communicator_init_and_start(self):
     prog = fluid.Program()
     comm = Communicator(prog)
     comm.start()
     comm.stop()
Пример #5
0
class DistributedTranspiler(Fleet):
    """
    A subclass for compatibility with fluid.transpiler.DistributeTranspiler.
    """
    def __init__(self):
        super(DistributedTranspiler, self).__init__(Mode.TRANSPILER)
        self._transpile_config = None
        self._transpiler = None
        self._origin_program = None
        self.startup_program = None
        self.main_program = None
        self._communicator = None

    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            from paddle.fluid.transpiler.details.checkport import wait_server_ready
            wait_server_ready(fleet.server_endpoints(to_string=False))

        if not self._transpile_config.sync_mode:
            self._communicator = Communicator(self.main_program)

            if not self._communicator.is_running():
                self._communicator.start()
            else:
                warnings.warn("communicator has been initialized, skip")

    def init_server(self, model_dir=None):
        """
        `init_server` has many many functions to do before start pserver,
        first, run executor to initialize startup program,
        second, if the `model_dir` is not empty, it will load parameters from it for increment training.

        Args:
            model_dir(str): The directory path.

        Returns:
            None
        """
        if not self.startup_program:
            raise ValueError(
                "startup_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.startup_program)

        if model_dir:
            if not os.path.isdir(model_dir):
                raise ValueError("There is no directory named '%s'", model_dir)

            io.load_persistables(self._executor, model_dir, self.main_program)

    def run_server(self):
        """
        `run_server` execute executor to start pserver main program.

        Returns:
            None
        """
        if not self.main_program:
            raise ValueError(
                "main_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.main_program)

    def stop_worker(self):
        """
        Close this executor.

        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.

        Returns:
            None
        """
        if not self._transpile_config.sync_mode and self._communicator.is_running(
        ):
            self._communicator.stop()
        self._executor.close()

        if isinstance(self._role_maker, MPISymetricRoleMaker):
            self._role_maker._finalize()

    def distributed_optimizer(self, optimizer, strategy=None):
        """
        Optimizer for distributed training.

        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
        Which has basic Optimizer function and special features for distributed training.

        Args:
            optimizer(Optimizer): The executor to run for init server.
            strategy(DistributeTranspilerConfig): Extra properties for distributed optimizer.

        Returns:
            TranspilerOptimizer: subclass of DistributedOptimizer.
        """

        if not isinstance(optimizer, Optimizer):
            raise ValueError("optimizer must be an instance of Optimizer")
        self._optimizer = TranspilerOptimizer(optimizer, strategy)
        return self._optimizer

    def save_inference_model(self,
                             executor,
                             dirname,
                             feeded_var_names,
                             target_vars,
                             main_program=None,
                             export_for_deployment=True):
        """
        Prune the given `main_program` to build a new program especially for inference,
        and then save it and all related parameters to given `dirname` by the `executor`.
        """
        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type"
            )

        if main_program is not None:
            if isinstance(main_program, CompiledProgram):
                raise TypeError(
                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                )
            io.save_inference_model(dirname, feeded_var_names, target_vars,
                                    executor, main_program, None, None,
                                    export_for_deployment)
        else:
            io.save_inference_model(dirname, feeded_var_names, target_vars,
                                    executor, self._origin_program, None, None,
                                    export_for_deployment, True)

            model_basename = "__model__"
            model_filename = os.path.join(dirname, model_basename)

            with open(model_filename, "rb") as f:
                program_desc_str = f.read()

            program = Program.parse_from_string(program_desc_str)
            program._copy_dist_param_info_from(self.main_program)
            self.save_persistables(executor, dirname, program)

    def save_persistables(self, executor, dirname, main_program=None):
        """
        This function filters out all variables with `persistable==True` from the
        give `main_program` and then saves these variables to the folder `dirname`
        or file `filename`.

        The `dirname` is used to specify the folder where persistable variables
        are going to be saved. If you would like to save variables in separate
        files, set `filename` None; if you would like to save all variables in a
        single file, use `filename` to specify the file name.
        """
        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type"
            )

        if main_program is None:
            main_program = self.main_program

        if isinstance(main_program, CompiledProgram):
            raise TypeError(
                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
            )

        if not main_program._is_distributed:
            raise ValueError(
                "main_program is for local, may not use fleet.save_persistables"
            )

        io.save_persistables(executor, dirname, main_program, None)

    def _transpile(self, config):
        if not isinstance(config, DistributeTranspilerConfig):
            raise TypeError(
                "config must be an instance of DistributeTranspilerConfig")

        if not config.sync_mode:
            config.runtime_split_send_recv = True

        # _origin_program is a deep copy for default_main_program, for inference
        self._origin_program = default_main_program().clone(for_test=False)

        self._transpile_config = config
        self._transpiler = OriginTranspiler(config)

        if self.is_worker():
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
                pservers=fleet.server_endpoints(to_string=True),
                trainers=fleet.worker_num(),
                sync_mode=config.sync_mode)

            if isinstance(self._role_maker, MPISymetricRoleMaker):
                config.wait_port = False

            self.main_program = self._transpiler.get_trainer_program(
                wait_port=config.wait_port)
            self.startup_program = default_startup_program()
        else:
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
                pservers=fleet.server_endpoints(to_string=True),
                trainers=fleet.worker_num(),
                sync_mode=config.sync_mode,
                current_endpoint=self.server_endpoints()[self.server_index()])
            self.main_program, self.startup_program = \
                self._transpiler.get_pserver_programs(self.server_endpoints()[self.server_index()])
Пример #6
0
    def _init_worker(self):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
            SyncStrategy, GeoStrategy

        is_sync = self.compiled_strategy.is_sync_mode()
        worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)
        server = self._get_fleet_proto(is_server=True, is_sync=is_sync)

        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self.role_maker._worker_index()
            return kwargs

        proto_txt = str(worker) + "\n" + str(server)

        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))

        if debug:
            print("worker: \n{}".format(proto_txt))

        endpoints = self.compiled_strategy.get_ps_endpoints()

        string_hosts = []
        for idx, ep in enumerate(endpoints):
            host, port = ep.split(":")
            pshost = fluid.core.PSHost(host, int(port), idx)
            string_hosts.append(pshost.serialize_to_string())

        dense_map = self.compiled_strategy.get_the_one_recv_context(
            split_dense_table=self.role_maker._is_heter_parameter_server_mode)
        send_ctx = self.compiled_strategy.get_the_one_send_context(
            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
            ep_list=endpoints)
        trainer_config = self.async_strategy.get_trainer_runtime_config()

        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))

        if debug:
            print("worker: \n{}".format(proto_txt))
            print("communicator send_ctx:")
            for key in send_ctx:
                print("{}: {}".format(key, send_ctx[key]))
            for key in dense_map:
                print("{}: {}".format(key, dense_map[key]))

        kwargs = {}
        kwargs['need_global_step'] = "0"
        kwargs["trainer_id"] = self.role_maker._role_id()
        kwargs["trainers"] = self.role_maker._worker_num()
        if self.role_maker._is_heter_worker():
            kwargs["trainer_id"] += kwargs["trainers"]

        for table in server.servers[0].tables:
            if table.table_class == "BarrierTable":
                kwargs["barrier_table_id"] = table.id
                break

        if isinstance(self.async_strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        from paddle.fluid.communicator import Communicator, HeterClient
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())
        self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
                                         string_hosts, fluid.global_scope())

        dist_strategy = self.context["valid_strategy"]

        is_test = bool(int(os.getenv("TEST_MODE", "0")))

        if self.role_maker._is_first_worker(
        ) and self.role_maker._is_heter_parameter_server_mode:
            # for ps-heter mode load all parameters on first_worker
            init_params = self.compiled_strategy.get_the_one_recv_context(
                split_dense_table=True, use_origin_program=True)
        else:
            init_params = dense_map

        if not is_test:
            self._communicator.init_params(init_params)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")

        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
        if launch_barrier and launch_barrier_flag:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            ):
                wait_server_ready(
                    self.role_maker._get_heter_worker_endpoints())

                self._heter_client = HeterClient(
                    self.role_maker._get_heter_worker_endpoints(),
                    self.role_maker._role_id())
Пример #7
0
class TheOnePSRuntime(RuntimeBase):
    def __init__(self):
        super(TheOnePSRuntime, self).__init__()
        self._communicator = None
        self._server = None
        self._worker = fluid.core.DistFleetWrapper()
        self._server_sub_program = []
        self._heter_client = None

    def _set_basic_info(self, context):
        self.context = context
        self.role_maker = context["role_maker"]
        self.origin_main_program = context["origin_main_program"]
        self.origin_startup_program = context["origin_startup_program"]
        self.async_strategy = self._get_distributed_strategy()
        self.compiled_strategy = self.build_compiled_startegy()

    def _get_distributed_strategy(self):
        strategy = None

        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
            StrategyFactory

        dist_strategy = self.context["valid_strategy"]
        k_steps = dist_strategy.a_sync_configs["k_steps"]

        if not dist_strategy.a_sync and k_steps == 0:
            strategy = StrategyFactory.create_sync_strategy()

        if dist_strategy.a_sync and k_steps == 0:
            strategy = StrategyFactory.create_async_strategy()

        if dist_strategy.a_sync and k_steps > 0:
            strategy = StrategyFactory.create_geo_strategy(k_steps)

        if not strategy:
            raise ValueError("k_steps must be invalid value, please check")

        return strategy

    def build_compiled_startegy(self):
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy

        compiled_config = CompileTimeStrategy(self.origin_main_program,
                                              self.origin_main_program,
                                              self.async_strategy,
                                              self.role_maker)
        return compiled_config

    def _init_worker(self):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
            SyncStrategy, GeoStrategy

        is_sync = self.compiled_strategy.is_sync_mode()
        worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)
        server = self._get_fleet_proto(is_server=True, is_sync=is_sync)

        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self.role_maker._worker_index()
            return kwargs

        proto_txt = str(worker) + "\n" + str(server)

        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))

        if debug:
            print("worker: \n{}".format(proto_txt))

        endpoints = self.compiled_strategy.get_ps_endpoints()

        string_hosts = []
        for idx, ep in enumerate(endpoints):
            host, port = ep.split(":")
            pshost = fluid.core.PSHost(host, int(port), idx)
            string_hosts.append(pshost.serialize_to_string())

        dense_map = self.compiled_strategy.get_the_one_recv_context(
            split_dense_table=self.role_maker._is_heter_parameter_server_mode)
        send_ctx = self.compiled_strategy.get_the_one_send_context(
            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
            ep_list=endpoints)
        trainer_config = self.async_strategy.get_trainer_runtime_config()

        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))

        if debug:
            print("worker: \n{}".format(proto_txt))
            print("communicator send_ctx:")
            for key in send_ctx:
                print("{}: {}".format(key, send_ctx[key]))
            for key in dense_map:
                print("{}: {}".format(key, dense_map[key]))

        kwargs = {}
        kwargs['need_global_step'] = "0"
        kwargs["trainer_id"] = self.role_maker._role_id()
        kwargs["trainers"] = self.role_maker._worker_num()
        if self.role_maker._is_heter_worker():
            kwargs["trainer_id"] += kwargs["trainers"]

        for table in server.servers[0].tables:
            if table.table_class == "BarrierTable":
                kwargs["barrier_table_id"] = table.id
                break

        if isinstance(self.async_strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        from paddle.fluid.communicator import Communicator, HeterClient
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())
        self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
                                         string_hosts, fluid.global_scope())

        dist_strategy = self.context["valid_strategy"]

        is_test = bool(int(os.getenv("TEST_MODE", "0")))

        if self.role_maker._is_first_worker(
        ) and self.role_maker._is_heter_parameter_server_mode:
            # for ps-heter mode load all parameters on first_worker
            init_params = self.compiled_strategy.get_the_one_recv_context(
                split_dense_table=True, use_origin_program=True)
        else:
            init_params = dense_map

        if not is_test:
            self._communicator.init_params(init_params)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")

        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
        if launch_barrier and launch_barrier_flag:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            ):
                wait_server_ready(
                    self.role_maker._get_heter_worker_endpoints())

                self._heter_client = HeterClient(
                    self.role_maker._get_heter_worker_endpoints(),
                    self.role_maker._role_id())

    def _push_sparse_param(self,
                           var_name,
                           table_id=-1,
                           scope=fluid.global_scope()):
        self._communicator.push_sparse_param(var_name, table_id, scope)

    def _get_executor(self):
        executor = fluid.Executor(fluid.CPUPlace())
        if self.role_maker._is_heter_parameter_server_mode:
            heter_worker_device_guard = self.context[
                "valid_strategy"].a_sync_configs[
                    "heter_worker_device_guard"].upper()
            if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
                raise ValueError("Heter Worker Not Support Device {}".format(
                    heter_worker_device_guard))
            if self.role_maker._is_heter_worker():
                if heter_worker_device_guard == "GPU":
                    executor = Executor(
                        fluid.CUDAPlace(
                            int(os.getenv("FLAGS_selected_gpus", "0"))))
                elif heter_worker_device_guard == "XPU":
                    executor = Executor(
                        fluid.XPUPlace(
                            int(os.getenv("FLAGS_selected_xpus", "0"))))
        return executor

    def _get_fleet_proto(self, is_server, is_sync):
        def _build_merge_accessor(ctx):
            accessor = Accessor()
            accessor.accessor_class = "CommMergeAccessor"
            accessor.optimizer = None

            if ctx.is_sparse():
                accessor.feature_dim = ctx.sections()[0]
                accessor.embedding_dim = ctx.sections()[1]
            else:
                accessor.feature_dim = ctx.sections()[0]
                accessor.embedding_dim = 1

            return accessor

        def _build_barrier_table(idx):
            table = Table()
            table.id = idx
            table.type = "PS_OTHER_TABLE"
            table.table_class = "BarrierTable"
            table.shard_num = 256

            accessor = Accessor()
            accessor.accessor_class = "CommMergeAccessor"
            accessor.optimizer = None
            accessor.feature_dim = 0
            accessor.embedding_dim = 0
            table.accessor = accessor

            common = CommonAccessor()
            common.table_name = "barrier_table"
            trainer_num = self.compiled_strategy.get_trainers()
            if self.role_maker._is_heter_parameter_server_mode:
                trainer_num += len(
                    self.role_maker._get_heter_worker_endpoints())
            common.trainer_num = trainer_num
            common.attrs = ""
            common.dims = []
            common.params = []
            table.common = common
            return table

        def _build_tensor_table(idx, tensor_dict):
            table = Table()
            table.id = idx
            table.type = "PS_OTHER_TABLE"
            table.table_class = tensor_dict["tensor_table_class"]
            table.shard_num = 256

            accessor = Accessor()
            accessor.accessor_class = "CommMergeAccessor"
            accessor.optimizer = None
            accessor.feature_dim = 0
            accessor.embedding_dim = 0
            table.accessor = accessor

            common = CommonAccessor()
            common.table_name = tensor_dict["feed_var_name"]
            common.trainer_num = self.compiled_strategy.get_trainers()
            common.attrs = ""
            common.dims = []
            common.params = []
            table.common = common

            tensor = Tensor()
            tensor.main_program_id = tensor_dict["main_program_id"]
            tensor.startup_program_id = tensor_dict["startup_program_id"]
            tensor.feed_var_name = tensor_dict["feed_var_name"]
            tensor.fetch_var_name = tensor_dict["fetch_var_name"]
            tensor.tensor_table_class = tensor_dict["tensor_table_class"]
            table.tensor = tensor

            return table

        def _add_tensor_table(tables):
            tensor_table_dict = self.compiled_strategy.get_tensor_table_dict()
            program_idx = 0
            for table_name in tensor_table_dict:
                if tensor_table_dict[table_name]["startup_program"] != None:
                    tensor_table_dict[table_name][
                        "startup_program_id"] = program_idx
                    self._server_sub_program.append(
                        tensor_table_dict[table_name]["startup_program"].desc)
                    program_idx += 1
                if tensor_table_dict[table_name]["main_program"] != None:
                    tensor_table_dict[table_name][
                        "main_program_id"] = program_idx
                    self._server_sub_program.append(
                        tensor_table_dict[table_name]["main_program"].desc)
                    program_idx += 1
                # Todo: Hard code for lr_decay table apply table id
                new_table = _build_tensor_table(len(tables),
                                                tensor_table_dict[table_name])
                tables.append(new_table)
            return tables

        def _get_tables():
            send_ctx = self.compiled_strategy.get_the_one_send_context(
                use_origin_program=True,
                split_dense_table=self.role_maker.
                _is_heter_parameter_server_mode)

            tables = []
            for idx, (name, ctx) in enumerate(send_ctx.items()):
                if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1:
                    continue

                table = Table()
                table.id = ctx.table_id()
                common = CommonAccessor()

                if ctx.is_sparse():
                    table.type = "PS_SPARSE_TABLE"
                    table.shard_num = 256

                    if self.compiled_strategy.is_geo_mode():
                        table.table_class = "SparseGeoTable"
                    else:
                        table.table_class = "CommonSparseTable"

                    common.table_name = self.compiled_strategy.grad_name_to_param_name[
                        ctx.origin_varnames()[0]]
                else:
                    table.type = "PS_DENSE_TABLE"
                    table.table_class = "CommonDenseTable"
                    table.shard_num = 256
                    common.table_name = "MergedDense"

                common.parse_by_optimizer(
                    ctx.origin_varnames()[0], ctx.is_sparse(),
                    ctx.sections()[1] if ctx.is_sparse() else
                    ctx.sections()[0], self.compiled_strategy)

                if ctx.is_sparse():
                    common.parse_entry(common.table_name,
                                       self.origin_main_program)

                if is_sync:
                    common.sync = "true"
                else:
                    common.sync = "false"

                table.common = common

                accessor = _build_merge_accessor(ctx)
                table.accessor = accessor
                tables.append(table)

            tensor_table_dict = self.compiled_strategy.get_tensor_table_dict()
            if len(tensor_table_dict) > 0:
                tables = _add_tensor_table(tables)
            else:
                empty_porgram = Program()
                self._server_sub_program.append(empty_porgram.desc)

            barrier_table = _build_barrier_table(len(tables))
            tables.append(barrier_table)
            return tables

        if is_server:
            server = Server()
            downpour_server = DownpourServer()

            service = Service()
            downpour_server.set_service_param(service)

            tables = _get_tables()
            downpour_server.tables = tables
            server.add_server(downpour_server)
            return server
        else:
            worker = Worker()
            downpour_worker = DownpourWorker()

            tables = _get_tables()
            downpour_worker.tables = tables
            worker.add_worker(downpour_worker)
            return worker

    def _init_server(self, dirname=None, var_names=None, **kwargs):
        if self.role_maker._is_heter_worker():
            self._init_heter_worker()
            return
        role_id = self.compiled_strategy.get_role_id()
        endpoints = self.compiled_strategy.get_ps_endpoints()
        is_sync = self.compiled_strategy.is_sync_mode()
        trainers = self.compiled_strategy.get_trainers()

        server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
        proto_txt = str(server)

        debug = bool(os.getenv("PSERVER_DEBUG", "0"))
        if debug:
            print("server: \n{}".format(proto_txt))

        string_hosts = []
        for idx, ep in enumerate(endpoints):
            host, port = ep.split(":")
            pshost = fluid.core.PSHost(host, int(port), idx)
            string_hosts.append(pshost.serialize_to_string())

        self._server = fluid.core.DistFleetWrapper()
        self._server.init_server(proto_txt, string_hosts, role_id, trainers,
                                 self._server_sub_program)

        from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames

        dist_varnames = get_sparse_tablenames(self.origin_main_program, True)
        sparse_varnames = get_sparse_tablenames(self.origin_main_program,
                                                False)

        distributed_varnames = dist_varnames + sparse_varnames

        if var_names is None:
            load_varnames = distributed_varnames
        else:
            for var_name in var_names:
                if var_name not in distributed_varnames:
                    raise ValueError(
                        "fleet.init server can only load sparse variables in {}"
                        .format(distributed_varnames))
            load_varnames = var_names

        if dirname is None or not load_varnames:
            return

        sparse_table_maps = {}
        for table in server.servers[0].tables:
            if table.type == "PS_SPARSE_TABLE" and table.common is not None:
                sparse_table_maps[table.common.table_name] = table.id

        dirname = os.path.normpath(dirname)
        pserver_id = self.role_maker._role_id()

        import time
        begin = time.time()
        for var_name in load_varnames:
            table_id = sparse_table_maps[var_name]
            path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
                                "{}.block{}.txt".format(var_name, pserver_id))
            meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
                                "{}.block{}.meta".format(var_name, pserver_id))
            self._server.load_sparse(path, meta, table_id)
        end = time.time()
        print("init sparse variables: {} cost time: {}".format(
            load_varnames, end - begin))

    def _run_server(self):
        if self.role_maker._is_heter_worker():
            self._run_heter_worker()
            return

        ep = self.compiled_strategy.get_ps_endpoint()
        host, port = ep.split(":")
        self._server.run_server(host, int(port))

    def _init_heter_worker(self):
        executor = self._get_executor()
        executor.run(fluid.default_startup_program())
        self._init_worker()

    def _run_heter_worker(self):
        executor = self._get_executor()
        executor.run(fluid.default_main_program())

    def _stop_worker(self):
        self._communicator.stop()
        if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
        ):
            self._heter_client.stop()
        executor = self._get_executor()
        executor.close()

    @staticmethod
    def __exclude_vars(exclude_var_names=[]):
        def is_valid(var):
            if var.name in exclude_var_names:
                return False

            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts

            origin_varname, _, _ = _get_varname_parts(var.name)
            if origin_varname.endswith("@GRAD"):
                return False

            if origin_varname == "learning_rate_0":
                return False

            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
                    var.desc.type() == core.VarDesc.VarType.READER:
                return False
            return var.persistable

        return is_valid

    def _save_sparse_params(self, executor, dirname, context, main_program,
                            mode):
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
        distributed_varnames = get_sparse_tablenames(
            self.compiled_strategy.origin_main_program, True)
        values = []
        for id, names in context.items():
            if names not in distributed_varnames:
                # only save sparse param to local
                self._worker.recv_and_save_model(id, dirname)
            # save sparse & distributed param on server
            self._worker.save_one_model(id, dirname, mode)
            values.extend(names)
        return values

    def _save_distributed_persistables(self,
                                       executor,
                                       dirname,
                                       main_program,
                                       mode=0):

        denses = self.compiled_strategy.get_the_one_recv_context(
            is_dense=True,
            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
            use_origin_program=True)
        sparses = self.compiled_strategy.get_the_one_recv_context(
            is_dense=False,
            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
            use_origin_program=True)

        sparse_varnames = self._save_sparse_params(executor, dirname, sparses,
                                                   main_program, mode)

        recv_dense_varnames = []
        for id, names in denses.items():
            recv_dense_varnames.extend(names)

        saved_varnames = sparse_varnames

        remaining_vars = list(
            filter(TheOnePSRuntime.__exclude_vars(saved_varnames),
                   main_program.list_vars()))

        fluid.io.save_vars(executor,
                           main_program=main_program,
                           dirname=dirname,
                           vars=remaining_vars)

    def _ps_inference_save_persistables(self,
                                        executor,
                                        dirname,
                                        main_program=None,
                                        mode=0,
                                        **kwargs):
        """
        This function filters out all variables with `persistable==True` from the
        give `main_program` and then saves these variables to the folder `dirname`
        or file `filename`.

        The `dirname` is used to specify the folder where persistable variables
        are going to be saved. If you would like to save variables in separate
        files, set `filename` None; if you would like to save all variables in a
        single file, use `filename` to specify the file name.
        """

        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type"
            )

        if main_program is None:
            main_program = self.compiled_strategy.get_origin_ps_main_program()

        if isinstance(main_program, CompiledProgram):
            raise TypeError(
                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
            )

        # Todo(MrChengmo): Save optimizer status
        self._save_distributed_persistables(executor, dirname, main_program,
                                            mode)

    def _ps_inference_save_inference_model(self,
                                           executor,
                                           dirname,
                                           feeded_var_names,
                                           target_vars,
                                           main_program=None,
                                           export_for_deployment=True,
                                           mode=0):
        """
        Prune the given `main_program` to build a new program especially for inference,
        and then save it and all related parameters to given `dirname` by the `executor`.
        """

        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type"
            )

        if main_program is not None:
            if isinstance(main_program, CompiledProgram):
                raise TypeError(
                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                )
            fluid.io.save_inference_model(dirname, feeded_var_names,
                                          target_vars, executor, main_program,
                                          None, None, export_for_deployment)
        else:
            fluid.io.save_inference_model(dirname, feeded_var_names,
                                          target_vars, executor,
                                          self.origin_main_program, None, None,
                                          export_for_deployment, True)
            model_basename = "__model__"
            model_filename = os.path.join(dirname, model_basename)

            with open(model_filename, "rb") as f:
                program_desc_str = f.read()

            program = Program.parse_from_string(program_desc_str)
            program._copy_dist_param_info_from(fluid.default_main_program())
            self._ps_inference_save_persistables(executor, dirname, program,
                                                 mode)

    def _save_inference_model(self, *args, **kwargs):
        self._ps_inference_save_inference_model(*args, **kwargs)

    def _save_persistables(self, *args, **kwargs):
        self._ps_inference_save_persistables(*args, **kwargs)

    def _shrink(self, threshold):
        import paddle.distributed.fleet as fleet
        fleet.util.barrier()
        if self.role_maker._is_first_worker():
            sparses = self.compiled_strategy.get_the_one_recv_context(
                is_dense=False,
                split_dense_table=self.role_maker.
                _is_heter_parameter_server_mode,
                use_origin_program=True)

            for id, names in sparses.items():
                self._worker.shrink_sparse_table(id, threshold)
        fleet.util.barrier()
Пример #8
0
class ParameterServerRuntime(RuntimeBase):
    def __init__(self):
        super(ParameterServerRuntime, self).__init__()
        self._communicator = None

    def _set_basic_info(self, context):
        self.context = context
        self.role_maker = context["role_maker"]
        self.origin_main_program = context["origin_main_program"]
        self.origin_startup_program = context["origin_startup_program"]
        self.async_strategy = self._get_distributed_strategy()
        self.compiled_strategy = self.build_compiled_startegy()

    def _get_distributed_strategy(self):
        strategy = None

        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory

        dist_strategy = self.context["valid_strategy"]
        k_steps = dist_strategy.a_sync_configs["k_steps"]

        if not dist_strategy.a_sync and k_steps == 0:
            strategy = StrategyFactory.create_sync_strategy()

        if dist_strategy.a_sync and k_steps == 0:
            strategy = StrategyFactory.create_async_strategy()

        if dist_strategy.a_sync and k_steps > 0:
            strategy = StrategyFactory.create_geo_strategy(k_steps)

        if not strategy:
            raise ValueError("k_steps must be invalid value, please check")

        return strategy

    def build_compiled_startegy(self):
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy

        compiled_config = CompileTimeStrategy(self.origin_main_program,
                                              self.origin_main_program,
                                              self.async_strategy,
                                              self.role_maker)
        return compiled_config

    def _load_sparse_params(self,
                            executor,
                            dirname,
                            varnames,
                            main_program=None):
        assert vars != None
        check_vars = []
        load_prog = Program()
        load_block = load_prog.global_block()

        def _in_varnames(var):
            return var.name in varnames

        load_vars = list(
            filter(_in_varnames,
                   fluid.default_main_program().list_vars()))
        if main_program is None:
            main_program = self.origin_main_program

        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
        for each_var in load_vars:
            assert isinstance(each_var, Variable)

            origin_varname, _, _ = _get_varname_parts(each_var.name)

            new_var = fluid.io._clone_var_in_block_(load_block, each_var)
            var_path = os.path.join(dirname, origin_varname)
            if not os.path.exists(var_path):
                raise ValueError(
                    "SelectedRows var {} can not find at {}".format(
                        new_var.name, var_path))

            if os.path.isfile(var_path):
                load_block.append_op(type='sparse_tensor_load',
                                     inputs={},
                                     outputs={'Out': [new_var]},
                                     attrs={
                                         'file_path':
                                         os.path.join(dirname, origin_varname),
                                         'node_index':
                                         self.role_maker._server_index(),
                                         'node_num':
                                         self.role_maker._server_num(),
                                         'shape':
                                         each_var.shape
                                     })
            check_vars.append(each_var)

        executor.run(load_prog)

    def _load_distributed_params(self, dirname, varnames):
        from paddle.fluid.communicator import LargeScaleKV
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts

        scale_kv = LargeScaleKV()
        for varname in varnames:
            origin_varname, _, _ = _get_varname_parts(varname)
            sparse_dir = os.path.join(dirname, origin_varname, varname)
            scale_kv.load(varname, sparse_dir)

    @staticmethod
    def __exclude_vars(exclude_var_names=[]):
        def is_valid(var):
            if var.name in exclude_var_names:
                return False

            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts

            origin_varname, _, _ = _get_varname_parts(var.name)
            if origin_varname.endswith("@GRAD"):
                return False

            if origin_varname == "learning_rate_0":
                return False

            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
                    var.desc.type() == core.VarDesc.VarType.READER:
                return False
            return var.persistable

        return is_valid

    def _init_worker(self):
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self.role_maker._worker_index()
            return kwargs

        def geo_strategy_envs():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames

            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(self.origin_main_program,
                                                      True)
                sparse_varnames = get_sparse_tablenames(
                    self.origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self.origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self.origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.role_maker._worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step

        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
            SyncStrategy, GeoStrategy

        trainer_config = self.async_strategy.get_trainer_runtime_config()
        print(trainer_config)

        dist_strategy = self.context["valid_strategy"]
        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
        if launch_barrier:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            ):
                wait_server_ready(
                    self.role_maker._get_heter_worker_endpoints())

        lrs = _has_global_step(_get_lr_ops(self.origin_main_program))

        if lrs:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self.async_strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self.async_strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = self.compiled_strategy.get_communicator_send_context()

        if self.compiled_strategy.is_geo_mode():
            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())
        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")

    def _get_executor(self):
        executor = fluid.Executor(fluid.CPUPlace())
        if self.role_maker._is_heter_parameter_server_mode:
            heter_worker_device_guard = self.context[
                "valid_strategy"].a_sync_configs[
                    "heter_worker_device_guard"].upper()
            if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
                raise ValueError("Heter Worker Not Support Device {}".format(
                    heter_worker_device_guard))
            if self.role_maker._is_heter_worker():
                if heter_worker_device_guard == "GPU":
                    executor = Executor(
                        fluid.CUDAPlace(
                            int(os.getenv("FLAGS_selected_gpus", "0"))))
                elif heter_worker_device_guard == "XPU":
                    executor = Executor(
                        fluid.XPUPlace(
                            int(os.getenv("FLAGS_selected_xpus", "0"))))
        return executor

    def _init_server(self, *args, **kwargs):
        if len(args) > 1:
            raise ValueError("init server can only accept 1 args: `dirname`")
        elif len(args) == 1:
            model_dirname = args[0]
        else:
            model_dirname = None

        executor = self._get_executor()
        if self.role_maker._is_heter_worker(
        ) and self.context["valid_strategy"].a_sync_configs["launch_barrier"]:
            # for heter trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())
        executor.run(fluid.default_startup_program())

        if self.role_maker._is_heter_worker():
            self._init_worker()
            return

        sparse_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
            False)
        sparse_related_optimize_varnames = []
        for var_name in sparse_varnames:
            sparse_related_optimize_varnames += self.compiled_strategy.get_optimize_varname_on_ps(
                var_name)
        sparse_related_optimize_varnames = list(
            set(sparse_related_optimize_varnames))
        distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
            True)
        distributed_related_optimize_varnames = []
        for var_name in distribtued_varnames:
            distributed_related_optimize_varnames += self.compiled_strategy.get_optimize_varname_on_ps(
                var_name)
        distributed_related_optimize_varnames = list(
            set(distributed_related_optimize_varnames))

        remaining_vars = list(
            filter(
                ParameterServerRuntime.__exclude_vars(
                    sparse_varnames + distribtued_varnames +
                    sparse_related_optimize_varnames +
                    distributed_related_optimize_varnames),
                fluid.default_main_program().list_vars()))

        if not model_dirname:
            return

        if not os.path.isdir(model_dirname):
            raise ValueError("There is no directory named '%s'", model_dirname)

        # load dense
        fluid.io.load_vars(executor,
                           main_program=fluid.default_main_program(),
                           dirname=model_dirname,
                           vars=remaining_vars)

        # load sparse
        self._load_sparse_params(executor=executor,
                                 dirname=model_dirname,
                                 varnames=sparse_varnames +
                                 sparse_related_optimize_varnames)

        # load large scale
        self._load_distributed_params(dirname=model_dirname,
                                      varnames=distribtued_varnames +
                                      distributed_related_optimize_varnames)

    def _run_server(self):
        executor = self._get_executor()
        executor.run(fluid.default_main_program())

    def _stop_worker(self):
        self._communicator.stop()
        executor = self._get_executor()
        executor.close()

    def _get_optimizer_status(self, op, param_name):
        supported_opts = [
            "sgd", "adam", "adagrad", "adamax", "momentum", "lars_momentum",
            "rmsprop", "decayed_adagrad", "ftrl"
        ]

        reshaped_val_map = {}
        reshaped_val_map["sgd"] = []
        reshaped_val_map["adam"] = ["moment1_0", "moment2_0"]
        reshaped_val_map["adagrad"] = ["moment_0"]
        reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
        reshaped_val_map["momentum"] = ["velocity_0"]
        reshaped_val_map["lars_momentum"] = ["velocity_0"]
        reshaped_val_map["rmsprop"] = [
            "momentum_0", "mean_square_0", "mean_grad_0"
        ]
        reshaped_val_map["decayed_adagrad"] = ["moment_0"]
        reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]

        orishaped_val_map = {}
        orishaped_val_map["adam"] = ["beta1_pow_acc_0", "beta2_pow_acc_0"]
        orishaped_val_map["adamax"] = ["beta1_pow_acc_0"]

        if op not in supported_opts:
            raise ValueError(
                "fleet can not support optimizer: {}, only this can be supported: {}"
                .format(op, supported_opts))

        reshaped_names = [
            param_name + "_" + val for val in reshaped_val_map[op]
        ]

        if op not in orishaped_val_map:
            origin_names = []
        else:
            origin_names = [
                param_name + "_" + val for val in orishaped_val_map[op]
            ]
        return reshaped_names, origin_names

    def _get_optimizer_op(self, param_name):
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops

        opts = _get_optimize_ops(self.origin_main_program)
        for op in opts:
            if "Param" in op.input_names and \
                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                return op

    def _save_dense_params(self, executor, dirname, context, main_program):
        self._communicator.recv()

        prog = Program()
        block = prog.global_block()
        local_vars = []

        for name, var_ctx in context.items():
            if len(var_ctx.origin_varnames()) != 1:
                raise ValueError("Dense can not support split now.")

            varname = var_ctx.origin_varnames()[0]
            local_vars.append(varname)

            optimizer = self._get_optimizer_op(varname)
            reshaped_varnames, origin_varnames = self._get_optimizer_status(
                optimizer.type, varname)

            for var_name in [varname] + reshaped_varnames + origin_varnames:
                var = self.origin_main_program.global_block().vars[var_name]
                block.append_op(type='recv_save',
                                attrs={
                                    "trainer_id":
                                    self.role_maker._worker_index(),
                                    "shape":
                                    var.shape,
                                    "slice_shapes":
                                    [",".join([str(i) for i in var.shape])],
                                    "slice_varnames": [var.name],
                                    "remote_varnames": [var.name],
                                    "is_sparse":
                                    False,
                                    "endpoints":
                                    var_ctx.split_endpoints(),
                                    "file_path":
                                    os.path.join(dirname, var.name)
                                })

        executor.run(prog)
        return local_vars

    def _save_sparse_params(self, executor, dirname, context, main_program):
        prog = Program()
        block = prog.global_block()
        local_vars = []

        for name, var_ctx in context.items():
            if len(var_ctx.origin_varnames()) != 1:
                raise ValueError("Dense can not support split now.")

            varname = var_ctx.origin_varnames()[0]
            local_vars.append(varname)

            optimizer = self._get_optimizer_op(varname)
            reshaped_varnames, origin_varnames = self._get_optimizer_status(
                optimizer.type, varname)

            var = self.origin_main_program.global_block().vars[varname]
            slice_shapes = []
            dims1 = ",".join([str(i) for i in var.shape[1:]])

            for section in var_ctx.sections():
                slice_shapes.append(str(section) + dims1)

            block.append_op(type='recv_save',
                            attrs={
                                "trainer_id":
                                self.role_maker._worker_index(),
                                "shape":
                                var.shape,
                                "slice_shapes":
                                slice_shapes,
                                "slice_varnames":
                                var_ctx.split_varnames(),
                                "remote_varnames":
                                var_ctx.split_varnames(),
                                "is_sparse":
                                True,
                                "endpoints":
                                var_ctx.split_endpoints(),
                                "pserver_num":
                                len(self.role_maker._get_pserver_endpoints()),
                                "file_path":
                                os.path.join(dirname, var.name)
                            })

            for reshaped_varname in reshaped_varnames:
                var = self.origin_main_program.global_block(
                ).vars[reshaped_varname]

                slice_varnames = []
                remote_varnames = []
                for i in range(len(var_ctx.split_varnames())):
                    slice_varnames.append("{}.block{}".format(
                        reshaped_varname, i))
                    remote_varnames.append(reshaped_varname)

                block.append_op(
                    type='recv_save',
                    attrs={
                        "trainer_id":
                        self.role_maker._worker_index(),
                        "shape":
                        var.shape,
                        "slice_shapes":
                        slice_shapes,
                        "slice_varnames":
                        slice_varnames,
                        "remote_varnames":
                        remote_varnames,
                        "is_sparse":
                        True,
                        "endpoints":
                        var_ctx.split_endpoints(),
                        "pserver_num":
                        len(self.role_maker._get_pserver_endpoints()),
                        "file_path":
                        os.path.join(dirname, var.name)
                    })

            for origin_varname in origin_varnames:
                var = self.origin_main_program.global_block(
                ).vars[origin_varname]

                block.append_op(type='recv_save',
                                attrs={
                                    "trainer_id":
                                    self.role_maker._worker_index(),
                                    "shape":
                                    var.shape,
                                    "slice_shapes":
                                    [",".join([str(i) for i in var.shape])],
                                    "slice_varnames": [origin_varname],
                                    "remote_varnames": [origin_varname],
                                    "is_sparse":
                                    False,
                                    "endpoints":
                                    var_ctx.split_endpoints()[:1],
                                    "file_path":
                                    os.path.join(dirname, var.name)
                                })
        executor.run(prog)
        return context.keys()

    def _save_distributed_params(self, executor, dirname, context, mode):
        prog = Program()
        block = prog.global_block()

        for name, var_ctx in context.items():
            block.append_op(type='checkpoint_notify',
                            attrs={
                                "varname": name,
                                "mode": mode,
                                "slice_varnames": var_ctx.split_varnames(),
                                "remote_varnames": var_ctx.split_varnames(),
                                "endpoints": var_ctx.split_endpoints(),
                                "dirname": dirname
                            })

        executor.run(prog)
        return context.keys()

    def _save_distributed_persistables(self, executor, dirname, main_program,
                                       mode):
        dense_ctx = self.compiled_strategy.get_communicator_recv_context(
            recv_type=1, use_origin_program=True)

        sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
            recv_type=2, use_origin_program=True)

        distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
            recv_type=3, use_origin_program=True)

        recv_dense_varnames = self._save_dense_params(executor, dirname,
                                                      dense_ctx, main_program)

        recv_sparse_varnames = self._save_sparse_params(
            executor, dirname, sparse_ctx, main_program)

        recv_distributed_varnames = self._save_distributed_params(
            executor, dirname, distributed_ctx, mode)

        saved_varnames = recv_dense_varnames + list(
            recv_sparse_varnames) + list(recv_distributed_varnames)

        remaining_vars = list(
            filter(ParameterServerRuntime.__exclude_vars(saved_varnames),
                   main_program.list_vars()))

        fluid.io.save_vars(executor,
                           main_program=main_program,
                           dirname=dirname,
                           vars=remaining_vars)

    def _ps_inference_save_persistables(self,
                                        executor,
                                        dirname,
                                        main_program=None,
                                        mode=0,
                                        **kwargs):
        """
        This function filters out all variables with `persistable==True` from the
        give `main_program` and then saves these variables to the folder `dirname`
        or file `filename`.

        The `dirname` is used to specify the folder where persistable variables
        are going to be saved. If you would like to save variables in separate
        files, set `filename` None; if you would like to save all variables in a
        single file, use `filename` to specify the file name.
        """

        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type"
            )

        if main_program is None:
            main_program = self.compiled_strategy.get_origin_ps_main_program()

        if isinstance(main_program, CompiledProgram):
            raise TypeError(
                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
            )

        self._save_distributed_persistables(executor, dirname, main_program,
                                            mode)

    def _ps_inference_save_inference_model(self,
                                           executor,
                                           dirname,
                                           feeded_var_names,
                                           target_vars,
                                           main_program=None,
                                           export_for_deployment=True):
        """
        Prune the given `main_program` to build a new program especially for inference,
        and then save it and all related parameters to given `dirname` by the `executor`.
        """

        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type"
            )

        if main_program is not None:
            if isinstance(main_program, CompiledProgram):
                raise TypeError(
                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                )
            fluid.io.save_inference_model(dirname, feeded_var_names,
                                          target_vars, executor, main_program,
                                          None, None, export_for_deployment)
        else:
            fluid.io.save_inference_model(dirname, feeded_var_names,
                                          target_vars, executor,
                                          self.origin_main_program, None, None,
                                          export_for_deployment, True)

            model_basename = "__model__"
            model_filename = os.path.join(dirname, model_basename)

            with open(model_filename, "rb") as f:
                program_desc_str = f.read()

            program = Program.parse_from_string(program_desc_str)
            program._copy_dist_param_info_from(fluid.default_main_program())
            self._ps_inference_save_persistables(executor,
                                                 dirname,
                                                 program,
                                                 mode=0)

    def _save_inference_model(self, *args, **kwargs):
        self._ps_inference_save_inference_model(*args, **kwargs)

    def _save_persistables(self, *args, **kwargs):
        self._ps_inference_save_persistables(*args, **kwargs)
Пример #9
0
    def _init_worker(self):
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self.role_maker._worker_index()
            return kwargs

        def geo_strategy_envs():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames

            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(self.origin_main_program,
                                                      True)
                sparse_varnames = get_sparse_tablenames(
                    self.origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self.origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self.origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.role_maker._worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step

        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
            SyncStrategy, GeoStrategy

        trainer_config = self.async_strategy.get_trainer_runtime_config()
        print(trainer_config)

        dist_strategy = self.context["valid_strategy"]
        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
        if launch_barrier:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            ):
                wait_server_ready(
                    self.role_maker._get_heter_worker_endpoints())

        lrs = _has_global_step(_get_lr_ops(self.origin_main_program))

        if lrs:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self.async_strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self.async_strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = self.compiled_strategy.get_communicator_send_context()

        if self.compiled_strategy.is_geo_mode():
            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())
        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")
Пример #10
0
    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            from paddle.fluid.transpiler.details.checkport import wait_server_ready
            wait_server_ready(fleet.server_endpoints(to_string=False))

        program_config = self._transpile_config.get_program_config()
        trainer_communicator_config = self._transpile_config.get_trainer_runtime_config(
        )

        print(trainer_communicator_config)

        if isinstance(self._transpile_config, GeoStrategy):
            kwargs = {}
            kwargs["push_vars"] = self.vars_info
            kwargs["trainers"] = fleet.worker_num()
            kwargs["push_nums"] = self._transpile_config.get_program_config(
            ).geo_sgd_need_push_nums

            self._communicator = Communicator(
                self.main_program, DistributedMode.GEO, kwargs,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, AsyncStrategy):
            self._communicator = Communicator(
                self.main_program, DistributedMode.ASYNC, None,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, HalfAsyncStrategy):
            self._communicator = Communicator(
                self.main_program, DistributedMode.HALF_ASYNC, None,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, SyncStrategy):
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints()
            kwargs["trainer_id"] = self._role_maker.worker_index()

            self._communicator = Communicator(
                self.main_program, DistributedMode.SYNC, kwargs,
                trainer_communicator_config.get_communicator_flags())

        else:
            raise TypeError("Training MODE do not supported")

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")
Пример #11
0
    def _init_transpiler_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self._role_maker.worker_index()
            return kwargs

        def geo_strategy_envs():
            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(
                    self._origin_main_program, True)
                sparse_varnames = get_sparse_tablenames(
                    self._origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self._origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self._origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster

        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            wait_server_ready(self.server_endpoints(to_string=False))

        trainer_config = self._strategy.get_trainer_runtime_config()

        print(trainer_config)

        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))

        if lrs > 0:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self._strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self._strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = fleet.compiled_config.get_communicator_send_context()

        if self.compiled_config.is_geo_mode():
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())

        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            raise ValueError(
                "Communicator can only be inited once, please check")
Пример #12
0
class FleetTranspiler(Fleet):
    """
    A subclass for compatibility with fluid.transpiler.DistributeTranspiler.
    """
    def __init__(self):
        super(FleetTranspiler, self).__init__(Mode.TRANSPILER)

        self._inner_mode = None

        if version.is_transpiler():
            self._inner_mode = PSMode.TRANSPILER
        else:
            self._inner_mode = PSMode.PSLIB

        self._strategy = None
        self._transpiler = None
        self._origin_main_program = None
        self._origin_startup_program = None
        self._communicator = None
        self.startup_program = None
        self.main_program = None

        self._opt_info = None
        self._local_ip = 0
        self._fleet_ptr = None
        self._main_programs = []
        self._scopes = []
        self._client2client_request_timeout_ms = 500000
        self._client2client_connect_timeout_ms = 10000
        self._client2client_max_retry = 3

    def init(self, role_maker=None):
        if role_maker is None:
            role_maker = MPISymetricRoleMaker()
        super(FleetTranspiler, self).init(role_maker)
        self._fleet_ptr = core.Fleet()

    def _init_transpiler_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self._role_maker.worker_index()
            return kwargs

        def geo_strategy_envs():
            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(
                    self._origin_main_program, True)
                sparse_varnames = get_sparse_tablenames(
                    self._origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self._origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self._origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster

        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            wait_server_ready(self.server_endpoints(to_string=False))

        trainer_config = self._strategy.get_trainer_runtime_config()

        print(trainer_config)

        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))

        if lrs > 0:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self._strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self._strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = fleet.compiled_config.get_communicator_send_context()

        if self.compiled_config.is_geo_mode():
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())

        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            raise ValueError(
                "Communicator can only be inited once, please check")

    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        if self._inner_mode == PSMode.TRANSPILER:
            self._init_transpiler_worker()
        else:
            raise NotImplementedError("add implement later")

    def _init_transpiler_server(self, model_dir=None):
        if not self.startup_program:
            raise ValueError(
                "startup_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.startup_program)

        if model_dir:
            if not os.path.isdir(model_dir):
                raise ValueError("There is no directory named '%s'", model_dir)

            sparse_varnames = self.compiled_config.get_sparse_varname_on_ps(
                True)
            distribtued_varnames = self.compiled_config.get_sparse_varname_on_ps(
                False)

            remaining_vars = list(
                filter(
                    FleetTranspiler.__exclude_vars(sparse_varnames +
                                                   distribtued_varnames),
                    self.main_program.list_vars()))

            fluid.io.load_vars(self._executor,
                               main_program=self.main_program,
                               dirname=model_dir,
                               vars=remaining_vars)

            self._load_sparse_params(dirname=model_dir,
                                     varnames=sparse_varnames)

            # todo(tangwei12) load distributed vars
            # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)

    def init_server(self, model_dir=None, **kwargs):
        """
        `init_server` has many many functions to do before start pserver,
        first, run executor to initialize startup program,
        second, if the `model_dir` is not empty, it will load parameters from it for increment training.

        Args:
            model_dir(str): The directory path.

        Returns:
            None
        """

        if self._inner_mode == PSMode.TRANSPILER:
            self._init_transpiler_server(model_dir)
        else:
            raise NotImplementedError("add implement later")

    def run_server(self):
        """
        `run_server` execute executor to start pserver main program.

        Returns:
            None
        """

        if self._inner_mode == PSMode.TRANSPILER:
            if not self.main_program:
                raise ValueError(
                    "main_program is None, need invoke DistributedOptimizer.minimize first"
                )

            self._executor.run(self.main_program)
        else:
            raise NotImplementedError("add implement later")

    def stop_worker(self):
        """
        Close this executor.

        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.

        Returns:
            None
        """

        if self._inner_mode == PSMode.TRANSPILER:
            self._communicator.stop()
            if isinstance(self._role_maker, MPISymetricRoleMaker):
                self._role_maker._finalize()
            self._executor.close()
        else:
            raise NotImplementedError("add implement later")

    def distributed_optimizer(self, optimizer, strategy=None):
        """
        Optimizer for distributed training.

        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
        Which has basic Optimizer function and special features for distributed training.

        Args:
            optimizer(Optimizer): The executor to run for init server.
            strategy(DistributeTranspilerConfig): Extra properties for distributed optimizer.

        Returns:
            TranspilerOptimizer: subclass of DistributedOptimizer.
        """

        if not isinstance(optimizer, Optimizer):
            raise ValueError("optimizer must be an instance of Optimizer")
        if not self._is_initialized:
            raise ValueError(
                "fleet.init(role) to initialize before optimizer.minimize(loss)"
            )

        if not strategy:
            _strategy = StrategyFactory.create_async_strategy()

        if isinstance(strategy, DistributedStrategy):
            _strategy = strategy
        elif isinstance(strategy, DistributeTranspilerConfig):
            if strategy.sync_mode:
                _strategy = SyncStrategy()
            else:
                if strategy.runtime_split_send_recv:
                    if strategy.geo_sgd_mode:
                        _strategy = GeoStrategy(
                            strategy.geo_sgd_need_push_nums)
                    elif strategy.half_async:
                        _strategy = HalfAsyncStrategy()
                    else:
                        _strategy = AsyncStrategy()
                else:
                    _strategy = HalfAsyncStrategy()
                    # for half_async compatibility
                    strategy.half_async = True
                    strategy.runtime_split_send_recv = True
            _strategy.set_program_config(strategy)
        elif isinstance(strategy, dict):
            if self._inner_mode != PSMode.PSLIB:
                raise TypeError("Dict strategy can only be used at PSLIB Mode")

            _strategy = StrategyFactory.create_async_strategy()
            _strategy.set_pslib_runtime_config(strategy)
        else:
            raise TypeError(
                "strategy must be an instance of DistributeTranspilerConfig, DistributedStrategy"
            )

        self._strategy = _strategy
        self._optimizer = ParameterServerOptimizer(optimizer, _strategy)
        return self._optimizer

    def save_inference_model(self,
                             executor,
                             dirname,
                             feeded_var_names,
                             target_vars,
                             main_program=None,
                             export_for_deployment=True):
        """
        Prune the given `main_program` to build a new program especially for inference,
        and then save it and all related parameters to given `dirname` by the `executor`.
        """

        if self._inner_mode == PSMode.PSLIB:
            raise NotImplementedError("add implement later")

        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type"
            )

        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
        if not isinstance(executor.place, fluid.CPUPlace):
            save_executor = Executor(fluid.CPUPlace())
        else:
            save_executor = executor

        if main_program is not None:
            if isinstance(main_program, CompiledProgram):
                raise TypeError(
                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                )
            fluid.io.save_inference_model(dirname, feeded_var_names,
                                          target_vars, executor, main_program,
                                          None, None, export_for_deployment)
        else:
            fluid.io.save_inference_model(dirname, feeded_var_names,
                                          target_vars, executor,
                                          self._origin_main_program, None,
                                          None, export_for_deployment, True)

            model_basename = "__model__"
            model_filename = os.path.join(dirname, model_basename)

            with open(model_filename, "rb") as f:
                program_desc_str = f.read()

            program = Program.parse_from_string(program_desc_str)
            program._copy_dist_param_info_from(self.main_program)
            self.save_persistables(executor, dirname, program)

    def _load_sparse_params(self, dirname, varnames):
        from paddle.fluid.communicator import LargeScaleKV
        scale_kv = LargeScaleKV()
        for varname in varnames:
            origin_varname, _, _ = public._get_varname_parts(varname)
            sparse_dir = os.path.join(dirname, origin_varname, varname)
            scale_kv.load(varname, sparse_dir)

    def _get_optimizer_status(self, op, param_name):
        supported_opts = [
            "sgd", "adam", "adagrad", "adamax", "momentum", "lars_momentum",
            "rmsprop", "decayed_adagrad", "ftrl"
        ]

        reshaped_val_map = {}
        reshaped_val_map["sgd"] = []
        reshaped_val_map["adam"] = ["moment1_0", "moment2_0"]
        reshaped_val_map["adagrad"] = ["moment_0"]
        reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
        reshaped_val_map["momentum"] = ["velocity_0"]
        reshaped_val_map["lars_momentum"] = ["velocity_0"]
        reshaped_val_map["rmsprop"] = [
            "momentum_0", "mean_square_0", "mean_grad_0"
        ]
        reshaped_val_map["decayed_adagrad"] = ["moment_0"]
        reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]

        orishaped_val_map = {}
        orishaped_val_map["adam"] = ["beta1_pow_acc_0", "beta2_pow_acc_0"]
        orishaped_val_map["adamax"] = ["beta1_pow_acc_0"]

        if op not in supported_opts:
            raise ValueError(
                "fleet can not support optimizer: {}, only this can be supported: {}"
                .format(op, supported_opts))

        reshaped_names = [
            param_name + "_" + val for val in reshaped_val_map[op]
        ]

        if op not in orishaped_val_map:
            origin_names = []
        else:
            origin_names = [
                param_name + "_" + val for val in orishaped_val_map[op]
            ]
        return reshaped_names, origin_names

    def _get_optimizer_op(self, param_name):
        opts = public._get_optimize_ops(self._origin_main_program)
        for op in opts:
            if "Param" in op.input_names and \
                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                return op

    def _save_dense_params(self, executor, dirname, context, main_program):
        self._communicator.recv()

        prog = Program()
        block = prog.global_block()
        local_vars = []

        for name, var_ctx in context.items():
            if len(var_ctx.origin_varnames()) != 1:
                raise ValueError("Dense can not support split now.")

            varname = var_ctx.origin_varnames()[0]
            local_vars.append(varname)

            optimizer = self._get_optimizer_op(varname)
            reshaped_varnames, origin_varnames = self._get_optimizer_status(
                optimizer.type, varname)

            for var_name in [varname] + reshaped_varnames + origin_varnames:
                var = self._origin_main_program.global_block().vars[var_name]
                block.append_op(type='recv_save',
                                attrs={
                                    "trainer_id":
                                    self._role_maker.worker_index(),
                                    "shape":
                                    var.shape,
                                    "slice_shapes":
                                    [",".join([str(i) for i in var.shape])],
                                    "slice_varnames": [var.name],
                                    "remote_varnames": [var.name],
                                    "is_sparse":
                                    False,
                                    "endpoints":
                                    var_ctx.split_endpoints(),
                                    "file_path":
                                    os.path.join(dirname, var.name)
                                })

        executor.run(prog)
        return local_vars

    def _save_sparse_params(self, executor, dirname, context, main_program):
        prog = Program()
        block = prog.global_block()
        local_vars = []

        for name, var_ctx in context.items():
            if len(var_ctx.origin_varnames()) != 1:
                raise ValueError("Dense can not support split now.")

            varname = var_ctx.origin_varnames()[0]
            local_vars.append(varname)

            optimizer = self._get_optimizer_op(varname)
            reshaped_varnames, origin_varnames = self._get_optimizer_status(
                optimizer.type, varname)

            var = self._origin_main_program.global_block().vars[varname]
            slice_shapes = []
            dims1 = ",".join([str(i) for i in var.shape[1:]])

            for section in var_ctx.sections():
                slice_shapes.append(str(section) + dims1)

            block.append_op(type='recv_save',
                            attrs={
                                "trainer_id":
                                self._role_maker.worker_index(),
                                "shape":
                                var.shape,
                                "slice_shapes":
                                slice_shapes,
                                "slice_varnames":
                                var_ctx.split_varnames(),
                                "remote_varnames":
                                var_ctx.split_varnames(),
                                "is_sparse":
                                True,
                                "endpoints":
                                var_ctx.split_endpoints(),
                                "pserver_num":
                                len(self._role_maker.get_pserver_endpoints()),
                                "file_path":
                                os.path.join(dirname, var.name)
                            })

            for reshaped_varname in reshaped_varnames:
                var = self._origin_main_program.global_block(
                ).vars[reshaped_varname]

                slice_varnames = []
                remote_varnames = []
                for i in range(len(var_ctx.split_varnames())):
                    slice_varnames.append("{}.block{}".format(
                        reshaped_varname, i))
                    remote_varnames.append(reshaped_varname)

                block.append_op(
                    type='recv_save',
                    attrs={
                        "trainer_id":
                        self._role_maker.worker_index(),
                        "shape":
                        var.shape,
                        "slice_shapes":
                        slice_shapes,
                        "slice_varnames":
                        slice_varnames,
                        "remote_varnames":
                        remote_varnames,
                        "is_sparse":
                        True,
                        "endpoints":
                        var_ctx.split_endpoints(),
                        "pserver_num":
                        len(self._role_maker.get_pserver_endpoints()),
                        "file_path":
                        os.path.join(dirname, var.name)
                    })

            for origin_varname in origin_varnames:
                var = self._origin_main_program.global_block(
                ).vars[origin_varname]

                block.append_op(type='recv_save',
                                attrs={
                                    "trainer_id":
                                    self._role_maker.worker_index(),
                                    "shape":
                                    var.shape,
                                    "slice_shapes":
                                    [",".join([str(i) for i in var.shape])],
                                    "slice_varnames": [origin_varname],
                                    "remote_varnames": [origin_varname],
                                    "is_sparse":
                                    False,
                                    "endpoints":
                                    var_ctx.split_endpoints()[:1],
                                    "file_path":
                                    os.path.join(dirname, var.name)
                                })
        executor.run(prog)
        return context.keys()

    def _save_distributed_params(self, executor, dirname, context,
                                 main_program):
        prog = Program()
        block = prog.global_block()

        for name, var_ctx in context.items():
            block.append_op(type='checkpoint_notify',
                            attrs={
                                "varname": name,
                                "is_slice": True,
                                "slice_varnames": var_ctx.split_varnames(),
                                "remote_varnames": var_ctx.split_varnames(),
                                "endpoints": var_ctx.split_endpoints(),
                                "dirname": dirname
                            })

        executor.run(prog)
        return context.keys()

    def _save_distributed_persistables(self, executor, dirname, main_program):
        dense_ctx = fleet.compiled_config.get_communicator_recv_context(
            recv_type=1)

        sparse_ctx = fleet.compiled_config.get_communicator_recv_context(
            recv_type=2)

        distributed_ctx = fleet.compiled_config.get_communicator_recv_context(
            recv_type=3)

        recv_dense_varnames = self._save_dense_params(executor, dirname,
                                                      dense_ctx, main_program)

        recv_sparse_varnames = self._save_sparse_params(
            executor, dirname, sparse_ctx, main_program)

        recv_distributed_varnames = self._save_distributed_params(
            executor, dirname, distributed_ctx, main_program)

        saved_varnames = recv_dense_varnames + list(
            recv_sparse_varnames) + list(recv_distributed_varnames)

        remaining_vars = list(
            filter(FleetTranspiler.__exclude_vars(saved_varnames),
                   main_program.list_vars()))

        fluid.io.save_vars(executor,
                           main_program=main_program,
                           dirname=dirname,
                           vars=remaining_vars)

    def save_persistables(self,
                          executor,
                          dirname,
                          main_program=None,
                          **kwargs):
        """
        This function filters out all variables with `persistable==True` from the
        give `main_program` and then saves these variables to the folder `dirname`
        or file `filename`.

        The `dirname` is used to specify the folder where persistable variables
        are going to be saved. If you would like to save variables in separate
        files, set `filename` None;
if you would like to save all variables in a
        single file, use `filename` to specify the file name.
        """

        if self._inner_mode == PSMode.PSLIB:
            raise NotImplementedError("add implement later")

        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type"
            )
        # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save
        if not isinstance(executor.place, fluid.CPUPlace):
            save_executor = Executor(fluid.CPUPlace())
        else:
            save_executor = executor

        if main_program is None:
            main_program = self.main_program

        if isinstance(main_program, CompiledProgram):
            raise TypeError(
                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
            )

        self._save_distributed_persistables(save_executor, dirname,
                                            main_program)

    @staticmethod
    def __exclude_vars(exclude_var_names=[]):
        def is_valid(var):
            if var.name in exclude_var_names:
                return False

            origin_varname, _, _ = public._get_varname_parts(var.name)
            if origin_varname.endswith("@GRAD"):
                return False

            if origin_varname == "learning_rate_0":
                return False

            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
                    var.desc.type() == core.VarDesc.VarType.READER:
                return False
            return var.persistable

        return is_valid
Пример #13
0
class DistributedTranspiler(Fleet):
    """
    A subclass for compatibility with fluid.transpiler.DistributeTranspiler.
    """
    def __init__(self):
        super(DistributedTranspiler, self).__init__(Mode.TRANSPILER)
        self._transpile_config = None
        self._transpiler = None
        self.startup_program = None
        self.main_program = None
        self._communicator = None

    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        if not self._transpile_config.sync_mode:
            self._communicator = Communicator(self.main_program)
            self._communicator.start()

    def init_server(self, model_dir=None):
        """
        `init_server` has many many functions to do before start pserver,
        first, run executor to initialize startup program,
        second, if the `model_dir` is not empty, it will load parameters from it for increment training.

        Args:
            model_dir(str): The directory path.

        Returns:
            None
        """
        if not self.startup_program:
            raise ValueError(
                "startup_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.startup_program)

        if model_dir:
            if not os.path.isdir(model_dir):
                raise ValueError("There is no directory named '%s'", model_dir)

            io.load_persistables(self._executor, model_dir,
                                 self.startup_program)

    def run_server(self):
        """
        `run_server` execute executor to start pserver main program.

        Returns:
            None
        """
        if not self.main_program:
            raise ValueError(
                "main_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.main_program)

    def stop_worker(self):
        """
        Close this executor.

        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.

        Returns:
            None
        """
        if not self._transpile_config.sync_mode:
            self._communicator.stop()
        self._executor.close()

    def distributed_optimizer(self, optimizer, strategy=None):
        """
        Optimizer for distributed training.

        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
        Which has basic Optimizer function and special features for distributed training.

        Args:
            optimizer(Optimizer): The executor to run for init server.
            strategy(dict): Extra properties for distributed optimizer.

        Returns:
            TranspilerOptimizer: subclass of DistributedOptimizer.
        """

        if not isinstance(optimizer, Optimizer):
            raise ValueError("optimizer must be an instance of Optimizer")
        self._optimizer = TranspilerOptimizer(optimizer, strategy)
        return self._optimizer

    def save_inference_model(self,
                             executor,
                             dirname,
                             feeded_var_names,
                             target_vars,
                             main_program=None,
                             export_for_deployment=True):
        """
        Prune the given `main_program` to build a new program especially for inference,
        and then save it and all related parameters to given `dirname` by the `executor`.
        """
        io.save_inference_model(dirname, feeded_var_names, target_vars,
                                executor, main_program, None, None,
                                export_for_deployment)

    def save_persistables(self, executor, dirname, main_program=None):
        """
        This function filters out all variables with `persistable==True` from the
        give `main_program` and then saves these variables to the folder `dirname`
        or file `filename`.

        The `dirname` is used to specify the folder where persistable variables
        are going to be saved. If you would like to save variables in separate
        files, set `filename` None; if you would like to save all variables in a
        single file, use `filename` to specify the file name.
        """
        io.save_persistables(executor, dirname, main_program, None)

    def _transpile(self, config):
        if not isinstance(config, DistributeTranspilerConfig):
            raise ValueError(
                "config must be an instance of DistributeTranspilerConfig")

        if not config.sync_mode:
            config.runtime_split_send_recv = True

        self._transpile_config = config
        self._transpiler = OriginTranspiler(config)
        self._transpiler.transpile(
            trainer_id=fleet.worker_index(),
            pservers=fleet.server_endpoints(to_string=True),
            trainers=fleet.worker_num(),
            sync_mode=config.sync_mode)

        if self.is_worker():
            self.main_program = self._transpiler.get_trainer_program()
            self.startup_program = default_startup_program()
        else:
            self.main_program, self.startup_program = \
                self._transpiler.get_pserver_programs(self.server_endpoints()[self.server_index()])