Python HalfAsyncStrategy 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy

클래스/타입: HalfAsyncStrategy

hotexamples.com에서의 예제들: 3

Python HalfAsyncStrategy - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy.HalfAsyncStrategy에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

HalfAsyncStrategy(2)

get_program_config(1)

get_server_runtime_config(1)

set_program_config(1)

예제 #1

파일 보기

파일: __init__.py 프로젝트: wuhuachaocoding/Paddle

    def distributed_optimizer(self, optimizer, strategy=None):
        """
        Optimizer for distributed training.

        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
        Which has basic Optimizer function and special features for distributed training.

        Args:
            optimizer(Optimizer): The executor to run for init server.
            strategy(DistributeTranspilerConfig): Extra properties for distributed optimizer.

        Returns:
            TranspilerOptimizer: subclass of DistributedOptimizer.
        """

        if not isinstance(optimizer, Optimizer):
            raise ValueError("optimizer must be an instance of Optimizer")
        if not self._is_initialized:
            raise ValueError(
                "fleet.init(role) to initialize before optimizer.minimize(loss)"
            )

        if not strategy:
            _strategy = StrategyFactory.create_async_strategy()

        if isinstance(strategy, DistributedStrategy):
            _strategy = strategy
        elif isinstance(strategy, DistributeTranspilerConfig):
            if strategy.sync_mode:
                _strategy = SyncStrategy()
            else:
                if strategy.runtime_split_send_recv:
                    if strategy.geo_sgd_mode:
                        _strategy = GeoStrategy(
                            strategy.geo_sgd_need_push_nums)
                    elif strategy.half_async:
                        _strategy = HalfAsyncStrategy()
                    else:
                        _strategy = AsyncStrategy()
                else:
                    _strategy = HalfAsyncStrategy()
                    # for half_async compatibility
                    strategy.half_async = True
                    strategy.runtime_split_send_recv = True
            _strategy.set_program_config(strategy)
        elif isinstance(strategy, dict):
            if self._inner_mode != PSMode.PSLIB:
                raise TypeError("Dict strategy can only be used at PSLIB Mode")

            _strategy = StrategyFactory.create_async_strategy()
            _strategy.set_pslib_runtime_config(strategy)
        else:
            raise TypeError(
                "strategy must be an instance of DistributeTranspilerConfig, DistributedStrategy"
            )

        self._strategy = _strategy
        self._optimizer = ParameterServerOptimizer(optimizer, _strategy)
        return self._optimizer

예제 #2

파일 보기

class DistributedTranspiler(Fleet):
    """
    A subclass for compatibility with fluid.transpiler.DistributeTranspiler.
    """

    def __init__(self):
        super(DistributedTranspiler, self).__init__(Mode.TRANSPILER)
        self._transpile_config = None
        self._transpiler = None
        self._origin_program = None
        self.startup_program = None
        self.main_program = None
        self._communicator = None

    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            from paddle.fluid.transpiler.details.checkport import wait_server_ready
            wait_server_ready(fleet.server_endpoints(to_string=False))

        program_config = self._transpile_config.get_program_config()
        trainer_communicator_config = self._transpile_config.get_trainer_runtime_config(
        )

        print(trainer_communicator_config)

        if isinstance(self._transpile_config, GeoStrategy):
            kwargs = {}
            kwargs["push_vars"] = self.vars_info
            kwargs["trainers"] = fleet.worker_num()
            kwargs["push_nums"] = self._transpile_config.get_program_config(
            ).geo_sgd_need_push_nums

            self._communicator = Communicator(
                self.main_program, DistributedMode.GEO, kwargs,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, AsyncStrategy):
            self._communicator = Communicator(
                self.main_program, DistributedMode.ASYNC, None,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, HalfAsyncStrategy):
            self._communicator = Communicator(
                self.main_program, DistributedMode.HALF_ASYNC, None,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, SyncStrategy):
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints()
            kwargs["trainer_id"] = self._role_maker.worker_index()

            self._communicator = Communicator(
                self.main_program, DistributedMode.SYNC, kwargs,
                trainer_communicator_config.get_communicator_flags())

        else:
            raise TypeError("Training MODE do not supported")

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")

    def init_server(self, model_dir=None):
        """
        `init_server` has many many functions to do before start pserver,
        first, run executor to initialize startup program,
        second, if the `model_dir` is not empty, it will load parameters from it for increment training.

        Args:
            model_dir(str): The directory path.

        Returns:
            None
        """
        if not self.startup_program:
            raise ValueError(
                "startup_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.startup_program)

        if model_dir:
            if not os.path.isdir(model_dir):
                raise ValueError("There is no directory named '%s'", model_dir)

            io.load_persistables(self._executor, model_dir, self.main_program)

    def run_server(self):
        """
        `run_server` execute executor to start pserver main program.

        Returns:
            None
        """
        if not self.main_program:
            raise ValueError(
                "main_program is None, need invoke DistributedOptimizer.minimize first"
            )

        self._executor.run(self.main_program)

    def stop_worker(self):
        """
        Close this executor.

        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.

        Returns:
            None
        """

        self._communicator.stop()
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            self._role_maker._finalize()
        self._executor.close()

    def distributed_optimizer(self, optimizer, strategy=None):
        """
        Optimizer for distributed training.

        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
        Which has basic Optimizer function and special features for distributed training.

        Args:
            optimizer(Optimizer): The executor to run for init server.
            strategy(DistributeTranspilerConfig): Extra properties for distributed optimizer.

        Returns:
            TranspilerOptimizer: subclass of DistributedOptimizer.
        """

        if not isinstance(optimizer, Optimizer):
            raise ValueError("optimizer must be an instance of Optimizer")
        if not fleet._is_initialized:
            raise ValueError(
                "use fleet.init(role) to initialize the role of current node before optimizer.minimize(loss)"
            )
        self._optimizer = TranspilerOptimizer(optimizer, strategy)
        return self._optimizer

    def save_inference_model(self,
                             executor,
                             dirname,
                             feeded_var_names,
                             target_vars,
                             main_program=None,
                             export_for_deployment=True):
        """
        Prune the given `main_program` to build a new program especially for inference,
        and then save it and all related parameters to given `dirname` by the `executor`.
        """
        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_inference_model() function, executor must be as Executor type"
            )

        if main_program is not None:
            if isinstance(main_program, CompiledProgram):
                raise TypeError(
                    "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                )
            io.save_inference_model(dirname, feeded_var_names, target_vars,
                                    executor, main_program, None, None,
                                    export_for_deployment)
        else:
            io.save_inference_model(dirname, feeded_var_names, target_vars,
                                    executor, self._origin_program, None, None,
                                    export_for_deployment, True)

            model_basename = "__model__"
            model_filename = os.path.join(dirname, model_basename)

            with open(model_filename, "rb") as f:
                program_desc_str = f.read()

            program = Program.parse_from_string(program_desc_str)
            program._copy_dist_param_info_from(self.main_program)
            self.save_persistables(executor, dirname, program)

    def save_persistables(self, executor, dirname, main_program=None):
        """
        This function filters out all variables with `persistable==True` from the
        give `main_program` and then saves these variables to the folder `dirname`
        or file `filename`.

        The `dirname` is used to specify the folder where persistable variables
        are going to be saved. If you would like to save variables in separate
        files, set `filename` None; if you would like to save all variables in a
        single file, use `filename` to specify the file name.
        """
        if isinstance(executor, ParallelExecutor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed"
            )

        if not isinstance(executor, Executor):
            raise TypeError(
                "in fleet.save_persistables() function, executor must be as Executor type"
            )

        if main_program is None:
            main_program = self.main_program

        if isinstance(main_program, CompiledProgram):
            raise TypeError(
                "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
            )

        if not main_program._is_distributed:
            raise ValueError(
                "main_program is for local, may not use fleet.save_persistables")

        io.save_persistables(executor, dirname, main_program, None)

    def _transpile(self, config):
        if isinstance(config, DistributedStrategy):
            self._transpile_config = config
        elif isinstance(config, DistributeTranspilerConfig):
            if config.sync_mode:
                self._transpile_config = SyncStrategy()
            else:
                if config.runtime_split_send_recv:
                    if config.geo_sgd_mode:
                        self._transpile_config = GeoStrategy(
                            config.geo_sgd_need_push_nums)
                    elif config.half_async:
                        self._transpile_config = HalfAsyncStrategy()
                    else:
                        self._transpile_config = AsyncStrategy()

                else:
                    self._transpile_config = HalfAsyncStrategy()
                    # for half_async compatibility
                    config.half_async = True
                    config.runtime_split_send_recv = True
            self._transpile_config.set_program_config(config)
        else:
            raise TypeError(
                "config must be an instance of DistributeTranspilerConfig, SyncStrategy, HalfAsyncStrategy, AsyncStrategy or GeoStratey."
            )

        program_config = self._transpile_config.get_program_config()

        # _origin_program is a deep copy for default_main_program, for inference
        self._origin_program = default_main_program().clone(for_test=False)

        if program_config.geo_sgd_mode:
            from paddle.fluid.transpiler.geo_sgd_transpiler import GeoSgdTranspiler
            self._transpiler = GeoSgdTranspiler(program_config)
        else:
            self._transpiler = OriginTranspiler(program_config)
        self._transpiler._set_server_config(
            self._transpile_config.get_server_runtime_config())

        if self.is_worker():
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
                pservers=fleet.server_endpoints(to_string=True),
                trainers=fleet.worker_num(),
                sync_mode=program_config.sync_mode)

            if isinstance(self._role_maker, MPISymetricRoleMaker):
                program_config.wait_port = False
                self._transpile_config.set_program_config(program_config)

            self.main_program = self._transpiler.get_trainer_program(
                wait_port=program_config.wait_port)
            self.startup_program = default_startup_program()
            if program_config.geo_sgd_mode:
                self.vars_info = self._transpiler._get_vars_info()
                self.startup_program = self._transpiler.trainer_startup_program
        else:
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
                pservers=fleet.server_endpoints(to_string=True),
                trainers=fleet.worker_num(),
                sync_mode=program_config.sync_mode,
                current_endpoint=self.server_endpoints()[self.server_index()])
            self.main_program, self.startup_program = \
                self._transpiler.get_pserver_programs(
                    self.server_endpoints()[self.server_index()])

    def _set_opt_info(self, opt_info):
        """
        this function saves the result from DistributedOptimizer.minimize()
        """
        self._opt_info = opt_info

예제 #3

파일 보기

    def _transpile(self, config):
        if isinstance(config, DistributedStrategy):
            self._transpile_config = config
        elif isinstance(config, DistributeTranspilerConfig):
            if config.sync_mode:
                self._transpile_config = SyncStrategy()
            else:
                if config.runtime_split_send_recv:
                    if config.geo_sgd_mode:
                        self._transpile_config = GeoStrategy(
                            config.geo_sgd_need_push_nums)
                    elif config.half_async:
                        self._transpile_config = HalfAsyncStrategy()
                    else:
                        self._transpile_config = AsyncStrategy()

                else:
                    self._transpile_config = HalfAsyncStrategy()
                    # for half_async compatibility
                    config.half_async = True
                    config.runtime_split_send_recv = True
            self._transpile_config.set_program_config(config)
        else:
            raise TypeError(
                "config must be an instance of DistributeTranspilerConfig, SyncStrategy, HalfAsyncStrategy, AsyncStrategy or GeoStratey."
            )

        program_config = self._transpile_config.get_program_config()

        # _origin_program is a deep copy for default_main_program, for inference
        self._origin_program = default_main_program().clone(for_test=False)

        if program_config.geo_sgd_mode:
            from paddle.fluid.transpiler.geo_sgd_transpiler import GeoSgdTranspiler
            self._transpiler = GeoSgdTranspiler(program_config)
        else:
            self._transpiler = OriginTranspiler(program_config)
        self._transpiler._set_server_config(
            self._transpile_config.get_server_runtime_config())

        if self.is_worker():
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
                pservers=fleet.server_endpoints(to_string=True),
                trainers=fleet.worker_num(),
                sync_mode=program_config.sync_mode)

            if isinstance(self._role_maker, MPISymetricRoleMaker):
                program_config.wait_port = False
                self._transpile_config.set_program_config(program_config)

            self.main_program = self._transpiler.get_trainer_program(
                wait_port=program_config.wait_port)
            self.startup_program = default_startup_program()
            if program_config.geo_sgd_mode:
                self.vars_info = self._transpiler._get_vars_info()
                self.startup_program = self._transpiler.trainer_startup_program
        else:
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
                pservers=fleet.server_endpoints(to_string=True),
                trainers=fleet.worker_num(),
                sync_mode=program_config.sync_mode,
                current_endpoint=self.server_endpoints()[self.server_index()])
            self.main_program, self.startup_program = \
                self._transpiler.get_pserver_programs(
                    self.server_endpoints()[self.server_index()])