示例#1
0
    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            from paddle.fluid.transpiler.details.checkport import wait_server_ready
            wait_server_ready(fleet.server_endpoints(to_string=False))

        if not self._transpile_config.sync_mode:
            if self._transpile_config.geo_sgd_mode:
                self._communicator = Communicator(
                    self.main_program, self.vars_info, fleet.worker_num(),
                    self._transpile_config.geo_sgd_need_push_nums)
            else:
                self._communicator = Communicator(self.main_program)

            if not self._communicator.is_running():
                self._communicator.start()
            else:
                warnings.warn("communicator has been initialized, skip")
示例#2
0
    def init_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            from paddle.fluid.transpiler.details.checkport import wait_server_ready
            wait_server_ready(fleet.server_endpoints(to_string=False))

        program_config = self._transpile_config.get_program_config()
        trainer_communicator_config = self._transpile_config.get_trainer_runtime_config(
        )

        print(trainer_communicator_config)

        if isinstance(self._transpile_config, GeoStrategy):
            kwargs = {}
            kwargs["push_vars"] = self.vars_info
            kwargs["trainers"] = fleet.worker_num()
            kwargs["push_nums"] = self._transpile_config.get_program_config(
            ).geo_sgd_need_push_nums

            self._communicator = Communicator(
                self.main_program, DistributedMode.GEO, kwargs,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, AsyncStrategy):
            self._communicator = Communicator(
                self.main_program, DistributedMode.ASYNC, None,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, HalfAsyncStrategy):
            self._communicator = Communicator(
                self.main_program, DistributedMode.HALF_ASYNC, None,
                trainer_communicator_config.get_communicator_flags())

        elif isinstance(self._transpile_config, SyncStrategy):
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints()
            kwargs["trainer_id"] = self._role_maker.worker_index()

            self._communicator = Communicator(
                self.main_program, DistributedMode.SYNC, kwargs,
                trainer_communicator_config.get_communicator_flags())

        else:
            raise TypeError("Training MODE do not supported")

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")
示例#3
0
    def _init_transpiler_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self._role_maker.worker_index()
            return kwargs

        def geo_strategy_envs():
            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(
                    self._origin_main_program, True)
                sparse_varnames = get_sparse_tablenames(
                    self._origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self._origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self._origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster

        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            wait_server_ready(self.server_endpoints(to_string=False))

        trainer_config = self._strategy.get_trainer_runtime_config()

        print(trainer_config)

        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))

        if lrs > 0:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self._strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self._strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = fleet.compiled_config.get_communicator_send_context()

        if self.compiled_config.is_geo_mode():
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())

        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            raise ValueError(
                "Communicator can only be inited once, please check")