Пример #1
0
    def _build_trainer_programs(self, compiled_config):
        from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker

        _main = compiled_config.origin_main_program.clone()
        _startup = compiled_config.origin_startup_program.clone()

        if not compiled_config.is_geo_mode():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
            _add_lr_decay_table_pass(
                _main, compiled_config,
                self.user_defined_strategy.a_sync_configs["lr_decay_steps"])

            # for main program
            _main = worker.delete_optimizer_pass(_main, compiled_config)
            _main = worker.distributed_ops_pass(_main, compiled_config)
            _main = worker.append_send_ops_pass(_main, compiled_config)

            # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(
                _startup, compiled_config)

            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)
            # for heter program
            if self.role_maker._is_heter_parameter_server_mode:
                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
                if self.role_maker._is_heter_worker():
                    # for heter worker
                    _main = heter_worker.split_heter_worker_ops_pass(
                        _main, compiled_config)
                else:
                    # for default worker
                    _main = heter_worker.split_trainer_ops_pass(
                        _main, compiled_config)
                # for startup change
                _startup = heter_worker.delete_startup_useless_ops_var_pass(
                    _startup, _main, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup
            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)

        launch_barrier = self.user_defined_strategy.a_sync_configs[
            "launch_barrier"]
        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
        if launch_barrier and launch_barrier_flag:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            # ):
            #     wait_server_ready(self.role_maker._get_heter_worker_endpoints())

        return _main, _startup
Пример #2
0
    def _build_trainer_programs(self, compiled_config):
        _main = fleet._origin_main_program.clone()
        _startup = fleet._origin_startup_program.clone()

        if not compiled_config.is_geo_mode():
            # for main program
            _main = worker.delete_optimizer_pass(_main, compiled_config)
            _main = worker.distributed_ops_pass(_main, compiled_config)
            _main = worker.append_send_ops_pass(_main, compiled_config)

            # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
            _startup = worker.init_from_server_pass(_startup, compiled_config)
            _startup = worker.delet_extra_optimizes_pass(
                _startup, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup

        return _main, _startup
Пример #3
0
    def run_single_pass(self):
        self.init_fleet_with_gloo()
        self.model = get_model(config)
        input_data = self.model.create_feeds()
        metrics = self.model.net(input_data)
        loss = self.model._cost
        user_defined_strategy = get_user_defined_strategy(config)
        learning_rate = config.get("hyper_parameters.optimizer.learning_rate")
        sync_mode = self.config.get("runner.sync_mode")
        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
        startup_program = paddle.static.default_startup_program()
        inner_optimizer.minimize(loss, startup_program)
        if self.config['debug_new_pass'] == 1:
            print("entering run {} - new".format(
                str(config["applied_pass_name"])))
            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
            ps_optimizer._set_basic_info(loss, self.role_maker,
                                         inner_optimizer,
                                         user_defined_strategy)
            ps_optimizer._set_origin_programs([loss])
            ps_optimizer._init_ps_pass_context(loss, startup_program)
            _main = ps_optimizer.pass_ctx._attrs['cloned_main']

            append_send_ops_pass = new_pass(config["applied_pass_name"],
                                            ps_optimizer.pass_ctx._attrs)
            append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx)
        else:
            print("entering run {} - old".format(
                str(config["applied_pass_name"])))
            from paddle.fluid.incubate.fleet.parameter_server.ir import public as public
            dist_strategy = get_distributed_strategy(user_defined_strategy)
            compiled_config = public.CompileTimeStrategy(
                loss.block.program, startup_program, dist_strategy,
                self.role_maker)

            _main = compiled_config.origin_main_program.clone()
            _startup = compiled_config.origin_startup_program.clone()
            from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker
            _main = worker.append_send_ops_pass(_main, compiled_config)

        if fleet.is_server():
            _main_file = ps_log_root_dir + sync_mode + "_" + str(
                config["applied_pass_name"]) + '_debug:_' + str(
                    self.config['debug_new_pass']) + '_server_main.prototxt'
            debug_program(_main_file, _main)
        elif fleet.is_worker():
            _main_file = ps_log_root_dir + sync_mode + "_" + str(
                config["applied_pass_name"]) + '_debug:_' + str(
                    self.config['debug_new_pass']) + '_worker_main.prototxt'
            debug_program(_main_file, _main)
    def _build_trainer_programs(self, compiled_config):
        from paddle.fluid.incubate.fleet.parameter_server.ir import trainer_pass as worker

        _main = compiled_config.origin_main_program.clone()
        _startup = compiled_config.origin_startup_program.clone()

        use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"]

        if not compiled_config.is_geo_mode():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
            _add_lr_decay_table_pass(
                _main, compiled_config,
                self.user_defined_strategy.a_sync_configs["lr_decay_steps"])

            # for main program
            _main = worker.distributed_ops_pass(_main, compiled_config,
                                                use_ps_gpu)
            if not use_ps_gpu:
                _main = worker.delete_optimizer_pass(_main, compiled_config)
                _main = worker.append_send_ops_pass(_main, compiled_config)
                _startup = worker.delete_extra_optimizes_pass(
                    _startup, compiled_config)

                # for startup program
            _startup = worker.fake_init_ops_pass(_startup, compiled_config)
            if use_ps_gpu:
                _main = worker.ps_gpu_pass(_main)
                from paddle.fluid.transpiler.collective import SingleProcessMultiThread
                t = SingleProcessMultiThread()
                env = self.get_dist_env()
                t.transpile(startup_program=_startup,
                            main_program=_main,
                            rank=env["trainer_id"],
                            endpoints=env["trainer_endpoints"],
                            current_endpoint=env['current_endpoint'],
                            wait_port=False)

            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)
            # for heter program
            if self.role_maker._is_heter_parameter_server_mode:
                from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
                if self.role_maker._is_heter_worker():
                    # for heter worker
                    stage_id = self.role_maker._get_stage_id()
                    device = self.role_maker._heter_device_type().lower()
                    _main = heter_worker.split_heter_worker_ops_pass(
                        _main, compiled_config, stage_id, device)
                else:
                    # for default worker
                    _main = heter_worker.split_trainer_ops_pass(
                        _main, compiled_config)
        else:
            _main = worker.append_send_ops_pass(_main, compiled_config)
            _startup = _startup
            compiled_config.set_origin_ps_main_program(_main)
            compiled_config.set_origin_ps_startup_program(_startup)

        launch_barrier = self.user_defined_strategy.a_sync_configs[
            "launch_barrier"]
        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
        if launch_barrier and launch_barrier_flag:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            # if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            # ):
            #     wait_server_ready(self.role_maker._get_heter_worker_endpoints())

        return _main, _startup