Exemplo n.º 1
0
    def build_strategy(self, args):
        self.strategy = None
        if args.mode == "async":
            self.strategy = StrategyFactory.create_async_strategy()
        elif args.mode == "sync":
            self.strategy = StrategyFactory.create_sync_strategy()
        elif args.mode == "half_async":
            self.strategy = StrategyFactory.create_half_async_strategy()
        elif args.mode == "geo":
            self.strategy = StrategyFactory.create_geo_strategy(
                args.geo_sgd_need_push_nums)
        self.dump_param = os.getenv("dump_param", "").split(",")
        self.dump_fields = os.getenv("dump_fields", "").split(",")
        self.dump_fields_path = os.getenv("dump_fields_path", "")
        debug = int(os.getenv("Debug", "0"))
        if debug:
            self.strategy.set_debug_opt({
                "dump_param":
                self.dump_param,
                "dump_fields":
                self.dump_fields,
                "dump_fields_path":
                self.dump_fields_path
            })

        return self.strategy
Exemplo n.º 2
0
    def test_geo_strategy(self):
        strategy = StrategyFactory.create_geo_strategy(5)
        self.assertEqual(strategy._program_config.sync_mode, False)
        self.assertEqual(strategy._program_config.runtime_split_send_recv,
                         True)
        self.assertEqual(strategy._program_config.geo_sgd_mode, True)
        self.assertEqual(strategy._program_config.geo_sgd_need_push_nums, 5)
        self.assertEqual(strategy._build_strategy.async_mode, True)

        # test set_build_strategy using fluid.BuildStrategy
        build_strategy_class = fluid.BuildStrategy()
        build_strategy_class.memory_optimize = False
        strategy.set_build_strategy(build_strategy_class)
        build_strategy = strategy.get_build_strategy()
        self.assertEqual(build_strategy.memory_optimize, False)

        # test set_build_strategy using dict
        build_strategy_dict = dict()
        build_strategy_dict['memory_optimize'] = True
        strategy.set_build_strategy(build_strategy_dict)
        build_strategy = strategy.get_build_strategy()
        self.assertEqual(build_strategy.memory_optimize, True)

        # test set_build_strategy exception
        build_strategy_dict['unknown'] = None
        self.assertRaises(Exception, strategy.set_build_strategy,
                          build_strategy_dict)
        build_strategy_illegal = None
        self.assertRaises(Exception, strategy.set_build_strategy,
                          build_strategy_illegal)
Exemplo n.º 3
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(current_id=0,
                                               role=role_maker.Role.SERVER,
                                               worker_num=2,
                                               server_endpoints=endpoints)

        fleet.init(role)
        loss, acc, _ = self.net()
        optimizer = fluid.optimizer.SGD(base_lr)
        strategy = StrategyFactory.create_geo_strategy(20)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
    def test_dist_geo_server_transpiler(self):
        num_voc = 128
        embed_dim = 64
        x_shape, x_lod = [16, 10], [[3, 5, 2, 6]]
        x = fluid.data(name='x', shape=x_shape, dtype='int32', lod_level=1)
        hash_embd = fluid.contrib.layers.search_pyramid_hash(
            input=x,
            num_emb=embed_dim,
            space_len=num_voc * embed_dim,
            pyramid_layer=4,
            rand_len=16,
            drop_out_percent=0.5,
            is_training=True,
            use_filter=False,
            white_list_len=6400,
            black_list_len=2800,
            seed=3,
            lr=0.002,
            param_attr=fluid.ParamAttr(
                name="PyramidHash_emb_0",
                learning_rate=0, ),
            param_attr_wl=fluid.ParamAttr(
                name="Filter",
                learning_rate=0, ),
            param_attr_bl=None,
            distribute_update_vars=["PyramidHash_emb_0"],
            name=None)

        cost = fluid.layers.reduce_sum(hash_embd)

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        strategy = StrategyFactory.create_geo_strategy(5)
        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(cost)

        pserver_startup_program = fleet.startup_program
        pserver_mian_program = fleet.main_program
Exemplo n.º 5
0
    def _get_distributed_strategy(self):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory

        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
        strategy = None

        if not self.user_defined_strategy.a_sync and k_steps == 0:
            strategy = StrategyFactory.create_sync_strategy()

        if self.user_defined_strategy.a_sync and k_steps == 0:
            strategy = StrategyFactory.create_async_strategy()

        if self.user_defined_strategy.a_sync and k_steps > 0:
            strategy = StrategyFactory.create_geo_strategy(k_steps)

        if not strategy:
            raise ValueError("k_steps must be invalid value, please check")

        return strategy
Exemplo n.º 6
0
    def build_strategy(self):
        mode = envs.get_runtime_environ("train.trainer.strategy")
        assert mode in ["async", "geo", "sync", "half_async"]

        strategy = None

        if mode == "async":
            strategy = StrategyFactory.create_async_strategy()
        elif mode == "geo":
            push_num = envs.get_global_env("train.strategy.mode.push_num", 100)
            strategy = StrategyFactory.create_geo_strategy(push_num)
        elif mode == "sync":
            strategy = StrategyFactory.create_sync_strategy()
        elif mode == "half_async":
            strategy = StrategyFactory.create_half_async_strategy()

        assert strategy is not None

        self.strategy = strategy
        return strategy
Exemplo n.º 7
0
    def _build_strategy(self, context):
        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
        mode = envs.get_runtime_environ("train.trainer.strategy")
        assert mode in ["async", "geo", "sync", "half_async"]

        strategy = None

        if mode == "async":
            strategy = StrategyFactory.create_async_strategy()
        elif mode == "geo":
            push_num = envs.get_global_env("train.strategy.mode.push_num", 100)
            strategy = StrategyFactory.create_geo_strategy(push_num)
        elif mode == "sync":
            strategy = StrategyFactory.create_sync_strategy()
        elif mode == "half_async":
            strategy = StrategyFactory.create_half_async_strategy()

        assert strategy is not None

        context["strategy"] = strategy
        return strategy
Exemplo n.º 8
0
    def test_geo_strategy(self):
        strategy = StrategyFactory.create_geo_strategy(5)
        self.assertEqual(strategy._program_config.sync_mode, False)
        self.assertEqual(strategy._program_config.runtime_split_send_recv,
                         True)
        self.assertEqual(strategy._program_config.geo_sgd_mode, True)
        self.assertEqual(strategy._program_config.geo_sgd_need_push_nums, 5)
        self.assertEqual(strategy._build_strategy.async_mode, True)

        # test set_build_strategy using fluid.BuildStrategy
        build_strategy_class = fluid.BuildStrategy()
        build_strategy_class.memory_optimize = False
        strategy.set_build_strategy(build_strategy_class)
        build_strategy = strategy.get_build_strategy()
        self.assertEqual(build_strategy.memory_optimize, False)

        # test set_build_strategy using dict
        build_strategy_dict = dict()
        build_strategy_dict['memory_optimize'] = True
        strategy.set_build_strategy(build_strategy_dict)
        build_strategy = strategy.get_build_strategy()
        self.assertEqual(build_strategy.memory_optimize, True)

        # test set_build_strategy exception
        build_strategy_dict['unknown'] = None
        self.assertRaises(Exception, strategy.set_build_strategy,
                          build_strategy_dict)
        build_strategy_illegal = None
        self.assertRaises(Exception, strategy.set_build_strategy,
                          build_strategy_illegal)

        os.environ["CPU_NUM"] = '100'
        trainer_runtime_config = strategy.get_trainer_runtime_config()
        runtime_configs = trainer_runtime_config.get_communicator_flags()
        self.assertIn('communicator_thread_pool_size', runtime_configs)
        self.assertIn('communicator_send_wait_times', runtime_configs)
        self.assertNotIn('communicator_independent_recv_thread',
                         runtime_configs)
Exemplo n.º 9
0
    def test_pserver(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        batch_size = 128
        is_sparse = True
        is_distribute = False

        strategy = StrategyFactory.create_geo_strategy(5)

        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)

        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        pserver_startup_program = fleet.startup_program
        pserver_mian_program = fleet.main_program
Exemplo n.º 10
0
 def _set_strategy(self, args):
     """配置运行的distributed_strategy, 
        build_strategy 配置在do_training中"""
     if int(os.getenv("PADDLE_COMPATIBILITY_CHECK", '0')):
         self.strategy = DistributeTranspilerConfig()
         if args.run_params["sync_mode"] == "sync":
             self.strategy.sync_mode = True
             self.strategy.runtime_split_send_recv = False
             self.async_mode = False
         elif args.run_params["sync_mode"] == "half_async":
             self.strategy.sync_mode = False
             self.async_mode = False
         elif args.run_params["sync_mode"] == "async":
             self.strategy.sync_mode = False
             self.async_mode = True
         elif args.run_params["sync_mode"] == "geo_async":
             self.strategy.sync_mode = False
             self.async_mode = True
             self.strategy.geo_sgd_mode = True
             self.strategy.geo_sgd_need_push_nums = 400
         self.strategy.mode = "pserver"
         self.strategy.slice_var_up = args.run_params['slice_var_up']
         self.strategy.enable_dc_asgd = args.run_params['enable_dc_asgd']
         #TODO: split_method=HashName, it will cause a bug, this option can open after repair
         # if args.run_params['split_method']:
         #    self.strategy.split_method = HashName
         # else:
         #    self.strategy.split_method = RoundRobin
         self.strategy.wait_port = args.run_params['wait_port']
         self.strategy.runtime_split_send_recv = args.run_params[
             'runtime_split_send_recv']
         self.strategy.use_hierarchical_allreduce = args.run_params[
             'use_hierarchical_allreduce']
         self.strategy.geo_sgd_need_push_nums = args.run_params['push_nums']
     else:
         self.strategy = StrategyFactory.create_sync_strategy()
         # trainer_runtime_config = TrainerRuntimeConfig()
         # trainer_runtime_config.send_queue_size = "16"
         # trainer_runtime_config.thread_pool_size="32"
         # trainer_runtime_config.max_merge_var_num="16"
         # trainer_runtime_config.is_sgd_communicator="0"
         if args.run_params["sync_mode"] == "sync":
             self.strategy = StrategyFactory.create_sync_strategy()
         elif args.run_params["sync_mode"] == "half_async":
             self.strategy = StrategyFactory.create_half_async_strategy()
         elif args.run_params["sync_mode"] == "async":
             self.strategy = StrategyFactory.create_async_strategy()
             build_strategy = self.strategy.get_build_strategy()
             build_strategy.memory_optimize = False
             self.strategy.set_build_strategy(build_strategy)
         elif args.run_params["sync_mode"] == "geo_async":
             self.strategy = StrategyFactory.create_geo_strategy(400)
         program_config = self.strategy.get_program_config()
         program_config.slice_var_up = args.run_params['slice_var_up']
         program_config.enable_dc_asgd = args.run_params['enable_dc_asgd']
         #TODO: split_method=HashName, it will cause a bug, this option can open after repair
         # if args.run_params['split_method']:
         #    program_config.split_method = HashName
         # else:
         #    program_config.split_method = RoundRobin
         program_config.wait_port = args.run_params['wait_port']
         program_config.runtime_split_send_recv = args.run_params[
             'runtime_split_send_recv']
         program_config.use_hierarchical_allreduce = args.run_params[
             'use_hierarchical_allreduce']
         program_config.geo_sgd_need_push_nums = args.run_params[
             'push_nums']