def test_pipeline_optimizer(self):
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.pipeline = True
        strategy.pipeline_configs = {
            'micro_batch_size': 1,
            'accumulate_steps': 2
        }

        train_prog, startup_prog = static.Program(), static.Program()
        with static.program_guard(train_prog, startup_prog):
            with fluid.unique_name.guard():
                avg_cost = self.net()

                optimizer = paddle.fluid.optimizer.Adam(0.01)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        strategy=strategy)
                optimizer.minimize(avg_cost)
예제 #2
0
    def test_dtype_error(self):
        # in static mode
        with self.assertRaises(TypeError):
            with static.program_guard(static.Program()):
                x = static.data(name="x", shape=self._shape, dtype="float32")
                out = paddle_apis[self.api](x, name="real_res")

        # in dynamic mode
        with self.assertRaises(RuntimeError):
            with fluid.dygraph.guard():
                input = np.random.random(self._shape).astype("float32")
                input_t = paddle.to_tensor(input)
                res = paddle_apis[self.api](input_t)
    def test_mlp_serial(self):

        global _global_process_mesh
        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])

        dist_strategy = fleet.DistributedStrategy()
        dist_strategy.amp = False
        dist_strategy.pipeline = False
        dist_strategy.recompute = False

        # init parallel optimizer
        dist_strategy.semi_auto = True

        fleet.init(is_collective=True, strategy=dist_strategy)

        train_program = static.Program()
        start_program = static.Program()
        loss, train_program, start_program = mlp_pretrain_forward(
            train_program, start_program)

        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
                                                         beta1=0.9,
                                                         beta2=0.999,
                                                         epsilon=1e-08,
                                                         grad_clip=None)

        optimizer = fleet.distributed_optimizer(optimizer)
        _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
            loss, start_program)
        suffix = core.kAutoParallelSuffix()
        for block in distributed_main_program.blocks:
            for op in block.ops:
                for attr_name in op.attr_names:
                    self.assertTrue(suffix not in attr_name)
        # print_program_with_dist_attr(distributed_main_program)
        self.assertIsNotNone(distributed_startup_program)
        self.assertIsNotNone(distributed_main_program)
예제 #4
0
    def test_in_static_mode(self):
        def init_input_output(dtype):
            input = np.random.random(self._shape).astype(
                dtype) + 1j * np.random.random(self._shape).astype(dtype)
            return {'x': input}, numpy_apis[self.api](input)

        for dtype in self.dtypes:
            input_dict, np_res = init_input_output(dtype)
            for place in self.places:
                with static.program_guard(static.Program()):
                    x = static.data(name="x", shape=self._shape, dtype=dtype)
                    out = paddle_apis[self.api](x)

                    exe = static.Executor(place)
                    out_value = exe.run(feed=input_dict, fetch_list=[out.name])
                    self.assertTrue(np.array_equal(np_res, out_value[0]))
예제 #5
0
    def test_in_static_mode(self):
        def init_input_output(dtype):
            input = np.random.random(self._shape).astype(dtype)
            return {'x': input}, psi(input)

        for dtype in self.dtypes:
            input_dict, sc_res = init_input_output(dtype)
            for place in self.places:
                with static.program_guard(static.Program()):
                    x = static.data(name="x", shape=self._shape, dtype=dtype)
                    out = paddle.digamma(x)

                    exe = static.Executor(place)
                    out_value = exe.run(feed=input_dict, fetch_list=[out.name])
                    self.assertEqual(
                        np.allclose(out_value[0], sc_res, rtol=1e-5), True)
예제 #6
0
def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
    paddle.set_device(device)

    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            # simple module
            data = static.data(name='data',
                               shape=[None, 1, 28, 28],
                               dtype='float32')
            label = static.data(name='label', shape=[None, 1], dtype='int64')

            hidden = static.nn.fc(data, size=128)
            hidden = func(hidden)
            hidden = static.nn.fc(hidden, size=128)
            predict = static.nn.fc(hidden, size=10, activation='softmax')
            loss = paddle.nn.functional.cross_entropy(input=hidden,
                                                      label=label)
            avg_loss = paddle.mean(loss)

            opt = paddle.optimizer.SGD(learning_rate=0.1)
            opt.minimize(avg_loss)

            # run start up model
            exe = static.Executor()
            exe.run(static.default_startup_program())

            # train
            for i in range(4):
                avg_loss_v = exe.run(static.default_main_program(),
                                     feed={
                                         'data': np_data,
                                         'label': np_label
                                     },
                                     fetch_list=[avg_loss])

            # save inference model
            static.save_inference_model(path_prefix, [data], [predict], exe)

            # get train predict value
            predict_v = exe.run(static.default_main_program(),
                                feed={
                                    'data': np_data,
                                    'label': np_label
                                },
                                fetch_list=[predict])

    return predict_v
예제 #7
0
def concat_static(func, dtype, np_inputs, axis_v, with_attr=False):
    paddle.enable_static()
    paddle.set_device("cpu")
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x1 = static.data(name="x1", shape=[2, 3], dtype=dtype)
            x2 = static.data(name="x2", shape=[2, 3], dtype=dtype)
            if with_attr:
                axis = axis_v
            else:
                axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
            x1.stop_gradient = False
            x2.stop_gradient = False

            total_time = 0
            for i in range(TEST_TIME):
                start = time.time()
                out = func([x1, x2], axis)
                total_time += time.time() - start
            print("- static mode concat time cost: {} s".format(total_time /
                                                                TEST_TIME))

            # mean only support float, so here use sum
            sum_out = paddle.sum(out)
            static.append_backward(sum_out)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            if with_attr:
                feed_dict = {
                    "x1": np_inputs[0].astype(dtype),
                    "x2": np_inputs[1].astype(dtype)
                }
            else:
                feed_dict = {
                    "x1": np_inputs[0].astype(dtype),
                    "x2": np_inputs[1].astype(dtype),
                    "axis": axis
                }
            out_v, x1_grad_v, x2_grad_v = exe.run(
                static.default_main_program(),
                feed=feed_dict,
                fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"])
    paddle.disable_static()
    return out_v, x1_grad_v, x2_grad_v
예제 #8
0
    def test_conj_static_mode(self):
        def init_input_output(dtype):
            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
                [2, 20, 2, 3]).astype(dtype)
            return {'x': input}, np.conj(input)

        for dtype in self._dtypes:
            input_dict, np_res = init_input_output(dtype)
            for place in self._places:
                with static.program_guard(static.Program()):
                    x_dtype = np.complex64 if dtype == "float32" else np.complex128
                    x = static.data(
                        name="x", shape=[2, 20, 2, 3], dtype=x_dtype)
                    out = paddle.conj(x)

                    exe = static.Executor(place)
                    out_value = exe.run(feed=input_dict, fetch_list=[out.name])
                    self.assertTrue(np.array_equal(np_res, out_value[0]))
예제 #9
0
def conj_static(func, shape, dtype, np_input):
    paddle.enable_static()
    paddle.set_device("cpu")
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name="x", shape=shape, dtype=dtype)
            x.stop_gradient = False
            out = func(x)
            sum_out = paddle.sum(out)
            static.append_backward(sum_out)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            out_v, x_grad_v = exe.run(static.default_main_program(),
                                      feed={"x": np_input},
                                      fetch_list=[out.name, x.name + "@GRAD"])
    paddle.disable_static()
    return out_v, x_grad_v
예제 #10
0
def test_relu2_static(device, dtype):
    paddle.enable_static()
    paddle.set_device(device)

    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name='X', shape=[None, 8], dtype=dtype)
            x.stop_gradient = False
            out = librelu2_op.relu2(x)
            static.append_backward(out)
            print(static.default_main_program())

            exe = static.Executor()
            exe.run(static.default_startup_program())

            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
            out, = exe.run(static.default_main_program(),
                           feed={'X': x},
                           fetch_list=[out.name])
            print(out)
예제 #11
0
def test_relu2_static(device, dtype, use_custom=True):
    paddle.enable_static()
    paddle.set_device(device)

    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name='X', shape=[None, 8], dtype=dtype)
            x.stop_gradient = False
            out = custom_relu_op_rf.relu2(
                x) if use_custom else paddle.nn.functional.relu(x)
            static.append_backward(out)
            print(static.default_main_program())

            places = static.cuda_places()
            print(places)
            exe = static.Executor()
            compiled_prog = static.CompiledProgram(
                static.default_main_program()).with_data_parallel(
                    loss_name=out.name, places=static.cuda_places())

            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
            out, = exe.run(compiled_prog, feed={'X': x}, fetch_list=[out.name])
            print(out)
예제 #12
0
def linear_static(func, dtype, np_x, np_weight, np_bias):
    paddle.enable_static()
    paddle.set_device("cpu")
    with static.scope_guard(static.Scope()):
        with static.program_guard(static.Program()):
            x = static.data(name="x", shape=np_x.shape, dtype=dtype)
            weight = static.data(
                name="weight", shape=np_weight.shape, dtype=dtype)
            bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype)
            out = func(x, weight, bias)

            exe = static.Executor()
            exe.run(static.default_startup_program())

            out_v, = exe.run(static.default_main_program(),
                             feed={
                                 "x": np_x.astype(dtype),
                                 "weight": np_weight.astype(dtype),
                                 "bias": np_bias.astype(dtype)
                             },
                             fetch_list=[out.name])
    paddle.disable_static()
    return out_v
예제 #13
0
import paddle
import paddle.static as static

paddle.enable_static()

startup_prog = static.Program()
main_prog = static.Program()
with static.program_guard(startup_prog, main_prog):
    x = static.data(name='X', shape=[1000, 784], dtype='float32')

    y = static.data(name='Y', shape=[784, 100], dtype='float32')

    z = paddle.matmul(x=x, y=y)

    binary_str = static.default_main_program().desc.serialize_to_string()
    prog_restored = static.default_main_program().parse_from_string(binary_str)

    print(static.default_main_program())
    print(prog_restored)
    def test_opt_sharding_with_pp(self):
        train_prog, startup_prog = static.Program(), static.Program()
        avg_cost, strategy = self.pp_net(train_prog, startup_prog)

        self.set_strategy(strategy, 'pipeline')
        strategy.sharding = True
        strategy.sharding_configs = {
            "sharding_degree": 1,
            "pp_degree": 2,
            "dp_degree": 2,
            "_dp_as_optimizer_sharding": True,
        }
        strategy.fuse_all_reduce_ops = False

        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
        train_prog = train_prog._pipeline_opt['section_program']
        startup_prog = startup_prog._pipeline_opt['startup_program']

        self.debug_program(train_prog, startup_prog)

        startup_prog_ops = startup_prog.global_block().ops
        main_prog_ops = train_prog.global_block().ops

        # check program
        startup_prog_op_types = [op.type for op in startup_prog_ops]
        main_prog_op_types = [op.type for op in main_prog_ops]

        # global, sharding, pp_send, pp_recv
        self.assertEqual(startup_prog_op_types, [
            'uniform_random', 'fill_constant', 'uniform_random',
            'fill_constant', 'uniform_random', 'fill_constant',
            'uniform_random', 'fill_constant', 'fill_constant',
            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
            'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
            'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast',
            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
            'c_broadcast'
        ])

        self.assertEqual(main_prog_op_types, [
            'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
            'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
            'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
            'fill_constant', 'mean_grad', 'cross_entropy_grad2',
            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
            'send_v2', 'fill_constant', 'sum', 'fill_constant', 'sum',
            'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant',
            'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
            'fill_constant', 'sum', 'c_reduce_sum', 'c_reduce_sum',
            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum',
            'momentum', 'momentum', 'momentum', 'momentum', 'c_broadcast',
            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
            'c_broadcast', 'c_broadcast', 'c_broadcast'
        ])

        # should has ring id for pp
        created_ring_ids = [
            op.desc.attr("ring_id") for op in startup_prog_ops
            if op.type == "c_comm_init"
        ]
        self.assertIn(self.dp_ring_id, created_ring_ids)
        self.assertIn(self.pp_pair_ring_id, created_ring_ids)

        # check correctness of pp group
        pp_group_waiting_prots = None
        for op in startup_prog_ops:
            if op.type == "c_gen_nccl_id" and \
                    op.desc.output_arg_names()[0] == "comm_id_0":
                pp_group_waiting_prots = op.desc.attr("other_endpoints")
        self.assertEqual(pp_group_waiting_prots, ['127.0.0.1:36003'])

        # check correctness of sharding group
        dp_group_waiting_ports = None
        for op in startup_prog_ops:
            if op.type == "c_gen_nccl_id" \
                    and op.desc.output_arg_names()[0] == "comm_id_3":
                dp_group_waiting_ports = op.desc.attr("other_endpoints")
        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
    def test_opt_sharding_with_pp_amp_gclip_boundary(self):
        """
        test optimizer sharding without parameter
        test loss grad scale value
        """
        train_prog, startup_prog = static.Program(), static.Program()
        avg_cost, strategy = self.boundary_net(train_prog, startup_prog)

        self.set_strategy(strategy, 'amp')
        self.set_strategy(strategy, 'pipeline')
        strategy.sharding = True
        strategy.sharding_configs = {
            "sharding_degree": 1,
            "pp_degree": 2,
            "dp_degree": 2,
            "_dp_as_optimizer_sharding": True,
        }
        strategy.fuse_all_reduce_ops = True
        strategy.fuse_grad_size_in_MB = 32
        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)

        self.optimizer(avg_cost,
                       strategy,
                       train_prog,
                       startup_prog,
                       grad_clip=clip)
        train_prog = train_prog._pipeline_opt['section_program']
        startup_prog = startup_prog._pipeline_opt['startup_program']
        self.debug_program(train_prog, startup_prog)

        startup_prog_ops = startup_prog.global_block().ops
        main_prog_ops = train_prog.global_block().ops

        # check program
        startup_prog_op_types = [op.type for op in startup_prog_ops]
        main_prog_op_types = [op.type for op in main_prog_ops]

        # check loss scale for hybrid
        for op in main_prog_ops:
            if is_loss_grad_op(op):
                self.assertEqual(op.type, 'fill_constant')
                self.assertTrue(op.has_attr('value'))
                scale = strategy.pipeline_configs[
                    'accumulate_steps'] * strategy.sharding_configs['dp_degree']
                loss_scale = 1.0 / scale
                self.assertAlmostEqual(float(op.attr('value')), loss_scale)

        # global, sharding, pp_send, pp_recv
        self.assertEqual(startup_prog_op_types, [
            'uniform_random', 'fill_constant', 'fill_constant',
            'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast'
        ])

        self.assertEqual(main_prog_op_types, [
            'recv_v2', 'cast', 'matmul', 'cast', 'reduce_mean',
            'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
            'reduce_mean_grad', 'cast', 'matmul_grad', 'c_sync_calc_stream',
            'send_v2', 'fill_constant', 'cast', 'sum', 'c_reduce_sum',
            'c_sync_comm_stream', 'check_finite_and_unscale', 'cast',
            'c_allreduce_max', 'c_allreduce_max', 'cast',
            'update_loss_scaling', 'fill_constant', 'c_allreduce_sum',
            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
            'elementwise_div', 'c_broadcast'
        ])
    def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self):
        train_prog, startup_prog = static.Program(), static.Program()
        avg_cost, strategy = self.pp_net(train_prog, startup_prog)

        self.set_strategy(strategy, 'amp')
        self.set_strategy(strategy, 'pipeline')

        strategy.sharding = True
        strategy.sharding_configs = {
            "sharding_degree": 1,
            "pp_degree": 2,
            "dp_degree": 2,
            "_dp_as_optimizer_sharding": True,
        }
        strategy.fuse_all_reduce_ops = True
        strategy.fuse_grad_size_in_MB = 32
        strategy.fuse_grad_merge = True
        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)

        self.optimizer(avg_cost,
                       strategy,
                       train_prog,
                       startup_prog,
                       grad_clip=clip)
        train_prog = train_prog._pipeline_opt['section_program']
        startup_prog = startup_prog._pipeline_opt['startup_program']
        self.debug_program(train_prog, startup_prog)

        startup_prog_ops = startup_prog.global_block().ops
        main_prog_ops = train_prog.global_block().ops

        # check program
        startup_prog_op_types = [op.type for op in startup_prog_ops]
        main_prog_op_types = [op.type for op in main_prog_ops]

        # global, sharding, pp_send, pp_recv
        self.assertEqual(startup_prog_op_types, [
            'uniform_random', 'fill_constant', 'uniform_random',
            'fill_constant', 'uniform_random', 'fill_constant',
            'uniform_random', 'fill_constant', 'fill_constant',
            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast'
        ])

        self.assertEqual(main_prog_op_types, [
            'recv_v2', 'cast', 'cast', 'mul', 'cast', 'elementwise_add',
            'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add',
            'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add',
            'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add',
            'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
            'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
            'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
            'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream',
            'send_v2', 'cast', 'sum', 'cast', 'sum', 'c_reduce_sum',
            'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale',
            'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast',
            'update_loss_scaling', 'squared_l2_norm', 'squared_l2_norm',
            'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum',
            'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', 'fill_constant',
            'elementwise_max', 'elementwise_div', 'elementwise_mul',
            'elementwise_mul', 'elementwise_mul', 'elementwise_mul',
            'elementwise_mul', 'momentum', 'momentum', 'momentum', 'momentum',
            'momentum', 'coalesce_tensor', 'c_broadcast', 'coalesce_tensor',
            'c_broadcast'
        ])
    def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self):
        train_prog, startup_prog = static.Program(), static.Program()
        avg_cost, strategy = self.pp_net(train_prog, startup_prog)

        self.set_strategy(strategy, 'pipeline')
        self.set_strategy(strategy, 'amp')
        strategy.amp_configs = {
            'custom_black_varnames': ['fc_6.b_0'],
        }
        strategy.recompute = True
        strategy.recompute_configs = {
            "checkpoints":
            ["fc_0.tmp_2", "fc_1.tmp_2", "fc_2.tmp_2", "fc_3.tmp_2"]
        }

        strategy.sharding = True
        strategy.sharding_configs = {
            "sharding_degree": 1,
            "pp_degree": 2,
            "dp_degree": 2,
            "_dp_as_optimizer_sharding": True,
            'optimize_cast': True,
        }
        strategy.fuse_all_reduce_ops = True
        strategy.fuse_grad_size_in_MB = 32
        strategy.fuse_grad_merge = True

        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
        train_prog = train_prog._pipeline_opt['section_program']
        startup_prog = startup_prog._pipeline_opt['startup_program']

        # self._debug = True
        self.debug_program(train_prog, startup_prog)

        startup_prog_ops = startup_prog.global_block().ops
        main_prog_ops = train_prog.global_block().ops

        # check program
        startup_prog_op_types = [op.type for op in startup_prog_ops]
        main_prog_op_types = [op.type for op in main_prog_ops]

        # global, sharding, pp_send, pp_recv
        self.assertEqual(startup_prog_op_types, [
            'uniform_random', 'fill_constant', 'uniform_random',
            'fill_constant', 'uniform_random', 'fill_constant',
            'uniform_random', 'fill_constant', 'fill_constant',
            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
            'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast',
            'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast',
            'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast'
        ])

        self.assertEqual(main_prog_op_types, [
            'recv_v2', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh',
            'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
            'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast',
            'elementwise_add', 'cast', 'softmax', 'cast', 'cross_entropy2',
            'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor',
            'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
            'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cast',
            'elementwise_add_grad', 'cast', 'mul_grad', 'cast', 'tanh_grad',
            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad',
            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
            'elementwise_add', 'cast', 'tanh_grad', 'cast',
            'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream',
            'send_v2', 'cast', 'sum', 'sum', 'cast', 'sum', 'c_reduce_sum',
            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
            'check_finite_and_unscale', 'cast', 'c_allreduce_max',
            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
            'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum',
            'momentum', 'cast', 'coalesce_tensor', 'c_broadcast',
            'c_broadcast', 'coalesce_tensor', 'c_broadcast'
        ])
예제 #18
0
def search(config, args, image_size, is_server=True):
    places = static.cuda_places() if args.use_gpu else static.cpu_places()
    place = places[0]
    if is_server:
        ### start a server and a client
        sa_nas = SANAS(config,
                       server_addr=(args.server_address, args.port),
                       search_steps=args.search_steps,
                       is_server=True)
    else:
        ### start a client
        sa_nas = SANAS(config,
                       server_addr=(args.server_address, args.port),
                       init_temperature=init_temperature,
                       is_server=False)

    image_shape = [3, image_size, image_size]
    for step in range(args.search_steps):
        archs = sa_nas.next_archs()[0]

        train_program = static.Program()
        test_program = static.Program()
        startup_program = static.Program()
        train_fetch_list, _, train_loader = build_program(train_program,
                                                          startup_program,
                                                          image_shape,
                                                          archs,
                                                          args,
                                                          is_train=True)

        current_params = count_parameters_in_MB(
            train_program.global_block().all_parameters(), 'cifar10')
        _logger.info('step: {}, current_params: {}M'.format(
            step, current_params))
        if current_params > float(3.77):
            continue

        test_fetch_list, _, test_loader = build_program(test_program,
                                                        startup_program,
                                                        image_shape,
                                                        archs,
                                                        args,
                                                        is_train=False)
        test_program = test_program.clone(for_test=True)

        exe = static.Executor(place)
        exe.run(startup_program)

        train_reader = reader.train_valid(batch_size=args.batch_size,
                                          is_train=True,
                                          is_shuffle=True)
        test_reader = reader.train_valid(batch_size=args.batch_size,
                                         is_train=False,
                                         is_shuffle=False)

        train_loader.set_batch_generator(train_reader, places=place)
        test_loader.set_batch_generator(test_reader, places=place)

        build_strategy = static.BuildStrategy()
        train_compiled_program = static.CompiledProgram(
            train_program).with_data_parallel(
                loss_name=train_fetch_list[0].name,
                build_strategy=build_strategy)

        valid_top1_list = []
        for epoch_id in range(args.retain_epoch):
            train_top1 = train(train_compiled_program, exe, epoch_id,
                               train_loader, train_fetch_list, args)
            _logger.info("TRAIN: step: {}, Epoch {}, train_acc {:.6f}".format(
                step, epoch_id, train_top1))
            valid_top1 = valid(test_program, exe, epoch_id, test_loader,
                               test_fetch_list, args)
            _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(
                epoch_id, valid_top1))
            valid_top1_list.append(valid_top1)
        sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
예제 #19
0
def search_mobilenetv2(config, args, image_size, is_server=True):
    places = static.cuda_places() if args.use_gpu else static.cpu_places()
    place = places[0]
    if is_server:
        ### start a server and a client
        rl_nas = RLNAS(key='lstm',
                       configs=config,
                       is_sync=False,
                       server_addr=(args.server_address, args.port),
                       controller_batch_size=1,
                       controller_decay_steps=1000,
                       controller_decay_rate=0.8,
                       lstm_num_layers=1,
                       hidden_size=10,
                       temperature=1.0)
    else:
        ### start a client
        rl_nas = RLNAS(key='lstm',
                       configs=config,
                       is_sync=False,
                       server_addr=(args.server_address, args.port),
                       lstm_num_layers=1,
                       hidden_size=10,
                       temperature=1.0,
                       controller_batch_size=1,
                       controller_decay_steps=1000,
                       controller_decay_rate=0.8,
                       is_server=False)

    image_shape = [3, image_size, image_size]
    if args.data == 'cifar10':
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.Cifar10(mode='train',
                                                       transform=transform,
                                                       backend='cv2')
        val_dataset = paddle.vision.datasets.Cifar10(mode='test',
                                                     transform=transform,
                                                     backend='cv2')

    elif args.data == 'imagenet':
        train_dataset = imagenet_reader.ImageNetDataset(mode='train')
        val_dataset = imagenet_reader.ImageNetDataset(mode='val')

    for step in range(args.search_steps):
        archs = rl_nas.next_archs(1)[0][0]

        train_program = static.Program()
        test_program = static.Program()
        startup_program = static.Program()
        train_loader, avg_cost, acc_top1, acc_top5 = build_program(
            train_program, startup_program, image_shape, train_dataset, archs,
            args, places)

        test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program(
            test_program,
            startup_program,
            image_shape,
            val_dataset,
            archs,
            args,
            place,
            is_test=True)
        test_program = test_program.clone(for_test=True)

        exe = static.Executor(place)
        exe.run(startup_program)

        build_strategy = static.BuildStrategy()
        train_compiled_program = static.CompiledProgram(
            train_program).with_data_parallel(loss_name=avg_cost.name,
                                              build_strategy=build_strategy)
        for epoch_id in range(args.retain_epoch):
            for batch_id, data in enumerate(train_loader()):
                fetches = [avg_cost.name]
                s_time = time.time()
                outs = exe.run(train_compiled_program,
                               feed=data,
                               fetch_list=fetches)[0]
                batch_time = time.time() - s_time
                if batch_id % 10 == 0:
                    _logger.info(
                        'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'
                        .format(step, epoch_id, batch_id, outs[0], batch_time))

        reward = []
        for batch_id, data in enumerate(test_loader()):
            test_fetches = [
                test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
            ]
            batch_reward = exe.run(test_program,
                                   feed=data,
                                   fetch_list=test_fetches)
            reward_avg = np.mean(np.array(batch_reward), axis=1)
            reward.append(reward_avg)

            _logger.info(
                'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'
                .format(step, batch_id, batch_reward[0], batch_reward[1],
                        batch_reward[2]))

        finally_reward = np.mean(np.array(reward), axis=0)
        _logger.info(
            'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format(
                finally_reward[0], finally_reward[1], finally_reward[2]))

        rl_nas.reward(np.float32(finally_reward[1]))
예제 #20
0
 def test_name_argument(self):
     with static.program_guard(static.Program()):
         x = static.data(name="x", shape=self._shape, dtype=self.dtypes[0])
         out = paddle_apis[self.api](x, name="real_res")
         self.assertTrue("real_res" in out.name)
예제 #21
0
def get_sparse_model(model_file, param_file, ratio, save_path):
    """
    Using the unstructured sparse algorithm to compress the network. 
    This interface is only used to evaluate the latency of the compressed network, and does not consider the loss of accuracy.
    Args:
        model_file(str), param_file(str): The inference model to be pruned.
        ratio(float): The ratio to prune the model.
        save_path(str): The save path of pruned model.
    """
    assert os.path.exists(model_file), f'{model_file} does not exist.'
    assert os.path.exists(
        param_file) or param_file is None, f'{param_file} does not exist.'
    paddle.enable_static()

    SKIP = ['image', 'feed', 'pool2d_0.tmp_0']

    folder = os.path.dirname(model_file)
    model_name = model_file.split('/')[-1]
    if param_file is None:
        param_name = None
    else:
        param_name = param_file.split('/')[-1]

    main_prog = static.Program()
    startup_prog = static.Program()
    exe = paddle.static.Executor(paddle.CPUPlace())
    exe.run(startup_prog)

    [inference_program, feed_target_names, fetch_targets] = (
        fluid.io.load_inference_model(
            folder, exe, model_filename=model_name, params_filename=param_name))
    thresholds = {}

    graph = GraphWrapper(inference_program)
    for op in graph.ops():
        for inp in op.all_inputs():
            name = inp.name()
            if inp.name() in SKIP: continue
            if 'tmp' in inp.name(): continue
            # 1x1_conv
            cond_conv = len(inp._var.shape) == 4 and inp._var.shape[
                2] == 1 and inp._var.shape[3] == 1
            cond_fc = False

            if cond_fc or cond_conv:
                array = np.array(paddle.static.global_scope().find_var(name)
                                 .get_tensor())
                flatten = np.abs(array.flatten())
                index = min(len(flatten) - 1, int(ratio * len(flatten)))
                ind = np.unravel_index(
                    np.argsort(
                        flatten, axis=None), flatten.shape)
                thresholds[name] = ind[0][:index]

    for op in graph.ops():
        for inp in op.all_inputs():
            name = inp.name()
            if name in SKIP: continue
            if 'tmp' in inp.name(): continue

            cond_conv = (len(inp._var.shape) == 4 and inp._var.shape[2] == 1 and
                         inp._var.shape[3] == 1)
            cond_fc = False

            # only support 1x1_conv now
            if not (cond_conv or cond_fc): continue
            array = np.array(paddle.static.global_scope().find_var(name)
                             .get_tensor())
            if thresholds.get(name) is not None:
                np.put(array, thresholds.get(name), 0)
            assert (abs(1 - np.count_nonzero(array) / array.size - ratio) < 1e-2
                    ), 'The model sparsity is abnormal.'
            paddle.static.global_scope().find_var(name).get_tensor().set(
                array, paddle.CPUPlace())

    fluid.io.save_inference_model(
        save_path,
        feeded_var_names=feed_target_names,
        target_vars=fetch_targets,
        executor=exe,
        main_program=inference_program,
        model_filename=model_name,
        params_filename=param_name)
    print("The pruned model is saved in: ", save_path)
예제 #22
0
def get_program():
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.semi_auto = True
    # fleet.init(is_collective=True, strategy=dist_strategy)

    train_program = static.Program()
    start_program = static.Program()
    with fluid.program_guard(train_program, start_program):

        # 循环计数器
        i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
        auto.shard_tensor(i,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        # 循环次数
        loop_len = fluid.layers.fill_constant(shape=[1],
                                              dtype='int64',
                                              value=epoch_num)
        auto.shard_tensor(loop_len,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        # input
        input = static.data(name="input",
                            shape=[batch_size, sequence_len, hidden_size],
                            dtype='float32')
        label = static.data(name="label",
                            shape=[batch_size, sequence_len, 1],
                            dtype='float32')

        data_holder = [input, label]
        # dataloader
        dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder,
                                                         capacity=4 *
                                                         batch_size,
                                                         iterable=False)
        dataloader.set_batch_generator(batch_generator_creator(),
                                       places=paddle.static.cuda_places())
        # data dist_attr
        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })
        auto.shard_tensor(label,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        # fill constant bsz like
        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
        auto.shard_tensor(tmp,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, 0, -1, -1]
                          })

        # model
        mlp_start = MLPLayer(hidden_size=hidden_size,
                             intermediate_size=4 * hidden_size,
                             dropout_ratio=0.1,
                             initializer_range=0.02)
        pred = mlp_start(input)

        input_array = fluid.layers.array_write(pred, i)
        auto.shard_tensor(input_array,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        cond = fluid.layers.less_than(x=i, y=loop_len)
        auto.shard_tensor(cond,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

        while_op = fluid.layers.While(cond=cond)
        with while_op.block():

            pre_input = fluid.layers.array_read(array=input_array, i=i)
            auto.shard_tensor(pre_input,
                              dist_attr={
                                  "process_mesh": _g_process_mesh,
                                  "dims_mapping": [-1, -1, -1]
                              })

            mlp_while = MLPLayer(hidden_size=hidden_size,
                                 intermediate_size=4 * hidden_size,
                                 dropout_ratio=0.1,
                                 initializer_range=0.02)
            cur_pred = mlp_while(pre_input)

            # 更新循环条件
            i = fluid.layers.increment(x=i, value=1, in_place=True)
            fluid.layers.array_write(cur_pred, array=input_array, i=i)
            fluid.layers.less_than(x=i, y=loop_len, cond=cond)

        end_pred = fluid.layers.array_read(array=input_array, i=i)
        auto.shard_tensor(end_pred,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        mlp_end = MLPLayer(hidden_size=hidden_size,
                           intermediate_size=4 * hidden_size,
                           dropout_ratio=0.1,
                           initializer_range=0.02)
        pred = mlp_end(end_pred)

        error_cost = paddle.nn.functional.square_error_cost(pred, label)
        auto.shard_tensor(error_cost,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1, -1, -1]
                          })

        loss = paddle.mean(error_cost)
        auto.shard_tensor(loss,
                          dist_attr={
                              "process_mesh": _g_process_mesh,
                              "dims_mapping": [-1]
                          })

    return train_program, start_program, dataloader, i, loss
예제 #23
0
def search_mobilenetv2(config, args, image_size, is_server=True):
    places = static.cuda_places() if args.use_gpu else static.cpu_places()
    place = places[0]
    if is_server:
        ### start a server and a client
        rl_nas = RLNAS(
            key='ddpg',
            configs=config,
            is_sync=False,
            obs_dim=26,  ### step + length_of_token
            server_addr=(args.server_address, args.port))
    else:
        ### start a client
        rl_nas = RLNAS(key='ddpg',
                       configs=config,
                       is_sync=False,
                       obs_dim=26,
                       server_addr=(args.server_address, args.port),
                       is_server=False)

    image_shape = [3, image_size, image_size]
    if args.data == 'cifar10':
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.Cifar10(mode='train',
                                                       transform=transform,
                                                       backend='cv2')
        val_dataset = paddle.vision.datasets.Cifar10(mode='test',
                                                     transform=transform,
                                                     backend='cv2')

    elif args.data == 'imagenet':
        train_dataset = imagenet_reader.ImageNetDataset(mode='train')
        val_dataset = imagenet_reader.ImageNetDataset(mode='val')

    for step in range(args.search_steps):
        if step == 0:
            action_prev = [1. for _ in rl_nas.range_tables]
        else:
            action_prev = rl_nas.tokens[0]
        obs = [step]
        obs.extend(action_prev)
        archs = rl_nas.next_archs(obs=obs)[0][0]

        train_program = static.Program()
        test_program = static.Program()
        startup_program = static.Program()
        train_loader, avg_cost, acc_top1, acc_top5 = build_program(
            train_program, startup_program, image_shape, train_dataset, archs,
            args, places)

        test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program(
            test_program,
            startup_program,
            image_shape,
            val_dataset,
            archs,
            args,
            place,
            is_test=True)
        test_program = test_program.clone(for_test=True)

        exe = static.Executor(place)
        exe.run(startup_program)

        build_strategy = static.BuildStrategy()
        train_compiled_program = static.CompiledProgram(
            train_program).with_data_parallel(loss_name=avg_cost.name,
                                              build_strategy=build_strategy)
        for epoch_id in range(args.retain_epoch):
            for batch_id, data in enumerate(train_loader()):
                fetches = [avg_cost.name]
                s_time = time.time()
                outs = exe.run(train_compiled_program,
                               feed=data,
                               fetch_list=fetches)[0]
                batch_time = time.time() - s_time
                if batch_id % 10 == 0:
                    _logger.info(
                        'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'
                        .format(step, epoch_id, batch_id, outs[0], batch_time))

        reward = []
        for batch_id, data in enumerate(test_loader()):
            test_fetches = [
                test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
            ]
            batch_reward = exe.run(test_program,
                                   feed=data,
                                   fetch_list=test_fetches)
            reward_avg = np.mean(np.array(batch_reward), axis=1)
            reward.append(reward_avg)

            _logger.info(
                'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'
                .format(step, batch_id, batch_reward[0], batch_reward[1],
                        batch_reward[2]))

        finally_reward = np.mean(np.array(reward), axis=0)
        _logger.info(
            'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format(
                finally_reward[0], finally_reward[1], finally_reward[2]))

        obs = np.expand_dims(obs, axis=0).astype('float32')
        actions = rl_nas.tokens
        obs_next = [step + 1]
        obs_next.extend(actions[0])
        obs_next = np.expand_dims(obs_next, axis=0).astype('float32')

        if step == args.search_steps - 1:
            terminal = np.expand_dims([True], axis=0).astype(np.bool)
        else:
            terminal = np.expand_dims([False], axis=0).astype(np.bool)
        rl_nas.reward(np.expand_dims(np.float32(finally_reward[1]), axis=0),
                      obs=obs,
                      actions=actions.astype('float32'),
                      obs_next=obs_next,
                      terminal=terminal)

        if step == 2:
            sys.exit(0)
예제 #24
0
def test_search_result(tokens, image_size, args, config):
    places = static.cuda_places() if args.use_gpu else static.cpu_places()
    place = places[0]

    sa_nas = SANAS(config,
                   server_addr=(args.server_address, args.port),
                   search_steps=args.search_steps,
                   is_server=True)

    image_shape = [3, image_size, image_size]
    if args.data == 'cifar10':
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.Cifar10(mode='train',
                                                       transform=transform,
                                                       backend='cv2')
        val_dataset = paddle.vision.datasets.Cifar10(mode='test',
                                                     transform=transform,
                                                     backend='cv2')

    elif args.data == 'imagenet':
        train_dataset = imagenet_reader.ImageNetDataset(mode='train')
        val_dataset = imagenet_reader.ImageNetDataset(mode='val')

    archs = sa_nas.tokens2arch(tokens)[0]

    train_program = static.Program()
    test_program = static.Program()
    startup_program = static.Program()
    train_loader, avg_cost, acc_top1, acc_top5 = build_program(
        train_program, startup_program, image_shape, train_dataset, archs,
        args, places)

    current_flops = flops(train_program)
    print('current_flops: {}'.format(current_flops))
    test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program(
        test_program,
        startup_program,
        image_shape,
        val_dataset,
        archs,
        args,
        place,
        is_test=True)

    test_program = test_program.clone(for_test=True)

    exe = static.Executor(place)
    exe.run(startup_program)

    build_strategy = static.BuildStrategy()
    train_compiled_program = static.CompiledProgram(
        train_program).with_data_parallel(loss_name=avg_cost.name,
                                          build_strategy=build_strategy)
    for epoch_id in range(args.retain_epoch):
        for batch_id, data in enumerate(train_loader()):
            fetches = [avg_cost.name]
            s_time = time.time()
            outs = exe.run(train_compiled_program,
                           feed=data,
                           fetch_list=fetches)[0]
            batch_time = time.time() - s_time
            if batch_id % 10 == 0:
                _logger.info(
                    'TRAIN: epoch: {}, batch: {}, cost: {}, batch_time: {}ms'.
                    format(epoch_id, batch_id, outs[0], batch_time))

        reward = []
        for batch_id, data in enumerate(test_loader()):
            test_fetches = [
                test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
            ]
            batch_reward = exe.run(test_program,
                                   feed=data,
                                   fetch_list=test_fetches)
            reward_avg = np.mean(np.array(batch_reward), axis=1)
            reward.append(reward_avg)

            _logger.info(
                'TEST: batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'.
                format(batch_id, batch_reward[0], batch_reward[1],
                       batch_reward[2]))

        finally_reward = np.mean(np.array(reward), axis=0)
        _logger.info(
            'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format(
                finally_reward[0], finally_reward[1], finally_reward[2]))
예제 #25
0
def get_program():
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.semi_auto = True
    # fleet.init(is_collective=True, strategy=dist_strategy)

    train_program = static.Program()
    start_program = static.Program()
    with static.program_guard(train_program, start_program):
        # input
        input = static.data(
            name="input",
            shape=[batch_size, sequence_len, hidden_size],
            dtype='float32')
        label = static.data(
            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
        data_holder = [input, label]
        # dataloader
        dataloader = paddle.io.DataLoader.from_generator(
            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
        dataloader.set_batch_generator(
            batch_generator_creator(), places=paddle.static.cuda_places())
        # data dist_attr
        auto.shard_tensor(
            input,
            dist_attr={
                "process_mesh": _g_process_mesh[0],
                "dims_mapping": [0, -1, -1]
            })
        auto.shard_tensor(
            label,
            dist_attr={
                "process_mesh": _g_process_mesh[0],
                "dims_mapping": [0, -1, -1]
            })

        mlp_start = MLPLayer(
            hidden_size=hidden_size,
            intermediate_size=4 * hidden_size,
            dropout_ratio=0.1,
            initializer_range=0.02)
        pred = mlp_start(input)

        mlp_mid = MLPLayer(
            hidden_size=hidden_size,
            intermediate_size=4 * hidden_size,
            dropout_ratio=0.1,
            initializer_range=0.02)
        pred = mlp_mid(pred)

        mlp_end = MLPLayer(
            hidden_size=hidden_size,
            intermediate_size=4 * hidden_size,
            dropout_ratio=0.1,
            initializer_range=0.02)
        pred = mlp_end(pred)

        error_cost = paddle.nn.functional.square_error_cost(pred, label)
        loss = paddle.mean(error_cost)

        optimizer = paddle.optimizer.Adam(
            learning_rate=0.00001,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08,
            grad_clip=None)

        feed_vars = {"inputs": [input], "labels": [label]}
        fetch_vars = {"loss": [loss]}

    return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
예제 #26
0
        print("op outputs are {}".format(op.output_arg_names))
        for key, value in sorted(six.iteritems(op.all_attrs())):
            if key not in ['op_callstack', 'op_role_var']:
                print(" [ attrs: {}:   {} ]".format(key, value))


def network():
    img = static.data(name='image', shape=[None, 784])
    hidden = static.nn.fc(input=img, size=200, act='relu')
    hidden = F.dropout(hidden, p=0.5)
    loss = F.cross_entropy(input=static.nn.fc(hidden, size=10, act='softmax'),
                           label=static.data(name='label',
                                             shape=[1],
                                             dtype='int64'))
    avg_loss = paddle.mean(loss)
    return avg_loss


train_program_2 = static.Program()
startup_program_2 = static.Program()
test_program_2 = static.Program()
with static.program_guard(train_program_2, startup_program_2):
    with utils.unique_name.guard():
        avg_loss = network()
        sgd = paddle.optimizer.SGD(learning_rate=1e-3)
        sgd.minimize(avg_loss)
# the test startup program is not used.
with static.program_guard(test_program_2, startup_program_2):
    with utils.unique_name.guard():
        avg_loss = network()
print_prog(test_program_2)
예제 #27
0
import paddle
import paddle.static as static

paddle.enable_static()

main_program = static.Program()
startup_program = static.Program()
with static.program_guard(main_program=main_program,
                          startup_program=startup_program):
    x = static.data(name="x", shape=[-1, 784], dtype='float32')
    y = static.data(name="y", shape=[-1, 1], dtype='int32')
    z = static.nn.fc(name="fc", input=x, size=10, act="relu")

print("main program is: {}".format(main_program))
print("start up program is: {}".format(startup_program))
예제 #28
0
def get_prune_model(model_file, param_file, ratio, save_path):
    """
    Using the structured pruning algorithm to compress the network. 
    This interface is only used to evaluate the latency of the compressed network, and does not consider the loss of accuracy.
    Args:
        model_file(str), param_file(str): The inference model to be pruned.
        ratio(float): The ratio to prune the model.
        save_path(str): The save path of pruned model.
    """

    assert os.path.exists(model_file), f'{model_file} does not exist.'
    assert os.path.exists(
        param_file) or param_file is None, f'{param_file} does not exist.'
    paddle.enable_static()

    SKIP = ['image', 'feed', 'pool2d_0.tmp_0']

    folder = os.path.dirname(model_file)
    model_name = model_file.split('/')[-1]
    if param_file is None:
        param_name = None
    else:
        param_name = param_file.split('/')[-1]

    main_prog = static.Program()
    startup_prog = static.Program()
    place = paddle.CPUPlace()
    exe = paddle.static.Executor()
    scope = static.global_scope()
    exe.run(startup_prog)

    [inference_program, feed_target_names, fetch_targets] = (
        fluid.io.load_inference_model(
            folder, exe, model_filename=model_name, params_filename=param_name))

    prune_params = []
    graph = GraphWrapper(inference_program)
    for op in graph.ops():
        for inp in op.all_inputs():
            name = inp.name()
            if inp.name() in SKIP: continue
            if 'tmp' in inp.name(): continue
            cond_conv = len(inp._var.shape) == 4 and 'conv' in name
            # only prune conv
            if cond_conv:
                prune_params.append(name)

    # drop last conv
    prune_params.pop()
    ratios = [ratio] * len(prune_params)

    pruner = Pruner()
    main_program, _, _ = pruner.prune(
        inference_program,
        scope,
        params=prune_params,
        ratios=ratios,
        place=place,
        lazy=False,
        only_graph=False,
        param_backup=None,
        param_shape_backup=None)

    fluid.io.save_inference_model(
        save_path,
        feeded_var_names=feed_target_names,
        target_vars=fetch_targets,
        executor=exe,
        main_program=main_program,
        model_filename=model_name,
        params_filename=param_name)
 def test_mapper_misc(self):
     self.assertEqual(get_dtype_bytes(paddle.float64), 8)
     self.assertEqual(get_dtype_bytes(paddle.float32), 4)
     self.assertEqual(get_dtype_bytes(paddle.float16), 2)
     self.assertEqual(get_dtype_bytes(paddle.bfloat16), 2)
     self.assertEqual(get_dtype_bytes(paddle.int64), 8)
     self.assertEqual(get_dtype_bytes(paddle.int32), 4)
     self.assertEqual(get_dtype_bytes(paddle.int16), 2)
     self.assertEqual(get_dtype_bytes(paddle.int8), 1)
     self.assertEqual(get_dtype_bytes(paddle.uint8), 1)
     self.assertRaises(ValueError, get_dtype_bytes, "unknown type")
     train_program = static.Program()
     startup_program = static.Program()
     ring_id = 0
     root_id = 0
     nranks = 2
     with fluid.program_guard(train_program, startup_program):
         input = layers.data(name="input", shape=[10, 10], dtype='float32')
         output = train_program.current_block().create_var(
             name="outofbroadcast",
             dtype='float32',
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
             stop_gradient=False)
         broadcast_op = train_program.global_block().append_op(
             type="c_broadcast",
             inputs={'X': input},
             attrs={
                 'ring_id': ring_id,
                 'root': root_id
             },
             outputs={'Out': output})
         self.assertEqual(get_comm_volume(broadcast_op, 0, 1), 400)
         self.assertEqual(get_comm_volume(broadcast_op, 1, 0), None)
         allgather_op = train_program.global_block().append_op(
             type="c_allgather",
             inputs={'X': input},
             attrs={
                 'ring_id': ring_id,
                 'nranks': nranks
             },
             outputs={'Out': output})
         self.assertEqual(get_comm_volume(allgather_op, 0, 1), 400)
         self.assertEqual(get_comm_volume(allgather_op, 0, 0), None)
         reduce_op = train_program.global_block().append_op(
             type="c_reduce_sum",
             inputs={'X': input},
             attrs={
                 'ring_id': ring_id,
                 'root_id': root_id
             },
             outputs={'Out': output})
         self.assertEqual(get_comm_volume(reduce_op, 0, 1), None)
         self.assertEqual(get_comm_volume(reduce_op, 1, 0), 400)
         cast_op = train_program.global_block().append_op(
             type="cast",
             inputs={"X": input},
             outputs={"Out": output},
             attrs={
                 "in_dtype": fluid.core.VarDesc.VarType.FP32,
                 "out_dtype": fluid.core.VarDesc.VarType.FP32
             })
         self.assertRaises(ValueError, get_comm_volume, cast_op, 0, 1)
예제 #30
0
def final_test(config, args, image_size, token=None):
    assert token != None, "If you want to start a final experiment, you must input a token."
    places = static.cuda_places() if args.use_gpu else static.cpu_places()
    place = places[0]
    sa_nas = SANAS(config,
                   server_addr=(args.server_address, args.port),
                   is_server=True)

    image_shape = [3, image_size, image_size]
    archs = sa_nas.tokens2arch(token)[0]

    train_program = static.Program()
    test_program = static.Program()
    startup_program = static.Program()
    train_fetch_list, (data,
                       label), train_loader = build_program(train_program,
                                                            startup_program,
                                                            image_shape,
                                                            archs,
                                                            args,
                                                            is_train=True)

    current_params = count_parameters_in_MB(
        train_program.global_block().all_parameters(), 'cifar10')
    _logger.info('current_params: {}M'.format(current_params))
    test_fetch_list, _, test_loader = build_program(test_program,
                                                    startup_program,
                                                    image_shape,
                                                    archs,
                                                    args,
                                                    is_train=False)
    test_program = test_program.clone(for_test=True)

    exe = static.Executor(place)
    exe.run(startup_program)

    train_reader = reader.train_valid(batch_size=args.batch_size,
                                      is_train=True,
                                      is_shuffle=True)
    test_reader = reader.train_valid(batch_size=args.batch_size,
                                     is_train=False,
                                     is_shuffle=False)

    train_loader.set_batch_generator(train_reader, places=place)
    test_loader.set_batch_generator(test_reader, places=place)

    build_strategy = static.BuildStrategy()
    train_compiled_program = static.CompiledProgram(
        train_program).with_data_parallel(loss_name=train_fetch_list[0].name,
                                          build_strategy=build_strategy)

    valid_top1_list = []
    for epoch_id in range(args.retain_epoch):
        train_top1 = train(train_compiled_program, exe, epoch_id, train_loader,
                           train_fetch_list, args)
        _logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format(
            epoch_id, train_top1))
        valid_top1 = valid(test_program, exe, epoch_id, test_loader,
                           test_fetch_list, args)
        _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(
            epoch_id, valid_top1))
        valid_top1_list.append(valid_top1)

        output_dir = os.path.join('darts_output', str(epoch_id))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        static.save_inference_model(output_dir, [data], test_fetch_list, exe)