Python CompiledProgram 예제들, paddle.fluid.compiler.CompiledProgram Python 예제들

예제 #1

0

파일 보기

    def check_network_convergence(self, use_cuda, build_strategy=None):
        os.environ['CPU_NUM'] = str(4)
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            loss = simple_fc_net()
            test_program = main.clone(for_test=True)

            opt = fluid.optimizer.SGD(learning_rate=0.001)
            opt.minimize(loss)

            batch_size = 32
            image = np.random.normal(size=(batch_size, 784)).astype('float32')
            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(startup)
            feed_dict = {'image': image, 'label': label}

            train_cp = compiler.CompiledProgram(main).with_data_parallel(
                loss_name=loss.name, build_strategy=build_strategy)
            test_cp = compiler.CompiledProgram(test_program).with_data_parallel(
                loss_name=loss.name,
                build_strategy=build_strategy,
                share_vars_from=train_cp)

            for i in range(5):
                _ = exe.run(train_cp, fetch_list=[loss.name], feed=feed_dict)
                test_loss, = exe.run(test_cp,
                                     fetch_list=[loss.name],
                                     feed=feed_dict)
                train_loss = exe.run(train_cp,
                                     fetch_list=[loss.name],
                                     feed=feed_dict)

                avg_test_loss_val = np.array(test_loss).mean()
                if math.isnan(float(avg_test_loss_val)):
                    sys.exit("got NaN loss, testing failed.")

                avg_train_loss_val = np.array(train_loss).mean()
                if math.isnan(float(avg_train_loss_val)):
                    sys.exit("got NaN loss, training failed.")

                self.assertTrue(
                    np.allclose(
                        train_loss, test_loss, atol=1e-8),
                    "Train loss: " + str(train_loss) + "\n Test loss:" +
                    str(test_loss))

예제 #2

0

파일 보기

파일: test_parallel_executor_dry_run.py 프로젝트: BobbleMLTeam/DeepSpeech_Paddle

    def main(self,
             network_func,
             iter=10,
             iter_per_pe=10,
             use_gpu=True,
             use_experimental_executor=False):
        if use_gpu and not fluid.core.is_compiled_with_cuda():
            logging.warning(
                "Paddle is not compiled with CUDA, skip GPU unittests")
            return

        main_prog = fluid.Program()
        startup_prog = fluid.Program()
        scope = fluid.Scope()
        with fluid.program_guard(main_prog, startup_prog):
            with fluid.scope_guard(scope):
                loss = network_func()
                exe = fluid.Executor(
                    fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace())
                exe.run(startup_prog)

                exe_strategy = fluid.ExecutionStrategy()
                exe_strategy._dry_run = True
                exe_strategy.use_experimental_executor = use_experimental_executor
                train_cp = compiler.CompiledProgram(
                    main_prog).with_data_parallel(loss_name=loss.name,
                                                  exec_strategy=exe_strategy)
                for _ in six.moves.xrange(iter):
                    for _ in six.moves.xrange(iter_per_pe):
                        exe.run(train_cp)

예제 #3

0

파일 보기

파일: test_feed_data_check_shape_type.py 프로젝트: walterwyn/paddletest

    def _feed_data_in_executor(self, in_size, label_size, feed_in_data,
                               feed_label, use_cuda, use_parallel_executor):

        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            in_data, label, loss = self._simple_fc_net(in_size, label_size,
                                                       self.class_num,
                                                       self.hidden_sizes)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = fluid.Executor(place)
        exe.run(startup_program)

        train_program = main_program
        if use_parallel_executor:
            train_program = compiler.CompiledProgram(
                main_program).with_data_parallel(loss_name=loss.name)

        for i in range(self.iterations):
            fetches = exe.run(train_program,
                              feed={
                                  in_data.name: feed_in_data,
                                  label.name: feed_label
                              },
                              fetch_list=[loss.name])

예제 #4

0

파일 보기

파일: test_parallel_executor_fetch_feed.py 프로젝트: lidanqing-intel/Paddle

    def parallel_exe(self, use_cuda, run_parallel_exe, seed=1):
        main_program = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = seed
        with fluid.program_guard(main_program, startup):
            data = fluid.layers.data(name='image',
                                     shape=[3, 224, 224],
                                     dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            out = Lenet(data, class_dim=102)
            loss = fluid.layers.cross_entropy(input=out, label=label)
            loss = fluid.layers.mean(loss)
            opt = fluid.optimizer.Momentum(
                learning_rate=0.1,
                momentum=0.9,
                regularization=fluid.regularizer.L2Decay(1e-4))
            opt.minimize(loss)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup)

        #FIXME force disable enable_inplace and memory_optimize to pass the unittest
        build_strategy = fluid.BuildStrategy()
        build_strategy.enable_inplace = False
        build_strategy.memory_optimize = False
        train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)

        run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)

예제 #5

0

파일 보기

파일: test_inplace_abn_op.py 프로젝트: sandyhouse/Paddle

    def compare(self, place, layout, only_forward, activation, alpha, use_cuda):
        seed = 10
        os.environ['FLAGS_cudnn_deterministic'] = "1"
        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2

        fetch_outs = []
        fetch_names = []
        for inplace in [False, True]:
            main, startup, outs = self.build_program(
                place,
                layout,
                seed,
                only_forward,
                activation,
                alpha,
                inplace=inplace)
            exe = fluid.Executor(place)
            exe.run(startup)

            fetch_name = [v.name for v in outs] + [
                'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
            ]
            if not only_forward:
                others = [
                    'inplace_abn_0.tmp_0' if inplace else 'batch_norm_0.tmp_0',
                    'inplace_abn_0.tmp_1' if inplace else 'batch_norm_0.tmp_1',
                    'bn_scale@GRAD',
                    'bn_bias@GRAD',
                    'input@GRAD',
                ]
                fetch_name += others
            for nm in fetch_name:
                fv = fluid.framework._get_var(str(nm), program=main)
                fv.persistable = True

            build_strategy = fluid.BuildStrategy()
            build_strategy.sync_batch_norm = use_cuda and \
                        fluid.core.get_cuda_device_count() > 1
            build_strategy.enable_inplace = inplace
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = 1 if os.name == 'nt' else 0
            comp_prog1 = compiler.CompiledProgram(main).with_data_parallel(
                outs[0].name if not only_forward else None,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
            bn_fetches = exe.run(program=main,
                                 feed={'input': data},
                                 fetch_list=fetch_name)
            fetch_outs.append(bn_fetches)
            fetch_names.append(fetch_name)

        for bn_val, inplace_abn_val, name1, name2 in zip(*(
                fetch_outs + fetch_names)):
            self.assertTrue(
                np.allclose(
                    bn_val, inplace_abn_val, atol=1e-2),
                "Output (" + name1 + ":" + name2 +
                ") has diff on {} with {} layout and {} activation. \n".format(
                    place, layout, activation) + "\nBN     " + str(bn_val) +
                "\n" + "Inplace ABN " + str(inplace_abn_val))

예제 #6

0

파일 보기

파일: model.py 프로젝트: shixiangbupt/PaddlePaddle-DeepSpeech

    def init_infer_program(self):
        # define inferer
        self.infer_program = fluid.Program()
        startup_prog = fluid.Program()

        # prepare the network
        with fluid.program_guard(self.infer_program, startup_prog):
            with fluid.unique_name.guard():
                self.infer_feeder, self.infer_log_probs, _ = self.create_network(is_infer=True)

        self.infer_program = self.infer_program.clone(for_test=True)
        self.infer_exe = fluid.Executor(self._place)
        self.infer_exe.run(startup_prog)

        # init param from pretrained_model
        if not self._init_from_pretrained_model:
            exit("预训练模型文件不存在！")
        self.init_from_pretrained_model(self.infer_exe, self.infer_program)

        # 支持多卡推理
        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()
        self.infer_compiled_prog = compiler.CompiledProgram(self.infer_program).with_data_parallel(
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)

예제 #7

0

파일 보기

파일: train_eval.py 프로젝트: will-jl944/PaddleSlim

def train(exe, train_program, train_out, test_program, test_out, args):
    loss, acc, global_lr, train_reader = train_out
    fetch_list_train = [loss.name, acc.name, global_lr.name]
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = True
    compiled_prog = compiler.CompiledProgram(
        train_program, build_strategy=build_strategy).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
    best_ave = 0
    for epoch_id in range(args.start_epoch, args.total_epoch):
        for batch_id, data in enumerate(train_reader()):
            loss, acc, global_lr = exe.run(compiled_prog,
                                           feed=data,
                                           fetch_list=fetch_list_train)
            avg_loss = np.mean(np.array(loss))
            avg_acc = np.mean(np.array(acc))
            print(
                '{}  Epoch: {:^4d} step: {:^4d} loss: {:.6f}, acc: {:.6f}, lr: {}'.
                format(now(), epoch_id, batch_id, avg_loss, avg_acc,
                       float(np.mean(np.array(global_lr)))))
        if batch_id % args.save_frequency == 0:
            model_path = os.path.join(args.save_ckpt, str(epoch_id))
            fluid.io.save_persistables(
                executor=exe, dirname=model_path, main_program=train_program)
            temp_ave = test(exe, test_program, test_out, args)
            if temp_ave > best_ave:
                best_ave = temp_ave
                print('Best AVE: {}'.format(best_ave))
                out_feature, test_reader, flods, flags = test_out
                fluid.io.save_inference_model(
                    executor=exe,
                    dirname='./out_inference',
                    feeded_var_names=['image_test'],
                    target_vars=[out_feature],
                    main_program=test_program)

예제 #8

0

파일 보기

    def run_parallel_exe(self,
                         place,
                         feed_list,
                         loss,
                         use_reduce=False,
                         use_fast_executor=False,
                         use_ir_memory_optimize=False):
        exe = fluid.Executor(place)
        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
        exe.run(fluid.default_startup_program())

        exec_strategy = fluid.ExecutionStrategy()
        if use_fast_executor:
            exec_strategy.use_experimental_executor = True

        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
        build_strategy.memory_optimize = use_ir_memory_optimize

        train_cp = compiler.CompiledProgram(
            fluid.default_main_program()).with_data_parallel(
                loss_name=loss.name,
                exec_strategy=exec_strategy,
                build_strategy=build_strategy)

        loss_set = []
        for data in self.train_data:
            out = exe.run(train_cp,
                          feed=feeder.feed(data),
                          fetch_list=[loss.name])
            loss_set.append(np.average(out))

        return loss_set

예제 #9

0

파일 보기

def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
    if use_cuda and not core.is_compiled_with_cuda():
        print('Skip use_cuda=True because Paddle is not compiled with cuda')
        return

    if use_parallel_executor and os.name == 'nt':
        print(
            'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
        )
        return

    word_dict_size = 5147
    reader = fake_imdb_reader(word_dict_size, batch_size * 40)
    train_reader = paddle.batch(reader, batch_size=batch_size)

    data = fluid.layers.data(name="words",
                             shape=[1],
                             dtype="int64",
                             lod_level=1)

    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    cost = network(data, label, word_dict_size)
    cost.persistable = True
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
    optimizer.minimize(cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
    reader = feeder.decorate_reader(train_reader,
                                    multi_devices=use_parallel_executor)

    exe = fluid.Executor(place)
    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    exe.run(fluid.default_startup_program())

    train_cp = fluid.default_main_program()
    if use_parallel_executor:
        train_cp = compiler.CompiledProgram(
            fluid.default_main_program()).with_data_parallel(
                loss_name=cost.name)
        fetch_list = [cost.name]
    else:
        fetch_list = [cost]

    for pass_id in six.moves.xrange(pass_num):
        batch_id = 0
        for data in reader():
            exe.run(train_cp,
                    feed=data,
                    fetch_list=fetch_list if batch_id % 4 == 0 else [])
            batch_id += 1
            if batch_id > 16:
                break

예제 #10

0

파일 보기

    def _try_to_compile(self, startup_program, main_program):
        node_num = self._node_num()
        assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num

        self._strategy.fuse_all_reduce_ops = True
        exec_strategy = self._strategy.exec_strategy

        if node_num <= 1:
            if self._strategy.nccl_comm_num > 1:
                logging.warn("set nccl_comm_num=1 since you only have 1 node.")
            self._strategy.nccl_comm_num = 1

            if self._strategy.use_hierarchical_allreduce:
                logging.warn(
                    "set use_hierarchical_allreduce=False since you only have 1 node."
                )
            self._strategy.use_hierarchical_allreduce = False

        sync_allreduce = os.getenv("FLAGS_sync_nccl_allreduce")
        if sync_allreduce is None or sync_allreduce == "1":
            exec_strategy.num_threads = self._strategy.nccl_comm_num + 1
            if self._strategy.use_hierarchical_allreduce:
                exec_strategy.num_threads = 2 * self._strategy.nccl_comm_num + 1
            if exec_strategy.num_threads > 4:
                logging.warn(
                    "if you use use_hierarchical_allreduce or "
                    "with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0"
                )

        if self.print_config:
            print("node_num:", node_num, "num_threads:",
                  exec_strategy.num_threads, "use_hierarchical_allreduce:",
                  self._strategy.use_hierarchical_allreduce, "nccl_comm_num:",
                  self._strategy.nccl_comm_num, "FLAGS_sync_nccl_allreduce:",
                  sync_allreduce)

        self._transpile(startup_program, main_program)

        if self._strategy.mode == "collective":
            return main_program

        self._strategy.num_trainers = fleet.worker_num()
        self._strategy.trainer_id = fleet.worker_index()
        self._strategy.trainers_endpoints = fleet.worker_endpoints()
        self._strategy.enable_backward_optimizer_op_deps = True

        self._compiled_program = compiler.CompiledProgram(main_program)

        self._compiled_program.with_data_parallel(
            loss_name=self._loss.name,
            build_strategy=self._strategy,
            exec_strategy=self._strategy.exec_strategy,
            share_vars_from=None)

        return self._compiled_program

예제 #11

0

파일 보기

    def _try_to_compile(self, main_program, loss):
        dist_strategy = self._get_distributed_strategy()

        build_strategy = dist_strategy.get_build_strategy()
        exec_strategy = dist_strategy.get_execute_strategy()

        self._compiled_program = compiler.CompiledProgram(main_program)

        self._compiled_program.with_data_parallel(
            loss_name=loss.name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy,
            share_vars_from=None)

        return self._compiled_program

예제 #12

0

파일 보기

    def main(self, with_double_buffer):
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        with fluid.program_guard(main_prog, startup_prog):
            image = fluid.layers.data(name='image',
                                      shape=self.ins_shape,
                                      dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            data_reader_handle = fluid.io.PyReader(
                feed_list=[image, label],
                capacity=16,
                iterable=False,
                use_double_buffer=with_double_buffer)
            fetch_list = [image.name, label.name]

        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)

        data_reader_handle.decorate_sample_list_generator(
            paddle.batch(self.prepare_data(), batch_size=self.batch_size))

        train_cp = compiler.CompiledProgram(main_prog).with_data_parallel(
            places=[place])

        batch_id = 0
        pass_count = 0
        while pass_count < self.test_pass_num:
            data_reader_handle.start()
            try:
                while True:
                    data_val, label_val = exe.run(train_cp,
                                                  fetch_list=fetch_list,
                                                  return_numpy=True)
                    ins_num = data_val.shape[0]
                    broadcasted_label = np.ones(
                        (ins_num, ) +
                        tuple(self.ins_shape)) * label_val.reshape(
                            (ins_num, 1))
                    self.assertEqual(data_val.all(), broadcasted_label.all())
                    batch_id += 1
            except fluid.core.EOFException:
                data_reader_handle.reset()
                pass_count += 1
                self.assertEqual(pass_count * self.batch_num, batch_id)

        self.assertEqual(pass_count, self.test_pass_num)

예제 #13

0

파일 보기

파일: test_py_func_op.py 프로젝트: lidanqing-intel/Paddle

def test_main(use_cuda, use_py_func_op, use_parallel_executor):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return None

    with fluid.program_guard(fluid.Program(), fluid.Program()):
        with fluid.scope_guard(fluid.core.Scope()):
            fluid.default_main_program().random_seed = 1
            fluid.default_startup_program().random_seed = 1
            np.random.seed(1)

            img = fluid.layers.data(name='image', shape=[784], dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            loss = simple_fc_net(img, label, use_py_func_op)
            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
            optimizer.minimize(loss)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
            r = paddle.batch(reader, batch_size=10)

            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())

            #FIXME force use old memory optimzie strategy here to pass the unittest
            #since open the new strategy will crash the unittest
            fluid.memory_optimize(fluid.default_main_program())

            train_cp = compiler.CompiledProgram(fluid.default_main_program())
            if use_parallel_executor:
                train_cp = train_cp.with_data_parallel(loss_name=loss.name)
                fetch_list = [loss.name]
            else:
                fetch_list = [loss]

            ret = []
            for epoch_id in six.moves.range(2):
                for d in r():
                    L, = exe.run(train_cp,
                                 feed=feeder.feed(d),
                                 fetch_list=fetch_list)
                    ret.append(L)
            return np.array(ret)

예제 #14

0

파일 보기

    def main(self, with_double_buffer):
        main_prog = fluid.Program()
        startup_prog = fluid.Program()

        with fluid.program_guard(main_prog, startup_prog):
            data_reader_handle = fluid.layers.io.open_files(
                filenames=[self.data_file_name],
                shapes=[[-1] + self.ins_shape, [-1, 1]],
                lod_levels=[0, 0],
                dtypes=['float32', 'int64'],
                thread_num=1,
                pass_num=1)
            data_reader = fluid.layers.io.batch(data_reader_handle,
                                                self.batch_size)
            if with_double_buffer:
                data_reader = fluid.layers.double_buffer(data_reader)
            image, label = fluid.layers.read_file(data_reader)
            fetch_list = [image.name, label.name]

        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)

        train_cp = compiler.CompiledProgram(main_prog).with_data_parallel()
        pass_count = 0
        while (True):
            try:
                data_val, label_val = exe.run(train_cp,
                                              fetch_list=fetch_list,
                                              return_numpy=True)
                ins_num = data_val.shape[0]
                broadcasted_label = np.ones(
                    (ins_num, ) + tuple(self.ins_shape)) * label_val.reshape(
                        (ins_num, 1))
                self.assertEqual(data_val.all(), broadcasted_label.all())

            except fluid.core.EOFException:
                pass_count += 1
                if pass_count < self.test_pass_num:
                    data_reader_handle.reset()
                else:
                    break

예제 #15

0

파일 보기

    def check_pass_conflict(cls,
                            method,
                            use_device=DeviceType.CUDA,
                            feed_dict=None,
                            get_data_from_feeder=None,
                            use_reduce=False,
                            use_ir_memory_optimize=True,
                            enable_inplace=True,
                            fuse_elewise_add_act_ops=False,
                            fuse_all_optimizer_ops=False,
                            fuse_all_reduce_ops=False,
                            fuse_relu_depthwise_conv=False,
                            optimizer=fluid.optimizer.Adam,
                            use_fast_executor=True,
                            enable_sequential_execution=False):

        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
                                              main, method, optimizer)

        place = fluid.CUDAPlace(
            0) if use_device == DeviceType.CUDA else fluid.XPUPlace(
                0) if use_device == DeviceType.XPU else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup)

        build_strategy, exec_strategy = cls.set_strategy(
            enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops,
            fuse_all_reduce_ops, fuse_elewise_add_act_ops,
            fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize,
            use_reduce, use_device)

        binary = compiler.CompiledProgram(main).with_data_parallel(
            loss_name=loss.name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)

        exe.run(binary, feed=feed_dict, fetch_list=[loss.name])

예제 #16

0

파일 보기

    def parallel_exe(self,
                     use_cuda,
                     run_parallel_exe,
                     use_faster_executor=False,
                     num_threads=4,
                     seed=1):
        main_program = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = seed
        with fluid.program_guard(main_program, startup):
            data = fluid.layers.data(name='image',
                                     shape=[3, 224, 224],
                                     dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            out = Lenet(data, class_dim=102)
            loss = fluid.layers.cross_entropy(input=out, label=label)
            loss = fluid.layers.mean(loss)
            opt = fluid.optimizer.Momentum(
                learning_rate=0.1,
                momentum=0.9,
                regularization=fluid.regularizer.L2Decay(1e-4))
            opt.minimize(loss)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup)

        build_strategy = fluid.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = use_faster_executor
        exec_strategy.num_threads = num_threads
        train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
            loss_name=loss.name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)

        run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)

예제 #17

0

파일 보기

파일: model.py 프로젝트: shixiangbupt/PaddlePaddle-DeepSpeech

    def train(self,
              train_batch_reader,
              dev_batch_reader,
              learning_rate,
              gradient_clipping,
              num_epoch,
              batch_size,
              num_samples,
              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_epoch: Number of training epochs.
        :type num_epoch: int
        :param batch_size: Number of batch size.
        :type batch_size: int
        :param num_samples: The num of train samples.
        :type num_samples: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type num_iteratons_print: int
        :param only_train_batch:Every epoch only train only_train_batch batch. Avoid insufficient video memory
        :type only_train_batch:int
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(self._output_model_dir):
            mkpath(self._output_model_dir)

        if isinstance(self._place, fluid.CUDAPlace):
            dev_count = fluid.core.get_cuda_device_count()
            learning_rate = learning_rate * dev_count
        else:
            dev_count = int(os.environ.get('CPU_NUM', 1))

        # prepare the network
        train_program = fluid.Program()
        startup_prog = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_reader, _, ctc_loss = self.create_network()
                # 学习率
                learning_rate = fluid.layers.exponential_decay(
                        learning_rate=learning_rate,
                        decay_steps=num_samples / batch_size / dev_count,
                        decay_rate=0.83,
                        staircase=True)
                # 准备优化器
                optimizer = fluid.optimizer.AdamOptimizer(
                    learning_rate=learning_rate,
                    regularization=fluid.regularizer.L2Decay(0.0001),
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=gradient_clipping))
                optimizer.minimize(loss=ctc_loss)

        exe = fluid.Executor(self._place)
        exe.run(startup_prog)

        # init from some pretrain models, to better solve the current task
        pre_epoch = 0
        if self._init_from_pretrained_model:
            pre_epoch = self.init_from_pretrained_model(exe, train_program)

        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()

        # pass the build_strategy to with_data_parallel API
        train_compiled_prog = compiler.CompiledProgram(train_program).with_data_parallel(loss_name=ctc_loss.name,
                                                                                         build_strategy=build_strategy,
                                                                                         exec_strategy=exec_strategy)

        train_reader.set_batch_generator(train_batch_reader)

        train_step = 0
        test_step = 0
        num_batch = -1
        # run train
        for epoch_id in range(num_epoch):
            train_reader.start()
            epoch_loss = []
            time_begin = time.time()
            batch_id = 0
            while True:
                try:
                    fetch_list = [ctc_loss.name, learning_rate.name]
                    if batch_id % 100 == 0:
                        fetch = exe.run(program=train_compiled_prog,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                        each_loss = fetch[0]
                        each_learning_rate = np.array(fetch[1])[0]
                        epoch_loss.extend(np.array(each_loss[0]) / batch_size)

                        print("Train [%s] epoch: [%d/%d], batch: [%d/%d], learning rate: %f, train loss: %f\n" %
                              (datetime.now(), epoch_id, num_epoch, batch_id, num_batch, each_learning_rate,
                               np.mean(each_loss[0]) / batch_size))
                        # 记录训练损失值
                        self.writer.add_scalar('Train loss', np.mean(each_loss[0]) / batch_size, train_step)
                        self.writer.add_scalar('Learning rate', each_learning_rate, train_step)
                        train_step += 1
                    else:
                        _ = exe.run(program=train_compiled_prog,
                                    fetch_list=[],
                                    return_numpy=False)
                    # 每2000个batch保存一次模型
                    if batch_id % 2000 == 0 and batch_id != 0:
                        self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch))
                    batch_id = batch_id + 1
                except fluid.core.EOFException:
                    train_reader.reset()
                    break
            num_batch = batch_id
            # 每一个epoch保存一次模型
            self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch))
            used_time = time.time() - time_begin
            if test_off:
                print('======================last Train=====================')
                print("Train time: %f sec, epoch: %d, train loss: %f\n" %
                      (used_time, epoch_id, np.mean(np.array(epoch_loss))))
                print('======================last Train=====================')
            else:
                print('\n======================Begin test=====================')
                # 设置临时模型的路径
                self._init_from_pretrained_model = self.save_model_path
                # 执行测试
                test_result = self.test(test_reader=dev_batch_reader)
                print("Train time: %f sec, epoch: %d, train loss: %f, test %s: %f"
                      % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), self.error_rate_type, test_result))
                print('======================Stop Train=====================\n')
                # 记录测试结果
                self.writer.add_scalar('Test %s' % self.error_rate_type, test_result, test_step)
                test_step += 1

        self.save_param(exe, train_program, "step_final")

        print("\n------------Training finished!!!-------------")

예제 #18

0

파일 보기

    def run_trainer(self, args):
        self.lr = args.lr
        if args.nccl2_reduce_layer_local_run:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size, single_device=True)
        elif args.use_dgc:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc)
        else:
            test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
                self.get_model(batch_size=args.batch_size)

        if args.update_method == "pserver":
            print_to_err(
                type(self).__name__,
                "begin to run transpile on trainer with pserver mode")
            t = self.get_transpiler(trainer_id=args.trainer_id,
                                    main_program=fluid.default_main_program(),
                                    pserver_endpoints=args.endpoints,
                                    trainers=args.trainers,
                                    sync_mode=args.sync_mode,
                                    dc_asgd=args.dc_asgd,
                                    hogwild_mode=args.hogwild)

            trainer_prog = t.get_trainer_program()
            print_to_err(
                type(self).__name__,
                "get trainer program done with pserver mode.")
        elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
            # transpile for nccl2
            config = fluid.DistributeTranspilerConfig()
            config.mode = "nccl2"
            config.nccl_comm_num = args.nccl_comm_num
            if args.use_hallreduce:
                config.use_hierarchical_allreduce = True
                config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks
            print_to_err(
                type(self).__name__,
                "begin to run transpile on trainer with nccl2 mode")
            nccl2_t = fluid.DistributeTranspiler(config=config)
            nccl2_t.transpile(args.trainer_id,
                              program=fluid.default_main_program(),
                              startup_program=fluid.default_startup_program(),
                              trainers=args.endpoints,
                              current_endpoint=args.current_endpoint)
            print_to_err(
                type(self).__name__,
                "get trainer program done. with nccl2 mode")
            trainer_prog = fluid.default_main_program()
        else:
            print_to_err(
                type(self).__name__,
                "do nothing about main program, just use it")
            trainer_prog = fluid.default_main_program()
            print_to_err(type(self).__name__, "use main program done.")

        # FIXME(gongwb):wait pserver initialization.
        time.sleep(1)

        if args.use_cuda:
            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
            place = fluid.CUDAPlace(device_id)
        else:
            place = fluid.CPUPlace()

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        print_to_err(type(self).__name__, "run worker startup program done.")

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        build_stra = fluid.BuildStrategy()
        # FIXME force disable enable_inplace and memory_optimize
        build_stra.enable_inplace = False
        build_stra.memory_optimize = False

        if args.hogwild:
            build_stra.async_mode = True

        if args.enable_backward_deps:
            build_stra.enable_backward_optimizer_op_deps = True

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        pass_builder = None
        if args.batch_merge_repeat > 1:
            pass_builder = build_stra._finalize_strategy_and_create_passes()
            mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
            mypass.set("num_repeats", args.batch_merge_repeat)

        if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
            build_stra.num_trainers = len(args.endpoints.split(","))
            build_stra.trainer_id = args.trainer_id
        else:
            # case args.update_method == "nccl2_reduce_layer":
            build_stra.num_trainers = 1
            build_stra.trainer_id = 0

        if args.use_dgc:
            # fuse_all_reduce_ops require that gradients should not be sparse types
            build_stra.fuse_all_reduce_ops = False

        print_to_err(
            type(self).__name__, "begin to compile with data parallel")
        binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
            loss_name=avg_cost.name,
            build_strategy=build_stra,
            exec_strategy=exec_strategy)
        print_to_err(
            type(self).__name__, "program compiled with data parallel")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(binary,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        print_to_out(out_losses)

예제 #19

0

파일 보기

data = fluid.data(name="char", shape=[None, 50], dtype="int64", lod_level=0)
#data = fluid.data(name="char", shape=[None, 50], dtype="float32", lod_level=0)
label = fluid.data(name="label", shape=[None, 1], dtype="int64", lod_level=0)

reader = fluid.io.PyReader(feed_list=[data, label],
                           capacity=40,
                           iterable=True,
                           return_list=False)
reader.decorate_sample_list_generator(train_reader, place)

emb = fluid.embedding(data, size=[10, 64])
prob = fluid.layers.fc(emb, size=2, act='softmax')
#prob = fluid.layers.fc(data, size=2, act='softmax')
ce = fluid.layers.cross_entropy(prob, label)
loss = fluid.layers.mean(ce)

exe = fluid.Executor(place[0])
fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)

exe.run(fluid.default_startup_program())
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False
compiled_train_prog = compiler.CompiledProgram(
    fluid.default_main_program()).with_data_parallel(
        loss_name=loss.name, build_strategy=build_strategy)

for data in reader():
    loss_data = exe.run(compiled_train_prog, feed=data, fetch_list=[loss.name])
    break

print(loss_data)

예제 #20

0

파일 보기

파일: model.py 프로젝트: inishchith/DeepSpeech

    def train(self,
              train_batch_reader,
              dev_batch_reader,
              feeding_dict,
              learning_rate,
              gradient_clipping,
              num_epoch,
              batch_size,
              num_samples,
              save_epoch=100,
              num_iterations_print=100,
              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_epoch: Number of training epochs.
        :type num_epoch: int
        :param batch_size: Number of batch size.
        :type batch_size: int
        :param num_samples: The num of train samples.
        :type num_samples: int
        :param save_epoch: Number of training iterations for save checkpoint and params.
        :type save_epoch: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type num_iteratons_print: int
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(self._output_model_dir):
            mkpath(self._output_model_dir)

        # adapt the feeding dict according to the network
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)

        if isinstance(self._place, fluid.CUDAPlace):
            dev_count = fluid.core.get_cuda_device_count()
        else:
            dev_count = int(os.environ.get('CPU_NUM', 1))

        # prepare the network
        train_program = fluid.Program()
        startup_prog = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_reader, log_probs, ctc_loss = self.create_network()
                # prepare optimizer
                optimizer = fluid.optimizer.AdamOptimizer(
                    learning_rate=fluid.layers.exponential_decay(
                        learning_rate=learning_rate,
                        decay_steps=num_samples / batch_size / dev_count,
                        decay_rate=0.83,
                        staircase=True))
                fluid.clip.set_gradient_clip(
                    clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=gradient_clipping))
                optimizer.minimize(loss=ctc_loss)

        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_reader, _, ctc_loss = self.create_network()

        test_prog = test_prog.clone(for_test=True)

        exe = fluid.Executor(self._place)
        exe.run(startup_prog)

        # init from some pretrain models, to better solve the current task
        pre_epoch = 0
        if self._init_from_pretrained_model:
            pre_epoch = self.init_from_pretrained_model(exe, train_program)

        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()

        # pass the build_strategy to with_data_parallel API
        compiled_prog = compiler.CompiledProgram(
            train_program).with_data_parallel(loss_name=ctc_loss.name,
                                              build_strategy=build_strategy,
                                              exec_strategy=exec_strategy)

        train_reader.set_batch_generator(train_batch_reader)
        test_reader.set_batch_generator(dev_batch_reader)

        # run train
        for epoch_id in range(num_epoch):
            train_reader.start()
            epoch_loss = []
            time_begin = time.time()
            batch_id = 0
            step = 0
            while True:
                try:
                    fetch_list = [ctc_loss.name]

                    if batch_id % num_iterations_print == 0:
                        fetch = exe.run(program=compiled_prog,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                        each_loss = fetch[0]
                        epoch_loss.extend(np.array(each_loss[0]) / batch_size)

                        print("epoch: %d, batch: %d, train loss: %f\n" %
                              (epoch_id, batch_id,
                               np.mean(each_loss[0]) / batch_size))

                    else:
                        each_loss = exe.run(program=compiled_prog,
                                            fetch_list=[],
                                            return_numpy=False)

                    batch_id = batch_id + 1
                except fluid.core.EOFException:
                    train_reader.reset()
                    break
            time_end = time.time()
            used_time = time_end - time_begin
            if test_off:
                print("\n--------Time: %f sec, epoch: %d, train loss: %f\n" %
                      (used_time, epoch_id, np.mean(np.array(epoch_loss))))
            else:
                print('\n----------Begin test...')
                test_loss = self.test(exe,
                                      dev_batch_reader=dev_batch_reader,
                                      test_program=test_prog,
                                      test_reader=test_reader,
                                      fetch_list=[ctc_loss])
                print(
                    "--------Time: %f sec, epoch: %d, train loss: %f, test loss: %f"
                    % (used_time, epoch_id + pre_epoch,
                       np.mean(np.array(epoch_loss)), test_loss / batch_size))
            if (epoch_id + 1) % save_epoch == 0:
                self.save_param(exe, train_program,
                                "epoch_" + str(epoch_id + pre_epoch))

        self.save_param(exe, train_program, "step_final")

        print("\n------------Training finished!!!-------------")

예제 #21

0

파일 보기

파일: test_eager_deletion_while_op.py 프로젝트: pyqt1/MyPaddle

    def run_main(self, place, with_data_parallel):
        self.place = place
        self.with_data_parallel = with_data_parallel

        if not core.is_compiled_with_cuda() and isinstance(
                self.place, core.CUDAPlace):
            return

        if isinstance(self.place, core.CUDAPlace):
            device_cnt = core.get_cuda_device_count(
            ) if self.with_data_parallel else 1
        else:
            device_cnt = int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count())
            ) if self.with_data_parallel else 1

        d0 = layers.data("d0",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d1 = layers.data("d1",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d2 = layers.data("d2",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
        j.stop_gradient = True

        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len2.stop_gradient = True
        cond2 = layers.less_than(x=j, y=array_len2)

        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            d = layers.reshape(d, shape=[10])
            prev = layers.reshape(prev, shape=[10])
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)
            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                d2 = layers.reshape(d2, shape=[10])
                prev2 = layers.reshape(prev2, shape=[10])
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)

        sum_result = layers.array_read(array=mem_array, i=j)
        sum_result.persistable = True
        tmp = layers.unsqueeze(sum_result, axes=[0])
        tmp = layers.expand(tmp, expand_times=[10, 1])
        fc = layers.fc(tmp, size=256)
        loss = layers.mean(sum_result)

        optim = fluid.optimizer.Adam(learning_rate=1e-3)
        optim.minimize(loss)

        exe = Executor(self.place)
        exe.run(fluid.default_startup_program())

        prog = fluid.default_main_program()
        if self.with_data_parallel:
            prog = compiler.CompiledProgram(
                fluid.default_main_program()).with_data_parallel(
                    loss_name=loss.name)

        for _ in range(5):
            d = []
            for i in range(3):
                tmp = numpy.random.random(size=[10]).astype('float32')
                if not self.with_data_parallel:
                    d.append(tmp)
                else:
                    d.append(numpy.array([tmp] * device_cnt))

            outs = exe.run(program=prog,
                           feed={
                               'd0': d[0],
                               'd1': d[1],
                               'd2': d[2]
                           },
                           fetch_list=[sum_result])
            self.assertAlmostEqual(numpy.sum(d),
                                   numpy.sum(outs[0]),
                                   delta=0.01)

예제 #22

0

파일 보기

파일: demo.py 프로젝트: chenwhql/PaddleScripts

import paddle.fluid as fluid
import paddle.fluid.compiler as compiler
import numpy
import os

place = fluid.CUDAPlace(0)  # fluid.CPUPlace()
exe = fluid.Executor(place)

data = fluid.layers.data(name='X', shape=[1], dtype='float32')
hidden = fluid.layers.fc(input=data, size=10)
loss = fluid.layers.mean(hidden)
fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)

fluid.default_startup_program().random_seed = 1
exe.run(fluid.default_startup_program())
compiled_prog = compiler.CompiledProgram(fluid.default_main_program())

x = numpy.random.random(size=(10, 1)).astype('float32')
loss_data, = exe.run(compiled_prog, feed={"X": x}, fetch_list=[loss.name])
print("loss: {}".format(loss_data[0]))

예제 #23

0

파일 보기

    def check_network_convergence(self,
                                  use_cuda=True,
                                  use_mem_opt=False,
                                  iter_num=5):
        prog = Program()
        startup_prog = Program()
        prog.random_seed = 100
        startup_prog.random_seed = 100
        with program_guard(prog, startup_prog):
            image = layers.data(name='x', shape=[784], dtype='float32')

            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
            cond = layers.less_than(x=label, y=limit)
            ie = layers.IfElse(cond)

            with ie.true_block():
                true_image = ie.input(image)
                hidden = layers.fc(input=true_image, size=100, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            with ie.false_block():
                false_image = ie.input(image)
                hidden = layers.fc(input=false_image, size=200, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            prob = ie()
            loss = layers.cross_entropy(input=prob[0], label=label)
            avg_loss = layers.mean(loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
            optimizer.minimize(avg_loss, startup_prog)
            train_reader = paddle.batch(
                paddle.dataset.mnist.train(), batch_size=200)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = Executor(place)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.use_cuda = use_cuda

            build_strategy = fluid.BuildStrategy()
            build_strategy.memory_optimize = use_mem_opt

            train_cp = compiler.CompiledProgram(fluid.default_main_program())
            train_cp = train_cp.with_data_parallel(
                loss_name=avg_loss.name,
                exec_strategy=exec_strategy,
                build_strategy=build_strategy)
            fetch_list = [avg_loss.name]

            exe.run(startup_prog)
            PASS_NUM = 100
            loop = 0
            ret = []
            for pass_id in range(PASS_NUM):
                for data in train_reader():
                    x_data = np.array([x[0] for x in data]).astype("float32")
                    y_data = np.array([x[1] for x in data]).astype("int64")
                    y_data = y_data.reshape((y_data.shape[0], 1))

                    outs = exe.run(train_cp,
                                   feed={'x': x_data,
                                         'y': y_data},
                                   fetch_list=[avg_loss])

                    loop += 1
                    ret.append(outs[0])
                    if iter_num == loop:
                        return ret
            return ret

예제 #24

0

파일 보기

    def check_network_convergence(self,
                                  network,
                                  use_cuda=True,
                                  memory_opt=True,
                                  use_ir_memory_optimize=True,
                                  enable_inplace=True,
                                  iter=5):
        if use_cuda and not core.is_compiled_with_cuda():
            print(
                'Skip use_cuda=True because Paddle is not compiled with cuda')
            return

        if os.name == 'nt':
            print(
                'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
            )
            return
        fluid.default_startup_program().random_seed = 100
        fluid.default_main_program().random_seed = 100

        data = fluid.layers.data(name="words",
                                 shape=[1],
                                 dtype="int64",
                                 lod_level=1)

        label = fluid.layers.data(name="label", shape=[1], dtype="int64")

        cost = network(data, label, len(self.word_dict))
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        optimizer.minimize(cost)
        build_strategy = fluid.BuildStrategy()
        build_strategy.enable_inplace = False
        build_strategy.memory_optimize = False
        if memory_opt:
            fluid.memory_optimize(fluid.default_main_program())
        else:
            build_strategy.enable_inplace = use_ir_memory_optimize
            build_strategy.memory_optimize = enable_inplace

        # execution
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
        reader = feeder.decorate_reader(self.train_reader, multi_devices=True)
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())

        train_cp = compiler.CompiledProgram(fluid.default_main_program())
        train_cp = train_cp.with_data_parallel(loss_name=cost.name,
                                               build_strategy=build_strategy)
        fetch_list = [cost.name]

        begin = time.time()
        first_loss, last_loss = None, None
        step_id = 0
        custom_iter = getattr(self, "iter", None)
        if not custom_iter == None:
            iter = custom_iter
        for data in reader():
            ret = exe.run(train_cp, feed=data, fetch_list=fetch_list)
            print(ret)
            step_id += 1
            if step_id == 1:
                first_loss = ret[0]
            if step_id == iter:
                last_loss = ret[0]
                break
        end = time.time()

        print("%.4f Instance per second" % ((self.batch_size * iter) /
                                            (end - begin)))

        print(first_loss, last_loss)
        avg_last_loss_val = np.array(last_loss).mean()
        avg_first_loss_val = np.array(first_loss).mean()
        if math.isnan(float(avg_last_loss_val)) or math.isnan(
                float(avg_first_loss_val)):
            sys.exit("got NaN loss, training failed.")

        return first_loss, last_loss

예제 #25

0

파일 보기

    def check_network_convergence(self,
                                  is_sparse,
                                  build_strategy=None,
                                  use_cuda=True):
        os.environ['CPU_NUM'] = str(4)
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            word = fluid.layers.data(
                name='word_data', shape=[1], dtype='int64', lod_level=1)
            predicate = fluid.layers.data(
                name='verb_data', shape=[1], dtype='int64', lod_level=1)
            ctx_n2 = fluid.layers.data(
                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
            ctx_n1 = fluid.layers.data(
                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
            ctx_0 = fluid.layers.data(
                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
            ctx_p1 = fluid.layers.data(
                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
            ctx_p2 = fluid.layers.data(
                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
            mark = fluid.layers.data(
                name='mark_data', shape=[1], dtype='int64', lod_level=1)

            feature_out = db_lstm(**locals())
            target = fluid.layers.data(
                name='target', shape=[1], dtype='int64', lod_level=1)
            crf_cost = fluid.layers.linear_chain_crf(
                input=feature_out,
                label=target,
                param_attr=fluid.ParamAttr(
                    name='crfw', learning_rate=1e-1))
            avg_cost = fluid.layers.mean(crf_cost)

            sgd_optimizer = fluid.optimizer.SGD(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=0.01,
                    decay_steps=100000,
                    decay_rate=0.5,
                    staircase=True))
            sgd_optimizer.minimize(avg_cost)

            train_data = paddle.batch(
                paddle.reader.shuffle(
                    paddle.dataset.conll05.test(), buf_size=8192),
                batch_size=16)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(startup)

            train_cp = compiler.CompiledProgram(main).with_data_parallel(
                loss_name=avg_cost.name, build_strategy=build_strategy)

            feeder = fluid.DataFeeder(
                feed_list=[
                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
                    mark, target
                ],
                place=fluid.CPUPlace())

            data = train_data()
            for i in range(10):
                cur_batch = next(data)
                print(exe.run(train_cp,
                              feed=feeder.feed(cur_batch),
                              fetch_list=[avg_cost.name])[0])

예제 #26

0

파일 보기

    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
        """
        Add distributed operations to minimize ``loss`` by updating ``parameter_list``.

        Args:
            loss (Tensor): A ``Tensor`` containing the value to minimize.
            startup_program (Program, optional): :ref:`api_fluid_Program` for
                initializing parameters in ``parameter_list``. The default value
                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
            parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                to be updated. The default value is None.

        Returns:
            tuple: tuple (optimize_ops, params_grads), A list of operators appended
            by minimize and a list of (param, grad) tensor pairs, param is
            ``Parameter``, grad is the gradient value corresponding to the parameter.
            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
            indicate program pruning. If so, the program will be pruned by ``feed`` and
            ``fetch_list`` before run, see details in ``Executor``.

        Examples:

            .. code-block:: python

                import paddle
                paddle.enable_static()
                import paddle.distributed.fleet as fleet
                import paddle.nn.functional as F

                hid_dim = 10
                label_dim = 2
                input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32')
                input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64')
                fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
                fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
                prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax')
                cost = F.cross_entropy(input=prediction, label=input_y)
                avg_cost = paddle.mean(x=cost)

                fleet.init(is_collective=True)
                strategy = fleet.DistributedStrategy()
                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
                optimizer.minimize(avg_cost)

                # for more examples, please reference https://github.com/PaddlePaddle/FleetX

        """
        context = {}
        context["user_defined_strategy"] = copy.deepcopy(
            self._user_defined_strategy)
        if paddle.fluid.framework.in_dygraph_mode():
            # imitate target optimizer retrieval
            target_opt = self.user_defined_optimizer
            self._context = context
            return target_opt.minimize(loss)

        # cache original feed forward program
        self.origin_main_program = loss.block.program
        context["origin_main_program"] = self.origin_main_program
        context["loss"] = loss
        if startup_program == None:
            self.origin_startup_program = \
                paddle.static.default_startup_program().clone(for_test=False)
            startup_program = paddle.static.default_startup_program()
        else:
            self.origin_startup_program = \
                startup_program.clone(for_test=False)

        context["origin_startup_program"] = startup_program
        context["role_maker"] = self._role_maker

        # compile time
        distributed_optimizer_list = \
            MetaOptimizerFactory()._get_valid_meta_optimizers(
                self.user_defined_optimizer)

        context["user_defined_strategy"] = copy.deepcopy(
            self._user_defined_strategy)
        copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)

        # trigger the auto-parallel in very strict condition
        # strategy = DistributedStrategy()
        # strategy.auto = True
        # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
        # optimizer = fleet.distributed_optimizer(optimizer, strategy)
        if copy_user_defined_strategy._is_strict_auto():
            # turn on all the strategy for each optimizer
            for opt in distributed_optimizer_list:
                opt._enable_strategy(copy_user_defined_strategy, context)

        valid_optimizer_list = []
        valid_graph_optimizer_list = []
        can_not_apply_optimizer_list = []
        # recall meta optimizers for ranking
        for opt in distributed_optimizer_list:
            opt._set_basic_info(loss, self._role_maker,
                                self.user_defined_optimizer,
                                copy_user_defined_strategy)
            if opt._can_apply() and not opt._is_graph_out():
                valid_optimizer_list.append(opt)
            elif opt._can_apply() and opt._is_graph_out():
                valid_graph_optimizer_list.append(opt)
            else:
                can_not_apply_optimizer_list.append(opt)
        # combine recalled meta optimizers to be a valid meta optimizer
        meta_optimizer, graph_optimizer = \
            self.strategy_compiler.generate_optimizer(
                loss, self._role_maker, self.user_defined_optimizer,
                copy_user_defined_strategy, valid_optimizer_list,
                valid_graph_optimizer_list)

        valid_strategy = self.strategy_compiler._get_valid_strategy(
            copy_user_defined_strategy, can_not_apply_optimizer_list)

        context["valid_strategy"] = copy.deepcopy(valid_strategy)

        applied_meta_list = self.strategy_compiler._get_applied_meta_list()
        applied_graph_list = self.strategy_compiler._get_applied_graph_list()

        context['applied_meta_list'] = applied_meta_list
        context['applied_graph_list'] = applied_graph_list

        self._context = context

        self.valid_strategy = valid_strategy
        self.valid_strategy._enable_env()

        optimize_ops = []
        params_grads = []

        if self._role_maker._is_non_distributed() and not self._is_collective:
            if self._runtime_handle is None:
                self._runtime_handle = RuntimeFactory()._create_runtime(context)

            compiled_program = compiler.CompiledProgram(
                self.origin_main_program).with_data_parallel(
                    loss_name=loss.name, share_vars_from=None)
            loss.block.program._graph = compiled_program
            return self.user_defined_optimizer.minimize(
                loss, startup_program, parameter_list, no_grad_set=no_grad_set)

        if meta_optimizer:
            optimize_ops, params_grads = meta_optimizer.minimize(
                loss, startup_program, parameter_list, no_grad_set=no_grad_set)

            default_program = paddle.static.default_main_program()

            if id(default_program) != id(loss.block.program):
                paddle.fluid.framework.switch_main_program(loss.block.program)

        else:
            optimize_ops, params_grads = self.user_defined_optimizer.minimize(
                loss, startup_program, parameter_list, no_grad_set=no_grad_set)

        context["program_optimize_ops"] = optimize_ops
        context["program_params_grads"] = params_grads

        if graph_optimizer:
            optimize_ops, params_grads = graph_optimizer.minimize(
                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
            # since we do not encourage users to use graph operations
            # if a graph optimizer takes effect, mostly
            # optimizers_ops and params_grads are None
            # i.e. users can not modify current computation graph anymore
            context["graph_optimize_ops"] = optimize_ops
            context["graph_optimize_grads"] = params_grads

        if self._runtime_handle is None:
            self._runtime_handle = RuntimeFactory()._create_runtime(context)

        import paddle.distributed.fleet as fleet
        fleet.util._set_strategy(context["valid_strategy"])

        return optimize_ops, params_grads

예제 #27

0

파일 보기

    def _compare(self, place, layout, only_forward):
        """Compare results."""
        seed = 10
        os.environ['FLAGS_cudnn_deterministic'] = "1"
        scope = core.Scope()
        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
        data = create_or_get_tensor(scope, "input",
                                    OpTest.np_dtype_to_fluid_dtype(data),
                                    place)

        # Single-GPU, N = 32 per GPU
        main, startup, outs = self._build_program(place, layout, seed, False,
                                                  only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
        ]
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD',
                'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        bn_fetches = exe.run(program=main,
                             feed={'input': data},
                             fetch_list=fetch_names)

        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
        assert core.get_cuda_device_count() > 1
        main, startup, outs = self._build_program(place, layout, seed, True,
                                                  only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
        ]
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD',
                'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        for nm in fetch_names:
            fv = fluid.framework._get_var(str(nm), program=main)
            fv.persistable = True
        build_strategy = fluid.BuildStrategy()
        build_strategy.sync_batch_norm = True
        build_strategy.enable_inplace = False
        build_strategy.memory_optimize = False
        comp_prog = compiler.CompiledProgram(main).with_data_parallel(
            outs[0].name if not only_forward else None,
            build_strategy=build_strategy)
        sync_bn_fetches = exe.run(program=comp_prog,
                                  feed={'input': data},
                                  fetch_list=fetch_names)

        for i in six.moves.xrange(1, len(sync_bn_fetches)):
            bn_val = bn_fetches[i]
            sync_bn_val = sync_bn_fetches[i]
            if sync_bn_val.shape != bn_val.shape:
                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
            self.assertTrue(
                np.allclose(bn_val, sync_bn_val, atol=self.atol),
                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))

예제 #28

0

파일 보기

    def main(self,
             use_cuda=True,
             use_parallel_executor=False,
             use_double_buffer=False,
             use_feed_list=False,
             use_decorate_paddle_reader=False):
        assert not use_cuda or use_cuda and core.is_compiled_with_cuda()

        self.use_cuda = use_cuda
        self.use_parallel_executor = use_parallel_executor
        self.use_double_buffer = use_double_buffer
        self.use_feed_list = use_feed_list
        self.use_decorate_paddle_reader = use_decorate_paddle_reader

        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            in_data, label, loss, optimizer, feed_queue, py_reader = simple_fc_net(
                in_size=self.in_size,
                class_num=self.class_num,
                hidden_sizes=self.hidden_sizes,
                batch_size=self.batch_size,
                queue_capacity=self.queue_capacity,
                use_double_buffer=self.use_double_buffer,
                use_feed_list=self.use_feed_list)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            exe = fluid.Executor(place)
            exe.run(startup_program)

            train_cp = main_program
            if use_parallel_executor:
                train_cp = compiler.CompiledProgram(
                    main_program).with_data_parallel(loss_name=loss.name)
                if use_cuda:
                    self.batch_size_times = core.get_cuda_device_count()
                else:
                    self.batch_size_times = int(
                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            else:
                self.batch_size_times = 1

            reader = self.tensor_reader(use_decorate_paddle_reader)
            batch_reader = paddle.batch(reader, batch_size=self.batch_size)

            self.inputs = []
            self.outputs = []

            if use_decorate_paddle_reader:
                if use_feed_list:
                    py_reader.decorate_paddle_reader(batch_reader)
                else:
                    py_reader.decorate_sample_list_generator(batch_reader)
                py_reader.start()
            else:
                thread = threading.Thread(target=feed_data,
                                          args=(feed_queue, batch_reader))
                thread.daemon = True
                thread.start()

            try:
                while True:
                    fetches = exe.run(train_cp,
                                      fetch_list=[in_data.name, label.name])
                    fetches = [as_numpy(fetch) for fetch in fetches]
                    self.outputs.append(fetches)
            except fluid.core.EOFException:
                pass

            feed_queue.close()
            self.validate()
            if use_decorate_paddle_reader:
                py_reader.exited = True
                py_reader.thread.join()
            else:
                thread.join()

예제 #29

0

파일 보기

파일: parallel_executor_test_base.py 프로젝트: goodcoder-cnn/Paddle

    def check_network_convergence(cls,
                                  method,
                                  use_cuda=True,
                                  iter=5,
                                  batch_size=None,
                                  feed_dict=None,
                                  feed_data_reader=None,
                                  get_data_from_feeder=None,
                                  use_parallel_executor=True,
                                  use_reduce=False,
                                  use_ir_memory_optimize=True,
                                  enable_inplace=True,
                                  fuse_elewise_add_act_ops=False,
                                  fuse_all_optimizer_ops=False,
                                  fuse_all_reduce_ops=False,
                                  fuse_relu_depthwise_conv=False,
                                  optimizer=fluid.optimizer.Adam,
                                  use_fast_executor=False,
                                  enable_sequential_execution=False):
        def run_executor(exe, binary, feed, fetch_list):
            if feed_data_reader is None:
                res = exe.run(binary, feed=feed, fetch_list=fetch_list)
            else:
                res = exe.run(binary,
                              feed=feed_data_reader.get_next(exe, binary),
                              fetch_list=fetch_list)
            return res

        if feed_data_reader is not None:
            assert isinstance(
                feed_data_reader, FeedDataReader
            ), "feed_data_reader must be type of FeedDataReader"

        paddle.manual_seed(1)
        paddle.framework.random._manual_program_seed(1)
        main = fluid.Program()
        startup = fluid.Program()

        with fluid.program_guard(main, startup):
            feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder,
                                              main, method, optimizer)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup)

        build_strategy, exec_strategy = cls.set_strategy(
            enable_inplace, enable_sequential_execution,
            fuse_all_optimizer_ops, fuse_all_reduce_ops,
            fuse_elewise_add_act_ops, fuse_relu_depthwise_conv,
            use_fast_executor, use_ir_memory_optimize, use_reduce, use_cuda)

        if use_parallel_executor:
            binary = compiler.CompiledProgram(main).with_data_parallel(
                loss_name=loss.name,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
        else:
            binary = main

        if batch_size is not None:
            batch_size *= fluid.core.get_cuda_device_count(
            ) if use_cuda else int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

        begin = time.time()
        first_loss, = run_executor(exe=exe,
                                   binary=binary,
                                   feed=feed_dict,
                                   fetch_list=[loss.name])
        for _ in range(iter):
            run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
        last_loss, = run_executor(exe=exe,
                                  binary=binary,
                                  feed=feed_dict,
                                  fetch_list=[loss.name])
        end = time.time()

        if batch_size is not None:
            print("%.4f Instance per second" % ((batch_size * iter + 2) /
                                                (end - begin)))

        avg_last_loss_val = np.array(last_loss).mean()
        avg_first_loss_val = np.array(first_loss).mean()
        if math.isnan(float(avg_last_loss_val)) or math.isnan(
                float(avg_first_loss_val)):
            sys.exit("got NaN loss, training failed.")

        print(first_loss, last_loss)
        # self.assertGreater(first_loss[0], last_loss[0])
        return first_loss, last_loss

예제 #30

0

파일 보기

    def _try_to_compile(self, startup_program, main_program, loss):
        dist_strategy = self.user_defined_strategy
        local_build_strategy = dist_strategy.build_strategy

        local_build_strategy.use_hierarchical_allreduce = \
            dist_strategy.use_hierarchical_allreduce
        local_build_strategy.hierarchical_allreduce_inter_nranks = \
            dist_strategy.hierarchical_allreduce_inter_nranks
        local_build_strategy.sync_batch_norm = \
            dist_strategy.sync_batch_norm
        local_build_strategy.fuse_all_reduce_ops = \
            dist_strategy.fuse_all_reduce_ops
        local_build_strategy.nccl_comm_num = \
            dist_strategy.nccl_comm_num

        gradient_scale_configs = self.user_defined_strategy.gradient_scale_configs
        scale_strategys = {
            'avg': BuildStrategy.GradientScaleStrategy.CoeffNumDevice,
            'sum': BuildStrategy.GradientScaleStrategy.One,
            'customized': BuildStrategy.GradientScaleStrategy.Customized,
        }
        assert gradient_scale_configs['scale_strategy'] in scale_strategys, \
            "gradient_scale_configs.scale_strategy must be 'avg', 'sum' or 'customized'"
        local_build_strategy.gradient_scale_strategy = \
            scale_strategys[gradient_scale_configs['scale_strategy']]

        if self.user_defined_strategy.recompute == True:
            logging.warn(
                "set enable_sequential_execution=True since you have enable the recompute strategy"
            )
            local_build_strategy.enable_sequential_execution = True

        exe_strategy = self.user_defined_strategy.execution_strategy
        worker_num = self.role_maker._worker_num()
        node_num = self.role_maker._node_num()

        if self.role_maker._is_collective:
            assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num

        if worker_num <= 1:
            # local mode
            if local_build_strategy.nccl_comm_num > 1:
                logging.warn("set nccl_comm_num=1 since you only have 1 node.")
            local_build_strategy.nccl_comm_num = 1

        if node_num <= 1:
            if local_build_strategy.use_hierarchical_allreduce:
                logging.warn(
                    "set hierachical_allreduce=False since you only have 1 node."
                )
            local_build_strategy.use_hierarchical_allreduce = False

        sync_allreduce = dist_strategy.sync_nccl_allreduce
        if sync_allreduce:
            exe_strategy.num_threads = max(
                local_build_strategy.nccl_comm_num + 1,
                exe_strategy.num_threads)
            if local_build_strategy.nccl_comm_num > 1:
                logging.warn(
                    "nccl_comm_num > 1, you may need to set sync_nccl_allreduce=False to ensure that different nccl comms can overlap"
                )

        sync_batch_norm = local_build_strategy.sync_batch_norm
        if sync_batch_norm:
            local_build_strategy.nccl_comm_num = 1
            local_build_strategy.use_hierarchical_allreduce = False
            exe_strategy.num_threads = 1
            logging.warn(
                "use sync_batch_norm will hang when set num_threads > 1, so "
                "set num_threads=1, nccl_comm_num=1, hierachical_allreduce=False."
            )

        # NOTE. compatible with compiler, otherwise these values will be overwritten by compiler
        main_program._nccl_comm_num = local_build_strategy.nccl_comm_num
        main_program._use_hierarchical_allreduce = local_build_strategy.use_hierarchical_allreduce
        main_program._hierarchical_allreduce_inter_nranks = local_build_strategy.hierarchical_allreduce_inter_nranks

        # TODO(guru4elephant): should be an independent optimizer
        if worker_num > 1:
            self._setup_nccl_op(startup_program, main_program,
                                local_build_strategy)

        local_build_strategy.num_trainers = self.role_maker._worker_num()
        local_build_strategy.trainer_id = self.role_maker._worker_index()
        local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
        )
        local_build_strategy.enable_backward_optimizer_op_deps = True

        self._compiled_program = compiler.CompiledProgram(main_program)

        self._compiled_program.with_data_parallel(
            loss_name=loss.name,
            build_strategy=local_build_strategy,
            exec_strategy=exe_strategy,
            share_vars_from=None)

        return self._compiled_program