def run_trainer_with_spawn_func(self, args):
        # 1. enable dygraph
        paddle.disable_static()

        # 2. init seed
        seed = 90
        paddle.static.default_startup_program().random_seed = seed
        paddle.static.default_main_program().random_seed = seed
        np.random.seed(seed)
        random.seed(seed)
        # get trainer id
        args.trainer_id = paddle.distributed.get_rank()

        # 3. init parallel env
        if args.update_method in ["nccl2", "gloo"]:
            paddle.distributed.init_parallel_env()

        # 4. train model
        model, train_reader, opt = self.get_model()
        if args.update_method in ["nccl2", "gloo"]:
            model = paddle.DataParallel(
                model, find_unused_parameters=args.find_unused_parameters)

        out_losses = self.model_train(args, model, opt, train_reader)
        print_to_out(out_losses)
        return out_losses
    def run_trainer_func(self, args):
        if fluid.core.is_compiled_with_cuda():
            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
            place = fluid.CUDAPlace(device_id)
        else:
            assert ("Only support CUDAPlace for now.")

        with fluid.dygraph.guard(place):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            np.random.seed(seed)
            random.seed(seed)
            model, train_reader, opt = self.get_model()

            if args.update_method == "nccl2":
                dist.init_parallel_env()
                print_to_err(
                    type(self).__name__,
                    "begin to prepare context in dygraph with nccl2")
                model = paddle.DataParallel(
                    model, find_unused_parameters=args.find_unused_parameters)
            print_to_err(type(self).__name__, "model built in dygraph")
            out_losses = self.model_train(args, model, opt, train_reader)
            print_to_out(out_losses)
            return out_losses
    def run_trainer(self, args):
        if fluid.core.is_compiled_with_cuda():
            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
            place = fluid.CUDAPlace(device_id)
        else:
            assert ("Only support CUDAPlace for now.")

        with fluid.dygraph.guard(place):
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            np.random.seed(seed)
            random.seed(seed)
            model, train_reader, opt = self.get_model()

            if args.update_method == "nccl2":
                dist.init_parallel_env()
                print_to_err(
                    type(self).__name__,
                    "begin to prepare context in dygraph with nccl2")
                if not args.find_unused_parameters:
                    model = paddle.DataParallel(model,
                                                find_unused_parameters=False)
                else:
                    model = paddle.DataParallel(model,
                                                find_unused_parameters=True)
                print_to_err(type(self).__name__, "model built in dygraph")
            out_losses = []
            print_to_err(type(self).__name__, "begin to run dygraph training")
            for step_id, data in enumerate(train_reader()):
                data = self._get_data(data, args)
                if step_id == RUN_STEP:
                    break
                if step_id % 3 != 0:
                    if args.update_method == "nccl2":
                        with model.no_sync():
                            loss = self.run_one_loop(model, opt, data)
                            loss.backward()
                    else:
                        loss = self.run_one_loop(model, opt, data)
                        loss.backward()
                else:
                    loss = self.run_one_loop(model, opt, data)
                    loss.backward()
                    opt.minimize(loss)
                    print_to_err(
                        type(self).__name__,
                        "loss at step %d: %f" % (step_id, loss.numpy()))
                    out_losses.append(loss.numpy())

                    if not args.accumulate_gradient:
                        model.clear_gradients()
        print_to_out(out_losses)
示例#4
0
    def run_trainer_with_spawn(self, args):
        paddle.disable_static()
        fluid.default_startup_program().random_seed = seed
        fluid.default_main_program().random_seed = seed
        np.random.seed(seed)
        random.seed(seed)
        args.trainer_id = dist.get_rank()

        if args.update_method == "nccl2":
            dist.init_parallel_env()
        model, train_reader, opt = self.get_model()
        if args.update_method == "nccl2":
            if args.find_unused_parameters:
                model = paddle.DataParallel(model, find_unused_parameters=True)
            else:
                model = paddle.DataParallel(model,
                                            find_unused_parameters=False)

        out_losses = []
        for step_id, data in enumerate(train_reader()):
            data = self._get_data(data, args)
            if step_id == RUN_STEP:
                break
            if step_id % 3 != 0:
                if args.update_method == "nccl2":
                    with model.no_sync():
                        loss = self.run_one_loop(model, opt, data)
                        loss.backward()
                else:
                    loss = self.run_one_loop(model, opt, data)
                    loss.backward()
            else:
                loss = self.run_one_loop(model, opt, data)
                loss.backward()
                opt.minimize(loss)
                print_to_err(
                    type(self).__name__,
                    "loss at step %d: %f" % (step_id, loss.numpy()))
                out_losses.append(loss.numpy())
                model.clear_gradients()
        print_to_out(out_losses)
        return out_losses