def run_trainer_with_spawn_func(self, args): # 1. enable dygraph paddle.disable_static() # 2. init seed seed = 90 paddle.static.default_startup_program().random_seed = seed paddle.static.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) # get trainer id args.trainer_id = paddle.distributed.get_rank() # 3. init parallel env if args.update_method in ["nccl2", "gloo"]: paddle.distributed.init_parallel_env() # 4. train model model, train_reader, opt = self.get_model() if args.update_method in ["nccl2", "gloo"]: model = paddle.DataParallel( model, find_unused_parameters=args.find_unused_parameters) out_losses = self.model_train(args, model, opt, train_reader) print_to_out(out_losses) return out_losses
def run_trainer_func(self, args): if fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: assert ("Only support CUDAPlace for now.") with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) model, train_reader, opt = self.get_model() if args.update_method == "nccl2": dist.init_parallel_env() print_to_err( type(self).__name__, "begin to prepare context in dygraph with nccl2") model = paddle.DataParallel( model, find_unused_parameters=args.find_unused_parameters) print_to_err(type(self).__name__, "model built in dygraph") out_losses = self.model_train(args, model, opt, train_reader) print_to_out(out_losses) return out_losses
def run_trainer(self, args): if fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: assert ("Only support CUDAPlace for now.") with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) model, train_reader, opt = self.get_model() if args.update_method == "nccl2": dist.init_parallel_env() print_to_err( type(self).__name__, "begin to prepare context in dygraph with nccl2") if not args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=False) else: model = paddle.DataParallel(model, find_unused_parameters=True) print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: if args.update_method == "nccl2": with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) print_to_err( type(self).__name__, "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) if not args.accumulate_gradient: model.clear_gradients() print_to_out(out_losses)
def run_trainer_with_spawn(self, args): paddle.disable_static() fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) args.trainer_id = dist.get_rank() if args.update_method == "nccl2": dist.init_parallel_env() model, train_reader, opt = self.get_model() if args.update_method == "nccl2": if args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=True) else: model = paddle.DataParallel(model, find_unused_parameters=False) out_losses = [] for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: if args.update_method == "nccl2": with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) print_to_err( type(self).__name__, "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) model.clear_gradients() print_to_out(out_losses) return out_losses