def _transpile(self, startup_program, main_program): """ Transpile the programs to distributed programs. And add the variables. """ worker_endpoints = fleet.worker_endpoints() trainer_id = fleet.worker_index() current_endpoint = fleet.worker_endpoints()[trainer_id] worker_endpoints_env = ','.join(worker_endpoints) trainers_num = fleet.worker_num() if self.print_config: print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # call transpiler config = dist_transpiler.DistributeTranspilerConfig() config.mode = self._strategy.mode config.collective_mode = self._strategy.collective_mode config.nccl_comm_num = self._strategy.nccl_comm_num config.use_hierarchical_allreduce = self._strategy.use_hierarchical_allreduce config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks t = dist_transpiler.DistributeTranspiler(config=config) t.transpile(trainer_id=trainer_id, trainers=worker_endpoints_env, startup_program=startup_program, program=main_program, current_endpoint=current_endpoint)
def dist_transpile(trainer_id, args, train_prog, startup_prog): if trainer_id < 0: return None, None # the port of all pservers, needed by both trainer and pserver port = os.getenv("PADDLE_PSERVER_PORT", "6174") # comma separated ips of all pservers, needed by trainer and # pserver pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # total number of workers/trainers in the job, needed by # trainer and pserver trainers = int(os.getenv("PADDLE_TRAINERS")) # the IP of the local machine, needed by pserver only current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") config = distribute_transpiler.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var t = distribute_transpiler.DistributeTranspiler(config=config) t.transpile( trainer_id, # NOTE: *MUST* use train_prog, for we are using with guard to # generate different program for train and test. program=train_prog, pservers=pserver_endpoints, trainers=trainers, sync_mode=not args.async_mode, startup_program=startup_prog) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( current_endpoint, pserver_program, startup_program=startup_prog) return pserver_program, pserver_startup_program elif training_role == "TRAINER": train_program = t.get_trainer_program() return train_program, startup_prog else: raise ValueError( 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' )
def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): """ minimize a program through loss Args: loss (Variable|Variable List): loss variable or loss variable list to run optimization. startup_program (Program): startup_program for initializing parameters in `parameter_list`. parameter_list (list): list of Variables to update. no_grad_set (set|None): set of Variables should be ignored. Returns: tuple: (optimize_ops, params_grads) which are, list of operators appended; and list of (param, grad) Variables pair for optimization. Note that in parameter server mode, a worker will not get anything about optimize_os Because optmizer algorithms run on pserver side. We will make this usable in pserver process, but currently the optimization part is written into Fleet(). A user does not need to care about how to startup a pserver node. """ optimize_ops, param_grads = self._optimizer.minimize( loss, startup_program, parameter_list, no_grad_set) worker_endpoints = fleet.worker_endpoints() trainer_id = fleet.worker_index() current_endpoint = fleet.worker_endpoints()[trainer_id] startup_program = startup_program if startup_program else \ fluid.framework.default_startup_program # call transpiler config = dist_transpiler.DistributeTranspilerConfig() config.mode = "nccl2" t = dist_transpiler.DistributeTranspiler(config=config) t.transpile( trainer_id, trainers=','.join(worker_endpoints), startup_program=startup_program, current_endpoint=current_endpoint) return optimize_ops, param_grads
def test(self): self._check() trainer_id = self.trainer_id num_trainers = self.num_trainers # if the test program is not built, which means that is the first time # to call the test method, we will first build the test program and # add ops to broadcast bn-related parameters from trainer 0 to other # trainers for distributed tests. if not self.test_initialized: emb, loss, _, _, _ = self.build_program(False, self.num_trainers > 1) emb_name = emb.name assert self._get_info(emb_name) is None self._set_info('emb_name', emb.name) if num_trainers > 1 and self.has_run_train: self._append_broadcast_ops(self.test_program) if num_trainers > 1 and not self.has_run_train: worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") config = dist_transpiler.DistributeTranspilerConfig() config.mode = "collective" config.collective_mode = "grad_allreduce" t = dist_transpiler.DistributeTranspiler(config=config) t.transpile(trainer_id=trainer_id, trainers=worker_endpoints, startup_program=self.startup_program, program=self.test_program, current_endpoint=current_endpoint) else: emb_name = self._get_info('emb_name') gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = fluid.CUDAPlace(gpu_id) exe = fluid.Executor(place) if not self.has_run_train: exe.run(self.startup_program) if not self.test_reader: test_reader = reader.test else: test_reader = self.test_reader if not self.test_initialized: test_list, test_name_list = test_reader(self.dataset_dir, self.val_targets) assert self._get_info('test_list') is None assert self._get_info('test_name_list') is None self._set_info('test_list', test_list) self._set_info('test_name_list', test_name_list) else: test_list = self._get_info('test_list') test_name_list = self._get_info('test_name_list') test_program = self.test_program if not self.has_run_train: assert self.checkpoint_dir, "No checkpoint found for test." self.load_checkpoint(executor=exe, main_program=test_program, load_for_train=False) feeder = fluid.DataFeeder(place=place, feed_list=['image', 'label'], program=test_program) fetch_list = [emb_name] self.test_initialized = True test_start = time.time() self._run_test(exe, test_list, test_name_list, feeder, fetch_list) test_end = time.time() logger.info("test time: {:.4f}".format(test_end - test_start))