def check_all_trainers_ready(ready_path, epoch): trainer_num = fleet.worker_num() trainer_id = fleet.worker_index() hadoop_home = os.getenv("HADOOP_HOME") configs = { "fs.default.name": os.getenv("FS_NAME"), "hadoop.job.ugi": os.getenv("FS_UGI") } node_ready = "ready.{}.{}.done".format(epoch, trainer_id) with open(node_ready, "w") as node: node.write("") client = HDFSClient(hadoop_home, configs) if not client.is_dir(ready_path): client.makedirs(ready_path) client.upload( hdfs_path=ready_path, local_path=node_ready, overwrite=True, retry_times=0) print("PUT {} ON HDFS {} OK".format(node_ready, ready_path)) while True: ready_num = len(client.ls(ready_path)) print("have {} trainers need to be ready".format(trainer_num - ready_num % trainer_num)) if ready_num % trainer_num == 0: break time.sleep(10) ready_num = len(client.ls(ready_path)) print("All trainers are ready, continue training")
from paddle.fluid.incubate.fleet.base import role_maker input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') cost = mlp(input_x, input_y) optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) role = role_maker.PaddleCloudRoleMaker() fleet.init(role) optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): fleet.init_worker() place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) step = 1001 for i in range(step): cost_val = exe.run(program=fleet.main_program, feed=gen_data(), fetch_list=[cost.name]) print("worker_index: %d, step%d cost = %f" % (fleet.worker_index(), i, cost_val[0])) fleet.stop_worker()
def train_and_eval(self): """ :return: """ if self.is_fleet and fleet.is_server(): logging.debug("is fleet.server, over") return if self.is_fleet: logging.debug("worker_index%d start train...." % fleet.worker_index()) num_train_examples = self.params.get("num_train_examples", 0) if num_train_examples == 0: num_train_examples = self.data_set_reader.train_reader.get_num_examples() self.data_set_reader.train_reader.run() steps = 1 time_begin = time.time() if 'output_path' in self.params.keys() and self.params["output_path"]: save_checkpoints_path = os.path.join(self.params["output_path"], "save_checkpoints") save_inference_model_path = os.path.join(self.params["output_path"], "save_inference_model") else: save_checkpoints_path = "./output/save_checkpoints/" save_inference_model_path = "./output/save_inference_model/" try: while True: try: if steps % self.params["train_log_step"] != 0: self.run(InstanceName.TRAINING, need_fetch=False) else: metrics_tensor_value = self.run(InstanceName.TRAINING, need_fetch=True) current_example, current_epoch = self.data_set_reader.train_reader.get_train_progress() logging.info("epoch {0} progress {1}/{2} pyreader queue size {3}". format(current_epoch, current_example, num_train_examples, self.data_set_reader.train_reader.paddle_py_reader.queue.size())) fetch_output_dict = collections.OrderedDict() for key, value in zip(self.fetch_list_train_key, metrics_tensor_value): fetch_output_dict[key] = value time_end = time.time() used_time = time_end - time_begin meta_info = collections.OrderedDict() meta_info[InstanceName.STEP] = steps meta_info[InstanceName.GPU_ID] = self.gpu_id meta_info[InstanceName.TIME_COST] = used_time metrics_output = self.model_class.get_metrics(fetch_output_dict, meta_info, InstanceName.TRAINING) if self.params.get("visualdl_log", False): assert isinstance(metrics_output, OrderedDict), "metrics_output is must be OrderedDict" self.visualdl_log(metrics_output, np.mean(fetch_output_dict[InstanceName.LOSS]), steps, phase=InstanceName.TRAINING) time_begin = time.time() if steps % self.params["eval_step"] == 0: if self.params["is_eval_dev"]: self.evaluate(self.data_set_reader.dev_reader, InstanceName.EVALUATE, steps) if self.params["is_eval_test"]: self.evaluate(self.data_set_reader.test_reader, InstanceName.TEST, steps) if self.trainer_id == 0: if steps % self.params["save_model_step"] == 0: self.save_models(save_checkpoints_path, save_inference_model_path, steps) steps += 1 if "steps_for_test" in self.params and steps >= self.params["steps_for_test"]: self.data_set_reader.train_reader.stop() logging.debug("steps_for_test stop!") break except fluid.core.EOFException: self.data_set_reader.train_reader.stop() break if self.params["is_eval_dev"]: logging.info("Final evaluate result: ") self.evaluate(self.data_set_reader.dev_reader, InstanceName.EVALUATE, steps) if self.params["is_eval_test"]: logging.info("Final test result: ") self.evaluate(self.data_set_reader.test_reader, InstanceName.TEST, steps) except Exception as e: logging.error('traceback.format_exc():%s' % traceback.format_exc()) self.save_models(save_checkpoints_path, save_inference_model_path, steps) raise e self.save_models(save_checkpoints_path, save_inference_model_path, steps)
#http_ip_port="127.0.0.1:26001") #role = role_maker.PaddleCloudRoleMaker(http_ip_port="127.0.0.1:26001") #role = role_maker.GeneralRoleMaker(path="./tmp4") logger.info("Begin") res = [0, 0] logger.info(res) role = role_maker.PaddleCloudRoleMaker(path="./tmp4") fleet.init(role) print("init wancheng") # #if fleet.is_worker(): # import time # time.sleep(3) a = [5] b = [2] res = [0] if fleet.worker_index() == 0: role._all_reduce(role._node_type_comm, a) elif fleet.worker_index() == 1: role._all_reduce(role._node_type_comm, b) #logger.info(res) #print("res ", res) #role._barrier_all()