def test_normal(self): logger.info("begin test_normal") checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._clear_envs() self._reset_generator() self._run_normal() self._readd_envs() logger.info("end test_normal")
def _test_corner_epoch_no(self, break_epoch_no): logger.info("begin test_corener_epoch_no") checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() self._run_save_0(break_epoch_no=break_epoch_no) self._reset_generator() self._run_load_0(break_epoch_no=break_epoch_no) fs.delete(checker.hdfs_checkpoint_path) logger.info("end test_corener_epoch_no")
def test_basic(self): logger.info("begin test_basic") checker = acp._get_checker() self.assertEqual(checker.run_env, "PADDLE_EDL_AUTO_CHECKPOINT") self.assertEqual(checker.platform, "PADDLE_CLOUD") self.assertEqual(checker.save_checkpoint_inter, 0) print(checker) fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() self._run_save_0() self._reset_generator() self._run_load_0() logger.info("end test_basic")
def test_timeout(self): fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=6 * 1000, sleep_inter=100) src = "hdfs_test_timeout" dst = "new_hdfs_test_timeout" fs.delete(dst) fs.mkdirs(src) fs.mkdirs(dst) fs.mkdirs(dst + "/" + src) output = "" cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst) try: fs.mv(src, dst, test_exists=False) self.assertFalse( 1, "can't execute cmd:{} output:{}".format(cmd, output)) except FSTimeOut as e: print("execute mv {} to {} timeout".format(src, dst)) ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000) self.assertNotEqual(ret, 0) print("second mv ret:{} output:{}".format(ret, output))
def test(self): fs = HDFSClient("/usr/local/hadoop-2.7.7", None) dir_path = "./checkpointsaver_test" fs.delete(dir_path) s = CheckpointSaver(fs) fs.mkdirs("{}/exe.exe".format(dir_path)) fs.mkdirs("{}/exe.1".format(dir_path)) fs.mkdirs("{}/exe".format(dir_path)) a = s.get_checkpoint_no(dir_path) self.assertEqual(len(a), 0) fs.mkdirs("{}/__paddle_checkpoint__.0".format(dir_path)) fs.mkdirs("{}/__paddle_checkpoint__.exe".format(dir_path)) a = s.get_checkpoint_no(dir_path) self.assertEqual(len(a), 1) s.clean_redundant_checkpoints(dir_path) s.clean_redundant_checkpoints(dir_path) fs.delete(dir_path)
def test_distributed_basic(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_distributed_basic") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) #basic exe, main_prog, startup_prog = self._generate() compiled, data_loader, optimizer, loss, image, label = \ self._init_env(exe, main_prog, startup_prog, minimize=False) #fleet os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070" role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) with fluid.program_guard(main_prog, startup_prog): dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(loss) exe.run(startup_prog) o = None i = 0 name = None for i in acp.train_epoch_range(3, 0): o = acp._get_train_epoch_range() name = o.name logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i)) for data in data_loader(): fetch = exe.run(fleet.main_program, feed=data, fetch_list=[loss]) self.assertEqual(len(o._exe_status), 1) o = acp._get_train_epoch_range() assert o == None, "now train epoch must not exits now" self.assertEqual(i, 2) fs.delete(save_dir) logger.info("end test_distributed_basic")
def test_multiple(self): checker = acp._get_checker() fs = HDFSClient(checker.hdfs_home, None) fs.delete(checker.hdfs_checkpoint_path) self._reset_generator() logger.info("begin test_multiple") fs = LocalFS() save_dir = "./run_save_0" fs.delete(save_dir) exe, main_prog1, startup_prog1 = self._generate() _, main_prog2, startup_prog2 = self._generate() compiled1, data_loader1, optimizer1, loss1, image1, label1 = \ self._init_env(exe, main_prog1, startup_prog1) compiled2, data_loader2, optimizer2, loss2, image2, label2 = \ self._init_env(exe, main_prog2, startup_prog2) o = None epochs = [] for i in acp.train_epoch_range(3, 0): for data in data_loader1(): fetch = exe.run(compiled1, feed=data, fetch_list=[loss1]) for data in data_loader2(): fetch = exe.run(compiled2, feed=data, fetch_list=[loss2]) o = acp._get_train_epoch_range() self.assertEqual(len(o._exe_status), 2) print(o._exe_status) epochs.append(i) o = acp._get_train_epoch_range() self.assertTrue(o == None, "now train epoch must not exits now") self.assertEqual(i, 2) self.assertEqual(epochs, [0, 1, 2]) fs.delete(save_dir) logger.info("end test_multiple")