def test_mode_str_ps_local_cluster_1p_2t_1f_async_c2(self): """test_mode_str_ps_local_cluster_1p_2t_c2.""" self.yaml_config_name = sys._getframe().f_code.co_name + '.yaml' self.yaml_content["mode"] = "runner0" self.yaml_content["runner"][0]["class"] = "local_cluster_train" self.yaml_content["runner"][0]["worker_num"] = 2 self.yaml_content["runner"][0]["server_num"] = 1 self.run_yaml() built_in.equals(self.pro.returncode, 0, self.err_msg) built_in.not_contains('logs/server.0', 'Traceback', self.err_msg) built_in.not_contains('logs/worker.0', 'Traceback', self.err_msg) built_in.path_not_exist('logs/worker.1', self.err_msg)
def test_mode_list_collective_selected_gpus_2f_2cards_c2(self): """test selected gpus 2card with two files and set fleet mode = collective, it will change ps to collective and run with local_cluster_train mode """ self.yaml_config_name = sys._getframe().f_code.co_name + '.yaml' self.yaml_content["runner"][0]["device"] = 'gpu' self.yaml_content["runner"][0]["selected_gpus"] = "0,1" self.yaml_content["runner"][0]["class"] = "local_cluster_train" self.yaml_content["runner"][0]["fleet_mode"] = "collective" self.yaml_content["dataset"][0]["data_path"] = "criteo_data" self.run_yaml() built_in.equals(self.pro.returncode, 0, self.err_msg) built_in.not_contains(self.err, 'Traceback', self.err_msg) built_in.regex_match_len('logs/worker.0', self.epoch_re, 2, self.err_msg) built_in.path_not_exist('logs/server.0', self.err_msg)
def test_mode_list_collective_selected_gpus_2f_4cards_c2(self): """ test_collective_selected_gpus_2f_2cards. 程序运行GPU卡号,会依据文件个数, gpu_nums, worker_num 来判断起多少个trainer. """ self.yaml_config_name = sys._getframe().f_code.co_name + '.yaml' self.yaml_content["runner"][0]["device"] = 'gpu' self.yaml_content["runner"][0]["selected_gpus"] = "0,1" self.yaml_content["runner"][0]["class"] = "local_cluster_train" self.yaml_content["runner"][0]["fleet_mode"] = "collective" self.yaml_content["dataset"][0]["data_path"] = "criteo_data" self.run_yaml() built_in.equals(self.pro.returncode, 0, self.err_msg) built_in.not_contains(self.err, 'Traceback', self.err_msg) built_in.regex_match_len('logs/worker.1', self.epoch_re, 2, self.err_msg) built_in.path_not_exist('logs/worker.2', self.err_msg)
def test_mode_list_ps_selected_gpus_2f_2card_c2(self): """test selected gpus 2card with two files and not set fleet mode, it will change ps to collective and run with local_cluster_train mode """ self.yaml_config_name = sys._getframe().f_code.co_name + '.yaml' self.yaml_content["runner"][0]["device"] = 'gpu' self.yaml_content["runner"][0]["selected_gpus"] = "0,1" self.yaml_content["dataset"][0]["data_path"] = "criteo_data" self.run_yaml() built_in.equals(self.pro.returncode, 0, self.err_msg) built_in.not_contains(self.err, 'Traceback', self.err_msg) built_in.path_not_exist('logs/server.0', self.err_msg) built_in.regex_match_equal(self.out, '\ntrain.trainer.engine\s+(\S+)\s+\n', "local_cluster", self.err_msg) built_in.regex_match_len('logs/worker.1', self.auc_re, 6, self.err_msg)