class ExperimentManager(abc.ABC): def __init__( self, project_path="/home/experiment/project", # according to core/experiment/dockex_experiment tmp_dockex_path="/tmp/dockex", initial_job_num=None, experiment_name_prefix=None, sleep_seconds=0.5, save_project=False, ): super().__init__() if project_path is None: raise ValueError("A project_path must be provided.") else: self.project_path = os.path.expanduser(project_path) self.tmp_dockex_path = tmp_dockex_path self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json") self.redis_client = DockexRedisClient(self.dockex_config["redis_address"]) self.docker_client = docker.from_env() manager_ip_address = self.redis_client.get("ip_address") manager_port = self.redis_client.get("redis_port") self.dependency_lookup_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=1 ) self.dependency_counts_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=2 ) self.job_lookup_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=3 ) self.initial_job_num = initial_job_num if self.initial_job_num is not None: self.job_num = self.initial_job_num else: self.job_num = self.redis_client.get("manager_job_num") self.sleep_seconds = sleep_seconds self.job_list = [] self.dockex_path_list = self.redis_client.get("dockex_path_list") self.experiment_name_prefix = experiment_name_prefix self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}" if self.experiment_name_prefix is not None: self.experiment_name = ( f"{self.experiment_name_prefix}_{self.experiment_name}" ) self.csv_filename = f"jobs_{self.experiment_name}.csv" self.csv_pathname = ( f"/tmp/dockex/data/{self.csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally self.trials_csv_filename = f"trials_{self.experiment_name}.csv" self.trials_csv_pathname = ( f"/tmp/dockex/data/{self.trials_csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally self.extra_output_pathnames = [] self.save_project = save_project self.project_archive_pathname = None self.project_archive_filename = None self.trial_dict = dict() self.trials_list = [] def send_to_output_saver(self, extra_output_pathname): self.extra_output_pathnames.append(extra_output_pathname) def generate_job_name(self, module_name): job_num = self.job_num job_name = f"{module_name}_{str(self.job_num)}" self.job_num += 1 return job_name, job_num def add_job( self, module_path, params=None, input_pathnames=None, skip_job=False, skip_input_pathnames=False, skip_output_pathnames=False, cpu_credits=1, gpu_credits=0, save_outputs=False, params_nested_update=False, trial_tag=None, save_trial=False ): if cpu_credits == 0 and gpu_credits == 0: raise ValueError("Either cpu_credits or gpu_credits must be > 0") if params is None: params = dict() if input_pathnames is None: input_pathnames = dict() module_name = pathlib.PurePath(module_path).name config_pathname = f"{self.project_path}/{module_path}/{module_name}.json" with open(config_pathname, "r") as fp: config = json.load(fp) job_name, job_num = self.generate_job_name(module_name) config["name"] = job_name config["job_num"] = job_num config["path"] = module_path config["module_name"] = module_name config["params_nested_update"] = params_nested_update if "params" in config.keys(): if params_nested_update: config["params"] = update(copy.deepcopy(config["params"]), params) else: config["params"].update(params) else: config["params"] = params if "input_pathnames" in config.keys(): config["input_pathnames"].update(input_pathnames) else: config["input_pathnames"] = input_pathnames config["skip_job"] = skip_job config["skip_input_pathnames"] = skip_input_pathnames config["skip_output_pathnames"] = skip_output_pathnames config["cpu_credits"] = cpu_credits config["gpu_credits"] = gpu_credits config["save_outputs"] = save_outputs config[ "skip_docker_wrapper_build" ] = ( True ) # ExperimentWorker takes care of building containers before wrapper launched config["experiment_job"] = True for params_key in config["params"].keys(): if config["params"][params_key] == "DOCKEX_REQUIRED": raise ValueError( f'Missing required parameter "{params_key}" for job name "{job_name}"' ) for input_pathname_key in config["input_pathnames"].keys(): if config["input_pathnames"][input_pathname_key] == "DOCKEX_REQUIRED": raise ValueError( f'Missing required input pathname "{input_pathname_key}" for job name "{job_name}"' ) for output_pathname_key in config["output_pathnames"].keys(): config["output_pathnames"][ output_pathname_key ] = f"{module_name}/{job_name}{config['output_pathnames'][output_pathname_key]}" if skip_job is False: self.job_list.append(copy.deepcopy(config)) if trial_tag is not None: self.trial_dict[trial_tag] = copy.deepcopy(config) if save_trial is True: self.trials_list.append(copy.deepcopy(self.trial_dict)) return config["output_pathnames"] def archive_project(self): self.redis_client.set("status", "ARCHIVING PROJECT") self.project_archive_filename = ( f"project_{self.experiment_name}.zip" ) # this assumes we're running in a container or using /tmp/dockex locally self.project_archive_pathname = ( f"/tmp/dockex/data/{self.project_archive_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally shutil.make_archive( self.project_archive_pathname.replace(".zip", ""), "zip", "/tmp/dockex/project", ) # this assumes we're running in a container or using /tmp/dockex locally self.redis_client.set("project_archive_filename", self.project_archive_filename) def wait_for_jobs_to_end(self): keep_waiting = True while keep_waiting: time.sleep(self.sleep_seconds) num_complete_jobs = self.redis_client.get("num_complete_jobs") num_total_jobs = self.redis_client.get("num_total_jobs") print_progress(num_complete_jobs, num_total_jobs) if num_complete_jobs == num_total_jobs: keep_waiting = False def wait_for_save_outputs(self): # make sure output_saver flag is True self.redis_client.set("output_saver_working_flag", True) # send an experiment done message to output_saver # it should set flag to False once it processes this message self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND) # wait for OutputSaver to finish its business while self.redis_client.get("output_saver_working_flag") is True: pass def wait_for_experiment_to_finish(self): print("WAITING FOR EXPERIMENT TO FINISH") self.redis_client.set("status", "WAITING FOR EXPERIMENT TO FINISH") # store the job csv in the experiment zip file self.redis_client.rpush("output_saver", self.csv_filename) # if a trials csv exists, store it in the experiment zip file if os.path.isfile(self.trials_csv_pathname): self.redis_client.rpush("output_saver", self.trials_csv_filename) # send extra outputs to output_saver for extra_output_pathname in self.extra_output_pathnames: self.redis_client.rpush("output_saver", extra_output_pathname) if self.save_project: self.redis_client.rpush("output_saver", self.project_archive_filename) self.wait_for_jobs_to_end() # generate a csv of all the finished jobs and add it to the zip post_job_list = [ json.loads(b) for b in self.job_lookup_db.mget(self.job_lookup_db.keys("*")) ] post_csv_filename = f"post_{self.csv_filename}" post_csv_pathname = ( f"/tmp/dockex/data/{post_csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally pd.DataFrame(post_job_list).sort_values(by="job_num", ascending=True).set_index( "name" ).to_csv(post_csv_pathname) self.redis_client.rpush("output_saver", post_csv_filename) self.wait_for_save_outputs() os.remove(post_csv_pathname) os.remove(self.csv_pathname) os.remove(self.project_archive_pathname) def initialize_experiment_variables(self): # set the global job num for future experiments self.redis_client.set("manager_job_num", self.job_num) # flush experiment dbs self.dependency_lookup_db.flushdb() self.dependency_counts_db.flushdb() self.job_lookup_db.flushdb() # initialize the overall experiment job counts self.redis_client.set("num_total_jobs", 0) self.redis_client.set("num_pending_jobs", 0) self.redis_client.set("num_ready_jobs", 0) self.redis_client.set("num_running_jobs", 0) self.redis_client.set("num_complete_jobs", 0) self.redis_client.set("num_error_jobs", 0) self.redis_client.strict_redis.delete("unique_module_paths") unique_module_names = self.redis_client.get_list("unique_module_names") for unique_module_name in unique_module_names: stats_keys = get_module_stats_keys(unique_module_name) for key in stats_keys.values(): self.redis_client.strict_redis.delete(key) self.redis_client.strict_redis.delete("unique_module_names") ready_jobs_list_key_dicts = self.redis_client.smembers( "ready_jobs_list_key_dicts" ) for ready_jobs_list_key_dict in ready_jobs_list_key_dicts: self.redis_client.strict_redis.delete( ready_jobs_list_key_dict["ready_jobs_list_key"] ) self.redis_client.strict_redis.delete("ready_jobs_list_key_dicts") self.redis_client.set("experiment_name", self.experiment_name) # reset output_saver just in case a zip was left open self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND) self.redis_client.strict_redis.delete("error_jobs") def stage_jobs(self): print("STAGING JOBS") self.redis_client.set("status", "STAGING JOBS") unique_module_names = [] unique_module_paths = [] for job in self.job_list: input_pathnames = job["input_pathnames"] module_name = job["module_name"] module_path = job["path"] name = job["name"] skip_input_pathnames = job["skip_input_pathnames"] if module_path not in unique_module_paths: unique_module_paths.append(module_path) self.redis_client.rpush("unique_module_paths", module_path) ready_jobs_list_dict = OrderedDict( [ ("cpu_credits", job["cpu_credits"]), ("gpu_credits", job["gpu_credits"]), ] ) # register the ready_jobs list that corresponds to this job's credits ready_jobs_list_key = ready_jobs_dict_to_key(ready_jobs_list_dict) ready_jobs_list_dict["ready_jobs_list_key"] = ready_jobs_list_key # this is an ordered dict to guarantee the resulting json string is always in the same order # we're using a redis set here, and don't want duplicate entries if dict keys are in different order self.redis_client.sadd("ready_jobs_list_key_dicts", ready_jobs_list_dict) stats_keys = get_module_stats_keys(module_name) if module_name not in unique_module_names: unique_module_names.append(module_name) self.redis_client.rpush("unique_module_names", module_name) # it's important that total_jobs is updated first for accurately detecting experiment completion self.redis_client.set(stats_keys["num_total_jobs"], 1) self.redis_client.set(stats_keys["num_pending_jobs"], 0) self.redis_client.set(stats_keys["num_ready_jobs"], 0) self.redis_client.set(stats_keys["num_running_jobs"], 0) self.redis_client.set(stats_keys["num_complete_jobs"], 0) self.redis_client.set(stats_keys["num_error_jobs"], 0) else: # it's important that total_jobs is updated first for accurately detecting experiment completion self.redis_client.strict_redis.incr(stats_keys["num_total_jobs"]) num_input_pathnames = 0 if len(input_pathnames.keys()) > 0: for input_pathname_key in input_pathnames.keys(): input_pathname = input_pathnames[input_pathname_key] if input_pathname is not None: if ( skip_input_pathnames is False or skip_input_pathnames is None ): self.dependency_lookup_db.sadd(input_pathname, name) num_input_pathnames += 1 elif skip_input_pathnames is True: pass elif type(skip_input_pathnames) is list: if input_pathname_key in skip_input_pathnames: pass else: self.dependency_lookup_db.sadd(input_pathname, name) num_input_pathnames += 1 if num_input_pathnames > 0: self.dependency_counts_db.set(name, num_input_pathnames) self.redis_client.strict_redis.incr(stats_keys["num_pending_jobs"]) self.redis_client.strict_redis.incr("num_pending_jobs") else: self.redis_client.rpush(ready_jobs_list_key, job) self.redis_client.strict_redis.incr(stats_keys["num_ready_jobs"]) self.redis_client.strict_redis.incr("num_ready_jobs") self.redis_client.strict_redis.incr("num_total_jobs") # register the job on the backend self.job_lookup_db.set(name, json.dumps(job)) def set_manager_flag(self): print("SETTING MANAGER FLAG") self.redis_client.set("status", "SETTING MANAGER FLAG") self.redis_client.set("manager_flag", True) def unset_manager_flag(self): print("UNSETTING MANAGER FLAG") self.redis_client.set("status", "UNSETTING MANAGER FLAG") self.redis_client.set("manager_flag", False) def generate_job_csv(self): print("GENERATING JOB CSV") pd.DataFrame(self.job_list).to_csv(self.csv_pathname) def generate_trial_csv(self): print('GENERATING TRIALS CSV') if len(self.trials_list) > 0: pd.DataFrame(self.trials_list).to_csv(self.trials_csv_pathname) def copy_project(self): print("COPYING PROJECT") self.redis_client.set("status", "COPYING PROJECT DIRECTORY") tmp_project_path = f"{self.tmp_dockex_path}/project" empty_make_directory(tmp_project_path) copy_tree(self.project_path, tmp_project_path) os.system(f"chown -R nonroot:nonroot {tmp_project_path}") def acquire_prevent_experiment_overlap_flag(self): print("ACQUIRING PREVENT EXPERIMENT OVERLAP FLAG") if self.redis_client.get("prevent_experiment_overlap_flag") is True: print("WAITING FOR PREVIOUS LOCAL EXPERIMENT TO FINISH") while self.redis_client.get("prevent_experiment_overlap_flag") is True: pass self.redis_client.set("prevent_experiment_overlap_flag", True) # TODO: also check and wait for remote machines to prevent overlapping experiments def release_prevent_experiment_overlap_flag(self): print("RELEASING PREVENT EXPERIMENT OVERLAP FLAG") self.redis_client.set("prevent_experiment_overlap_flag", False) def run(self, print_build_logs=False): print("RUNNING EXPERIMENT") self.redis_client.set("status", "RUNNING EXPERIMENT") self.generate_job_csv() self.generate_trial_csv() self.acquire_prevent_experiment_overlap_flag() start = time.time() try: self.initialize_experiment_variables() self.copy_project() self.stage_jobs() build_project_modules( self.docker_client, self.redis_client.get_list("unique_module_paths"), print_build_logs=print_build_logs, redis_client=self.redis_client, ) self.archive_project() self.set_manager_flag() self.redis_client.set("status", "RUNNING EXPERIMENT") self.wait_for_experiment_to_finish() self.unset_manager_flag() except: self.wait_for_save_outputs() self.release_prevent_experiment_overlap_flag() self.unset_manager_flag() self.redis_client.set("status", "EXPERIMENT FAILED") raise end = time.time() self.release_prevent_experiment_overlap_flag() self.redis_client.set("status", "EXPERIMENT COMPLETE") print(f"EXPERIMENT EXECUTION TIME: {round((end - start), 2)} seconds")
class ExperimentWorker(PythonJobWithBackend): def __init__(self, input_args, checking_manager_sleep_seconds=0.5, working_sleep_seconds=0.25): super().__init__(input_args) self.checking_manager_sleep_seconds = checking_manager_sleep_seconds self.working_sleep_seconds = working_sleep_seconds self.docker_client = docker.from_env() self.experiment_manager = None self.experiment_manager_dict = None def run_job(self): while True: # check if we're connected to a manager # if we're NOT connected to a manager if self.experiment_manager is None: # check if there are any managers available dockex_machines_df = pd.DataFrame( self.redis_client.get_list("dockex_machines")) if len(dockex_machines_df) > 0: manager_machines_df = dockex_machines_df.loc[ dockex_machines_df.manager_flag == True] if len(manager_machines_df) > 0: # if so, connect to the manager self.experiment_manager_dict = manager_machines_df.iloc[ 0].to_dict() self.experiment_manager = DockexRedisClient( self.experiment_manager_dict["redis_address"]) self.redis_client.set( "manager_redis_address", self.experiment_manager_dict["redis_address"], ) # if the manager is not the local manager if (self.experiment_manager_dict["redis_address"] != self.redis_address): # empty the project directory empty_directory("/tmp/dockex/project") empty_directory("/tmp/dockex/data") # need to copy project archive, unarchive it, and build module images project_archive_filename = self.experiment_manager.get( "project_archive_filename") local_project_archive_filename = ( f"/tmp/dockex/data/{project_archive_filename}") found_project_archive = ftp_find_file( self.experiment_manager.get_list( "dockex_machines"), self.redis_client.get("ip_address"), f"data/{project_archive_filename}", local_project_archive_filename, ) if found_project_archive: with zipfile.ZipFile( local_project_archive_filename, "r") as zip_file: zip_file.extractall("/tmp/dockex/project") # build the module images experiment_module_paths = self.experiment_manager.get_list( "unique_module_paths") # TODO: need a way to signal to the experiment that a build failed # TODO: maybe a flag on manager that the experiment continually checks # TODO: or maybe manager needs to test build before setting manager flag? # TODO: even then though, if a build fails on remote host, that host should NOT work on that experiment name # TODO: maybe a worker should track bad experiment names self.redis_client.set( "status", "BUILDING PROJECT MODULES") build_project_modules(self.docker_client, experiment_module_paths) else: self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") else: time.sleep(self.checking_manager_sleep_seconds) else: time.sleep(self.checking_manager_sleep_seconds) # if we are connected to a manager else: # check if the manager is still a manager # if it is NOT still a manager if self.experiment_manager.get("manager_flag") is not True: # disconnect from the manager self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") # if it is still a manager else: # check that the experiment name is the same # if it is NOT the same, a new experiment has started if (self.experiment_manager.get("experiment_name") != self.experiment_manager_dict["experiment_name"]): # disconnect from the manager self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") # if the experiment name is the same else: # see if we can pull any work to do # get the list of ready_jobs lists ready_jobs_df = pd.DataFrame( self.experiment_manager.smembers( "ready_jobs_list_key_dicts")) if len(ready_jobs_df) > 0: # start with the jobs requiring the most credits ready_jobs_df = ready_jobs_df.sort_values( by=["gpu_credits", "cpu_credits"], ascending=False) num_open_cpu_credits = self.redis_client.get( "cpu_credits_total") - self.redis_client.get( "cpu_credits_used") num_open_gpu_credits = self.redis_client.get( "gpu_credits_total") - self.redis_client.get( "gpu_credits_used") if num_open_cpu_credits > 0 or num_open_gpu_credits > 0: for ready_jobs_df_ind in ready_jobs_df.index: num_open_cpu_credits = self.redis_client.get( "cpu_credits_total" ) - self.redis_client.get( "cpu_credits_used") num_open_gpu_credits = self.redis_client.get( "gpu_credits_total" ) - self.redis_client.get( "gpu_credits_used") required_cpu_credits = int( ready_jobs_df.loc[ready_jobs_df_ind, "cpu_credits"]) required_gpu_credits = int( ready_jobs_df.loc[ready_jobs_df_ind, "gpu_credits"]) ready_jobs_key = ready_jobs_df.loc[ ready_jobs_df_ind, "ready_jobs_list_key"] slots_min_list = [] if required_cpu_credits > 0: num_open_cpu_slots = int( np.floor(num_open_cpu_credits / required_cpu_credits)) slots_min_list.append( num_open_cpu_slots) if required_gpu_credits > 0: num_open_gpu_slots = int( np.floor(num_open_gpu_credits / required_gpu_credits)) slots_min_list.append( num_open_gpu_slots) num_open_slots = int( np.min(slots_min_list)) if num_open_slots > 0: p = (self.experiment_manager. strict_redis.pipeline()) p.lrange( ready_jobs_key, 0, (num_open_slots - 1)) # lrange is inclusive, so - 1 p.ltrim(ready_jobs_key, num_open_slots, -1) pop_job_dicts, _ = p.execute() if len(pop_job_dicts) > 0: for pop_job_dict in pop_job_dicts: pop_job_dict = json.loads( pop_job_dict) print(pop_job_dict) # checkout the credits self.redis_client.strict_redis.incrby( "cpu_credits_used", required_cpu_credits, ) self.redis_client.strict_redis.incrby( "gpu_credits_used", required_gpu_credits, ) self.redis_client.redis_launch_job( f"/tmp/dockex/json/{pop_job_dict['name']}.json", pop_job_dict, ) time.sleep(self.working_sleep_seconds)