class ExperimentManager(abc.ABC): def __init__( self, project_path="/home/experiment/project", # according to core/experiment/dockex_experiment tmp_dockex_path="/tmp/dockex", initial_job_num=None, experiment_name_prefix=None, sleep_seconds=0.5, save_project=False, ): super().__init__() if project_path is None: raise ValueError("A project_path must be provided.") else: self.project_path = os.path.expanduser(project_path) self.tmp_dockex_path = tmp_dockex_path self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json") self.redis_client = DockexRedisClient(self.dockex_config["redis_address"]) self.docker_client = docker.from_env() manager_ip_address = self.redis_client.get("ip_address") manager_port = self.redis_client.get("redis_port") self.dependency_lookup_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=1 ) self.dependency_counts_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=2 ) self.job_lookup_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=3 ) self.initial_job_num = initial_job_num if self.initial_job_num is not None: self.job_num = self.initial_job_num else: self.job_num = self.redis_client.get("manager_job_num") self.sleep_seconds = sleep_seconds self.job_list = [] self.dockex_path_list = self.redis_client.get("dockex_path_list") self.experiment_name_prefix = experiment_name_prefix self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}" if self.experiment_name_prefix is not None: self.experiment_name = ( f"{self.experiment_name_prefix}_{self.experiment_name}" ) self.csv_filename = f"jobs_{self.experiment_name}.csv" self.csv_pathname = ( f"/tmp/dockex/data/{self.csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally self.trials_csv_filename = f"trials_{self.experiment_name}.csv" self.trials_csv_pathname = ( f"/tmp/dockex/data/{self.trials_csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally self.extra_output_pathnames = [] self.save_project = save_project self.project_archive_pathname = None self.project_archive_filename = None self.trial_dict = dict() self.trials_list = [] def send_to_output_saver(self, extra_output_pathname): self.extra_output_pathnames.append(extra_output_pathname) def generate_job_name(self, module_name): job_num = self.job_num job_name = f"{module_name}_{str(self.job_num)}" self.job_num += 1 return job_name, job_num def add_job( self, module_path, params=None, input_pathnames=None, skip_job=False, skip_input_pathnames=False, skip_output_pathnames=False, cpu_credits=1, gpu_credits=0, save_outputs=False, params_nested_update=False, trial_tag=None, save_trial=False ): if cpu_credits == 0 and gpu_credits == 0: raise ValueError("Either cpu_credits or gpu_credits must be > 0") if params is None: params = dict() if input_pathnames is None: input_pathnames = dict() module_name = pathlib.PurePath(module_path).name config_pathname = f"{self.project_path}/{module_path}/{module_name}.json" with open(config_pathname, "r") as fp: config = json.load(fp) job_name, job_num = self.generate_job_name(module_name) config["name"] = job_name config["job_num"] = job_num config["path"] = module_path config["module_name"] = module_name config["params_nested_update"] = params_nested_update if "params" in config.keys(): if params_nested_update: config["params"] = update(copy.deepcopy(config["params"]), params) else: config["params"].update(params) else: config["params"] = params if "input_pathnames" in config.keys(): config["input_pathnames"].update(input_pathnames) else: config["input_pathnames"] = input_pathnames config["skip_job"] = skip_job config["skip_input_pathnames"] = skip_input_pathnames config["skip_output_pathnames"] = skip_output_pathnames config["cpu_credits"] = cpu_credits config["gpu_credits"] = gpu_credits config["save_outputs"] = save_outputs config[ "skip_docker_wrapper_build" ] = ( True ) # ExperimentWorker takes care of building containers before wrapper launched config["experiment_job"] = True for params_key in config["params"].keys(): if config["params"][params_key] == "DOCKEX_REQUIRED": raise ValueError( f'Missing required parameter "{params_key}" for job name "{job_name}"' ) for input_pathname_key in config["input_pathnames"].keys(): if config["input_pathnames"][input_pathname_key] == "DOCKEX_REQUIRED": raise ValueError( f'Missing required input pathname "{input_pathname_key}" for job name "{job_name}"' ) for output_pathname_key in config["output_pathnames"].keys(): config["output_pathnames"][ output_pathname_key ] = f"{module_name}/{job_name}{config['output_pathnames'][output_pathname_key]}" if skip_job is False: self.job_list.append(copy.deepcopy(config)) if trial_tag is not None: self.trial_dict[trial_tag] = copy.deepcopy(config) if save_trial is True: self.trials_list.append(copy.deepcopy(self.trial_dict)) return config["output_pathnames"] def archive_project(self): self.redis_client.set("status", "ARCHIVING PROJECT") self.project_archive_filename = ( f"project_{self.experiment_name}.zip" ) # this assumes we're running in a container or using /tmp/dockex locally self.project_archive_pathname = ( f"/tmp/dockex/data/{self.project_archive_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally shutil.make_archive( self.project_archive_pathname.replace(".zip", ""), "zip", "/tmp/dockex/project", ) # this assumes we're running in a container or using /tmp/dockex locally self.redis_client.set("project_archive_filename", self.project_archive_filename) def wait_for_jobs_to_end(self): keep_waiting = True while keep_waiting: time.sleep(self.sleep_seconds) num_complete_jobs = self.redis_client.get("num_complete_jobs") num_total_jobs = self.redis_client.get("num_total_jobs") print_progress(num_complete_jobs, num_total_jobs) if num_complete_jobs == num_total_jobs: keep_waiting = False def wait_for_save_outputs(self): # make sure output_saver flag is True self.redis_client.set("output_saver_working_flag", True) # send an experiment done message to output_saver # it should set flag to False once it processes this message self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND) # wait for OutputSaver to finish its business while self.redis_client.get("output_saver_working_flag") is True: pass def wait_for_experiment_to_finish(self): print("WAITING FOR EXPERIMENT TO FINISH") self.redis_client.set("status", "WAITING FOR EXPERIMENT TO FINISH") # store the job csv in the experiment zip file self.redis_client.rpush("output_saver", self.csv_filename) # if a trials csv exists, store it in the experiment zip file if os.path.isfile(self.trials_csv_pathname): self.redis_client.rpush("output_saver", self.trials_csv_filename) # send extra outputs to output_saver for extra_output_pathname in self.extra_output_pathnames: self.redis_client.rpush("output_saver", extra_output_pathname) if self.save_project: self.redis_client.rpush("output_saver", self.project_archive_filename) self.wait_for_jobs_to_end() # generate a csv of all the finished jobs and add it to the zip post_job_list = [ json.loads(b) for b in self.job_lookup_db.mget(self.job_lookup_db.keys("*")) ] post_csv_filename = f"post_{self.csv_filename}" post_csv_pathname = ( f"/tmp/dockex/data/{post_csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally pd.DataFrame(post_job_list).sort_values(by="job_num", ascending=True).set_index( "name" ).to_csv(post_csv_pathname) self.redis_client.rpush("output_saver", post_csv_filename) self.wait_for_save_outputs() os.remove(post_csv_pathname) os.remove(self.csv_pathname) os.remove(self.project_archive_pathname) def initialize_experiment_variables(self): # set the global job num for future experiments self.redis_client.set("manager_job_num", self.job_num) # flush experiment dbs self.dependency_lookup_db.flushdb() self.dependency_counts_db.flushdb() self.job_lookup_db.flushdb() # initialize the overall experiment job counts self.redis_client.set("num_total_jobs", 0) self.redis_client.set("num_pending_jobs", 0) self.redis_client.set("num_ready_jobs", 0) self.redis_client.set("num_running_jobs", 0) self.redis_client.set("num_complete_jobs", 0) self.redis_client.set("num_error_jobs", 0) self.redis_client.strict_redis.delete("unique_module_paths") unique_module_names = self.redis_client.get_list("unique_module_names") for unique_module_name in unique_module_names: stats_keys = get_module_stats_keys(unique_module_name) for key in stats_keys.values(): self.redis_client.strict_redis.delete(key) self.redis_client.strict_redis.delete("unique_module_names") ready_jobs_list_key_dicts = self.redis_client.smembers( "ready_jobs_list_key_dicts" ) for ready_jobs_list_key_dict in ready_jobs_list_key_dicts: self.redis_client.strict_redis.delete( ready_jobs_list_key_dict["ready_jobs_list_key"] ) self.redis_client.strict_redis.delete("ready_jobs_list_key_dicts") self.redis_client.set("experiment_name", self.experiment_name) # reset output_saver just in case a zip was left open self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND) self.redis_client.strict_redis.delete("error_jobs") def stage_jobs(self): print("STAGING JOBS") self.redis_client.set("status", "STAGING JOBS") unique_module_names = [] unique_module_paths = [] for job in self.job_list: input_pathnames = job["input_pathnames"] module_name = job["module_name"] module_path = job["path"] name = job["name"] skip_input_pathnames = job["skip_input_pathnames"] if module_path not in unique_module_paths: unique_module_paths.append(module_path) self.redis_client.rpush("unique_module_paths", module_path) ready_jobs_list_dict = OrderedDict( [ ("cpu_credits", job["cpu_credits"]), ("gpu_credits", job["gpu_credits"]), ] ) # register the ready_jobs list that corresponds to this job's credits ready_jobs_list_key = ready_jobs_dict_to_key(ready_jobs_list_dict) ready_jobs_list_dict["ready_jobs_list_key"] = ready_jobs_list_key # this is an ordered dict to guarantee the resulting json string is always in the same order # we're using a redis set here, and don't want duplicate entries if dict keys are in different order self.redis_client.sadd("ready_jobs_list_key_dicts", ready_jobs_list_dict) stats_keys = get_module_stats_keys(module_name) if module_name not in unique_module_names: unique_module_names.append(module_name) self.redis_client.rpush("unique_module_names", module_name) # it's important that total_jobs is updated first for accurately detecting experiment completion self.redis_client.set(stats_keys["num_total_jobs"], 1) self.redis_client.set(stats_keys["num_pending_jobs"], 0) self.redis_client.set(stats_keys["num_ready_jobs"], 0) self.redis_client.set(stats_keys["num_running_jobs"], 0) self.redis_client.set(stats_keys["num_complete_jobs"], 0) self.redis_client.set(stats_keys["num_error_jobs"], 0) else: # it's important that total_jobs is updated first for accurately detecting experiment completion self.redis_client.strict_redis.incr(stats_keys["num_total_jobs"]) num_input_pathnames = 0 if len(input_pathnames.keys()) > 0: for input_pathname_key in input_pathnames.keys(): input_pathname = input_pathnames[input_pathname_key] if input_pathname is not None: if ( skip_input_pathnames is False or skip_input_pathnames is None ): self.dependency_lookup_db.sadd(input_pathname, name) num_input_pathnames += 1 elif skip_input_pathnames is True: pass elif type(skip_input_pathnames) is list: if input_pathname_key in skip_input_pathnames: pass else: self.dependency_lookup_db.sadd(input_pathname, name) num_input_pathnames += 1 if num_input_pathnames > 0: self.dependency_counts_db.set(name, num_input_pathnames) self.redis_client.strict_redis.incr(stats_keys["num_pending_jobs"]) self.redis_client.strict_redis.incr("num_pending_jobs") else: self.redis_client.rpush(ready_jobs_list_key, job) self.redis_client.strict_redis.incr(stats_keys["num_ready_jobs"]) self.redis_client.strict_redis.incr("num_ready_jobs") self.redis_client.strict_redis.incr("num_total_jobs") # register the job on the backend self.job_lookup_db.set(name, json.dumps(job)) def set_manager_flag(self): print("SETTING MANAGER FLAG") self.redis_client.set("status", "SETTING MANAGER FLAG") self.redis_client.set("manager_flag", True) def unset_manager_flag(self): print("UNSETTING MANAGER FLAG") self.redis_client.set("status", "UNSETTING MANAGER FLAG") self.redis_client.set("manager_flag", False) def generate_job_csv(self): print("GENERATING JOB CSV") pd.DataFrame(self.job_list).to_csv(self.csv_pathname) def generate_trial_csv(self): print('GENERATING TRIALS CSV') if len(self.trials_list) > 0: pd.DataFrame(self.trials_list).to_csv(self.trials_csv_pathname) def copy_project(self): print("COPYING PROJECT") self.redis_client.set("status", "COPYING PROJECT DIRECTORY") tmp_project_path = f"{self.tmp_dockex_path}/project" empty_make_directory(tmp_project_path) copy_tree(self.project_path, tmp_project_path) os.system(f"chown -R nonroot:nonroot {tmp_project_path}") def acquire_prevent_experiment_overlap_flag(self): print("ACQUIRING PREVENT EXPERIMENT OVERLAP FLAG") if self.redis_client.get("prevent_experiment_overlap_flag") is True: print("WAITING FOR PREVIOUS LOCAL EXPERIMENT TO FINISH") while self.redis_client.get("prevent_experiment_overlap_flag") is True: pass self.redis_client.set("prevent_experiment_overlap_flag", True) # TODO: also check and wait for remote machines to prevent overlapping experiments def release_prevent_experiment_overlap_flag(self): print("RELEASING PREVENT EXPERIMENT OVERLAP FLAG") self.redis_client.set("prevent_experiment_overlap_flag", False) def run(self, print_build_logs=False): print("RUNNING EXPERIMENT") self.redis_client.set("status", "RUNNING EXPERIMENT") self.generate_job_csv() self.generate_trial_csv() self.acquire_prevent_experiment_overlap_flag() start = time.time() try: self.initialize_experiment_variables() self.copy_project() self.stage_jobs() build_project_modules( self.docker_client, self.redis_client.get_list("unique_module_paths"), print_build_logs=print_build_logs, redis_client=self.redis_client, ) self.archive_project() self.set_manager_flag() self.redis_client.set("status", "RUNNING EXPERIMENT") self.wait_for_experiment_to_finish() self.unset_manager_flag() except: self.wait_for_save_outputs() self.release_prevent_experiment_overlap_flag() self.unset_manager_flag() self.redis_client.set("status", "EXPERIMENT FAILED") raise end = time.time() self.release_prevent_experiment_overlap_flag() self.redis_client.set("status", "EXPERIMENT COMPLETE") print(f"EXPERIMENT EXECUTION TIME: {round((end - start), 2)} seconds")
class DockerWrapper: def __init__(self, input_args): super().__init__() self.json_pathname = input_args[1] self.redis_address = input_args[2] self.redis_client = DockexRedisClient(self.redis_address) self.tmp_dockex_path = self.redis_client.get("tmp_dockex_path") self.docker_client = docker.from_env() self.job_config = read_job_config(self.json_pathname) self.dockerfile_path = f"{self.job_config['path']}/Dockerfile" if "image_tag" in self.job_config.keys(): self.image_tag = self.job_config["image_tag"] else: self.image_tag = module_path_to_image_tag(self.job_config["path"]) self.command_args = self.generate_command_args() self.volumes = self.generate_volumes() self.network_mode = "host" self.environment = None if "include_json_pathname_env_variable" in self.job_config.keys(): if self.job_config["include_json_pathname_env_variable"]: self.environment = {"JSON_PATHNAME": self.json_pathname} self.skip_build = False if "skip_docker_wrapper_build" in self.job_config.keys(): if self.job_config["skip_docker_wrapper_build"] is True: self.skip_build = True # build path depends on if path is in core or relative to /tmp/dockex/project if self.job_config["path"].startswith("core/"): self.build_path = "." else: self.build_path = "/tmp/dockex/project" if "experiment_job" in self.job_config.keys(): self.experiment_job = self.job_config["experiment_job"] else: self.experiment_job = False if self.experiment_job is True: self.detach = False else: self.detach = True self.build_kwargs_dict = dict( path=self.build_path, dockerfile=self.dockerfile_path, tag=self.image_tag ) self.run_kwargs_dict = dict( image=self.image_tag, name=self.job_config["name"], command=self.command_args, detach=self.detach, network_mode=self.network_mode, volumes=self.volumes, environment=self.environment, ) # check global gpus enable if self.redis_client.get("enable_gpus") is True: self.run_kwargs_dict["enable_gpus"] = True else: self.run_kwargs_dict["enable_gpus"] = False # allow module to override global gpus enable if "enable_gpus" in self.job_config.keys(): if self.job_config["enable_gpus"] is True: self.run_kwargs_dict["enable_gpus"] = True else: self.run_kwargs_dict["enable_gpus"] = False self.good_to_launch = None self.experiment_manager_address = None self.experiment_manager = None self.dependency_lookup_db = None self.job_lookup_db = None self.stats_keys = None self.container_data_prefix = "/tmp/dockex/data/" self.sleep_seconds = 0.25 def generate_command_args(self): command_args = f"{self.json_pathname}" if "omit_json_pathname_arg" in self.job_config.keys(): if self.job_config["omit_json_pathname_arg"]: command_args = "" if "pass_redis_address_arg" in self.job_config.keys(): if self.job_config["pass_redis_address_arg"]: if command_args == "": command_args = f"{self.redis_address}" else: command_args = f"{command_args} {self.redis_address}" if "command_args" in self.job_config.keys(): if command_args == "": command_args = f"{self.job_config['command_args']}" else: command_args = f"{command_args} {self.job_config['command_args']}" return command_args def generate_volumes(self): volumes = {self.tmp_dockex_path: {"bind": "/tmp/dockex", "mode": "rw"}} if "bind_mount_docker_socket" in self.job_config.keys(): if self.job_config["bind_mount_docker_socket"]: volumes["/var/run/docker.sock"] = { "bind": "/var/run/docker.sock", "mode": "rw", } if "volumes" in self.job_config.keys(): for volume_key in self.job_config["volumes"].keys(): volumes[volume_key] = { "bind": self.job_config["volumes"][volume_key], "mode": "rw", } return volumes def connect_to_experiment_manager(self): print("GETTING MANAGER REDIS ADDRESS") keep_trying = True while keep_trying: self.experiment_manager_address = self.redis_client.get( "manager_redis_address" ) if self.experiment_manager_address is not None: keep_trying = False print("FOUND MANAGER REDIS ADDRESS") else: print("NO MANAGER FOUND, TRYING AGAIN") time.sleep(self.sleep_seconds) print("CONNECTING TO EXPERIMENT MANAGER") self.experiment_manager = DockexRedisClient(self.experiment_manager_address) experiment_manager_ip_address = self.experiment_manager.get("ip_address") experiment_manager_port = self.experiment_manager.get("redis_port") self.dependency_lookup_db = redis.StrictRedis( host=experiment_manager_ip_address, port=experiment_manager_port, db=1 ) self.job_lookup_db = redis.StrictRedis( host=experiment_manager_ip_address, port=experiment_manager_port, db=3 ) def prepare_input_pathnames(self): input_pathnames = self.job_config["input_pathnames"] if len(input_pathnames.keys()) > 0: # loop through ftp clients, connect, keep trying until it connects (in case workers take a while to spin up for input_pathname_key in input_pathnames.keys(): input_pathname = input_pathnames[input_pathname_key] if input_pathname is not None: local_input_pathname = ( f"{self.container_data_prefix}{input_pathname}" ) # if the file doesn't exist, go find it print("CHECKING FOR FILE: " + local_input_pathname) if not os.path.isfile(local_input_pathname): print("GOING TO LOOK FOR FILE") ftp_find_file( self.experiment_manager.get_list("dockex_machines"), self.redis_client.get("ip_address"), f"data/{input_pathname}", local_input_pathname, ) # update input_pathnames with local path input_pathnames[input_pathname_key] = local_input_pathname # assign local input pathnames to job config for job self.job_config["input_pathnames"] = input_pathnames # check that all input pathnames exist if len(self.job_config["input_pathnames"].values()) > 0: check_pathnames = [ os.path.isfile(check_pathname) for check_pathname in self.job_config["input_pathnames"].values() if check_pathname is not None ] self.good_to_launch = all(check is True for check in check_pathnames) else: self.good_to_launch = True def prepare_output_pathnames(self): output_pathnames = self.job_config["output_pathnames"] if len(output_pathnames.keys()) > 0: for output_pathname_key in output_pathnames.keys(): output_pathname = output_pathnames[output_pathname_key] if output_pathname is not None: local_output_pathname = ( f"{self.container_data_prefix}{output_pathname}" ) # if the file is inside a directory, make sure that directory exists local_output_path = os.path.split(local_output_pathname)[0] if local_output_path != "": check_make_directory(local_output_path) os.system(f"chown -R nonroot:nonroot {local_output_path}") output_pathnames[output_pathname_key] = local_output_pathname self.job_config["output_pathnames"] = output_pathnames def launch_experiment_job(self): print("GOOD TO LAUNCH") # overwrite json file with local input/output pathnames write_job_config(self.json_pathname, self.job_config) # update pending ready running numbers for experiment and job_command # use a backend pipeline so it's all atomic # this is a job going from ready to running update_pipeline = self.experiment_manager.strict_redis.pipeline() update_pipeline.decr("num_ready_jobs") update_pipeline.decr(self.stats_keys["num_ready_jobs"]) update_pipeline.incr("num_running_jobs") update_pipeline.incr(self.stats_keys["num_running_jobs"]) update_pipeline.execute() start_time = datetime.datetime.now() # launch the job try: build_image_run_container( self.docker_client, self.build_kwargs_dict, self.run_kwargs_dict, print_build_logs=True, skip_build=self.skip_build, native_run=True, ) except Exception as e: print("EXCEPTION WHILE RUNNING CONTAINER") print(e) end_time = datetime.datetime.now() self.job_config["start_time"] = str(start_time) self.job_config["end_time"] = str(end_time) self.job_config["execution_time"] = str(end_time - start_time) print("GOOD LAUNCH") def cleanup_job(self): # release the credits self.redis_client.strict_redis.decrby( "cpu_credits_used", int(self.job_config["cpu_credits"]) ) self.redis_client.strict_redis.decrby( "gpu_credits_used", int(self.job_config["gpu_credits"]) ) skip_output_pathnames = self.job_config["skip_output_pathnames"] if type(skip_output_pathnames) is not list: if skip_output_pathnames is True: skip_output_pathnames = list(skip_output_pathnames.keys()) else: skip_output_pathnames = [] # check if its output_pathnames exist successful_job = True for local_output_pathname_key in self.job_config["output_pathnames"].keys(): # local output_pathname contains the container_data_prefix local_output_pathname = self.job_config["output_pathnames"][ local_output_pathname_key ] # remove the local data_path prepend output_pathname = local_output_pathname.replace( self.container_data_prefix, "" ) # if the output_pathname doesn't exist and we're not skipping that output_pathname, an error occurred if not os.path.isfile(local_output_pathname): if local_output_pathname_key not in skip_output_pathnames: # set the flag successful_job = False # if the file does exist, save the output if requested # NOTE: it's important to push to output_saver before updating num_complete_jobs # NOTE: because ExperimentManager assumes this to determine when experiment has ended else: if self.job_config["save_outputs"]: self.experiment_manager.rpush("output_saver", output_pathname) self.check_dependencies(output_pathname) # update the progress counts on ExperimentStager # this is a running to complete update_pipeline = self.experiment_manager.strict_redis.pipeline() update_pipeline.decr("num_running_jobs") update_pipeline.decr(self.stats_keys["num_running_jobs"]) update_pipeline.incr("num_complete_jobs") update_pipeline.incr(self.stats_keys["num_complete_jobs"]) update_pipeline.execute() if successful_job: self.job_config["status"] = "SUCCESS" else: self.job_config["status"] = "ERROR" # push to error_jobs list self.experiment_manager.rpush("error_jobs", self.job_config) # update progress counts update_pipeline = self.experiment_manager.strict_redis.pipeline() update_pipeline.incr("num_error_jobs") update_pipeline.incr(self.stats_keys["num_error_jobs"]) update_pipeline.execute() job_config_json = json.dumps(self.job_config) # write job dict with status to backend self.job_lookup_db.set(self.job_config["name"], job_config_json) def check_dependencies(self, output_pathname): # get the job keys that depend on this output_pathname print("OUTPUT_PATHNAME: " + output_pathname) dependent_job_names = [ b.decode("utf-8") for b in self.dependency_lookup_db.smembers(output_pathname) ] print("DEPENDENCY NAMES: " + str(dependent_job_names)) for dependent_job_name in dependent_job_names: print("PROCESSING DEPENDENCY: " + dependent_job_name) self.experiment_manager.rpush("decrement_dependency", dependent_job_name) def failure_to_launch(self): # report error print("BAD LAUNCH") self.job_config["status"] = "ERROR" print(self.job_config) # ExperimentWorker checked out a cpu_credit before launching JobWrapperer # since job errored, check credit back in self.redis_client.strict_redis.decrby( "cpu_credits_used", int(self.job_config["cpu_credits"]) ) self.redis_client.strict_redis.decrby( "gpu_credits_used", int(self.job_config["gpu_credits"]) ) # push to error_jobs list self.experiment_manager.rpush("error_jobs", self.job_config) self.job_lookup_db.set(self.job_config["name"], json.dumps(self.job_config)) # propagate error for dependent jobs for local_output_pathname in self.job_config["output_pathnames"].values(): # remove the local data_path prepend output_pathname = local_output_pathname.replace( self.container_data_prefix, "" ) self.check_dependencies(output_pathname) # update progress counts # ready to complete/error update_pipeline = self.experiment_manager.strict_redis.pipeline() update_pipeline.decr("num_ready_jobs") update_pipeline.decr(self.stats_keys["num_ready_jobs"]) update_pipeline.incr("num_error_jobs") update_pipeline.incr(self.stats_keys["num_error_jobs"]) update_pipeline.incr("num_complete_jobs") update_pipeline.incr(self.stats_keys["num_complete_jobs"]) update_pipeline.execute() def run(self): print(self.job_config) print("build kwargs:") print(self.build_kwargs_dict) print("run kwargs") print(self.run_kwargs_dict) if self.experiment_job is not True: build_image_run_container( self.docker_client, self.build_kwargs_dict, self.run_kwargs_dict, print_build_logs=True, skip_build=self.skip_build, native_run=True, ) else: print("RUNNING EXPERIMENT JOB") self.connect_to_experiment_manager() self.prepare_input_pathnames() self.prepare_output_pathnames() self.stats_keys = get_module_stats_keys(self.job_config["module_name"]) if self.good_to_launch: self.launch_experiment_job() self.cleanup_job() else: self.failure_to_launch() # make sure there aren't any lingering root permission files os.system(f"chown -R nonroot:nonroot {self.container_data_prefix}") print("SUCCESS")
class ExperimentWorker(PythonJobWithBackend): def __init__(self, input_args, checking_manager_sleep_seconds=0.5, working_sleep_seconds=0.25): super().__init__(input_args) self.checking_manager_sleep_seconds = checking_manager_sleep_seconds self.working_sleep_seconds = working_sleep_seconds self.docker_client = docker.from_env() self.experiment_manager = None self.experiment_manager_dict = None def run_job(self): while True: # check if we're connected to a manager # if we're NOT connected to a manager if self.experiment_manager is None: # check if there are any managers available dockex_machines_df = pd.DataFrame( self.redis_client.get_list("dockex_machines")) if len(dockex_machines_df) > 0: manager_machines_df = dockex_machines_df.loc[ dockex_machines_df.manager_flag == True] if len(manager_machines_df) > 0: # if so, connect to the manager self.experiment_manager_dict = manager_machines_df.iloc[ 0].to_dict() self.experiment_manager = DockexRedisClient( self.experiment_manager_dict["redis_address"]) self.redis_client.set( "manager_redis_address", self.experiment_manager_dict["redis_address"], ) # if the manager is not the local manager if (self.experiment_manager_dict["redis_address"] != self.redis_address): # empty the project directory empty_directory("/tmp/dockex/project") empty_directory("/tmp/dockex/data") # need to copy project archive, unarchive it, and build module images project_archive_filename = self.experiment_manager.get( "project_archive_filename") local_project_archive_filename = ( f"/tmp/dockex/data/{project_archive_filename}") found_project_archive = ftp_find_file( self.experiment_manager.get_list( "dockex_machines"), self.redis_client.get("ip_address"), f"data/{project_archive_filename}", local_project_archive_filename, ) if found_project_archive: with zipfile.ZipFile( local_project_archive_filename, "r") as zip_file: zip_file.extractall("/tmp/dockex/project") # build the module images experiment_module_paths = self.experiment_manager.get_list( "unique_module_paths") # TODO: need a way to signal to the experiment that a build failed # TODO: maybe a flag on manager that the experiment continually checks # TODO: or maybe manager needs to test build before setting manager flag? # TODO: even then though, if a build fails on remote host, that host should NOT work on that experiment name # TODO: maybe a worker should track bad experiment names self.redis_client.set( "status", "BUILDING PROJECT MODULES") build_project_modules(self.docker_client, experiment_module_paths) else: self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") else: time.sleep(self.checking_manager_sleep_seconds) else: time.sleep(self.checking_manager_sleep_seconds) # if we are connected to a manager else: # check if the manager is still a manager # if it is NOT still a manager if self.experiment_manager.get("manager_flag") is not True: # disconnect from the manager self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") # if it is still a manager else: # check that the experiment name is the same # if it is NOT the same, a new experiment has started if (self.experiment_manager.get("experiment_name") != self.experiment_manager_dict["experiment_name"]): # disconnect from the manager self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") # if the experiment name is the same else: # see if we can pull any work to do # get the list of ready_jobs lists ready_jobs_df = pd.DataFrame( self.experiment_manager.smembers( "ready_jobs_list_key_dicts")) if len(ready_jobs_df) > 0: # start with the jobs requiring the most credits ready_jobs_df = ready_jobs_df.sort_values( by=["gpu_credits", "cpu_credits"], ascending=False) num_open_cpu_credits = self.redis_client.get( "cpu_credits_total") - self.redis_client.get( "cpu_credits_used") num_open_gpu_credits = self.redis_client.get( "gpu_credits_total") - self.redis_client.get( "gpu_credits_used") if num_open_cpu_credits > 0 or num_open_gpu_credits > 0: for ready_jobs_df_ind in ready_jobs_df.index: num_open_cpu_credits = self.redis_client.get( "cpu_credits_total" ) - self.redis_client.get( "cpu_credits_used") num_open_gpu_credits = self.redis_client.get( "gpu_credits_total" ) - self.redis_client.get( "gpu_credits_used") required_cpu_credits = int( ready_jobs_df.loc[ready_jobs_df_ind, "cpu_credits"]) required_gpu_credits = int( ready_jobs_df.loc[ready_jobs_df_ind, "gpu_credits"]) ready_jobs_key = ready_jobs_df.loc[ ready_jobs_df_ind, "ready_jobs_list_key"] slots_min_list = [] if required_cpu_credits > 0: num_open_cpu_slots = int( np.floor(num_open_cpu_credits / required_cpu_credits)) slots_min_list.append( num_open_cpu_slots) if required_gpu_credits > 0: num_open_gpu_slots = int( np.floor(num_open_gpu_credits / required_gpu_credits)) slots_min_list.append( num_open_gpu_slots) num_open_slots = int( np.min(slots_min_list)) if num_open_slots > 0: p = (self.experiment_manager. strict_redis.pipeline()) p.lrange( ready_jobs_key, 0, (num_open_slots - 1)) # lrange is inclusive, so - 1 p.ltrim(ready_jobs_key, num_open_slots, -1) pop_job_dicts, _ = p.execute() if len(pop_job_dicts) > 0: for pop_job_dict in pop_job_dicts: pop_job_dict = json.loads( pop_job_dict) print(pop_job_dict) # checkout the credits self.redis_client.strict_redis.incrby( "cpu_credits_used", required_cpu_credits, ) self.redis_client.strict_redis.incrby( "gpu_credits_used", required_gpu_credits, ) self.redis_client.redis_launch_job( f"/tmp/dockex/json/{pop_job_dict['name']}.json", pop_job_dict, ) time.sleep(self.working_sleep_seconds)