def pop_callback(self, credits_update): if credits_update["mode"] == "incr": DockexRedisClient(credits_update["redis_address"]).strict_redis.incr( f"{credits_update['type']}_credits_total" ) elif credits_update["mode"] == "decr": DockexRedisClient(credits_update["redis_address"]).strict_redis.decr( f"{credits_update['type']}_credits_total" ) elif credits_update["mode"] == "set": DockexRedisClient(credits_update["redis_address"]).strict_redis.set( f"{credits_update['type']}_credits_total", credits_update["value"] )
def connect_to_experiment_manager(self): print("GETTING MANAGER REDIS ADDRESS") keep_trying = True while keep_trying: self.experiment_manager_address = self.redis_client.get( "manager_redis_address" ) if self.experiment_manager_address is not None: keep_trying = False print("FOUND MANAGER REDIS ADDRESS") else: print("NO MANAGER FOUND, TRYING AGAIN") time.sleep(self.sleep_seconds) print("CONNECTING TO EXPERIMENT MANAGER") self.experiment_manager = DockexRedisClient(self.experiment_manager_address) experiment_manager_ip_address = self.experiment_manager.get("ip_address") experiment_manager_port = self.experiment_manager.get("redis_port") self.dependency_lookup_db = redis.StrictRedis( host=experiment_manager_ip_address, port=experiment_manager_port, db=1 ) self.job_lookup_db = redis.StrictRedis( host=experiment_manager_ip_address, port=experiment_manager_port, db=3 )
def run_job(self): while True: try: discovered_machine_ips = self.redis_client.get_list( "machines_on_network") if len(discovered_machine_ips) > 0: for machine_ip_address in discovered_machine_ips: # this assumes that all cluster machines use same port for redis check_redis_address = ( f"http://{machine_ip_address}:{self.redis_port}") try: if (DockexRedisClient(check_redis_address).get( "dockex_backend") is True): self.redis_client.sadd( "dockex_redis_addresses", check_redis_address) except (redis.exceptions.ConnectionError, TypeError): self.redis_client.srem("dockex_redis_addresses", check_redis_address) # machine discovery won't pick up local machine if using 127.0.0.1 # always make sure local machine gets registered else: self.redis_client.sadd("dockex_redis_addresses", self.redis_address) except Exception as e: print(e) time.sleep(self.sleep_seconds)
def launch_redis(self): print("BUILDING AND RUNNING REDIS") build_image_run_container( self.docker_client, dict( path=".", dockerfile="core/services/backend/dockex_redis/Dockerfile", tag="dockex_redis_image", ), dict( image="dockex_redis_image", name="dockex_redis", detach=True, network_mode="host", volumes={ self.config["tmp_dockex_path"]: { "bind": "/tmp/dockex", "mode": "rw", } }, ), print_build_logs=True, ) # connect to redis and flush self.redis_client = DockexRedisClient(self.config["redis_address"]) trying_to_connect = True while trying_to_connect: try: self.redis_client.flushdb() trying_to_connect = False except redis.exceptions.ConnectionError: pass # fill redis with dockex config values for key in self.config.keys(): self.redis_client.set(key, self.config[key]) # mark the redis instance as a dockex backend self.redis_client.set("dockex_backend", True) self.redis_client.set("status", "LAUNCHED REDIS")
def run_job(self): while True: try: dockex_redis_addresses = self.redis_client.smembers( "dockex_redis_addresses" ) p = self.redis_client.strict_redis.pipeline() p.delete("dockex_machines") for dockex_redis_address in dockex_redis_addresses: try: temp_client = DockexRedisClient(dockex_redis_address) dockex_status_dict = dict( machine_name=temp_client.get("machine_name"), redis_address=dockex_redis_address, manager_flag=temp_client.get("manager_flag"), experiment_name=temp_client.get("experiment_name"), ip_address=temp_client.get("ip_address"), tmp_dockex_ftpd_port=temp_client.get( "tmp_dockex_ftpd_port" ), tmp_dockex_ftpd_password=temp_client.get( "tmp_dockex_ftpd_password" ), ) p.rpush("dockex_machines", json.dumps(dockex_status_dict)) except redis.exceptions.ConnectionError: pass p.execute() except Exception as e: print(e) time.sleep(self.sleep_seconds)
def __init__(self, input_args): super().__init__() self.json_pathname = input_args[1] self.redis_address = input_args[2] self.redis_client = DockexRedisClient(self.redis_address) self.tmp_dockex_path = self.redis_client.get("tmp_dockex_path") self.docker_client = docker.from_env() self.job_config = read_job_config(self.json_pathname) self.dockerfile_path = f"{self.job_config['path']}/Dockerfile" if "image_tag" in self.job_config.keys(): self.image_tag = self.job_config["image_tag"] else: self.image_tag = module_path_to_image_tag(self.job_config["path"]) self.command_args = self.generate_command_args() self.volumes = self.generate_volumes() self.network_mode = "host" self.environment = None if "include_json_pathname_env_variable" in self.job_config.keys(): if self.job_config["include_json_pathname_env_variable"]: self.environment = {"JSON_PATHNAME": self.json_pathname} self.skip_build = False if "skip_docker_wrapper_build" in self.job_config.keys(): if self.job_config["skip_docker_wrapper_build"] is True: self.skip_build = True # build path depends on if path is in core or relative to /tmp/dockex/project if self.job_config["path"].startswith("core/"): self.build_path = "." else: self.build_path = "/tmp/dockex/project" if "experiment_job" in self.job_config.keys(): self.experiment_job = self.job_config["experiment_job"] else: self.experiment_job = False if self.experiment_job is True: self.detach = False else: self.detach = True self.build_kwargs_dict = dict( path=self.build_path, dockerfile=self.dockerfile_path, tag=self.image_tag ) self.run_kwargs_dict = dict( image=self.image_tag, name=self.job_config["name"], command=self.command_args, detach=self.detach, network_mode=self.network_mode, volumes=self.volumes, environment=self.environment, ) # check global gpus enable if self.redis_client.get("enable_gpus") is True: self.run_kwargs_dict["enable_gpus"] = True else: self.run_kwargs_dict["enable_gpus"] = False # allow module to override global gpus enable if "enable_gpus" in self.job_config.keys(): if self.job_config["enable_gpus"] is True: self.run_kwargs_dict["enable_gpus"] = True else: self.run_kwargs_dict["enable_gpus"] = False self.good_to_launch = None self.experiment_manager_address = None self.experiment_manager = None self.dependency_lookup_db = None self.job_lookup_db = None self.stats_keys = None self.container_data_prefix = "/tmp/dockex/data/" self.sleep_seconds = 0.25
def __init__( self, project_path="/home/experiment/project", # according to core/experiment/dockex_experiment tmp_dockex_path="/tmp/dockex", initial_job_num=None, experiment_name_prefix=None, sleep_seconds=0.5, save_project=False, ): super().__init__() if project_path is None: raise ValueError("A project_path must be provided.") else: self.project_path = os.path.expanduser(project_path) self.tmp_dockex_path = tmp_dockex_path self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json") self.redis_client = DockexRedisClient(self.dockex_config["redis_address"]) self.docker_client = docker.from_env() manager_ip_address = self.redis_client.get("ip_address") manager_port = self.redis_client.get("redis_port") self.dependency_lookup_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=1 ) self.dependency_counts_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=2 ) self.job_lookup_db = redis.StrictRedis( host=manager_ip_address, port=manager_port, db=3 ) self.initial_job_num = initial_job_num if self.initial_job_num is not None: self.job_num = self.initial_job_num else: self.job_num = self.redis_client.get("manager_job_num") self.sleep_seconds = sleep_seconds self.job_list = [] self.dockex_path_list = self.redis_client.get("dockex_path_list") self.experiment_name_prefix = experiment_name_prefix self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}" if self.experiment_name_prefix is not None: self.experiment_name = ( f"{self.experiment_name_prefix}_{self.experiment_name}" ) self.csv_filename = f"jobs_{self.experiment_name}.csv" self.csv_pathname = ( f"/tmp/dockex/data/{self.csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally self.trials_csv_filename = f"trials_{self.experiment_name}.csv" self.trials_csv_pathname = ( f"/tmp/dockex/data/{self.trials_csv_filename}" ) # this assumes we're running in a container or using /tmp/dockex locally self.extra_output_pathnames = [] self.save_project = save_project self.project_archive_pathname = None self.project_archive_filename = None self.trial_dict = dict() self.trials_list = []
def __init__(self, input_args): super().__init__(input_args) self.redis_address = input_args[2] self.redis_client = DockexRedisClient(self.redis_address)
def run_job(self): while True: try: dockex_machines = self.redis_client.get_list("dockex_machines") cluster_cpu_list = [] cluster_ram_total_list = [] cluster_ram_used_list = [] cluster_gpu_list = [] cluster_gpu_memory_total_list = [] cluster_gpu_memory_used_list = [] cluster_cpu_credits_total = 0 cluster_cpu_credits_used = 0 cluster_gpu_credits_total = 0 cluster_gpu_credits_used = 0 p = self.redis_client.strict_redis.pipeline() p.delete("cluster_monitor") p.delete("cluster_stats") for dockex_machine in dockex_machines: try: temp_redis_client = DockexRedisClient( dockex_machine["redis_address"]) dockex_machine[ "hardware_monitor"] = temp_redis_client.get( "hardware_monitor") dockex_machine[ "credits_monitor"] = temp_redis_client.get( "credits_monitor") dockex_machine["status"] = temp_redis_client.get( "status") dockex_machine["data_path"] = temp_redis_client.get( "data_path") dockex_machine["json_path"] = temp_redis_client.get( "json_path") dockex_machine[ "redis_address"] = temp_redis_client.get( "redis_address") dockex_machine[ "webdis_address"] = temp_redis_client.get( "webdis_address") p.rpush("cluster_monitor", json.dumps(dockex_machine)) cluster_cpu_list += dockex_machine["hardware_monitor"][ "cpu_percent_per_cpu"] cluster_ram_total_list.append( dockex_machine["hardware_monitor"] ["virtual_memory_total"]) cluster_ram_used_list.append( dockex_machine["hardware_monitor"] ["virtual_memory_used"]) cluster_gpu_list += dockex_machine["hardware_monitor"][ "gpu_percent_per_gpu"] cluster_gpu_memory_total_list.append( dockex_machine["hardware_monitor"] ["gpu_memory_total"]) cluster_gpu_memory_used_list.append( dockex_machine["hardware_monitor"] ["gpu_memory_used"]) cluster_cpu_credits_total += dockex_machine[ "credits_monitor"]["cpu_credits_total"] cluster_cpu_credits_used += dockex_machine[ "credits_monitor"]["cpu_credits_used"] cluster_gpu_credits_total += dockex_machine[ "credits_monitor"]["gpu_credits_total"] cluster_gpu_credits_used += dockex_machine[ "credits_monitor"]["gpu_credits_used"] except Exception as e: print(e) cluster_num_cpus = len(cluster_cpu_list) if cluster_num_cpus > 0: cluster_cpu_utilization = round( sum(cluster_cpu_list) / float(cluster_num_cpus), 1) else: cluster_cpu_utilization = 0.0 cluster_num_gpus = len(cluster_gpu_list) if cluster_num_gpus > 0: cluster_gpu_utilization = round( sum(cluster_gpu_list) / float(cluster_num_gpus), 1) else: cluster_gpu_utilization = 0.0 virtual_memory_total = sum(cluster_ram_total_list) virtual_memory_used = sum(cluster_ram_used_list) if virtual_memory_total > 0.0: virtual_memory_percent = round( (virtual_memory_used * 100.0 / virtual_memory_total), 1) else: virtual_memory_percent = 0.0 gpu_memory_total = sum(cluster_gpu_memory_total_list) gpu_memory_used = sum(cluster_gpu_memory_used_list) if gpu_memory_total > 0.0: gpu_memory_percent = round( (gpu_memory_used * 100.0 / gpu_memory_total), 1) else: gpu_memory_percent = 0.0 cluster_stats = { "machine_count": len(dockex_machines), "cpu_count": cluster_num_cpus, "cpu_percent": cluster_cpu_utilization, "cpu_percent_per_cpu": cluster_cpu_list, "virtual_memory_total": virtual_memory_total, "virtual_memory_used": virtual_memory_used, "virtual_memory_percent": virtual_memory_percent, "gpu_count": cluster_num_gpus, "gpu_percent": cluster_gpu_utilization, "gpu_percent_per_gpu": cluster_gpu_list, "gpu_memory_total": gpu_memory_total, "gpu_memory_used": gpu_memory_used, "gpu_memory_percent": gpu_memory_percent, "cpu_credits_total": cluster_cpu_credits_total, "cpu_credits_used": cluster_cpu_credits_used, "gpu_credits_total": cluster_gpu_credits_total, "gpu_credits_used": cluster_gpu_credits_used, } p.set("cluster_stats", json.dumps(cluster_stats)) p.execute() except Exception as e: print(e) time.sleep(self.sleep_seconds)
def run_job(self): while True: # check if we're connected to a manager # if we're NOT connected to a manager if self.experiment_manager is None: # check if there are any managers available dockex_machines_df = pd.DataFrame( self.redis_client.get_list("dockex_machines")) if len(dockex_machines_df) > 0: manager_machines_df = dockex_machines_df.loc[ dockex_machines_df.manager_flag == True] if len(manager_machines_df) > 0: # if so, connect to the manager self.experiment_manager_dict = manager_machines_df.iloc[ 0].to_dict() self.experiment_manager = DockexRedisClient( self.experiment_manager_dict["redis_address"]) self.redis_client.set( "manager_redis_address", self.experiment_manager_dict["redis_address"], ) # if the manager is not the local manager if (self.experiment_manager_dict["redis_address"] != self.redis_address): # empty the project directory empty_directory("/tmp/dockex/project") empty_directory("/tmp/dockex/data") # need to copy project archive, unarchive it, and build module images project_archive_filename = self.experiment_manager.get( "project_archive_filename") local_project_archive_filename = ( f"/tmp/dockex/data/{project_archive_filename}") found_project_archive = ftp_find_file( self.experiment_manager.get_list( "dockex_machines"), self.redis_client.get("ip_address"), f"data/{project_archive_filename}", local_project_archive_filename, ) if found_project_archive: with zipfile.ZipFile( local_project_archive_filename, "r") as zip_file: zip_file.extractall("/tmp/dockex/project") # build the module images experiment_module_paths = self.experiment_manager.get_list( "unique_module_paths") # TODO: need a way to signal to the experiment that a build failed # TODO: maybe a flag on manager that the experiment continually checks # TODO: or maybe manager needs to test build before setting manager flag? # TODO: even then though, if a build fails on remote host, that host should NOT work on that experiment name # TODO: maybe a worker should track bad experiment names self.redis_client.set( "status", "BUILDING PROJECT MODULES") build_project_modules(self.docker_client, experiment_module_paths) else: self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") else: time.sleep(self.checking_manager_sleep_seconds) else: time.sleep(self.checking_manager_sleep_seconds) # if we are connected to a manager else: # check if the manager is still a manager # if it is NOT still a manager if self.experiment_manager.get("manager_flag") is not True: # disconnect from the manager self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") # if it is still a manager else: # check that the experiment name is the same # if it is NOT the same, a new experiment has started if (self.experiment_manager.get("experiment_name") != self.experiment_manager_dict["experiment_name"]): # disconnect from the manager self.experiment_manager_dict = None self.experiment_manager = None self.redis_client.strict_redis.delete( "manager_redis_address") # if the experiment name is the same else: # see if we can pull any work to do # get the list of ready_jobs lists ready_jobs_df = pd.DataFrame( self.experiment_manager.smembers( "ready_jobs_list_key_dicts")) if len(ready_jobs_df) > 0: # start with the jobs requiring the most credits ready_jobs_df = ready_jobs_df.sort_values( by=["gpu_credits", "cpu_credits"], ascending=False) num_open_cpu_credits = self.redis_client.get( "cpu_credits_total") - self.redis_client.get( "cpu_credits_used") num_open_gpu_credits = self.redis_client.get( "gpu_credits_total") - self.redis_client.get( "gpu_credits_used") if num_open_cpu_credits > 0 or num_open_gpu_credits > 0: for ready_jobs_df_ind in ready_jobs_df.index: num_open_cpu_credits = self.redis_client.get( "cpu_credits_total" ) - self.redis_client.get( "cpu_credits_used") num_open_gpu_credits = self.redis_client.get( "gpu_credits_total" ) - self.redis_client.get( "gpu_credits_used") required_cpu_credits = int( ready_jobs_df.loc[ready_jobs_df_ind, "cpu_credits"]) required_gpu_credits = int( ready_jobs_df.loc[ready_jobs_df_ind, "gpu_credits"]) ready_jobs_key = ready_jobs_df.loc[ ready_jobs_df_ind, "ready_jobs_list_key"] slots_min_list = [] if required_cpu_credits > 0: num_open_cpu_slots = int( np.floor(num_open_cpu_credits / required_cpu_credits)) slots_min_list.append( num_open_cpu_slots) if required_gpu_credits > 0: num_open_gpu_slots = int( np.floor(num_open_gpu_credits / required_gpu_credits)) slots_min_list.append( num_open_gpu_slots) num_open_slots = int( np.min(slots_min_list)) if num_open_slots > 0: p = (self.experiment_manager. strict_redis.pipeline()) p.lrange( ready_jobs_key, 0, (num_open_slots - 1)) # lrange is inclusive, so - 1 p.ltrim(ready_jobs_key, num_open_slots, -1) pop_job_dicts, _ = p.execute() if len(pop_job_dicts) > 0: for pop_job_dict in pop_job_dicts: pop_job_dict = json.loads( pop_job_dict) print(pop_job_dict) # checkout the credits self.redis_client.strict_redis.incrby( "cpu_credits_used", required_cpu_credits, ) self.redis_client.strict_redis.incrby( "gpu_credits_used", required_gpu_credits, ) self.redis_client.redis_launch_job( f"/tmp/dockex/json/{pop_job_dict['name']}.json", pop_job_dict, ) time.sleep(self.working_sleep_seconds)