示例#1
0
    def pop_callback(self, credits_update):
        if credits_update["mode"] == "incr":
            DockexRedisClient(credits_update["redis_address"]).strict_redis.incr(
                f"{credits_update['type']}_credits_total"
            )
        elif credits_update["mode"] == "decr":
            DockexRedisClient(credits_update["redis_address"]).strict_redis.decr(
                f"{credits_update['type']}_credits_total"
            )

        elif credits_update["mode"] == "set":
            DockexRedisClient(credits_update["redis_address"]).strict_redis.set(
                f"{credits_update['type']}_credits_total", credits_update["value"]
            )
示例#2
0
    def connect_to_experiment_manager(self):
        print("GETTING MANAGER REDIS ADDRESS")
        keep_trying = True
        while keep_trying:
            self.experiment_manager_address = self.redis_client.get(
                "manager_redis_address"
            )

            if self.experiment_manager_address is not None:
                keep_trying = False
                print("FOUND MANAGER REDIS ADDRESS")
            else:
                print("NO MANAGER FOUND, TRYING AGAIN")
                time.sleep(self.sleep_seconds)

        print("CONNECTING TO EXPERIMENT MANAGER")
        self.experiment_manager = DockexRedisClient(self.experiment_manager_address)

        experiment_manager_ip_address = self.experiment_manager.get("ip_address")
        experiment_manager_port = self.experiment_manager.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=1
        )
        self.job_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=3
        )
    def run_job(self):
        while True:
            try:
                discovered_machine_ips = self.redis_client.get_list(
                    "machines_on_network")

                if len(discovered_machine_ips) > 0:
                    for machine_ip_address in discovered_machine_ips:
                        # this assumes that all cluster machines use same port for redis
                        check_redis_address = (
                            f"http://{machine_ip_address}:{self.redis_port}")

                        try:
                            if (DockexRedisClient(check_redis_address).get(
                                    "dockex_backend") is True):
                                self.redis_client.sadd(
                                    "dockex_redis_addresses",
                                    check_redis_address)

                        except (redis.exceptions.ConnectionError, TypeError):
                            self.redis_client.srem("dockex_redis_addresses",
                                                   check_redis_address)

                # machine discovery won't pick up local machine if using 127.0.0.1
                # always make sure local machine gets registered
                else:
                    self.redis_client.sadd("dockex_redis_addresses",
                                           self.redis_address)

            except Exception as e:
                print(e)

            time.sleep(self.sleep_seconds)
示例#4
0
    def launch_redis(self):
        print("BUILDING AND RUNNING REDIS")

        build_image_run_container(
            self.docker_client,
            dict(
                path=".",
                dockerfile="core/services/backend/dockex_redis/Dockerfile",
                tag="dockex_redis_image",
            ),
            dict(
                image="dockex_redis_image",
                name="dockex_redis",
                detach=True,
                network_mode="host",
                volumes={
                    self.config["tmp_dockex_path"]: {
                        "bind": "/tmp/dockex",
                        "mode": "rw",
                    }
                },
            ),
            print_build_logs=True,
        )

        # connect to redis and flush
        self.redis_client = DockexRedisClient(self.config["redis_address"])

        trying_to_connect = True
        while trying_to_connect:
            try:
                self.redis_client.flushdb()
                trying_to_connect = False

            except redis.exceptions.ConnectionError:
                pass

        # fill redis with dockex config values
        for key in self.config.keys():
            self.redis_client.set(key, self.config[key])

        # mark the redis instance as a dockex backend
        self.redis_client.set("dockex_backend", True)

        self.redis_client.set("status", "LAUNCHED REDIS")
    def run_job(self):
        while True:
            try:
                dockex_redis_addresses = self.redis_client.smembers(
                    "dockex_redis_addresses"
                )

                p = self.redis_client.strict_redis.pipeline()
                p.delete("dockex_machines")

                for dockex_redis_address in dockex_redis_addresses:
                    try:
                        temp_client = DockexRedisClient(dockex_redis_address)

                        dockex_status_dict = dict(
                            machine_name=temp_client.get("machine_name"),
                            redis_address=dockex_redis_address,
                            manager_flag=temp_client.get("manager_flag"),
                            experiment_name=temp_client.get("experiment_name"),
                            ip_address=temp_client.get("ip_address"),
                            tmp_dockex_ftpd_port=temp_client.get(
                                "tmp_dockex_ftpd_port"
                            ),
                            tmp_dockex_ftpd_password=temp_client.get(
                                "tmp_dockex_ftpd_password"
                            ),
                        )

                        p.rpush("dockex_machines", json.dumps(dockex_status_dict))

                    except redis.exceptions.ConnectionError:
                        pass

                p.execute()

            except Exception as e:
                print(e)

            time.sleep(self.sleep_seconds)
示例#6
0
    def __init__(self, input_args):
        super().__init__()

        self.json_pathname = input_args[1]
        self.redis_address = input_args[2]
        self.redis_client = DockexRedisClient(self.redis_address)

        self.tmp_dockex_path = self.redis_client.get("tmp_dockex_path")

        self.docker_client = docker.from_env()

        self.job_config = read_job_config(self.json_pathname)

        self.dockerfile_path = f"{self.job_config['path']}/Dockerfile"

        if "image_tag" in self.job_config.keys():
            self.image_tag = self.job_config["image_tag"]
        else:
            self.image_tag = module_path_to_image_tag(self.job_config["path"])

        self.command_args = self.generate_command_args()
        self.volumes = self.generate_volumes()
        self.network_mode = "host"

        self.environment = None
        if "include_json_pathname_env_variable" in self.job_config.keys():
            if self.job_config["include_json_pathname_env_variable"]:
                self.environment = {"JSON_PATHNAME": self.json_pathname}

        self.skip_build = False
        if "skip_docker_wrapper_build" in self.job_config.keys():
            if self.job_config["skip_docker_wrapper_build"] is True:
                self.skip_build = True

        # build path depends on if path is in core or relative to /tmp/dockex/project
        if self.job_config["path"].startswith("core/"):
            self.build_path = "."
        else:
            self.build_path = "/tmp/dockex/project"

        if "experiment_job" in self.job_config.keys():
            self.experiment_job = self.job_config["experiment_job"]
        else:
            self.experiment_job = False

        if self.experiment_job is True:
            self.detach = False
        else:
            self.detach = True

        self.build_kwargs_dict = dict(
            path=self.build_path, dockerfile=self.dockerfile_path, tag=self.image_tag
        )

        self.run_kwargs_dict = dict(
            image=self.image_tag,
            name=self.job_config["name"],
            command=self.command_args,
            detach=self.detach,
            network_mode=self.network_mode,
            volumes=self.volumes,
            environment=self.environment,
        )

        # check global gpus enable
        if self.redis_client.get("enable_gpus") is True:
            self.run_kwargs_dict["enable_gpus"] = True
        else:
            self.run_kwargs_dict["enable_gpus"] = False

        # allow module to override global gpus enable
        if "enable_gpus" in self.job_config.keys():
            if self.job_config["enable_gpus"] is True:
                self.run_kwargs_dict["enable_gpus"] = True
            else:
                self.run_kwargs_dict["enable_gpus"] = False

        self.good_to_launch = None
        self.experiment_manager_address = None
        self.experiment_manager = None
        self.dependency_lookup_db = None
        self.job_lookup_db = None
        self.stats_keys = None
        self.container_data_prefix = "/tmp/dockex/data/"

        self.sleep_seconds = 0.25
    def __init__(
            self,
            project_path="/home/experiment/project",  # according to core/experiment/dockex_experiment
            tmp_dockex_path="/tmp/dockex",
            initial_job_num=None,
            experiment_name_prefix=None,
            sleep_seconds=0.5,
            save_project=False,
    ):

        super().__init__()

        if project_path is None:
            raise ValueError("A project_path must be provided.")
        else:
            self.project_path = os.path.expanduser(project_path)

        self.tmp_dockex_path = tmp_dockex_path

        self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json")
        self.redis_client = DockexRedisClient(self.dockex_config["redis_address"])

        self.docker_client = docker.from_env()

        manager_ip_address = self.redis_client.get("ip_address")
        manager_port = self.redis_client.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=1
        )
        self.dependency_counts_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=2
        )
        self.job_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=3
        )

        self.initial_job_num = initial_job_num
        if self.initial_job_num is not None:
            self.job_num = self.initial_job_num
        else:
            self.job_num = self.redis_client.get("manager_job_num")

        self.sleep_seconds = sleep_seconds

        self.job_list = []

        self.dockex_path_list = self.redis_client.get("dockex_path_list")

        self.experiment_name_prefix = experiment_name_prefix
        self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}"
        if self.experiment_name_prefix is not None:
            self.experiment_name = (
                f"{self.experiment_name_prefix}_{self.experiment_name}"
            )

        self.csv_filename = f"jobs_{self.experiment_name}.csv"
        self.csv_pathname = (
            f"/tmp/dockex/data/{self.csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.trials_csv_filename = f"trials_{self.experiment_name}.csv"
        self.trials_csv_pathname = (
            f"/tmp/dockex/data/{self.trials_csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.extra_output_pathnames = []
        self.save_project = save_project
        self.project_archive_pathname = None
        self.project_archive_filename = None

        self.trial_dict = dict()
        self.trials_list = []
    def __init__(self, input_args):
        super().__init__(input_args)

        self.redis_address = input_args[2]
        self.redis_client = DockexRedisClient(self.redis_address)
示例#9
0
    def run_job(self):
        while True:
            try:
                dockex_machines = self.redis_client.get_list("dockex_machines")

                cluster_cpu_list = []
                cluster_ram_total_list = []
                cluster_ram_used_list = []

                cluster_gpu_list = []
                cluster_gpu_memory_total_list = []
                cluster_gpu_memory_used_list = []

                cluster_cpu_credits_total = 0
                cluster_cpu_credits_used = 0
                cluster_gpu_credits_total = 0
                cluster_gpu_credits_used = 0

                p = self.redis_client.strict_redis.pipeline()
                p.delete("cluster_monitor")
                p.delete("cluster_stats")

                for dockex_machine in dockex_machines:
                    try:
                        temp_redis_client = DockexRedisClient(
                            dockex_machine["redis_address"])
                        dockex_machine[
                            "hardware_monitor"] = temp_redis_client.get(
                                "hardware_monitor")
                        dockex_machine[
                            "credits_monitor"] = temp_redis_client.get(
                                "credits_monitor")
                        dockex_machine["status"] = temp_redis_client.get(
                            "status")
                        dockex_machine["data_path"] = temp_redis_client.get(
                            "data_path")
                        dockex_machine["json_path"] = temp_redis_client.get(
                            "json_path")
                        dockex_machine[
                            "redis_address"] = temp_redis_client.get(
                                "redis_address")
                        dockex_machine[
                            "webdis_address"] = temp_redis_client.get(
                                "webdis_address")

                        p.rpush("cluster_monitor", json.dumps(dockex_machine))

                        cluster_cpu_list += dockex_machine["hardware_monitor"][
                            "cpu_percent_per_cpu"]
                        cluster_ram_total_list.append(
                            dockex_machine["hardware_monitor"]
                            ["virtual_memory_total"])
                        cluster_ram_used_list.append(
                            dockex_machine["hardware_monitor"]
                            ["virtual_memory_used"])

                        cluster_gpu_list += dockex_machine["hardware_monitor"][
                            "gpu_percent_per_gpu"]
                        cluster_gpu_memory_total_list.append(
                            dockex_machine["hardware_monitor"]
                            ["gpu_memory_total"])
                        cluster_gpu_memory_used_list.append(
                            dockex_machine["hardware_monitor"]
                            ["gpu_memory_used"])

                        cluster_cpu_credits_total += dockex_machine[
                            "credits_monitor"]["cpu_credits_total"]
                        cluster_cpu_credits_used += dockex_machine[
                            "credits_monitor"]["cpu_credits_used"]

                        cluster_gpu_credits_total += dockex_machine[
                            "credits_monitor"]["gpu_credits_total"]
                        cluster_gpu_credits_used += dockex_machine[
                            "credits_monitor"]["gpu_credits_used"]

                    except Exception as e:
                        print(e)

                cluster_num_cpus = len(cluster_cpu_list)
                if cluster_num_cpus > 0:
                    cluster_cpu_utilization = round(
                        sum(cluster_cpu_list) / float(cluster_num_cpus), 1)
                else:
                    cluster_cpu_utilization = 0.0

                cluster_num_gpus = len(cluster_gpu_list)
                if cluster_num_gpus > 0:
                    cluster_gpu_utilization = round(
                        sum(cluster_gpu_list) / float(cluster_num_gpus), 1)
                else:
                    cluster_gpu_utilization = 0.0

                virtual_memory_total = sum(cluster_ram_total_list)
                virtual_memory_used = sum(cluster_ram_used_list)
                if virtual_memory_total > 0.0:
                    virtual_memory_percent = round(
                        (virtual_memory_used * 100.0 / virtual_memory_total),
                        1)
                else:
                    virtual_memory_percent = 0.0

                gpu_memory_total = sum(cluster_gpu_memory_total_list)
                gpu_memory_used = sum(cluster_gpu_memory_used_list)
                if gpu_memory_total > 0.0:
                    gpu_memory_percent = round(
                        (gpu_memory_used * 100.0 / gpu_memory_total), 1)
                else:
                    gpu_memory_percent = 0.0

                cluster_stats = {
                    "machine_count": len(dockex_machines),
                    "cpu_count": cluster_num_cpus,
                    "cpu_percent": cluster_cpu_utilization,
                    "cpu_percent_per_cpu": cluster_cpu_list,
                    "virtual_memory_total": virtual_memory_total,
                    "virtual_memory_used": virtual_memory_used,
                    "virtual_memory_percent": virtual_memory_percent,
                    "gpu_count": cluster_num_gpus,
                    "gpu_percent": cluster_gpu_utilization,
                    "gpu_percent_per_gpu": cluster_gpu_list,
                    "gpu_memory_total": gpu_memory_total,
                    "gpu_memory_used": gpu_memory_used,
                    "gpu_memory_percent": gpu_memory_percent,
                    "cpu_credits_total": cluster_cpu_credits_total,
                    "cpu_credits_used": cluster_cpu_credits_used,
                    "gpu_credits_total": cluster_gpu_credits_total,
                    "gpu_credits_used": cluster_gpu_credits_used,
                }

                p.set("cluster_stats", json.dumps(cluster_stats))

                p.execute()

            except Exception as e:
                print(e)

            time.sleep(self.sleep_seconds)
示例#10
0
    def run_job(self):
        while True:
            # check if we're connected to a manager
            # if we're NOT connected to a manager
            if self.experiment_manager is None:
                # check if there are any managers available
                dockex_machines_df = pd.DataFrame(
                    self.redis_client.get_list("dockex_machines"))

                if len(dockex_machines_df) > 0:
                    manager_machines_df = dockex_machines_df.loc[
                        dockex_machines_df.manager_flag == True]

                    if len(manager_machines_df) > 0:
                        # if so, connect to the manager
                        self.experiment_manager_dict = manager_machines_df.iloc[
                            0].to_dict()
                        self.experiment_manager = DockexRedisClient(
                            self.experiment_manager_dict["redis_address"])
                        self.redis_client.set(
                            "manager_redis_address",
                            self.experiment_manager_dict["redis_address"],
                        )

                        # if the manager is not the local manager
                        if (self.experiment_manager_dict["redis_address"] !=
                                self.redis_address):
                            # empty the project directory
                            empty_directory("/tmp/dockex/project")
                            empty_directory("/tmp/dockex/data")

                            # need to copy project archive, unarchive it, and build module images
                            project_archive_filename = self.experiment_manager.get(
                                "project_archive_filename")
                            local_project_archive_filename = (
                                f"/tmp/dockex/data/{project_archive_filename}")

                            found_project_archive = ftp_find_file(
                                self.experiment_manager.get_list(
                                    "dockex_machines"),
                                self.redis_client.get("ip_address"),
                                f"data/{project_archive_filename}",
                                local_project_archive_filename,
                            )

                            if found_project_archive:
                                with zipfile.ZipFile(
                                        local_project_archive_filename,
                                        "r") as zip_file:
                                    zip_file.extractall("/tmp/dockex/project")

                                # build the module images
                                experiment_module_paths = self.experiment_manager.get_list(
                                    "unique_module_paths")
                                # TODO: need a way to signal to the experiment that a build failed
                                # TODO: maybe a flag on manager that the experiment continually checks
                                # TODO: or maybe manager needs to test build before setting manager flag?
                                # TODO: even then though, if a build fails on remote host, that host should NOT work on that experiment name
                                # TODO: maybe a worker should track bad experiment names
                                self.redis_client.set(
                                    "status", "BUILDING PROJECT MODULES")
                                build_project_modules(self.docker_client,
                                                      experiment_module_paths)

                            else:
                                self.experiment_manager_dict = None
                                self.experiment_manager = None
                                self.redis_client.strict_redis.delete(
                                    "manager_redis_address")

                    else:
                        time.sleep(self.checking_manager_sleep_seconds)
                else:
                    time.sleep(self.checking_manager_sleep_seconds)

            # if we are connected to a manager
            else:
                # check if the manager is still a manager
                # if it is NOT still a manager
                if self.experiment_manager.get("manager_flag") is not True:
                    # disconnect from the manager
                    self.experiment_manager_dict = None
                    self.experiment_manager = None
                    self.redis_client.strict_redis.delete(
                        "manager_redis_address")

                # if it is still a manager
                else:
                    # check that the experiment name is the same
                    # if it is NOT the same, a new experiment has started
                    if (self.experiment_manager.get("experiment_name") !=
                            self.experiment_manager_dict["experiment_name"]):
                        # disconnect from the manager
                        self.experiment_manager_dict = None
                        self.experiment_manager = None
                        self.redis_client.strict_redis.delete(
                            "manager_redis_address")

                    # if the experiment name is the same
                    else:
                        # see if we can pull any work to do
                        # get the list of ready_jobs lists
                        ready_jobs_df = pd.DataFrame(
                            self.experiment_manager.smembers(
                                "ready_jobs_list_key_dicts"))

                        if len(ready_jobs_df) > 0:
                            # start with the jobs requiring the most credits
                            ready_jobs_df = ready_jobs_df.sort_values(
                                by=["gpu_credits", "cpu_credits"],
                                ascending=False)

                            num_open_cpu_credits = self.redis_client.get(
                                "cpu_credits_total") - self.redis_client.get(
                                    "cpu_credits_used")
                            num_open_gpu_credits = self.redis_client.get(
                                "gpu_credits_total") - self.redis_client.get(
                                    "gpu_credits_used")

                            if num_open_cpu_credits > 0 or num_open_gpu_credits > 0:
                                for ready_jobs_df_ind in ready_jobs_df.index:
                                    num_open_cpu_credits = self.redis_client.get(
                                        "cpu_credits_total"
                                    ) - self.redis_client.get(
                                        "cpu_credits_used")
                                    num_open_gpu_credits = self.redis_client.get(
                                        "gpu_credits_total"
                                    ) - self.redis_client.get(
                                        "gpu_credits_used")

                                    required_cpu_credits = int(
                                        ready_jobs_df.loc[ready_jobs_df_ind,
                                                          "cpu_credits"])
                                    required_gpu_credits = int(
                                        ready_jobs_df.loc[ready_jobs_df_ind,
                                                          "gpu_credits"])
                                    ready_jobs_key = ready_jobs_df.loc[
                                        ready_jobs_df_ind,
                                        "ready_jobs_list_key"]

                                    slots_min_list = []
                                    if required_cpu_credits > 0:
                                        num_open_cpu_slots = int(
                                            np.floor(num_open_cpu_credits /
                                                     required_cpu_credits))
                                        slots_min_list.append(
                                            num_open_cpu_slots)

                                    if required_gpu_credits > 0:
                                        num_open_gpu_slots = int(
                                            np.floor(num_open_gpu_credits /
                                                     required_gpu_credits))
                                        slots_min_list.append(
                                            num_open_gpu_slots)

                                    num_open_slots = int(
                                        np.min(slots_min_list))

                                    if num_open_slots > 0:
                                        p = (self.experiment_manager.
                                             strict_redis.pipeline())
                                        p.lrange(
                                            ready_jobs_key, 0,
                                            (num_open_slots -
                                             1))  # lrange is inclusive, so - 1
                                        p.ltrim(ready_jobs_key, num_open_slots,
                                                -1)
                                        pop_job_dicts, _ = p.execute()

                                        if len(pop_job_dicts) > 0:
                                            for pop_job_dict in pop_job_dicts:
                                                pop_job_dict = json.loads(
                                                    pop_job_dict)
                                                print(pop_job_dict)

                                                # checkout the credits
                                                self.redis_client.strict_redis.incrby(
                                                    "cpu_credits_used",
                                                    required_cpu_credits,
                                                )
                                                self.redis_client.strict_redis.incrby(
                                                    "gpu_credits_used",
                                                    required_gpu_credits,
                                                )

                                                self.redis_client.redis_launch_job(
                                                    f"/tmp/dockex/json/{pop_job_dict['name']}.json",
                                                    pop_job_dict,
                                                )

                        time.sleep(self.working_sleep_seconds)