def get_status(self, fn_entry):
        status = "completed"      # unless below finds different

        fn_queue_entry = file_utils.path_join(self.psm_queue_path, fn_entry, for_windows=False)
        ssh_cmd = "ls -lt " + fn_queue_entry
        result = None

        #error_code, result = process_utils.sync_run_ssh(None, self.box_addr, ssh_cmd, report_error=False)
        result = self.run_cmd(ssh_cmd)

        if result and fn_entry in result:
            status = "queued"
        else:
            text = self.get_running_entry_name()
            if text == fn_entry:
                # entry might be running; is the runner script OR controller active?
                if self._get_runner_script_process_id():
                    status = "running"
                elif self._get_controller_process_id():
                    status = "running"
                else:
                    console.print("--> runner script and controller processes not running")
            else:
                console.print("PSM current job:", text)

        return status
    def poll_for_tensorboard_files(self, last_changed, blob_path, start_index,
                                   tb_path, run_name):
        # get all blobs in the run's output dir
        blobs = self.store.list_blobs(self.ws_name,
                                      blob_path,
                                      return_names=False)
        download_count = 0

        #console.print("blob_names=", blob_names)
        for blob in blobs:
            # is this a tensorboard file?
            basename = os.path.basename(blob.name)
            if not basename.startswith("events.out.tfevents"):
                continue

            # get interesting part of blob's path (after run_name/)
            bn = blob.name[start_index:]
            modified = blob.properties.last_modified

            if not bn in last_changed or last_changed[bn] != modified:
                last_changed[bn] = modified

                if "{logdir}" in tb_path:

                    # extract parent dir of blob
                    test_train_node = os.path.basename(
                        os.path.dirname(blob.name))
                    console.print("tb_path=", tb_path, ", test_train_node=",
                                  test_train_node, ", basename=", basename)

                    # apply to remaining template
                    tb_path_full = tb_path.format(
                        **{"logdir": test_train_node})
                    #console.print("tb_path_full=", tb_path_full)
                    local_fn = file_utils.path_join(tb_path_full, basename)
                else:
                    local_fn = tb_path

                local_fn = os.path.join("logs", local_fn)
                console.print("our local_fn=", local_fn)

                # download the new/changed blob
                try:
                    console.print("downloading bn={}, local_fn={}".format(
                        bn, local_fn))
                    file_utils.ensure_dir_exists(file=local_fn)
                    self.store.download_file_from_run(self.ws_name, run_name,
                                                      bn, local_fn)
                    download_count += 1

                    if self.print_progress:
                        console.print("d", end="", flush=True)
                except BaseException as ex:
                    logger.exception(
                        "Error in download_file_from_run, from tensorboard_reader, ex={}"
                        .format(ex))

        return download_count
    def get_running_entry_name(self):
        text = None

        fn_current = file_utils.path_join(self.controller_cwd, constants.CURRENT_RUNNING_ENTRY, for_windows=False)

        new_bytes = self.read_file(fn_current, 0, None)
        text = new_bytes.decode()
        text = text.strip()         # remove newline, spaces
        return text
    def enqueue(self, team, job, run, node, fn_zip):
        # copy file to box (with unique name)
        guid = str(uuid.uuid4())
        ticks = time.time()

        # copy with ".tmp" extension so PSM doesn't see partial file
        fn_tmp_entry = "{}.{}.{}.{}.{}.tmp".format(team, job, run, node, int(10*ticks))
        fn_tmp_dest = file_utils.path_join(self.psm_queue_path, fn_tmp_entry, for_windows=False)

        fn_entry = os.path.splitext(fn_tmp_entry)[0] + ".zip"
        fn_dest = file_utils.path_join(self.psm_queue_path, fn_entry, for_windows=False)

        # don't do a confirm since PSM can remove it from queue instantly
        self.ftp_client.put(fn_zip, fn_tmp_dest)

        # now, rename the file so that PSM can see it
        self.ftp_client.rename(fn_tmp_dest, fn_dest)

        return fn_entry
示例#5
0
    def run_job_on_box(self,
                       job_id,
                       run_data_list,
                       box_index,
                       box_info,
                       app_info,
                       pool_info,
                       resume_name=None,
                       repeat=None,
                       using_hp=None,
                       exper_name=None,
                       snapshot_dir=None,
                       args=None):

        box_name = box_info.box_name
        box_addr = box_info.address
        box_os = box_info.box_os
        is_box_windows = (box_os == "windows")

        run_data = run_data_list[0]
        run_name = run_data["run_name"]

        if pc_utils.is_localhost(box_addr=box_addr):
            psm_client = LocalPsmClient()
        else:
            psm_client = RemotePsmClient(box_addr, is_box_windows)

        psm_client.restart_psm_if_needed()
        #print("psm created for box: " + box_addr)

        team = self.config.get("general", "xt-team-name")
        node_id = utils.node_id(box_index)

        cwd_dir = os.path.expanduser(constants.CWD_DIR)
        fn_src_zip = file_utils.path_join(cwd_dir, constants.CODE_ZIP_FN)

        fn_entry = psm_client.enqueue(team, job_id, run_name, node_id,
                                      fn_src_zip)

        service_node_info = {
            "fn_entry": fn_entry,
            "box_addr": box_addr,
            "box_os": box_os,
            "box_name": box_name,
            "job_id": job_id,
            "run_name": run_name
        }

        fb.feedback("submitted", is_final=True)

        return service_node_info
示例#6
0
def get_controller_cwd(is_windows, is_local=False):
    if is_windows:
        # we only support windows as a local machine
        #cwd = os.path.expanduser("~/.xt/cwd")

        # docker has problems mapping paths to user home directories (~/)
        # controller app has problems copying/deleting files in 'programdata' folder
        # so, for windows, we use this:
        sys_drive = os.getenv("SystemDrive")
        cwd = file_utils.path_join(sys_drive + "/xt", "cwd")
    else:
        cwd = "~/.xt/cwd"

        # only safe to expand if local
        if is_local:
            cwd = os.path.expanduser(cwd)

    return cwd
示例#7
0
    def upload_sweep_data(self, sweeps_text, exper_name, job_id, args):
        '''
        we have extracted/parsed HP sweeps data; write it to the experiment/job store
        where we can find it during dynamic HP searches (running in controller).
        '''
        # upload SWEEP file to job or experiment directory
        fn_sweeps = args["hp_config"]
        agg_dest = args["aggregate_dest"]

        if not fn_sweeps:
            # must have extracted sweeps data from cmd line options
            fn_sweeps = constants.HP_CONFIG_FN
            args["hp_config"] = fn_sweeps

        # upload to a known folder name (since value of fn_sweeps can vary) and we need to find it later (HX usage)
        target_name = file_utils.path_join(constants.HP_CONFIG_DIR, os.path.basename(fn_sweeps))
        
        if agg_dest == "experiment":
            self.store.create_experiment_file(workspace, exper_name, target_name, sweeps_text)
        else:
            self.store.create_job_file(job_id, target_name, sweeps_text)
    def restart_psm_if_needed(self):
        '''
        processing:
            - if PSM is running on old psm.py, kill the process and restart it.  
            - if PMS is not running, start it.
        '''
        kill_needed = False
        start_needed = False

        fn_src = os.path.join(file_utils.get_my_file_dir(__file__), constants.PSM)
        fn_dest = file_utils.path_join(self.xt_path, constants.PSM, for_windows=self.box_is_windows)

        running = bool(self._get_psm_process_id())
        #print("PSM running=", running)

        if running:
            # do file contents match?
            text_src = file_utils.read_text_file(fn_src)
            text_dest = ""

            if self.remote_file_exists(fn_dest):
                # read text of fn_dest on remote box
                with self.ftp_client.open(fn_dest, "rb") as infile:
                    bytes_dest = infile.read()
                    text_dest = bytes_dest.decode()

                # normalize NEWLINE chars before comparison 
                # (ftp_client seems to add CR when called frm windows)
                text_src = text_src.replace("\r\n", "\n")
                text_dest = text_dest.replace("\r\n", "\n")

            if text_src != text_dest:
                kill_needed = True
        else:
            start_needed = True

        if kill_needed:
            p = self._get_psm_process_id()
            ssh_cmd = "kill -kill {}".format(p)
            self.run_cmd(ssh_cmd)
            start_needed = True

        if start_needed:
            # create required dirs
            self._make_dir(self.psm_queue_path)
            self._make_dir(self.cwd_path)

            # copy psm.py
            # caution: node slashes in fn_dest must all match box's OS style
            fn_dest = file_utils.fix_slashes(fn_dest, is_linux=True)
            status = self.ftp_client.put(fn_src, fn_dest)

            # run psm
            fn_log = os.path.join(self.xt_path, constants.PSMLOG)

            if self.box_is_windows:
                cmd_parts = ["cmd", "/c", "python -u {} > {}".format(fn_dest, fn_log)]
                cmd = " ".join(cmd_parts)
            else:
                fn_log = file_utils.fix_slashes(fn_log, is_linux=True)
                cmd = 'nohup bash --login -c "python -u {}" </dev/null > {} 2>&1 &'.format(fn_dest, fn_log) 
                #print("cmd=", cmd)

            #process_utils.sync_run_ssh(self, self.box_addr, cmd)
            self.run_cmd(cmd)

            for i in range(20):
                # don't return until PSM is running
                running = bool(self._get_psm_process_id())
                if running:
                    break

                time.sleep(.5)

            if not running:
                errors.general_error("Could not start remote PSM on box={}".format(self.box_addr))
示例#9
0
    def get_client_context(self,
                           exper_name,
                           run_name,
                           app_info,
                           box_info,
                           job_id,
                           node_index,
                           run_specs,
                           resume_name=None,
                           using_hp=False,
                           repeat=None,
                           args=None):
        '''
        this function gathers up all of the job-level context needed to run the job on the specified node (node_index).
        '''
        config = self.config
        cmd_parts = run_specs["cmd_parts"]
        workspace = args['workspace']
        working_dir = args['working_dir']

        context = Bag()

        context.ws = workspace
        context.working_dir = working_dir
        context.exper_name = exper_name
        context.run_name = run_name
        context.job_id = job_id
        context.sku = args["sku"]
        context.app_name = app_info.app_name if app_info else None
        context.box = args["box"]
        context.from_ip = pc_utils.get_ip_address()
        context.from_host = pc_utils.get_hostname()
        context.box_name = box_info.box_name
        context.target_file, _, _ = self.get_target(cmd_parts)
        context.resume_name = resume_name
        context.generated_sweep_text = None  # will be conditionally set in controller

        context.pool = args["pool"]
        context.node_index = node_index
        context.compute = args["target"]
        context.service_type = args["service_type"]

        # provide all provider info to controller
        context.providers = config.get("providers")

        #context.run_specs = run_specs
        context.cmd_parts = cmd_parts
        context.xt_cmd = args[
            "xt_cmd"]  # log our full cmd to support correct rerun's
        context.run_script = run_specs["run_script"]
        context.parent_script = run_specs["parent_script"]

        # for helping docker login to user's Azure Container Registry
        is_docker = (args["docker"] != None)
        # if cmd_parts:
        #     is_docker = (cmd_parts[0] == "docker") or (cmd_parts[0] == "sudo" and cmd_parts[1] == "docker")

        #registry = config.get("environment", "registry", suppress_warning=True)
        registry = None
        compute_def = args["compute_def"]
        if compute_def and "docker" in compute_def:
            docker_name = compute_def["docker"]
            docker_def = self.config.get_docker_def(docker_name)
            if docker_def and "registry" in docker_def:
                registry = docker_def["registry"]

        if registry:
            registry_creds = config.get("external-services", registry)
            needs_login = is_docker and utils.safe_value(
                registry_creds, "login")
            login_server = utils.safe_value(registry_creds, "login-server")
            username = utils.safe_value(registry_creds, "username")
            password = utils.safe_value(registry_creds, "password")
        else:
            needs_login = False
            login_server = None
            username = None
            password = None

        context.docker_login = needs_login
        context.docker_server = login_server
        context.docker_username = username
        context.docker_password = password

        context.username = self.config.get("general", "username")

        setup = self.config.get_setup_from_target_def(compute_def)
        activate_cmd = utils.safe_value(setup, "activate")
        context.activate_cmd = activate_cmd

        # config info
        #box_os = self.get_box_os(box_name)
        box_os = box_info.box_os

        after_files_list = args["after_dirs"]
        after_files_list = utils.parse_list_option_value(after_files_list)
        context.after_files_list = after_files_list

        after_omit_list = args["after_omit"]
        after_omit_list = utils.parse_list_option_value(after_omit_list)
        context.after_omit_list = after_omit_list

        context.primary_metric = args["primary_metric"]
        context.maximize_metric = args["maximize_metric"]
        context.report_rollup = args["report_rollup"]

        context.after_upload = args["after_upload"]
        #context.scrape = config.get("general", "scrape")
        context.log = args["log"]

        # PARENT/CHILD info
        context.repeat = repeat
        context.repeats_remaining = None  # will be set in controller
        context.total_run_count = args["total_run_count"]
        context.search_style = args["search_style"]
        context.is_parent = context.search_style != "single"

        # HPARAM search
        hp_config = args["hp_config"]
        if hp_config:
            hp_config = file_utils.path_join(constants.HP_CONFIG_DIR,
                                             os.path.basename(hp_config))

        context.hp_config = hp_config
        context.fn_generated_config = args["fn_generated_config"]
        context.using_hp = using_hp
        context.search_type = args["search_type"]
        context.option_prefix = args["option_prefix"]

        context.restart = False
        context.concurrent = args["concurrent"]
        context.xtlib_capture = args["xtlib_upload"]

        # for mirroring files to grok server or storage
        context.mirror_dest = args["mirror_dest"]
        context.mirror_files = args["mirror_files"]
        context.grok_server = None  # args["grok_server"]

        context.aggregate_dest = args["aggregate_dest"]
        context.dest_name = exper_name if context.aggregate_dest == "experiment" else job_id

        store_creds = self.config.get_storage_creds()
        context.store_creds = store_creds
        context.store_code_path = config.get_storage_provider_code_path(
            store_creds)

        mongo_creds, mongo_name = self.config.get_mongo_creds()
        context.mongo_conn_str = mongo_creds["mongo-connection-string"]

        context.shell_launch_prefix = box_info.shell_launch_prefix

        #console.print("context=", context)
        return context