def ensure_workspace_exists(self, ws_name, flag_as__error=True): self._check_ws_name(ws_name) exists = self.does_workspace_exist(ws_name) if not exists: if flag_as__error: errors.store_error("Workspace not found: {}".format(ws_name)) self.create_workspace(ws_name)
def delete_workspace(self, ws_name): result = self.provider.delete_container(ws_name) if not result: errors.store_error("could not delete workspace: " + ws_name) return result
def get_run_log(self, ws_name, run_name): blob_path = self._run_path(run_name) + "/" + constants.RUN_LOG if not self.provider.does_blob_exist(ws_name, blob_path): # limited support for old-style run logging blob_path = run_name + "/" + constants.RUN_LOG #console.print("blob_path=", blob_path) if not self.provider.does_blob_exist(ws_name, blob_path): errors.store_error("unknown run: ws={}, run_name={}".format( ws_name, run_name)) #console.print("get_run_log: ws_name=", ws_name, ", blob_path=", blob_path) # watch out for 0-length blobs (azure will throw retryable exception if you use "get_blob_to_text") blob = self.provider.get_blob_properties(ws_name, blob_path) #console.print("blob.properties.content_length=", blob.properties.content_length) lines = [] if blob.properties.content_length: text = self.provider.get_blob_text(ws_name, blob_path) #console.print("get_run_log: text=", text) lines = text.split("\n") #console.print("lines=", lines) lines = [json.loads(line) for line in lines if line.strip()] return lines
def get_run_record(store, workspace, run_name, fields_dict=None): run_records = get_run_records(store, workspace, [run_name], fields_dict) if not run_records: errors.store_error("Run {} does not exist in workspace {}".format( run_name, workspace)) rr = run_records[0] return rr
def root_files(self, root_name, use_blobs=False): if use_blobs: return RootBlobs(self, root_name) else: #return store_azure_file.RootFiles(self, ws_name) errors.store_error( "Root files are not current supported (use RootBlobs)")
def cancel_run(self, ws_name, run_name): console.diag("start of azure_ml.cancel_run()") target_run = self.get_run(ws_name, run_name) if not target_run: errors.store_error("run not found: {}".format(run_name)) console.diag("after get_run() call") before_status = target_run.status.lower() if before_status in ["preparing", "queued"]: target_run.cancel() killed = True status = "cancelled" elif before_status in ["starting", "running"]: target_run.cancel() killed = True status = "cancelled" else: killed = False status = target_run.status console.diag("after run.cancel() call") return { "workspace": ws_name, "run_name": run_name, "cancelled": killed, "status": status }
def get_client_cs(core, job_id, node_index): ''' instantiate the backend service that owns the specified job node and request it's client connection string ''' cs = None box_secret = None filter = {"_id": job_id} jobs = core.store.mongo.get_info_for_jobs(filter, None) if not jobs: errors.store_error("unknown job_id: {}".format(job_id)) job = jobs[0] node_id = utils.node_id(node_index) compute = utils.safe_value(job, "compute") secrets_by_node = utils.safe_value(job, "secrets_by_node") if not secrets_by_node: errors.store_error("unknown node_index={} for job={}".format( node_index, job_id)) box_secret = utils.safe_value(secrets_by_node, node_id) service_info_by_node = utils.safe_value(job, "service_info_by_node") node_info = utils.safe_value(service_info_by_node, node_id) if compute and node_info: backend = core.create_backend(compute) cs = backend.get_client_cs(node_info) cs_plus = {"cs": cs, "box_secret": box_secret, "job": job} return cs_plus
def create_workspace(self, ws_name, description=None): ''' create workspace as top level container ''' self._check_ws_name(ws_name) if self.does_workspace_exist(ws_name): errors.store_error("workspace already exists: {}".format(ws_name)) # note: this operation often must retry several times if same container has just been deleted #console.print("creating workspace=", ws_name) # MULTIPROCESS: this is the step that will fail (if any) result = self.provider.create_container(ws_name) if not result: errors.store_error("could not create workspace: " + ws_name) # MULTIPROCESS: safe now # create a holder file for RUNS directory runs_holder_fn = constants.RUNS_DIR + "/" + constants.HOLDER_FILE self._create_blob(ws_name, runs_holder_fn, "1", True) # create a holder file for EXPERIMENTS directory experiments_holder_fn = constants.EXPERIMENTS_DIR + "/" + constants.HOLDER_FILE self._create_blob(ws_name, experiments_holder_fn, "1", True) # create NEXT_RUN_NAME (for extra safety, ensure file doesn't already exist) blob_fn = constants.WORKSPACE_DIR + "/" + constants.WORKSPACE_NEXT self._create_blob(ws_name, blob_fn, "1", True)
def ensure_share_exists(self, share_name, flag_as__error=True): container_name = utils.make_share_name(share_name) self._check_ws_name(container_name) exists = self.does_share_exist(share_name) if not exists: if flag_as__error: errors.store_error("Share not found: {}".format(share_name)) self.create_share(share_name)
def delete_share(self, share_name): container_name = utils.make_share_name(share_name) self._check_ws_name(container_name) result = self.provider.delete_container(container_name) if not result: errors.store_error("could not delete share: " + share_name) return result
def validate_job_name_with_ws(store, job_name, validate): job_name = job_name.lower() if not is_job_id(job_name): return errors.syntax_error("Illegal job name: {}".format(job_name)) ws = store.get_job_workspace(job_name) if validate and not ws: errors.store_error("job '{}' does not exist".format(job_name)) return ws
def create_share(self, share_name, description=None): ''' create share as top level container ''' container_name = utils.make_share_name(share_name) self._check_ws_name(container_name) # note: this operation often must retry several times if same container has just been deleted #console.print("creating share=", ws_name) # MULTIPROCESS: this is the step that will fail (if any) result = self.provider.create_container(container_name) if not result: errors.store_error("could not create share: " + share_name)
def download_file(self, fn, dest_fn, progress_callback=None, use_snapshot=False): container, path, wc_target = self._get_container_path_target(fn) #console.print("container=", container, ", path=", path) # ensure blob exists ourselves so we can issue a friendly error if not self.store.provider.does_blob_exist(container, path): errors.store_error("Blob not found: container={}, path={}".format( container, path)) # ensure the directory of the dest_fn exists file_utils.ensure_dir_exists(file=dest_fn) if use_snapshot: # create temp. snapshot if progress_callback: progress_callback(status="creating-snapshot") props = self.store.provider.snapshot_blob(container, path) snapshot_id = props.snapshot # download the snapshot if progress_callback: progress_callback(status="downloading-snapshot") text = self.store.provider.get_blob_to_path( container, path, dest_fn, snapshot=snapshot_id, progress_callback=progress_callback) # delete the snapshot if progress_callback: progress_callback(status="deleting-snapshot") self.store.provider.delete_blob(container, path, snapshot=snapshot_id) if progress_callback: progress_callback(status="deleted-snapshot") else: # normal download text = self.store.provider.get_blob_to_path( container, path, dest_fn, progress_callback=progress_callback) return text
def _read_blob(self, ws_name, blob_path): console.diag("_read_blob: ws_name={}, blob_path={}".format( ws_name, blob_path)) if not self.does_workspace_exist(ws_name): # avoid 10 retries and unfriendly storage errors errors.store_error("container doesn't exist: " + ws_name) if not self.provider.does_blob_exist(ws_name, blob_path): # avoid 10 retries and unfriendly storage errors errors.store_error( "blob doesn't exist: container={}, path={}".format( ws_name, blob_path)) blob_text = self.provider.get_blob_text(ws_name, blob_path) return blob_text
def copy_run(self, source_workspace_name, source_run_name, dest_workspace_name, dest_run_name): if self.does_run_exist(dest_workspace_name, dest_run_name): errors.store_error( "destination run already exists: ws={}, run={}".format( dest_workspace_name, dest_run_name)) # copy a single blob at a time #for source_blob_path in self.bs.list_blob_names(source_workspace_name, source_run_name): for source_blob in self.provider.list_blobs( source_workspace_name, path=self._run_path(source_run_name) + "/"): dest_blob_path = self._run_path( dest_run_name) + "/" + self._remove_first_node(source_blob) # copy single blob within same storage service self.provider.copy_blob(source_workspace_name, source_blob, dest_workspace_name, dest_blob_path)
def download_runs(self, store, ws_name, run_group_name, run_group_type, hp_config_cloud_path, hp_config_local_dir): # Download the all_runs file local_cache_path = "{}/{}/{}/".format(hp_config_local_dir, ws_name, run_group_type) local_config_file_path = "{}{}".format(local_cache_path, "hp-config.yaml") if run_group_name == "experiment": console.print( "downloading runs for EXPERIMENT={}...".format(run_group_type)) # files are at EXPERIMENT LEVEL # read SWEEPS file if not store.does_experiment_file_exist(ws_name, run_group_type, hp_config_cloud_path): errors.store_error( "missing experiment hp_config file (ws={}, exper={}, fn={})" .format(ws_name, run_group_type, hp_config_cloud_path)) store.download_file_from_experiment(ws_name, run_group_type, hp_config_cloud_path, local_config_file_path) # read ALLRUNS info aggregated in EXPERIMENT allrun_records = store.get_all_runs(run_group_name, ws_name, run_group_type) else: console.print( "downloading runs for JOB={}...".format(run_group_type)) # files are at JOB LEVEL # read SWEEPS file if not store.does_job_file_exist(run_group_type, hp_config_cloud_path): errors.store_error( "missing job hp_config file (job={}, fn={})".format( run_group_type, hp_config_cloud_path)) store.download_file_from_job(run_group_type, hp_config_cloud_path, local_config_file_path) # read ALLRUNS info aggregated in JOB allrun_records = store.get_all_runs(run_group_name, ws_name, run_group_type) console.diag("after downloading all runs") return local_config_file_path, allrun_records
def get_client_cs(core, ws, run_name): cs = None box_secret = None filter = {"_id": run_name} runs = core.store.mongo.get_info_for_runs(ws, filter, {"run_logs": 0}) if not runs: errors.store_error("Unknown run: {}/{}".format(ws, run_name)) if runs: from xtlib import job_helper run = runs[0] job_id = utils.safe_value(run, "job_id") node_index = utils.safe_value(run, "node_index") cs_plus = job_helper.get_client_cs(core, job_id, node_index) cs = cs_plus["cs"] box_secret = cs_plus["box_secret"] return cs, box_secret
def validate_run_name(store, ws, run_name, error_if_invalid=True, parse_only=False): run_name = correct_slash(run_name) if "/" in run_name: parts = run_name.split("/") if len(parts) != 2: errors.syntax_error("invalid format for run name: " + run_name) ws, run_name = parts run_name = run_name.lower() if not parse_only and not "*" in run_name: if not store.mongo.does_run_exist(ws, run_name): if error_if_invalid: errors.store_error( "run '{}' does not exist in workspace '{}'".format( run_name, ws)) else: return None, None, None return ws, run_name, ws + "/" + run_name
def download(self, store_path, local_path, share, workspace, experiment, job, run, feedback, snapshot, show_output=True): use_blobs = True use_multi = True # default until we test if store_path exists as a file/blob download_count = 0 fs = self.create_file_accessor(use_blobs, share, workspace, experiment, job, run) # test for existance of store_path as a blob/file if not "*" in store_path and not "?" in store_path: if fs.does_file_exist(store_path): use_multi = False if local_path: # exapnd ~/ in front of local path local_path = os.path.expanduser(local_path) else: # path not specified for local if use_multi: local_path = "." else: local_path = "./" + os.path.basename(store_path) uri = fs.get_uri(store_path) # default store folder to recursive if use_multi and not "*" in store_path and not "?" in store_path: store_path += "/**" use_snapshot = snapshot feedback_progress = FeedbackProgress(feedback, show_output) progress_callback = feedback_progress.progress if feedback else None if use_multi: # download MULTI blobs/files what = "blobs" if use_blobs else "files" single_what = what[0:-1] if show_output: console.print("collecting {} names from: {}...".format( single_what, uri), end="") _, blob_names = fs.get_filenames(store_path, full_paths=False) if show_output: console.print() if len(blob_names) == 0: console.print("no matching {} found in: {}".format(what, uri)) return 0 elif len(blob_names) == 1: what = "blob" if use_blobs else "file" if show_output: console.print("\ndownloading {} {}...:".format( len(blob_names), what)) file_utils.ensure_dir_exists(local_path) max_name_len = max( [len(local_path + "/" + name) for name in blob_names]) name_width = 1 + max_name_len #console.print("max_name_len=", max_name_len, ", name_width=", name_width) for f, bn in enumerate(blob_names): dest_fn = file_utils.fix_slashes(local_path + "/" + bn) if show_output: file_msg = "file {}/{}".format(1 + f, len(blob_names)) console.print(" {2:}: {1:<{0:}} ".format( name_width, dest_fn + ":", file_msg), end="", flush=True) feedback_progress.start() full_bn = uri + "/" + bn if uri else bn fs.download_file(full_bn, dest_fn, progress_callback=progress_callback, use_snapshot=use_snapshot) feedback_progress.end() download_count += 1 else: # download SINGLE blobs/files what = "blob" if use_blobs else "file" if not fs.does_file_exist(store_path): errors.store_error("{} not found: {}".format(what, uri)) local_path = file_utils.fix_slashes(local_path) if show_output: console.print("\nfrom {}, downloading {}:".format(uri, what)) console.print(" {}: ".format(local_path), end="", flush=True) feedback_progress.start() fs.download_file(store_path, local_path, progress_callback=progress_callback, use_snapshot=use_snapshot) feedback_progress.end() download_count += 1 return download_count
def get_job_record(store, job_id, fields_dict=None): job_records = get_job_records(store, [job_id], fields_dict) if not job_records: errors.store_error("job {} does not exist".format(job_id)) jr = job_records[0] return jr