def get_docker_params(image_name, image_url, image_mappings, root_work_dir, job_dir, runtime_options=None): """Build docker params.""" # get dirs to mount root_jobs_dir = os.path.join(root_work_dir, 'jobs') root_tasks_dir = os.path.join(root_work_dir, 'tasks') root_workers_dir = os.path.join(root_work_dir, 'workers') root_cache_dir = os.path.join(root_work_dir, 'cache') # docker params dict params = { "image_name": image_name, "image_url": image_url, "uid": os.getuid(), "gid": os.getgid(), "working_dir": job_dir, "volumes": [ ("/var/run/docker.sock", "/var/run/docker.sock"), (root_jobs_dir, root_jobs_dir), (root_tasks_dir, root_tasks_dir), (root_workers_dir, root_workers_dir), (root_cache_dir, "{}:ro".format(root_cache_dir)), ] } # add default image mappings celery_cfg_file = os.environ.get('HYSDS_CELERY_CFG', app.conf.__file__) if celery_cfg_file not in image_mappings and "celeryconfig.py" not in list(image_mappings.values()): image_mappings[celery_cfg_file] = "celeryconfig.py" dsets_cfg_file = os.environ.get('HYSDS_DATASETS_CFG', os.path.normpath(os.path.join(os.path.dirname(sys.executable), '..', 'etc', 'datasets.json'))) if dsets_cfg_file not in image_mappings and "datasets.json" not in list(image_mappings.values()): image_mappings[dsets_cfg_file] = "datasets.json" # if running on k8s add hosts and resolv.conf; create mount directory blacklist = app.conf.WORKER_MOUNT_BLACKLIST mnt_dir = None on_k8s = int(app.conf.get('K8S', 0)) if on_k8s: for f in ("/etc/hosts", "/etc/resolv.conf"): if f not in image_mappings and f not in list(image_mappings.values()): image_mappings[f] = f blacklist = [i for i in blacklist if i != "/etc"] mnt_dir = mkdtemp(prefix=".container_mounts-", dir=job_dir) # add user-defined image mappings for k, v in list(image_mappings.items()): k = os.path.expandvars(k) verify_docker_mount(k, blacklist) mode = "ro" if isinstance(v, list): if len(v) > 1: v, mode = v[0:2] elif len(v) == 1: v = v[0] else: raise RuntimeError("Invalid image mapping: %s:%s" % (k, v)) if v.startswith('/'): mnt = v else: mnt = os.path.join(job_dir, v) if mnt_dir is not None: k = copy_mount(k, mnt_dir) params['volumes'].append((k, "%s:%s" % (mnt, mode))) # add runtime resources params['runtime_options'] = dict() if runtime_options is None: runtime_options = dict() for k, v in list(runtime_options.items()): # validate we have GPUs if k == "gpus" and int(os.environ.get("HYSDS_GPU_AVAILABLE", 0)) == 0: logger.warning("Job specified runtime option 'gpus' but no GPUs were detected. Skipping this option.") continue params['runtime_options'][k] = v return params
def triage(job, ctx): """Triage failed job's context and job json as well as _run.sh.""" # set time_start if not defined (job failed prior to setting it) if "time_start" not in job["job_info"]: job["job_info"]["time_start"] = "{}Z".format( datetime.utcnow().isoformat("T")) # default triage id default_triage_id_format = "triaged_job-{job_id}_task-{job[task_id]}" default_triage_id_regex = "triaged_job-(?P<job_id>.+)_task-(?P<task_id>[-\\w])" # if exit code of job command is zero, don't triage anything exit_code = job["job_info"]["status"] if exit_code == 0: logger.info("Job exited with exit code %s. No need to triage." % exit_code) return True # disable triage if ctx.get("_triage_disabled", False): logger.info( "Flag _triage_disabled set to True. Not performing triage.") return True # Check if custom triage id format was provided if "_triage_id_format" in ctx: triage_id_format = ctx["_triage_id_format"] else: triage_id_format = default_triage_id_format # get job info job_dir = job["job_info"]["job_dir"] job_id = job["job_info"]["id"] logger.info("job id: {}".format(job_id)) # Check if the job_id is a triaged dataset. If so, let's parse out the job_id logger.info("Checking to see if the job_id matches the regex: {}".format( default_triage_id_regex)) match = re.search(default_triage_id_regex, job_id) if match: logger.info( "job_id matches the triage dataset regex. Parsing out job_id") parsed_job_id = match.groupdict()["job_id"] logger.info("extracted job_id: {}".format(parsed_job_id)) else: logger.info( "job_id does not match the triage dataset regex: {}".format( default_triage_id_regex)) parsed_job_id = job_id # create triage dataset # Attempt to first use triage id format from user, but if there is any problem use the default id format instead try: triage_id = triage_id_format.format(job_id=parsed_job_id, job=job, job_context=ctx) except Exception as e: logger.warning( "Failed to apply custom triage id format because of {}: {}. Falling back to default triage id" .format(e.__class__.__name__, e)) triage_id = default_triage_id_format.format(job_id=parsed_job_id, job=job, job_context=ctx) triage_dir = os.path.join(job_dir, triage_id) makedirs(triage_dir) # create dataset json ds_file = os.path.join(triage_dir, "{}.dataset.json".format(triage_id)) ds = { "version": "v{}".format(hysds.__version__), "label": "triage for job {}".format(parsed_job_id), } if "cmd_start" in job["job_info"]: ds["starttime"] = job["job_info"]["cmd_start"] if "cmd_end" in job["job_info"]: ds["endtime"] = job["job_info"]["cmd_end"] with open(ds_file, "w") as f: json.dump(ds, f, sort_keys=True, indent=2) # create met json met_file = os.path.join(triage_dir, "{}.met.json".format(triage_id)) with open(met_file, "w") as f: json.dump(job["job_info"], f, sort_keys=True, indent=2) # triage job-related files for f in glob(os.path.join(job_dir, "_*")): if os.path.isdir(f): shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f))) else: shutil.copy(f, triage_dir) # triage log files for f in glob(os.path.join(job_dir, "*.log")): if os.path.isdir(f): shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f))) else: shutil.copy(f, triage_dir) # triage additional globs for g in ctx.get("_triage_additional_globs", []): for f in glob(os.path.join(job_dir, g)): f = os.path.normpath(f) dst = os.path.join(triage_dir, os.path.basename(f)) if os.path.exists(dst): dst = "{}.{}Z".format(dst, datetime.utcnow().isoformat("T")) try: if os.path.isdir(f): shutil.copytree(f, dst) else: shutil.copy(f, dst) except Exception as e: tb = traceback.format_exc() logger.error( "Skipping copying of {}. Got exception: {}\n{}".format( f, str(e), tb)) # publish prod_json = publish_dataset(triage_dir, ds_file, job, ctx) # write published triage to file pub_triage_file = os.path.join(job_dir, "_triaged.json") with open(pub_triage_file, "w") as f: json.dump(prod_json, f, indent=2, sort_keys=True) # signal run_job() to continue return True
def triage(job, ctx): """Triage failed job's context and job json as well as _run.sh.""" ### default_triage_id_format = "triaged_job-{job[job_info][id]}" default_triage_id_format = "triaged_job-{job[job_info][id]}-{job[task_id]}" # if exit code of job command is zero, don't triage anything exit_code = job['job_info']['status'] if exit_code == 0: logger.info("Job exited with exit code %s. No need to triage." % exit_code) return True # disable triage if ctx.get('_triage_disabled', False): logger.info( "Flag _triage_disabled set to True. Not performing triage.") return True # Check if custom triage id format was provided if '_triage_id_format' in ctx: triage_id_format = ctx['_triage_id_format'] else: triage_id_format = default_triage_id_format # get job info job_dir = job['job_info']['job_dir'] job_id = job['job_info']['id'] # create triage dataset # Attempt to first use triage id format from user, but if there is any problem use the default id format instead try: triage_id = triage_id_format.format(job=job, job_context=ctx) except Exception as e: logger.warning( "Failed to apply custom triage id format because of {}: {}. Falling back to default triage id" .format(e.__class__.__name__, e)) triage_id = default_triage_id_format.format(job=job, job_context=ctx) triage_dir = os.path.join(job_dir, triage_id) makedirs(triage_dir) # create dataset json ds_file = os.path.join(triage_dir, '{}.dataset.json'.format(triage_id)) ds = { 'version': 'v{}'.format(hysds.__version__), 'label': 'triage for job {}'.format(job_id), } if 'cmd_start' in job['job_info']: ds['starttime'] = job['job_info']['cmd_start'] if 'cmd_end' in job['job_info']: ds['endtime'] = job['job_info']['cmd_end'] with open(ds_file, 'w') as f: json.dump(ds, f, sort_keys=True, indent=2) # create met json met_file = os.path.join(triage_dir, '{}.met.json'.format(triage_id)) with open(met_file, 'w') as f: json.dump(job['job_info'], f, sort_keys=True, indent=2) # triage job-related files for f in glob(os.path.join(job_dir, '_*')): shutil.copy(f, triage_dir) # triage log files for f in glob(os.path.join(job_dir, '*.log')): shutil.copy(f, triage_dir) # triage additional globs for g in ctx.get('_triage_additional_globs', []): for f in glob(os.path.join(job_dir, g)): if os.path.isdir(f): shutil.copytree(f, os.path.join(triage_dir, os.path.basename(f))) else: shutil.copy(f, triage_dir) # publish prod_json = publish_dataset(triage_dir, ds_file, job, ctx) # write published triage to file pub_triage_file = os.path.join(job_dir, '_triaged.json') with open(pub_triage_file, 'w') as f: json.dump(prod_json, f, indent=2, sort_keys=True) # signal run_job() to continue return True