def setup_batch_jobs(): if not ConfigParams().is_ci_context: with JobRegistry() as job_registry: job_registry.ensure_paths() job_tracker = JobTracker(JobRegistry, principal="", keytab="") threading.Thread(target=job_tracker.loop_update_statuses, daemon=True).start()
def cancel_job(self, job_id: str, user_id: str): with JobRegistry() as registry: application_id = registry.get_job(job_id, user_id)['application_id'] # TODO: better logging of this kill. subprocess.run( ["yarn", "application", "-kill", application_id], timeout=20, check=True, )
def create_job(self, user_id: str, job_specification: dict, api_version: str) -> BatchJobMetadata: job_id = str(uuid.uuid4()) with JobRegistry() as registry: job_info = registry.register(job_id=job_id, user_id=user_id, api_version=api_version, specification=job_specification) return BatchJobMetadata(id=job_id, process=job_specification, status=job_info["status"], created=parse_rfc3339(job_info["created"]))
def when_ready(server): print(server) from pyspark import SparkContext sc = SparkContext.getOrCreate() principal = sc.getConf().get("spark.yarn.principal") keytab = sc.getConf().get("spark.yarn.keytab") logging.getLogger('gunicorn.error').info('Gunicorn info logging enabled!') logging.getLogger('flask').info('Flask info logging enabled!') with JobRegistry() as job_registry: job_registry.ensure_paths() job_tracker = JobTracker(JobRegistry, principal, keytab) threading.Thread(target=job_tracker.update_statuses, daemon=True).start()
def update_statuses(self) -> None: with self._job_registry() as registry: registry.ensure_paths() jobs_to_track = registry.get_running_jobs() for job_info in jobs_to_track: try: job_id, user_id = job_info['job_id'], job_info['user_id'] application_id, current_status = job_info[ 'application_id'], job_info['status'] if application_id: try: if ConfigParams().is_kube_deploy: from openeogeotrellis.utils import s3_client, download_s3_dir state, start_time, finish_time = JobTracker._kube_status( job_id, user_id) new_status = JobTracker._kube_status_parser( state) registry.patch(job_id, user_id, status=new_status, started=start_time, finished=finish_time) if current_status != new_status: _log.info( "changed job %s status from %s to %s" % (job_id, current_status, new_status), extra={'job_id': job_id}) if state == "COMPLETED": # TODO: do we support SHub batch processes in this environment? The AWS # credentials conflict. download_s3_dir( "OpenEO-data", "batch_jobs/{j}".format(j=job_id)) result_metadata = self._batch_jobs.get_results_metadata( job_id, user_id) registry.patch(job_id, user_id, **result_metadata) registry.mark_done(job_id, user_id) _log.info("marked %s as done" % job_id, extra={'job_id': job_id}) else: state, final_state, start_time, finish_time, aggregate_resource_allocation =\ JobTracker._yarn_status(application_id) memory_time_megabyte_seconds, cpu_time_seconds =\ JobTracker._parse_resource_allocation(aggregate_resource_allocation) new_status = JobTracker._to_openeo_status( state, final_state) registry.patch( job_id, user_id, status=new_status, started=JobTracker. _to_serializable_datetime(start_time), finished=JobTracker. _to_serializable_datetime(finish_time), memory_time_megabyte_seconds= memory_time_megabyte_seconds, cpu_time_seconds=cpu_time_seconds) if current_status != new_status: _log.info( "changed job %s status from %s to %s" % (job_id, current_status, new_status), extra={'job_id': job_id}) if final_state != "UNDEFINED": result_metadata = self._batch_jobs.get_results_metadata( job_id, user_id) # TODO: skip patching the job znode and read from this file directly? registry.patch(job_id, user_id, **result_metadata) if new_status == 'finished': registry.remove_dependencies( job_id, user_id) dependency_sources = JobRegistry.get_dependency_sources( job_info) if dependency_sources: async_task.schedule_delete_batch_process_dependency_sources( job_id, dependency_sources) registry.mark_done(job_id, user_id) _log.info("marked %s as done" % job_id, extra={ 'job_id': job_id, 'area': result_metadata.get('area'), 'unique_process_ids': result_metadata.get( 'unique_process_ids'), 'cpu_time_seconds': cpu_time_seconds }) except JobTracker._UnknownApplicationIdException: registry.mark_done(job_id, user_id) except Exception: _log.warning( "resuming with remaining jobs after failing to handle batch job {j}:\n{e}" .format(j=job_id, e=traceback.format_exc()), extra={'job_id': job_id}) registry.set_status(job_id, user_id, 'error') registry.mark_done(job_id, user_id)
def main(): import argparse logging.basicConfig(level=logging.INFO) openeogeotrellis.backend.logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(stream=sys.stdout) handler.formatter = JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S%z") root_logger = logging.getLogger() root_logger.addHandler(handler) _log.info("argv: {a!r}".format(a=sys.argv)) _log.info("ConfigParams(): {c}".format(c=ConfigParams())) # FIXME: there's no Java output because Py4J redirects the JVM's stdout/stderr to /dev/null unless JavaGateway's # redirect_stdout/redirect_stderr are set (EP-4018) try: parser = argparse.ArgumentParser(usage="OpenEO AsyncTask --task <task>", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--py4j-jarpath", default="venv/share/py4j/py4j0.10.7.jar", help='Path to the Py4J jar') parser.add_argument("--py4j-classpath", default="geotrellis-extensions-2.2.0-SNAPSHOT.jar", help='Classpath used to launch the Java Gateway') parser.add_argument("--principal", default="*****@*****.**", help="Principal to be used to login to KDC") parser.add_argument("--keytab", default="openeo-deploy/mep/openeo.keytab", help="The full path to the file that contains the keytab for the principal") parser.add_argument("--task", required=True, dest="task_json", help="The task description in JSON") args = parser.parse_args() task = json.loads(args.task_json) task_id = task['task_id'] if task_id not in [TASK_DELETE_BATCH_PROCESS_RESULTS, TASK_POLL_SENTINELHUB_BATCH_PROCESSES, TASK_DELETE_BATCH_PROCESS_DEPENDENCY_SOURCES]: raise ValueError(f'unsupported task_id "{task_id}"') arguments: dict = task.get('arguments', {}) def batch_jobs() -> GpsBatchJobs: java_opts = [ "-client", "-Dsoftware.amazon.awssdk.http.service.impl=software.amazon.awssdk.http.urlconnection.UrlConnectionSdkHttpService" ] java_gateway = JavaGateway.launch_gateway(jarpath=args.py4j_jarpath, classpath=args.py4j_classpath, javaopts=java_opts, die_on_exit=True) return GpsBatchJobs(get_layer_catalog(opensearch_enrich=True), java_gateway.jvm, args.principal, args.keytab) if task_id in [TASK_DELETE_BATCH_PROCESS_RESULTS, TASK_DELETE_BATCH_PROCESS_DEPENDENCY_SOURCES]: batch_job_id = arguments['batch_job_id'] dependency_sources = (arguments.get('dependency_sources') or [f"s3://{sentinel_hub.OG_BATCH_RESULTS_BUCKET}/{subfolder}" for subfolder in arguments['subfolders']]) _log.info(f"removing dependency sources {dependency_sources} for batch job {batch_job_id}...", extra={'job_id': batch_job_id}) batch_jobs().delete_batch_process_dependency_sources(job_id=batch_job_id, dependency_sources=dependency_sources, propagate_errors=True) elif task_id == TASK_POLL_SENTINELHUB_BATCH_PROCESSES: batch_job_id = arguments['batch_job_id'] user_id = arguments['user_id'] while True: time.sleep(SENTINEL_HUB_BATCH_PROCESSES_POLL_INTERVAL_S) with JobRegistry() as registry: job_info = registry.get_job(batch_job_id, user_id) if job_info.get('dependency_status') not in ['awaiting', "awaiting_retry"]: break else: try: batch_jobs().poll_sentinelhub_batch_processes(job_info) except Exception: # TODO: retry in Nifi? How to mark this job as 'error' then? _log.error("failed to handle polling batch processes for batch job {j}:\n{e}" .format(j=batch_job_id, e=traceback.format_exc()), extra={'job_id': batch_job_id}) with JobRegistry() as registry: registry.set_status(batch_job_id, user_id, 'error') registry.mark_done(batch_job_id, user_id) raise else: raise AssertionError(f'unexpected task_id "{task_id}"') except Exception as e: _log.error(e, exc_info=True) raise e
def start_job(self, job_id: str, user_id: str): from pyspark import SparkContext with JobRegistry() as registry: job_info = registry.get_job(job_id, user_id) api_version = job_info.get('api_version') current_status = job_info['status'] if current_status in ['queued', 'running']: return elif current_status != 'created': # TODO: is this about restarting a job? registry.mark_ongoing(job_id, user_id) registry.set_application_id(job_id, user_id, None) registry.set_status(job_id, user_id, 'created') spec = json.loads(job_info.get('specification')) extra_options = spec.get('job_options', {}) driver_memory = extra_options.get("driver-memory", "22G") executor_memory = extra_options.get("executor-memory", "5G") kerberos() output_dir = self._get_job_output_dir(job_id) input_file = output_dir / "in" # TODO: how support multiple output files? output_file = output_dir / "out" log_file = output_dir / "log" with input_file.open('w') as f: f.write(job_info['specification']) conf = SparkContext.getOrCreate().getConf() principal, key_tab = conf.get("spark.yarn.principal"), conf.get( "spark.yarn.keytab") script_location = pkg_resources.resource_filename( 'openeogeotrellis.deploy', 'submit_batch_job.sh') args = [ script_location, "OpenEO batch job {j} user {u}".format(j=job_id, u=user_id), str(input_file), str(output_file), str(log_file) ] if principal is not None and key_tab is not None: args.append(principal) args.append(key_tab) else: args.append("no_principal") args.append("no_keytab") if api_version: args.append(api_version) else: args.append("0.4.0") args.append(driver_memory) args.append(executor_memory) try: output_string = subprocess.check_output( args, stderr=subprocess.STDOUT, universal_newlines=True) except CalledProcessError as e: logger.exception(e) logger.error(e.stdout) logger.error(e.stderr) raise e try: # note: a job_id is returned as soon as an application ID is found in stderr, not when the job is finished logger.info(output_string) application_id = self._extract_application_id(output_string) print("mapped job_id %s to application ID %s" % (job_id, application_id)) registry.set_application_id(job_id, user_id, application_id) except _BatchJobError as e: traceback.print_exc(file=sys.stderr) # TODO: why reraise as CalledProcessError? raise CalledProcessError(1, str(args), output=output_string)
def get_user_jobs(self, user_id: str) -> List[BatchJobMetadata]: with JobRegistry() as registry: return [ self._parse_job_info(job_info) for job_info in registry.get_user_jobs(user_id) ]
def get_job_info(self, job_id: str, user_id: str) -> BatchJobMetadata: with JobRegistry() as registry: job_info = registry.get_job(job_id, user_id) return self._parse_job_info(job_info)
def setup_batch_jobs() -> None: with JobRegistry() as job_registry: job_registry.ensure_paths()
from openeogeotrellis.job_registry import JobRegistry import datetime import pandas as pd with JobRegistry() as registry: jobs_before = registry.get_all_jobs_before(datetime.datetime.now()) df = pd.DataFrame(jobs_before) df.created = pd.to_datetime(df.created) df.index = df.created print(df.status.unique()) df = df[(df.status == 'finished') | (df.status == 'error') | (df.status == 'canceled')] df = df[(df.user_id != 'jenkins') & (df.user_id != 'geopyspark-integrationtester')] df['yearmonth'] = df.index.strftime('%Y%m') df['cpuhour'] = df.cpu_time_seconds / 3600.0 df['cpuhour'] = df.cpu_time_seconds / 3600.0 df['cost'] = df.cpuhour * 0.01 df['memoryhour'] = df.memory_time_megabyte_seconds / (3600 * 1024) df['memorycost'] = df.memoryhour * 0.005 df['totalcost'] = df.memorycost + df.cost cost_by_user_month = df.groupby(['user_id', 'yearmonth']).sum().cost cost_by_user_month = cost_by_user_month[cost_by_user_month > 1.0] memorycost_by_user_month = df.groupby(['user_id', 'yearmonth']).sum().memorycost memorycost_by_user_month = memorycost_by_user_month[ memorycost_by_user_month > 1.0] total_cost = (memorycost_by_user_month + cost_by_user_month).round()