def __init__(self, job_registry: Callable[[], JobRegistry], principal: str, keytab: str): self._job_registry = job_registry self._principal = principal self._keytab = keytab self._batch_jobs = GpsBatchJobs(catalog=None, jvm=None, principal=principal, key_tab=keytab)
def test_get_submit_py_files_basic(tmp_path, caplog): (tmp_path / "lib.whl").touch() (tmp_path / "zop.zip").touch() (tmp_path / "__pyfiles__").mkdir() (tmp_path / "__pyfiles__" / "stuff.py").touch() env = {"OPENEO_SPARK_SUBMIT_PY_FILES": "stuff.py,lib.whl,foo.py"} py_files = GpsBatchJobs.get_submit_py_files(env=env, cwd=tmp_path) assert py_files == "__pyfiles__/stuff.py,lib.whl" warn_logs = [r.message for r in caplog.records if r.levelname == "WARNING"] assert warn_logs == ["Could not find 'py-file' foo.py: skipping"]
def batch_jobs() -> GpsBatchJobs: java_opts = [ "-client", "-Dsoftware.amazon.awssdk.http.service.impl=software.amazon.awssdk.http.urlconnection.UrlConnectionSdkHttpService" ] java_gateway = JavaGateway.launch_gateway(jarpath=args.py4j_jarpath, classpath=args.py4j_classpath, javaopts=java_opts, die_on_exit=True) return GpsBatchJobs(get_layer_catalog(opensearch_enrich=True), java_gateway.jvm, args.principal, args.keytab)
def test_get_submit_py_files_deep_paths(tmp_path, caplog): # Originally submitted py-files env = { "OPENEO_SPARK_SUBMIT_PY_FILES": "data/deps/stuff.py,data/deps/lib.whl" } # Resources of flask app job. (tmp_path / "lib.whl").touch() (tmp_path / "__pyfiles__").mkdir() (tmp_path / "__pyfiles__" / "stuff.py").touch() py_files = GpsBatchJobs.get_submit_py_files(env=env, cwd=tmp_path) assert py_files == "__pyfiles__/stuff.py,lib.whl" warn_logs = [r.message for r in caplog.records if r.levelname == "WARNING"] assert warn_logs == []
def remove_batch_jobs_before(upper: datetime, jvm: JVMView) -> None: _log.info("removing batch jobs before {d}...".format(d=upper)) # TODO: how to cope with unneeded arguments? batch_jobs = GpsBatchJobs(catalog=None, jvm=jvm, principal="", key_tab="") batch_jobs.delete_jobs_before(upper)
class JobTracker: class _UnknownApplicationIdException(ValueError): pass _YarnStatus = namedtuple('YarnStatus', [ 'state', 'final_state', 'start_time', 'finish_time', 'aggregate_resource_allocation' ]) _KubeStatus = namedtuple('KubeStatus', ['state', 'start_time', 'finish_time']) def __init__(self, job_registry: Callable[[], JobRegistry], principal: str, keytab: str): self._job_registry = job_registry self._principal = principal self._keytab = keytab self._batch_jobs = GpsBatchJobs(catalog=None, jvm=None, principal=principal, key_tab=keytab) def loop_update_statuses(self, interval_s: int = 60): with self._job_registry() as registry: registry.ensure_paths() try: i = 0 while True: try: _log.info("tracking statuses...") if i % 60 == 0: self._refresh_kerberos_tgt() self.update_statuses() except Exception: _log.warning( "scheduling new run after failing to track batch jobs:\n{e}" .format(e=traceback.format_exc())) time.sleep(interval_s) i += 1 except KeyboardInterrupt: pass def update_statuses(self) -> None: with self._job_registry() as registry: registry.ensure_paths() jobs_to_track = registry.get_running_jobs() for job_info in jobs_to_track: try: job_id, user_id = job_info['job_id'], job_info['user_id'] application_id, current_status = job_info[ 'application_id'], job_info['status'] if application_id: try: if ConfigParams().is_kube_deploy: from openeogeotrellis.utils import s3_client, download_s3_dir state, start_time, finish_time = JobTracker._kube_status( job_id, user_id) new_status = JobTracker._kube_status_parser( state) registry.patch(job_id, user_id, status=new_status, started=start_time, finished=finish_time) if current_status != new_status: _log.info( "changed job %s status from %s to %s" % (job_id, current_status, new_status), extra={'job_id': job_id}) if state == "COMPLETED": # TODO: do we support SHub batch processes in this environment? The AWS # credentials conflict. download_s3_dir( "OpenEO-data", "batch_jobs/{j}".format(j=job_id)) result_metadata = self._batch_jobs.get_results_metadata( job_id, user_id) registry.patch(job_id, user_id, **result_metadata) registry.mark_done(job_id, user_id) _log.info("marked %s as done" % job_id, extra={'job_id': job_id}) else: state, final_state, start_time, finish_time, aggregate_resource_allocation =\ JobTracker._yarn_status(application_id) memory_time_megabyte_seconds, cpu_time_seconds =\ JobTracker._parse_resource_allocation(aggregate_resource_allocation) new_status = JobTracker._to_openeo_status( state, final_state) registry.patch( job_id, user_id, status=new_status, started=JobTracker. _to_serializable_datetime(start_time), finished=JobTracker. _to_serializable_datetime(finish_time), memory_time_megabyte_seconds= memory_time_megabyte_seconds, cpu_time_seconds=cpu_time_seconds) if current_status != new_status: _log.info( "changed job %s status from %s to %s" % (job_id, current_status, new_status), extra={'job_id': job_id}) if final_state != "UNDEFINED": result_metadata = self._batch_jobs.get_results_metadata( job_id, user_id) # TODO: skip patching the job znode and read from this file directly? registry.patch(job_id, user_id, **result_metadata) if new_status == 'finished': registry.remove_dependencies( job_id, user_id) dependency_sources = JobRegistry.get_dependency_sources( job_info) if dependency_sources: async_task.schedule_delete_batch_process_dependency_sources( job_id, dependency_sources) registry.mark_done(job_id, user_id) _log.info("marked %s as done" % job_id, extra={ 'job_id': job_id, 'area': result_metadata.get('area'), 'unique_process_ids': result_metadata.get( 'unique_process_ids'), 'cpu_time_seconds': cpu_time_seconds }) except JobTracker._UnknownApplicationIdException: registry.mark_done(job_id, user_id) except Exception: _log.warning( "resuming with remaining jobs after failing to handle batch job {j}:\n{e}" .format(j=job_id, e=traceback.format_exc()), extra={'job_id': job_id}) registry.set_status(job_id, user_id, 'error') registry.mark_done(job_id, user_id) @staticmethod def yarn_available() -> bool: """Check if YARN tools are available.""" try: _log.info("Checking if Hadoop 'yarn' tool is available") output = subprocess.check_output(["yarn", "version"]).decode("ascii") hadoop_yarn_available = "hadoop" in output.lower() _log.info( "Hadoop yarn available: {a}".format(a=hadoop_yarn_available)) return hadoop_yarn_available except Exception as e: _log.info("Failed to run 'yarn': {e!r}".format(e=e)) return False @staticmethod def _kube_status(job_id: str, user_id: str) -> '_KubeStatus': from openeogeotrellis.utils import kube_client user_id_truncated = user_id.split('@')[0][:20] job_id_truncated = job_id.split('-')[0] try: api_instance = kube_client() status = api_instance.get_namespaced_custom_object( "sparkoperator.k8s.io", "v1beta2", "spark-jobs", "sparkapplications", "job-{id}-{user}".format(id=job_id_truncated, user=user_id_truncated)) return JobTracker._KubeStatus( status['status']['applicationState']['state'], status['status']['lastSubmissionAttemptTime'], status['status']['terminationTime']) except Exception as e: _log.info(e) @staticmethod def _yarn_status(application_id: str) -> '_YarnStatus': """Returns (State, Final-State) of a job as reported by YARN.""" try: application_report = subprocess.check_output( ["yarn", "application", "-status", application_id]).decode() props = re.findall(r"\t(.+) : (.+)", application_report) def prop_value(name: str) -> str: return next(value for key, value in props if key == name) return JobTracker._YarnStatus( prop_value("State"), prop_value("Final-State"), prop_value("Start-Time"), prop_value("Finish-Time"), prop_value("Aggregate Resource Allocation")) except CalledProcessError as e: stdout = e.stdout.decode() if "doesn't exist in RM or Timeline Server" in stdout: raise JobTracker._UnknownApplicationIdException(stdout) else: raise @staticmethod def _parse_resource_allocation( aggregate_resource_allocation) -> (int, int): match = re.fullmatch(r"^(\d+) MB-seconds, (\d+) vcore-seconds$", aggregate_resource_allocation) return int(match.group(1)), int(match.group(2)) if match else (None, None) @staticmethod def _to_openeo_status(state: str, final_state: str) -> str: # TODO: encapsulate status if state == 'ACCEPTED': new_status = 'queued' elif state == 'RUNNING': new_status = 'running' else: new_status = 'created' if final_state == 'KILLED': new_status = 'canceled' elif final_state == 'SUCCEEDED': new_status = 'finished' elif final_state == 'FAILED': new_status = 'error' return new_status @staticmethod def _kube_status_parser(state: str) -> str: if state == 'PENDING': new_status = 'queued' elif state == 'RUNNING': new_status = 'running' elif state == 'COMPLETED': new_status = 'finished' elif state == 'FAILED': new_status = 'error' else: new_status = 'created' return new_status @staticmethod def _to_serializable_datetime(epoch_millis: str) -> Union[str, None]: if epoch_millis == "0": return None utc_datetime = datetime.utcfromtimestamp(int(epoch_millis) / 1000) return date_to_rfc3339(utc_datetime) def _refresh_kerberos_tgt(self): if self._keytab and self._principal: cmd = ["kinit", "-V", "-kt", self._keytab, self._principal] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in p.stdout: _log.info(line.rstrip().decode()) p.wait() if p.returncode: _log.warning("{c} returned exit code {r}".format( c=" ".join(cmd), r=p.returncode)) else: _log.warning("No Kerberos principal/keytab: will not refresh TGT")
def test_extract_application_id(): yarn_log = """ 19/07/10 15:56:39 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded. 19/07/10 15:56:39 INFO Client: Attempting to login to the Kerberos using principal: [email protected] and keytab: jenkins.keytab-2322e03c-bf97-4f59-b9ad-7c2ecb2d1c70 19/07/10 15:56:39 INFO RequestHedgingRMFailoverProxyProvider: Created wrapped proxy for [rm1, rm2] 19/07/10 15:56:39 INFO RequestHedgingRMFailoverProxyProvider: Looking for the active RM in [rm1, rm2]... 19/07/10 15:56:39 INFO RequestHedgingRMFailoverProxyProvider: Found active RM [rm2] 19/07/10 15:56:39 INFO Client: Requesting a new application from cluster with 99 NodeManagers 19/07/10 15:56:39 INFO Configuration: resource-types.xml not found 19/07/10 15:56:39 INFO ResourceUtils: Unable to find 'resource-types.xml'. 19/07/10 15:56:39 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (55296 MB per container) 19/07/10 15:56:39 INFO Client: Will allocate AM container, with 1408 MB memory including 384 MB overhead 19/07/10 15:56:39 INFO Client: Setting up container launch context for our AM 19/07/10 15:56:39 INFO Client: Setting up the launch environment for our AM container 19/07/10 15:56:39 INFO Client: Credentials file set to: credentials-4bfb4d79-eb95-4578-bd0a-cbfa2bf7d298 19/07/10 15:56:39 INFO Client: Preparing resources for our AM container 19/07/10 15:56:39 INFO HadoopFSDelegationTokenProvider: getting token for: DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_885268276_1, [email protected] (auth:KERBEROS)]] 19/07/10 15:56:39 INFO DFSClient: Created token for jenkins: HDFS_DELEGATION_TOKEN [email protected], renewer=yarn, realUser=, issueDate=1562766999634, maxDate=1563371799634, sequenceNumber=1296276, masterKeyId=1269 on ha-hdfs:hacluster 19/07/10 15:56:39 INFO HadoopFSDelegationTokenProvider: getting token for: DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_885268276_1, [email protected] (auth:KERBEROS)]] 19/07/10 15:56:39 INFO DFSClient: Created token for jenkins: HDFS_DELEGATION_TOKEN [email protected], renewer=jenkins, realUser=, issueDate=1562766999721, maxDate=1563371799721, sequenceNumber=1296277, masterKeyId=1269 on ha-hdfs:hacluster 19/07/10 15:56:39 INFO HadoopFSDelegationTokenProvider: Renewal interval is 86400059 for token HDFS_DELEGATION_TOKEN 19/07/10 15:56:40 INFO Client: To enable the AM to login from keytab, credentials are being copied over to the AM via the YARN Secure Distributed Cache. 19/07/10 15:56:40 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/jenkins.keytab-2322e03c-bf97-4f59-b9ad-7c2ecb2d1c70 -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/jenkins.keytab-2322e03c-bf97-4f59-b9ad-7c2ecb2d1c70 19/07/10 15:56:41 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 19/07/10 15:56:46 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/spark-ad3a2402-36d5-407a-8b30-392033d45899/__spark_libs__4608991107087829959.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/__spark_libs__4608991107087829959.zip 19/07/10 15:56:51 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/geotrellis-extensions-1.3.0-SNAPSHOT.jar -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/geotrellis-extensions-1.3.0-SNAPSHOT.jar 19/07/10 15:56:52 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/geotrellis-backend-assembly-0.4.5-openeo.jar -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/geotrellis-backend-assembly-0.4.5-openeo.jar 19/07/10 15:56:54 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/layercatalog.json -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/layercatalog.json 19/07/10 15:56:54 INFO Client: Uploading resource file:/mnt/ceph/Projects/OpenEO/f5ddcb98-a9ca-440e-a705-da6d71aaab44/in -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/in 19/07/10 15:56:54 INFO Client: Uploading resource https://artifactory.vgt.vito.be/auxdata-public/openeo/venv.zip#venv -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/venv.zip 19/07/10 15:57:01 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/venv/lib64/python3.5/site-packages/openeogeotrellis/deploy/batch_job.py -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/batch_job.py 19/07/10 15:57:01 INFO Client: Uploading resource file:/usr/hdp/3.0.0.0-1634/spark2/python/lib/pyspark.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/pyspark.zip 19/07/10 15:57:01 INFO Client: Uploading resource file:/usr/hdp/3.0.0.0-1634/spark2/python/lib/py4j-0.10.7-src.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/py4j-0.10.7-src.zip 19/07/10 15:57:02 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/spark-ad3a2402-36d5-407a-8b30-392033d45899/__spark_conf__2177799938793019578.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/__spark_conf__.zip 19/07/10 15:57:02 INFO SecurityManager: Changing view acls to: jenkins 19/07/10 15:57:02 INFO SecurityManager: Changing modify acls to: jenkins 19/07/10 15:57:02 INFO SecurityManager: Changing view acls groups to: 19/07/10 15:57:02 INFO SecurityManager: Changing modify acls groups to: 19/07/10 15:57:02 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(jenkins); groups with view permissions: Set(); users with modify permissions: Set(jenkins); groups with modify permissions: Set() 19/07/10 15:57:02 INFO Client: Submitting application application_1562328661428_5542 to ResourceManager 19/07/10 15:57:02 INFO YarnClientImpl: Submitted application application_1562328661428_5542 19/07/10 15:57:03 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:03 INFO Client: client token: Token { kind: YARN_CLIENT_TOKEN, service: } diagnostics: AM container is launched, waiting for AM container to Register with RM ApplicationMaster host: N/A ApplicationMaster RPC port: -1 queue: default start time: 1562767022250 final status: UNDEFINED tracking URL: http://epod17.vgt.vito.be:8088/proxy/application_1562328661428_5542/ user: jenkins 19/07/10 15:57:04 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:05 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:06 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:07 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:08 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:13 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:57:59 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:58:00 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:58:01 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED) 19/07/10 15:58:02 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:02 INFO Client: client token: Token { kind: YARN_CLIENT_TOKEN, service: } diagnostics: N/A ApplicationMaster host: 192.168.207.182 ApplicationMaster RPC port: 0 queue: default start time: 1562767022250 final status: UNDEFINED tracking URL: http://epod17.vgt.vito.be:8088/proxy/application_1562328661428_5542/ user: jenkins 19/07/10 15:58:03 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:04 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:05 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:06 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:07 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:08 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:09 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:10 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) 19/07/10 15:58:11 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING) """ assert GpsBatchJobs._extract_application_id( yarn_log) == "application_1562328661428_5542"
def test_get_submit_py_files_empty(tmp_path): env = {"OPENEO_SPARK_SUBMIT_PY_FILES": ""} py_files = GpsBatchJobs.get_submit_py_files(env=env, cwd=tmp_path) assert py_files == ""
def test_get_submit_py_files_no_env(tmp_path): py_files = GpsBatchJobs.get_submit_py_files(env={}, cwd=tmp_path) assert py_files == ""