예제 #1
0
 def __init__(self, job_registry: Callable[[], JobRegistry], principal: str,
              keytab: str):
     self._job_registry = job_registry
     self._principal = principal
     self._keytab = keytab
     self._batch_jobs = GpsBatchJobs(catalog=None,
                                     jvm=None,
                                     principal=principal,
                                     key_tab=keytab)
예제 #2
0
def test_get_submit_py_files_basic(tmp_path, caplog):
    (tmp_path / "lib.whl").touch()
    (tmp_path / "zop.zip").touch()
    (tmp_path / "__pyfiles__").mkdir()
    (tmp_path / "__pyfiles__" / "stuff.py").touch()
    env = {"OPENEO_SPARK_SUBMIT_PY_FILES": "stuff.py,lib.whl,foo.py"}
    py_files = GpsBatchJobs.get_submit_py_files(env=env, cwd=tmp_path)
    assert py_files == "__pyfiles__/stuff.py,lib.whl"
    warn_logs = [r.message for r in caplog.records if r.levelname == "WARNING"]
    assert warn_logs == ["Could not find 'py-file' foo.py: skipping"]
예제 #3
0
        def batch_jobs() -> GpsBatchJobs:
            java_opts = [
                "-client",
                "-Dsoftware.amazon.awssdk.http.service.impl=software.amazon.awssdk.http.urlconnection.UrlConnectionSdkHttpService"
            ]

            java_gateway = JavaGateway.launch_gateway(jarpath=args.py4j_jarpath,
                                                      classpath=args.py4j_classpath,
                                                      javaopts=java_opts,
                                                      die_on_exit=True)

            return GpsBatchJobs(get_layer_catalog(opensearch_enrich=True), java_gateway.jvm, args.principal,
                                args.keytab)
예제 #4
0
def test_get_submit_py_files_deep_paths(tmp_path, caplog):
    # Originally submitted py-files
    env = {
        "OPENEO_SPARK_SUBMIT_PY_FILES": "data/deps/stuff.py,data/deps/lib.whl"
    }
    # Resources of flask app job.
    (tmp_path / "lib.whl").touch()
    (tmp_path / "__pyfiles__").mkdir()
    (tmp_path / "__pyfiles__" / "stuff.py").touch()
    py_files = GpsBatchJobs.get_submit_py_files(env=env, cwd=tmp_path)
    assert py_files == "__pyfiles__/stuff.py,lib.whl"
    warn_logs = [r.message for r in caplog.records if r.levelname == "WARNING"]
    assert warn_logs == []
예제 #5
0
def remove_batch_jobs_before(upper: datetime, jvm: JVMView) -> None:
    _log.info("removing batch jobs before {d}...".format(d=upper))

    # TODO: how to cope with unneeded arguments?
    batch_jobs = GpsBatchJobs(catalog=None, jvm=jvm, principal="", key_tab="")
    batch_jobs.delete_jobs_before(upper)
예제 #6
0
class JobTracker:
    class _UnknownApplicationIdException(ValueError):
        pass

    _YarnStatus = namedtuple('YarnStatus', [
        'state', 'final_state', 'start_time', 'finish_time',
        'aggregate_resource_allocation'
    ])
    _KubeStatus = namedtuple('KubeStatus',
                             ['state', 'start_time', 'finish_time'])

    def __init__(self, job_registry: Callable[[], JobRegistry], principal: str,
                 keytab: str):
        self._job_registry = job_registry
        self._principal = principal
        self._keytab = keytab
        self._batch_jobs = GpsBatchJobs(catalog=None,
                                        jvm=None,
                                        principal=principal,
                                        key_tab=keytab)

    def loop_update_statuses(self, interval_s: int = 60):
        with self._job_registry() as registry:
            registry.ensure_paths()

        try:
            i = 0

            while True:
                try:
                    _log.info("tracking statuses...")

                    if i % 60 == 0:
                        self._refresh_kerberos_tgt()

                    self.update_statuses()
                except Exception:
                    _log.warning(
                        "scheduling new run after failing to track batch jobs:\n{e}"
                        .format(e=traceback.format_exc()))

                time.sleep(interval_s)

                i += 1
        except KeyboardInterrupt:
            pass

    def update_statuses(self) -> None:
        with self._job_registry() as registry:
            registry.ensure_paths()

            jobs_to_track = registry.get_running_jobs()

            for job_info in jobs_to_track:
                try:
                    job_id, user_id = job_info['job_id'], job_info['user_id']
                    application_id, current_status = job_info[
                        'application_id'], job_info['status']

                    if application_id:
                        try:
                            if ConfigParams().is_kube_deploy:
                                from openeogeotrellis.utils import s3_client, download_s3_dir
                                state, start_time, finish_time = JobTracker._kube_status(
                                    job_id, user_id)

                                new_status = JobTracker._kube_status_parser(
                                    state)

                                registry.patch(job_id,
                                               user_id,
                                               status=new_status,
                                               started=start_time,
                                               finished=finish_time)

                                if current_status != new_status:
                                    _log.info(
                                        "changed job %s status from %s to %s" %
                                        (job_id, current_status, new_status),
                                        extra={'job_id': job_id})

                                if state == "COMPLETED":
                                    # TODO: do we support SHub batch processes in this environment? The AWS
                                    #  credentials conflict.
                                    download_s3_dir(
                                        "OpenEO-data",
                                        "batch_jobs/{j}".format(j=job_id))

                                    result_metadata = self._batch_jobs.get_results_metadata(
                                        job_id, user_id)
                                    registry.patch(job_id, user_id,
                                                   **result_metadata)

                                    registry.mark_done(job_id, user_id)
                                    _log.info("marked %s as done" % job_id,
                                              extra={'job_id': job_id})
                            else:
                                state, final_state, start_time, finish_time, aggregate_resource_allocation =\
                                    JobTracker._yarn_status(application_id)

                                memory_time_megabyte_seconds, cpu_time_seconds =\
                                    JobTracker._parse_resource_allocation(aggregate_resource_allocation)

                                new_status = JobTracker._to_openeo_status(
                                    state, final_state)

                                registry.patch(
                                    job_id,
                                    user_id,
                                    status=new_status,
                                    started=JobTracker.
                                    _to_serializable_datetime(start_time),
                                    finished=JobTracker.
                                    _to_serializable_datetime(finish_time),
                                    memory_time_megabyte_seconds=
                                    memory_time_megabyte_seconds,
                                    cpu_time_seconds=cpu_time_seconds)

                                if current_status != new_status:
                                    _log.info(
                                        "changed job %s status from %s to %s" %
                                        (job_id, current_status, new_status),
                                        extra={'job_id': job_id})

                                if final_state != "UNDEFINED":
                                    result_metadata = self._batch_jobs.get_results_metadata(
                                        job_id, user_id)
                                    # TODO: skip patching the job znode and read from this file directly?
                                    registry.patch(job_id, user_id,
                                                   **result_metadata)

                                    if new_status == 'finished':
                                        registry.remove_dependencies(
                                            job_id, user_id)

                                        dependency_sources = JobRegistry.get_dependency_sources(
                                            job_info)

                                        if dependency_sources:
                                            async_task.schedule_delete_batch_process_dependency_sources(
                                                job_id, dependency_sources)

                                    registry.mark_done(job_id, user_id)

                                    _log.info("marked %s as done" % job_id,
                                              extra={
                                                  'job_id':
                                                  job_id,
                                                  'area':
                                                  result_metadata.get('area'),
                                                  'unique_process_ids':
                                                  result_metadata.get(
                                                      'unique_process_ids'),
                                                  'cpu_time_seconds':
                                                  cpu_time_seconds
                                              })
                        except JobTracker._UnknownApplicationIdException:
                            registry.mark_done(job_id, user_id)
                except Exception:
                    _log.warning(
                        "resuming with remaining jobs after failing to handle batch job {j}:\n{e}"
                        .format(j=job_id, e=traceback.format_exc()),
                        extra={'job_id': job_id})
                    registry.set_status(job_id, user_id, 'error')
                    registry.mark_done(job_id, user_id)

    @staticmethod
    def yarn_available() -> bool:
        """Check if YARN tools are available."""
        try:
            _log.info("Checking if Hadoop 'yarn' tool is available")
            output = subprocess.check_output(["yarn",
                                              "version"]).decode("ascii")
            hadoop_yarn_available = "hadoop" in output.lower()
            _log.info(
                "Hadoop yarn available: {a}".format(a=hadoop_yarn_available))
            return hadoop_yarn_available
        except Exception as e:
            _log.info("Failed to run 'yarn': {e!r}".format(e=e))
            return False

    @staticmethod
    def _kube_status(job_id: str, user_id: str) -> '_KubeStatus':
        from openeogeotrellis.utils import kube_client

        user_id_truncated = user_id.split('@')[0][:20]
        job_id_truncated = job_id.split('-')[0]
        try:
            api_instance = kube_client()
            status = api_instance.get_namespaced_custom_object(
                "sparkoperator.k8s.io", "v1beta2", "spark-jobs",
                "sparkapplications",
                "job-{id}-{user}".format(id=job_id_truncated,
                                         user=user_id_truncated))

            return JobTracker._KubeStatus(
                status['status']['applicationState']['state'],
                status['status']['lastSubmissionAttemptTime'],
                status['status']['terminationTime'])

        except Exception as e:
            _log.info(e)

    @staticmethod
    def _yarn_status(application_id: str) -> '_YarnStatus':
        """Returns (State, Final-State) of a job as reported by YARN."""

        try:
            application_report = subprocess.check_output(
                ["yarn", "application", "-status", application_id]).decode()

            props = re.findall(r"\t(.+) : (.+)", application_report)

            def prop_value(name: str) -> str:
                return next(value for key, value in props if key == name)

            return JobTracker._YarnStatus(
                prop_value("State"), prop_value("Final-State"),
                prop_value("Start-Time"), prop_value("Finish-Time"),
                prop_value("Aggregate Resource Allocation"))
        except CalledProcessError as e:
            stdout = e.stdout.decode()
            if "doesn't exist in RM or Timeline Server" in stdout:
                raise JobTracker._UnknownApplicationIdException(stdout)
            else:
                raise

    @staticmethod
    def _parse_resource_allocation(
            aggregate_resource_allocation) -> (int, int):
        match = re.fullmatch(r"^(\d+) MB-seconds, (\d+) vcore-seconds$",
                             aggregate_resource_allocation)

        return int(match.group(1)), int(match.group(2)) if match else (None,
                                                                       None)

    @staticmethod
    def _to_openeo_status(state: str, final_state: str) -> str:
        # TODO: encapsulate status
        if state == 'ACCEPTED':
            new_status = 'queued'
        elif state == 'RUNNING':
            new_status = 'running'
        else:
            new_status = 'created'

        if final_state == 'KILLED':
            new_status = 'canceled'
        elif final_state == 'SUCCEEDED':
            new_status = 'finished'
        elif final_state == 'FAILED':
            new_status = 'error'

        return new_status

    @staticmethod
    def _kube_status_parser(state: str) -> str:
        if state == 'PENDING':
            new_status = 'queued'
        elif state == 'RUNNING':
            new_status = 'running'
        elif state == 'COMPLETED':
            new_status = 'finished'
        elif state == 'FAILED':
            new_status = 'error'
        else:
            new_status = 'created'

        return new_status

    @staticmethod
    def _to_serializable_datetime(epoch_millis: str) -> Union[str, None]:
        if epoch_millis == "0":
            return None

        utc_datetime = datetime.utcfromtimestamp(int(epoch_millis) / 1000)
        return date_to_rfc3339(utc_datetime)

    def _refresh_kerberos_tgt(self):
        if self._keytab and self._principal:
            cmd = ["kinit", "-V", "-kt", self._keytab, self._principal]

            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)

            for line in p.stdout:
                _log.info(line.rstrip().decode())

            p.wait()
            if p.returncode:
                _log.warning("{c} returned exit code {r}".format(
                    c=" ".join(cmd), r=p.returncode))
        else:
            _log.warning("No Kerberos principal/keytab: will not refresh TGT")
def test_extract_application_id():
    yarn_log = """
19/07/10 15:56:39 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
19/07/10 15:56:39 INFO Client: Attempting to login to the Kerberos using principal: [email protected] and keytab: jenkins.keytab-2322e03c-bf97-4f59-b9ad-7c2ecb2d1c70
19/07/10 15:56:39 INFO RequestHedgingRMFailoverProxyProvider: Created wrapped proxy for [rm1, rm2]
19/07/10 15:56:39 INFO RequestHedgingRMFailoverProxyProvider: Looking for the active RM in [rm1, rm2]...
19/07/10 15:56:39 INFO RequestHedgingRMFailoverProxyProvider: Found active RM [rm2]
19/07/10 15:56:39 INFO Client: Requesting a new application from cluster with 99 NodeManagers
19/07/10 15:56:39 INFO Configuration: resource-types.xml not found
19/07/10 15:56:39 INFO ResourceUtils: Unable to find 'resource-types.xml'.
19/07/10 15:56:39 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (55296 MB per container)
19/07/10 15:56:39 INFO Client: Will allocate AM container, with 1408 MB memory including 384 MB overhead
19/07/10 15:56:39 INFO Client: Setting up container launch context for our AM
19/07/10 15:56:39 INFO Client: Setting up the launch environment for our AM container
19/07/10 15:56:39 INFO Client: Credentials file set to: credentials-4bfb4d79-eb95-4578-bd0a-cbfa2bf7d298
19/07/10 15:56:39 INFO Client: Preparing resources for our AM container
19/07/10 15:56:39 INFO HadoopFSDelegationTokenProvider: getting token for: DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_885268276_1, [email protected] (auth:KERBEROS)]]
19/07/10 15:56:39 INFO DFSClient: Created token for jenkins: HDFS_DELEGATION_TOKEN [email protected], renewer=yarn, realUser=, issueDate=1562766999634, maxDate=1563371799634, sequenceNumber=1296276, masterKeyId=1269 on ha-hdfs:hacluster
19/07/10 15:56:39 INFO HadoopFSDelegationTokenProvider: getting token for: DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_885268276_1, [email protected] (auth:KERBEROS)]]
19/07/10 15:56:39 INFO DFSClient: Created token for jenkins: HDFS_DELEGATION_TOKEN [email protected], renewer=jenkins, realUser=, issueDate=1562766999721, maxDate=1563371799721, sequenceNumber=1296277, masterKeyId=1269 on ha-hdfs:hacluster
19/07/10 15:56:39 INFO HadoopFSDelegationTokenProvider: Renewal interval is 86400059 for token HDFS_DELEGATION_TOKEN
19/07/10 15:56:40 INFO Client: To enable the AM to login from keytab, credentials are being copied over to the AM via the YARN Secure Distributed Cache.
19/07/10 15:56:40 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/jenkins.keytab-2322e03c-bf97-4f59-b9ad-7c2ecb2d1c70 -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/jenkins.keytab-2322e03c-bf97-4f59-b9ad-7c2ecb2d1c70
19/07/10 15:56:41 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
19/07/10 15:56:46 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/spark-ad3a2402-36d5-407a-8b30-392033d45899/__spark_libs__4608991107087829959.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/__spark_libs__4608991107087829959.zip
19/07/10 15:56:51 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/geotrellis-extensions-1.3.0-SNAPSHOT.jar -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/geotrellis-extensions-1.3.0-SNAPSHOT.jar
19/07/10 15:56:52 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/geotrellis-backend-assembly-0.4.5-openeo.jar -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/geotrellis-backend-assembly-0.4.5-openeo.jar
19/07/10 15:56:54 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/layercatalog.json -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/layercatalog.json
19/07/10 15:56:54 INFO Client: Uploading resource file:/mnt/ceph/Projects/OpenEO/f5ddcb98-a9ca-440e-a705-da6d71aaab44/in -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/in
19/07/10 15:56:54 INFO Client: Uploading resource https://artifactory.vgt.vito.be/auxdata-public/openeo/venv.zip#venv -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/venv.zip
19/07/10 15:57:01 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/container_e3344_1562328661428_5538_01_000001/venv/lib64/python3.5/site-packages/openeogeotrellis/deploy/batch_job.py -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/batch_job.py
19/07/10 15:57:01 INFO Client: Uploading resource file:/usr/hdp/3.0.0.0-1634/spark2/python/lib/pyspark.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/pyspark.zip
19/07/10 15:57:01 INFO Client: Uploading resource file:/usr/hdp/3.0.0.0-1634/spark2/python/lib/py4j-0.10.7-src.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/py4j-0.10.7-src.zip
19/07/10 15:57:02 INFO Client: Uploading resource file:/data1/hadoop/yarn/local/usercache/jenkins/appcache/application_1562328661428_5538/spark-ad3a2402-36d5-407a-8b30-392033d45899/__spark_conf__2177799938793019578.zip -> hdfs://hacluster/user/jenkins/.sparkStaging/application_1562328661428_5542/__spark_conf__.zip
19/07/10 15:57:02 INFO SecurityManager: Changing view acls to: jenkins
19/07/10 15:57:02 INFO SecurityManager: Changing modify acls to: jenkins
19/07/10 15:57:02 INFO SecurityManager: Changing view acls groups to: 
19/07/10 15:57:02 INFO SecurityManager: Changing modify acls groups to: 
19/07/10 15:57:02 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(jenkins); groups with view permissions: Set(); users  with modify permissions: Set(jenkins); groups with modify permissions: Set()
19/07/10 15:57:02 INFO Client: Submitting application application_1562328661428_5542 to ResourceManager
19/07/10 15:57:02 INFO YarnClientImpl: Submitted application application_1562328661428_5542
19/07/10 15:57:03 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:03 INFO Client: 
	 client token: Token { kind: YARN_CLIENT_TOKEN, service:  }
	 diagnostics: AM container is launched, waiting for AM container to Register with RM
	 ApplicationMaster host: N/A
	 ApplicationMaster RPC port: -1
	 queue: default
	 start time: 1562767022250
	 final status: UNDEFINED
	 tracking URL: http://epod17.vgt.vito.be:8088/proxy/application_1562328661428_5542/
	 user: jenkins
19/07/10 15:57:04 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:05 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:06 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:07 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:08 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:13 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:57:59 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:58:00 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:58:01 INFO Client: Application report for application_1562328661428_5542 (state: ACCEPTED)
19/07/10 15:58:02 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:02 INFO Client: 
	 client token: Token { kind: YARN_CLIENT_TOKEN, service:  }
	 diagnostics: N/A
	 ApplicationMaster host: 192.168.207.182
	 ApplicationMaster RPC port: 0
	 queue: default
	 start time: 1562767022250
	 final status: UNDEFINED
	 tracking URL: http://epod17.vgt.vito.be:8088/proxy/application_1562328661428_5542/
	 user: jenkins
19/07/10 15:58:03 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:04 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:05 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:06 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:07 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:08 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:09 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:10 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
19/07/10 15:58:11 INFO Client: Application report for application_1562328661428_5542 (state: RUNNING)
    """
    assert GpsBatchJobs._extract_application_id(
        yarn_log) == "application_1562328661428_5542"
예제 #8
0
def test_get_submit_py_files_empty(tmp_path):
    env = {"OPENEO_SPARK_SUBMIT_PY_FILES": ""}
    py_files = GpsBatchJobs.get_submit_py_files(env=env, cwd=tmp_path)
    assert py_files == ""
예제 #9
0
def test_get_submit_py_files_no_env(tmp_path):
    py_files = GpsBatchJobs.get_submit_py_files(env={}, cwd=tmp_path)
    assert py_files == ""