Python EmrJobRunner.add_job_flow_steps示例

编程语言: Python

命名空间/包名称: dagster_aws.emr

类/类型: EmrJobRunner

方法/功能: add_job_flow_steps

hotexamples.com的示例: 5

Python EmrJobRunner.add_job_flow_steps - 已找到5个示例。这些是从开源项目中提取的最受好评的dagster_aws.emr.EmrJobRunner.add_job_flow_steps现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

EmrJobRunner(20)

run_job_flow(16)

construct_step_dict_for_command(7)

add_job_flow_steps(5)

describe_cluster(4)

is_emr_step_complete(4)

log_location_for_cluster(4)

retrieve_logs_for_step_id(4)

add_tags(2)

wait_for_emr_steps_to_complete(2)

wait_for_log(2)

cluster_id_from_name(1)

wait_for_steps_to_complete(1)

示例#1

显示文件

文件： test_emr.py 项目： yingjiebyron/dagster

def test_is_emr_step_complete(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = "test_step"
    step_cmd = ["ls", "/"]
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id,
        [emr.construct_step_dict_for_command(step_name, step_cmd)])

    def get_step_dict(step_id, step_state):
        return {
            "Step": {
                "Id": step_id,
                "Name": step_name,
                "Config": {
                    "Jar": "command-runner.jar",
                    "Properties": {},
                    "Args": step_cmd
                },
                "ActionOnFailure": "CONTINUE",
                "Status": {
                    "State": step_state,
                    "StateChangeReason": {
                        "Message": "everything is hosed"
                    },
                    "Timeline": {
                        "StartDateTime": _boto3_now()
                    },
                },
            },
        }

    emr_step_id = step_ids[0]
    describe_step_returns = [
        get_step_dict(emr_step_id, "PENDING"),
        get_step_dict(emr_step_id, "RUNNING"),
        get_step_dict(emr_step_id, "COMPLETED"),
        get_step_dict(emr_step_id, "FAILED"),
    ]
    with mock.patch.object(EmrJobRunner,
                           "describe_step",
                           side_effect=describe_step_returns):
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)

        with pytest.raises(EmrError) as exc_info:
            emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
            assert "step failed" in str(exc_info.value)

示例#2

显示文件

文件： test_emr.py 项目： varokas/dagster-1

def test_is_emr_step_complete(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = 'test_step'
    step_cmd = ['ls', '/']
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id,
        [emr.construct_step_dict_for_command(step_name, step_cmd)])

    def get_step_dict(step_id, step_state):
        return {
            'Step': {
                'Id': step_id,
                'Name': step_name,
                'Config': {
                    'Jar': 'command-runner.jar',
                    'Properties': {},
                    'Args': step_cmd
                },
                'ActionOnFailure': 'CONTINUE',
                'Status': {
                    'State': step_state,
                    'StateChangeReason': {
                        'Message': 'everything is hosed'
                    },
                    'Timeline': {
                        'StartDateTime': _boto3_now()
                    },
                },
            },
        }

    emr_step_id = step_ids[0]
    describe_step_returns = [
        get_step_dict(emr_step_id, 'PENDING'),
        get_step_dict(emr_step_id, 'RUNNING'),
        get_step_dict(emr_step_id, 'COMPLETED'),
        get_step_dict(emr_step_id, 'FAILED'),
    ]
    with mock.patch.object(EmrJobRunner,
                           'describe_step',
                           side_effect=describe_step_returns):
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert not emr.is_emr_step_complete(context.log, cluster_id,
                                            emr_step_id)
        assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)

        with pytest.raises(EmrError) as exc_info:
            emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
            assert 'step failed' in str(exc_info.value)

示例#3

显示文件

def test_emr_wait_for_step(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = 'test_step'
    step_cmd = ['ls', '/']
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]
    )

    def get_step_dict(step_id, step_state):
        return {
            'Step': {
                'Id': step_id,
                'Name': step_name,
                'Config': {'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd},
                'ActionOnFailure': 'CONTINUE',
                'Status': {
                    'State': step_state,
                    'StateChangeReason': {'Message': 'everything is hosed'},
                    'Timeline': {'StartDateTime': _boto3_now()},
                },
            },
        }

    calls = {'num_calls': 0, 'final_state': 'COMPLETED'}

    def new_describe_step(_, cluster_id, step_id):
        calls['num_calls'] += 1

        if calls['num_calls'] == 1:
            return get_step_dict(step_id, 'PENDING')
        elif calls['num_calls'] == 2:
            return get_step_dict(step_id, 'RUNNING')
        else:
            return get_step_dict(step_id, calls['final_state'])

        return emr.describe_step(cluster_id, step_id)

    with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step):
        emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids)

    calls['num_calls'] = 0
    calls['final_state'] = 'FAILED'
    with pytest.raises(EmrError) as exc_info:
        with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step):
            emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids)
    assert 'step failed' in str(exc_info.value)

示例#4

显示文件

文件： pyspark_step_launcher.py 项目： punneng/dagster

class EmrPySparkStepLauncher(StepLauncher):
    def __init__(
        self,
        region_name,
        staging_bucket,
        staging_prefix,
        wait_for_logs,
        action_on_failure,
        cluster_id,
        spark_config,
        local_pipeline_package_path,
        deploy_local_pipeline_package,
        s3_pipeline_package_path=None,
    ):
        self.region_name = check.str_param(region_name, 'region_name')
        self.staging_bucket = check.str_param(staging_bucket, 'staging_bucket')
        self.staging_prefix = check.str_param(staging_prefix, 'staging_prefix')
        self.wait_for_logs = check.bool_param(wait_for_logs, 'wait_for_logs')
        self.action_on_failure = check.str_param(action_on_failure,
                                                 'action_on_failure')
        self.cluster_id = check.str_param(cluster_id, 'cluster_id')
        self.spark_config = spark_config

        check.invariant(
            not deploy_local_pipeline_package or not s3_pipeline_package_path,
            'If deploy_local_pipeline_package is set to True, s3_pipeline_package_path should not '
            'also be set.',
        )

        self.local_pipeline_package_path = check.str_param(
            local_pipeline_package_path, 'local_pipeline_package_path')
        self.deploy_local_pipeline_package = check.bool_param(
            deploy_local_pipeline_package, 'deploy_local_pipeline_package')
        self.s3_pipeline_package_path = check.opt_str_param(
            s3_pipeline_package_path, 's3_pipeline_package_path')

        self.emr_job_runner = EmrJobRunner(region=self.region_name)

    def _post_artifacts(self, log, step_run_ref, run_id, step_key):
        '''
        Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.

        For the zip file, consider the following toy example:

            # Folder: my_pyspark_project/
            # a.py
            def foo():
                print(1)

            # b.py
            def bar():
                print(2)

            # main.py
            from a import foo
            from b import bar

            foo()
            bar()

        This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running
        `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will
        print 1, 2.
        '''

        with seven.TemporaryDirectory() as temp_dir:
            s3 = boto3.client('s3', region_name=self.region_name)

            # Upload step run ref
            def _upload_file_to_s3(local_path, s3_filename):
                key = self._artifact_s3_key(run_id, step_key, s3_filename)
                s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)
                log.debug('Uploading file {local_path} to {s3_uri}'.format(
                    local_path=local_path, s3_uri=s3_uri))
                s3.upload_file(Filename=local_path,
                               Bucket=self.staging_bucket,
                               Key=key)

            # Upload main file.
            # The remote Dagster installation should also have the file, but locating it there
            # could be a pain.
            main_local_path = self._main_file_local_path()
            _upload_file_to_s3(main_local_path, self._main_file_name())

            if self.deploy_local_pipeline_package:
                # Zip and upload package containing pipeline
                zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)
                build_pyspark_zip(zip_local_path,
                                  self.local_pipeline_package_path)
                _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)

            # Create step run ref pickle file
            step_run_ref_local_path = os.path.join(
                temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)
            with open(step_run_ref_local_path, 'wb') as step_pickle_file:
                pickle.dump(step_run_ref, step_pickle_file)

            _upload_file_to_s3(step_run_ref_local_path,
                               PICKLED_STEP_RUN_REF_FILE_NAME)

    def launch_step(self, step_context, prior_attempts_count):
        step_run_ref = step_context_to_step_run_ref(
            step_context, prior_attempts_count,
            self.local_pipeline_package_path)

        run_id = step_context.pipeline_run.run_id
        log = step_context.log

        step_key = step_run_ref.step_key
        self._post_artifacts(log, step_run_ref, run_id, step_key)

        emr_step_def = self._get_emr_step_def(run_id, step_key,
                                              step_context.solid.name)
        emr_step_id = self.emr_job_runner.add_job_flow_steps(
            log, self.cluster_id, [emr_step_def])[0]

        s3 = boto3.resource('s3', region_name=self.region_name)
        for event in self.wait_for_completion(log, s3, run_id, step_key,
                                              emr_step_id):
            log_step_event(step_context, event)
            yield event

        if self.wait_for_logs:
            self._log_logs_from_s3(log, emr_step_id)

    def wait_for_completion(self,
                            log,
                            s3,
                            run_id,
                            step_key,
                            emr_step_id,
                            check_interval=15):
        ''' We want to wait for the EMR steps to complete, and while that's happening, we want to
        yield any events that have been written to S3 for us by the remote process.
        After the the EMR steps complete, we want a final chance to fetch events before finishing
        the step.
        '''
        done = False
        all_events = []
        while not done:
            time.sleep(
                check_interval)  # AWS rate-limits us if we poll it too often
            done = self.emr_job_runner.is_emr_step_complete(
                log, self.cluster_id, emr_step_id)

            all_events_new = self.read_events(s3, run_id, step_key)
            if len(all_events_new) > len(all_events):
                for i in range(len(all_events), len(all_events_new)):
                    yield all_events_new[i]
                all_events = all_events_new

    def read_events(self, s3, run_id, step_key):
        events_s3_obj = s3.Object(  # pylint: disable=no-member
            self.staging_bucket,
            self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME))

        try:
            events_data = events_s3_obj.get()['Body'].read()
            return pickle.loads(events_data)
        except ClientError as ex:
            # The file might not be there yet, which is fine
            if ex.response['Error']['Code'] == 'NoSuchKey':
                return []
            else:
                raise ex

    def _log_logs_from_s3(self, log, emr_step_id):
        '''Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs
        them to the given log.'''
        stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(
            log, self.cluster_id, emr_step_id)
        # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for
        # Dagster's logging system.
        records = parse_hadoop_log4j_records(stderr_log)
        for record in records:
            log._log(  # pylint: disable=protected-access
                record.level, record.logger + ': ' + record.message, {})
        log.info(stdout_log)

    def _get_emr_step_def(self, run_id, step_key, solid_name):
        '''From the local Dagster instance, construct EMR steps that will kick off execution on a
        remote EMR cluster.
        '''
        action_on_failure = self.action_on_failure

        # Execute Solid via spark-submit
        conf = dict(flatten_dict(self.spark_config))
        conf['spark.app.name'] = conf.get('spark.app.name', solid_name)

        check.invariant(
            conf.get('spark.master', 'yarn') == 'yarn',
            desc=
            'spark.master is configured as %s; cannot set Spark master on EMR to anything '
            'other than "yarn"' % conf.get('spark.master'),
        )

        command = ([
            EMR_SPARK_HOME + 'bin/spark-submit',
            '--master',
            'yarn',
            '--deploy-mode',
            conf.get('spark.submit.deployMode', 'client'),
        ] + format_for_cli(list(flatten_dict(conf))) + [
            '--py-files',
            self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),
            self._artifact_s3_uri(run_id, step_key, self._main_file_name()),
            self.staging_bucket,
            self._artifact_s3_key(run_id, step_key,
                                  PICKLED_STEP_RUN_REF_FILE_NAME),
        ])

        return EmrJobRunner.construct_step_dict_for_command(
            'Execute Solid %s' % solid_name,
            command,
            action_on_failure=action_on_failure)

    def _main_file_name(self):
        return os.path.basename(self._main_file_local_path())

    def _main_file_local_path(self):
        return emr_step_main.__file__

    def _artifact_s3_uri(self, run_id, step_key, filename):
        key = self._artifact_s3_key(run_id, step_key, filename)
        return 's3://{bucket}/{key}'.format(bucket=self.staging_bucket,
                                            key=key)

    def _artifact_s3_key(self, run_id, step_key, filename):
        return '/'.join([
            self.staging_prefix, run_id, step_key,
            os.path.basename(filename)
        ])

示例#5

显示文件

文件： pyspark_step_launcher.py 项目： helloworld/dagster

class EmrPySparkStepLauncher(StepLauncher):
    def __init__(
        self,
        region_name,
        staging_bucket,
        staging_prefix,
        wait_for_logs,
        action_on_failure,
        cluster_id,
        spark_config,
        local_job_package_path,
        deploy_local_job_package,
        s3_job_package_path=None,
    ):
        self.region_name = check.str_param(region_name, "region_name")
        self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")
        self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")
        self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")
        self.action_on_failure = check.str_param(action_on_failure,
                                                 "action_on_failure")
        self.cluster_id = check.str_param(cluster_id, "cluster_id")
        self.spark_config = spark_config

        check.invariant(
            not deploy_local_job_package or not s3_job_package_path,
            "If deploy_local_job_package is set to True, s3_job_package_path should not "
            "also be set.",
        )

        self.local_job_package_path = check.str_param(
            local_job_package_path, "local_job_package_path")
        self.deploy_local_job_package = check.bool_param(
            deploy_local_job_package, "deploy_local_job_package")
        self.s3_job_package_path = check.opt_str_param(s3_job_package_path,
                                                       "s3_job_package_path")

        self.emr_job_runner = EmrJobRunner(region=self.region_name)

    def _post_artifacts(self, log, step_run_ref, run_id, step_key):
        """
        Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.

        For the zip file, consider the following toy example:

            # Folder: my_pyspark_project/
            # a.py
            def foo():
                print(1)

            # b.py
            def bar():
                print(2)

            # main.py
            from a import foo
            from b import bar

            foo()
            bar()

        This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running
        `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will
        print 1, 2.
        """
        from dagster_pyspark.utils import build_pyspark_zip

        with tempfile.TemporaryDirectory() as temp_dir:
            s3 = boto3.client("s3", region_name=self.region_name)

            # Upload step run ref
            def _upload_file_to_s3(local_path, s3_filename):
                key = self._artifact_s3_key(run_id, step_key, s3_filename)
                s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)
                log.debug("Uploading file {local_path} to {s3_uri}".format(
                    local_path=local_path, s3_uri=s3_uri))
                s3.upload_file(Filename=local_path,
                               Bucket=self.staging_bucket,
                               Key=key)

            # Upload main file.
            # The remote Dagster installation should also have the file, but locating it there
            # could be a pain.
            main_local_path = self._main_file_local_path()
            _upload_file_to_s3(main_local_path, self._main_file_name())

            if self.deploy_local_job_package:
                # Zip and upload package containing job
                zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)

                build_pyspark_zip(zip_local_path, self.local_job_package_path)
                _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)

            # Create step run ref pickle file
            step_run_ref_local_path = os.path.join(
                temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)
            with open(step_run_ref_local_path, "wb") as step_pickle_file:
                pickle.dump(step_run_ref, step_pickle_file)

            _upload_file_to_s3(step_run_ref_local_path,
                               PICKLED_STEP_RUN_REF_FILE_NAME)

    def launch_step(self, step_context, prior_attempts_count):
        step_run_ref = step_context_to_step_run_ref(
            step_context, prior_attempts_count, self.local_job_package_path)

        run_id = step_context.pipeline_run.run_id
        log = step_context.log

        step_key = step_run_ref.step_key
        self._post_artifacts(log, step_run_ref, run_id, step_key)

        emr_step_def = self._get_emr_step_def(run_id, step_key,
                                              step_context.solid.name)
        emr_step_id = self.emr_job_runner.add_job_flow_steps(
            log, self.cluster_id, [emr_step_def])[0]

        yield from self.wait_for_completion_and_log(log, run_id, step_key,
                                                    emr_step_id, step_context)

    def wait_for_completion_and_log(self, log, run_id, step_key, emr_step_id,
                                    step_context):
        s3 = boto3.resource("s3", region_name=self.region_name)
        try:
            for event in self.wait_for_completion(log, s3, run_id, step_key,
                                                  emr_step_id):
                log_step_event(step_context, event)
                yield event
        except EmrError as emr_error:
            if self.wait_for_logs:
                self._log_logs_from_s3(log, emr_step_id)
            raise emr_error

        if self.wait_for_logs:
            self._log_logs_from_s3(log, emr_step_id)

    def wait_for_completion(self,
                            log,
                            s3,
                            run_id,
                            step_key,
                            emr_step_id,
                            check_interval=15):
        """We want to wait for the EMR steps to complete, and while that's happening, we want to
        yield any events that have been written to S3 for us by the remote process.
        After the the EMR steps complete, we want a final chance to fetch events before finishing
        the step.
        """
        done = False
        all_events = []
        # If this is being called within a `capture_interrupts` context, allow interrupts
        # while waiting for the pyspark execution to complete, so that we can terminate slow or
        # hanging steps
        while not done:
            with raise_execution_interrupts():
                time.sleep(check_interval
                           )  # AWS rate-limits us if we poll it too often
                done = self.emr_job_runner.is_emr_step_complete(
                    log, self.cluster_id, emr_step_id)

                all_events_new = self.read_events(s3, run_id, step_key)

            if len(all_events_new) > len(all_events):
                for i in range(len(all_events), len(all_events_new)):
                    yield all_events_new[i]
                all_events = all_events_new

    def read_events(self, s3, run_id, step_key):
        events_s3_obj = s3.Object(  # pylint: disable=no-member
            self.staging_bucket,
            self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME))

        try:
            events_data = events_s3_obj.get()["Body"].read()
            return pickle.loads(events_data)
        except ClientError as ex:
            # The file might not be there yet, which is fine
            if ex.response["Error"]["Code"] == "NoSuchKey":
                return []
            else:
                raise ex

    def _log_logs_from_s3(self, log, emr_step_id):
        """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs
        them to the given log."""
        stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(
            log, self.cluster_id, emr_step_id)
        # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for
        # Dagster's logging system.
        records = parse_hadoop_log4j_records(stderr_log)
        for record in records:
            if record.level:
                log.log(
                    level=record.level,
                    msg="".join([
                        "Spark Driver stderr: ", record.logger, ": ",
                        record.message
                    ]),
                )
            else:
                log.debug(f"Spark Driver stderr: {record.message}")

        sys.stdout.write("---------- Spark Driver stdout: ----------\n" +
                         stdout_log + "\n" +
                         "---------- End of Spark Driver stdout ----------\n")

    def _get_emr_step_def(self, run_id, step_key, solid_name):
        """From the local Dagster instance, construct EMR steps that will kick off execution on a
        remote EMR cluster.
        """
        from dagster_spark.utils import flatten_dict, format_for_cli

        action_on_failure = self.action_on_failure

        # Execute Solid via spark-submit
        conf = dict(flatten_dict(self.spark_config))
        conf["spark.app.name"] = conf.get("spark.app.name", solid_name)

        check.invariant(
            conf.get("spark.master", "yarn") == "yarn",
            desc=
            "spark.master is configured as %s; cannot set Spark master on EMR to anything "
            'other than "yarn"' % conf.get("spark.master"),
        )

        command = ([
            EMR_SPARK_HOME + "bin/spark-submit",
            "--master",
            "yarn",
            "--deploy-mode",
            conf.get("spark.submit.deployMode", "client"),
        ] + format_for_cli(list(flatten_dict(conf))) + [
            "--py-files",
            self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),
            self._artifact_s3_uri(run_id, step_key, self._main_file_name()),
            self.staging_bucket,
            self._artifact_s3_key(run_id, step_key,
                                  PICKLED_STEP_RUN_REF_FILE_NAME),
        ])

        return EmrJobRunner.construct_step_dict_for_command(
            "Execute Solid/Op %s" % solid_name,
            command,
            action_on_failure=action_on_failure)

    def _main_file_name(self):
        return os.path.basename(self._main_file_local_path())

    def _main_file_local_path(self):
        return emr_step_main.__file__

    def _artifact_s3_uri(self, run_id, step_key, filename):
        key = self._artifact_s3_key(run_id, step_key, filename)
        return "s3://{bucket}/{key}".format(bucket=self.staging_bucket,
                                            key=key)

    def _artifact_s3_key(self, run_id, step_key, filename):
        return "/".join([
            self.staging_prefix, run_id, step_key,
            os.path.basename(filename)
        ])