Exemplo n.º 1
0
    def _get_execute_steps(self, context, solid_name):
        '''From the local Dagster instance, construct EMR steps that will kick off execution on a
        remote EMR cluster.
        '''
        action_on_failure = self.config['action_on_failure']
        staging_bucket = self.config['staging_bucket']

        run_id = context.run_id
        local_root = os.path.dirname(
            os.path.abspath(self.config['pipeline_file']))

        steps = []

        # Install Python dependencies if a requirements file exists
        requirements_file = self.config.get('requirements_file_path')
        if requirements_file and not os.path.exists(requirements_file):
            raise DagsterInvalidDefinitionError(
                'The requirements.txt file that was specified does not exist')

        if not requirements_file:
            requirements_file = os.path.join(local_root, 'requirements.txt')

        if os.path.exists(requirements_file):
            with open(requirements_file, 'rb') as f:
                python_dependencies = six.ensure_str(f.read()).split('\n')
                steps.append(
                    get_install_requirements_step(python_dependencies,
                                                  action_on_failure))

        # Execute Solid via spark-submit
        conf = dict(flatten_dict(self.config.get('spark_conf')))
        conf['spark.app.name'] = conf.get('spark.app.name', solid_name)

        check.invariant(
            conf.get('spark.master', 'yarn') == 'yarn',
            desc=
            'spark.master is configured as %s; cannot set Spark master on EMR to anything '
            'other than "yarn"' % conf.get('spark.master'),
        )

        steps.append({
            'Name': 'Execute Solid %s' % solid_name,
            'ActionOnFailure': action_on_failure,
            'HadoopJarStep': {
                'Jar':
                'command-runner.jar',
                'Args': [
                    EMR_SPARK_HOME + 'bin/spark-submit',
                    '--master',
                    'yarn',
                    '--deploy-mode',
                    conf.get('spark.submit.deployMode', 'client'),
                ] + format_for_cli(list(flatten_dict(conf))) + [
                    '--py-files',
                    's3://%s/%s/pyspark.zip' % (staging_bucket, run_id),
                    's3://%s/%s/main.py' % (staging_bucket, run_id),
                ],
            },
        })
        return steps
Exemplo n.º 2
0
def spark_session_from_config(spark_conf=None):
    spark_conf = check.opt_dict_param(spark_conf, 'spark_conf')
    builder = SparkSession.builder
    flat = flatten_dict(spark_conf)
    for key, value in flat:
        builder = builder.config(key, value)

    return builder.getOrCreate()
Exemplo n.º 3
0
def spark_session_resource(init_context):
    builder = SparkSession.builder
    flat = flatten_dict(init_context.resource_config['spark_conf'])
    for key, value in flat:
        builder = builder.config(key, value)

    spark = builder.getOrCreate()
    try:
        yield spark
    finally:
        spark.stop()
Exemplo n.º 4
0
    def _get_emr_step_def(self, run_id, step_key, solid_name):
        """From the local Dagster instance, construct EMR steps that will kick off execution on a
        remote EMR cluster.
        """
        from dagster_spark.utils import flatten_dict, format_for_cli

        action_on_failure = self.action_on_failure

        # Execute Solid via spark-submit
        conf = dict(flatten_dict(self.spark_config))
        conf["spark.app.name"] = conf.get("spark.app.name", solid_name)

        check.invariant(
            conf.get("spark.master", "yarn") == "yarn",
            desc="spark.master is configured as %s; cannot set Spark master on EMR to anything "
            'other than "yarn"' % conf.get("spark.master"),
        )

        command = (
            [
                EMR_SPARK_HOME + "bin/spark-submit",
                "--master",
                "yarn",
                "--deploy-mode",
                conf.get("spark.submit.deployMode", "client"),
            ]
            + format_for_cli(list(flatten_dict(conf)))
            + [
                "--py-files",
                self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),
                self._artifact_s3_uri(run_id, step_key, self._main_file_name()),
                self.staging_bucket,
                self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),
            ]
        )

        return EmrJobRunner.construct_step_dict_for_command(
            "Execute Solid %s" % solid_name, command, action_on_failure=action_on_failure
        )
Exemplo n.º 5
0
    def _get_emr_step_def(self, run_id, step_key, solid_name):
        '''From the local Dagster instance, construct EMR steps that will kick off execution on a
        remote EMR cluster.
        '''
        action_on_failure = self.action_on_failure

        # Execute Solid via spark-submit
        conf = dict(flatten_dict(self.spark_config))
        conf['spark.app.name'] = conf.get('spark.app.name', solid_name)

        check.invariant(
            conf.get('spark.master', 'yarn') == 'yarn',
            desc=
            'spark.master is configured as %s; cannot set Spark master on EMR to anything '
            'other than "yarn"' % conf.get('spark.master'),
        )

        command = ([
            EMR_SPARK_HOME + 'bin/spark-submit',
            '--master',
            'yarn',
            '--deploy-mode',
            conf.get('spark.submit.deployMode', 'client'),
        ] + format_for_cli(list(flatten_dict(conf))) + [
            '--py-files',
            self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),
            self._artifact_s3_uri(run_id, step_key, self._main_file_name()),
            self.staging_bucket,
            self._artifact_s3_key(run_id, step_key,
                                  PICKLED_STEP_RUN_REF_FILE_NAME),
        ])

        return EmrJobRunner.construct_step_dict_for_command(
            'Execute Solid %s' % solid_name,
            command,
            action_on_failure=action_on_failure)