def stop(self): self._spark_session.stop() @abc.abstractmethod def get_compute_fn(self, fn, solid_name): pass class SystemPySparkResource(PySparkResourceDefinition): def get_compute_fn(self, fn, solid_name): return fn @resource({ 'spark_conf': spark_config(), 'stop_session': Field( bool, is_optional=True, default_value=True, description='Whether to stop the Spark session on pipeline completion. ' 'Defaults to True.', ), }) def pyspark_resource(init_context): pyspark = SystemPySparkResource(init_context.resource_config['spark_conf']) try: yield pyspark finally: if init_context.resource_config['stop_session']:
})) def write_rdd(context, file_type, file_options, spark_rdd): if file_type == 'csv': df = context.resources.spark.createDataFrame(spark_rdd) context.log.info('DF: {}'.format(df)) df.write.csv(file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')) else: check.failed('Unsupported file type: {}'.format(file_type)) SparkRDD = as_dagster_type(RDD, 'SparkRDD', input_schema=load_rdd, output_schema=write_rdd) @resource(config_field=Field(Dict({'spark_conf': spark_config()}))) def spark_session_resource(init_context): builder = SparkSession.builder flat = flatten_dict(init_context.resource_config['spark_conf']) for key, value in flat: builder = builder.config(key, value) spark = builder.getOrCreate() try: yield spark finally: spark.stop()
class PySparkResource: def __init__(self, spark_conf): self._spark_session = spark_session_from_config(spark_conf) @property def spark_session(self): return self._spark_session @property def spark_context(self): return self.spark_session.sparkContext @resource({"spark_conf": spark_config()}) def pyspark_resource(init_context): """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster. Example: .. code-block:: python @solid(required_resource_keys={"pyspark"}) def my_solid(context): spark_session = context.pyspark.spark_session dataframe = spark_session.read.json("examples/src/main/resources/people.json") my_pyspark_resource = pyspark_resource.configured( {"spark_conf": {"spark.executor.memory": "2g"}}
'SparkRDD', input_hydration_config=load_rdd, output_materialization_config=write_rdd) def spark_session_from_config(spark_conf=None): spark_conf = check.opt_dict_param(spark_conf, 'spark_conf') builder = SparkSession.builder flat = flatten_dict(spark_conf) for key, value in flat: builder = builder.config(key, value) return builder.getOrCreate() @resource({'spark_conf': spark_config()}) def spark_session_resource(init_context): spark = spark_session_from_config( init_context.resource_config['spark_conf']) try: yield spark finally: spark.stop() @output_selector_schema( Selector({ 'csv': Field( Dict({ 'path': Field(Path),
return steps @property def running_on_emr(self): '''Detects whether we are running on the EMR cluster ''' if os.path.exists('/mnt/var/lib/info/job-flow.json'): return True return False @resource( { 'pipeline_file': Field(str, description='Path to the file where the pipeline is defined'), 'pipeline_fn_name': Field(str), 'spark_config': spark_config(), 'cluster_id': Field(str, description='Name of the job flow (cluster) on which to execute'), 'region_name': Field(str), 'action_on_failure': Field(str, is_required=False, default_value='CANCEL_AND_WAIT'), 'staging_bucket': Field( str, is_required=True, description='S3 staging bucket to use for staging the produced main.py and zip file of' ' Python code', ), 'requirements_file_path': Field( str, is_required=False, description='Path to a requirements.txt file; the current directory is searched if none' ' is specified.', ),