def generate_solid(solid_id, num_inputs, num_outputs, num_cfg): def compute_fn(_context, **_kwargs): for i in range(num_outputs): yield Output(i, "out_{}".format(i)) config = {} for i in range(num_cfg): config[f"field_{i}"] = Field(str, is_required=False) return SolidDefinition( name=solid_id, input_defs=[ InputDefinition(name="in_{}".format(i), default_value="default") for i in range(num_inputs) ], output_defs=[ OutputDefinition(name="out_{}".format(i)) for i in range(num_outputs) ], compute_fn=compute_fn, config_schema=config, )
def test_basic_solid_with_config(): did_get = {} def _t_fn(context, _inputs): did_get['yep'] = context.solid_config solid = SolidDefinition( name='solid_with_context', inputs=[], outputs=[], config_field=Field(Dict({'some_config': Field(String)})), transform_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid]) execute_pipeline( pipeline, {'solids': {'solid_with_context': {'config': {'some_config': 'foo'}}}} ) assert 'yep' in did_get assert 'some_config' in did_get['yep']
def test_wrong_solid_name(): pipeline_def = PipelineDefinition( name='pipeline_wrong_solid_name', solid_defs=[ SolidDefinition( name='some_solid', input_defs=[], output_defs=[], config_field=Field(Int), compute_fn=lambda *_args: None, ) ], ) env_config = {'solids': {'another_name': {'config': {}}}} with pytest.raises(DagsterInvalidConfigError) as pe_info: execute_pipeline(pipeline_def, env_config) pe = pe_info.value assert 'Undefined field "another_name" at path root:solids' in str(pe)
def test_provided_default_on_resources_config(): pipeline_def = PipelineDefinition( mode_definitions=[ ModeDefinition( name='some_mode', resources={ 'some_resource': ResourceDefinition( resource_fn=lambda: None, config_field=Field( Dict( { 'with_default_int': Field( Int, is_optional=True, default_value=23434 ) } ) ), ) }, ) ], solids=[ SolidDefinition(name='some_solid', inputs=[], outputs=[], compute_fn=lambda *args: None) ], ) env_type = create_environment_type(pipeline_def) assert env_type.type_attributes.is_system_config some_resource_field = env_type.fields['resources'].config_type.fields['some_resource'] assert some_resource_field.is_optional some_resource_config_field = some_resource_field.config_type.fields['config'] assert some_resource_config_field.is_optional assert some_resource_config_field.default_value == {'with_default_int': 23434} assert some_resource_field.default_value == {'config': {'with_default_int': 23434}} value = EnvironmentConfig.from_dict(throwing_evaluate_config_value(env_type, {})) assert value.resources == {'some_resource': {'config': {'with_default_int': 23434}}}
def test_solid_not_found(): def _t_fn(*_args): raise Exception('should not reach') solid = SolidDefinition( name='find_me_solid', inputs=[], outputs=[], transform_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid]) with pytest.raises(DagsterInvariantViolationError): execute_pipeline( pipeline, config.Environment(solids={ 'not_found': config.Solid({ 'some_config': 1, }), }), )
def test_required_solid_with_required_subfield(): pipeline_def = PipelineDefinition( name="some_pipeline", solid_defs=[ SolidDefinition( name="int_config_solid", config_schema={"required_field": String}, input_defs=[], output_defs=[], compute_fn=lambda *_args: None, ) ], ) env_type = create_environment_type(pipeline_def) assert env_type.fields["solids"].is_required is True assert env_type.fields["solids"].config_type solids_type = env_type.fields["solids"].config_type assert solids_type.fields["int_config_solid"].is_required is True int_config_solid_type = solids_type.fields["int_config_solid"].config_type assert int_config_solid_type.fields["config"].is_required is True assert env_type.fields["execution"].is_required is False env_obj = EnvironmentConfig.build( pipeline_def, {"solids": {"int_config_solid": {"config": {"required_field": "foobar"}}}}, ) assert env_obj.solids["int_config_solid"].config["required_field"] == "foobar" res = process_config(env_type, {"solids": {}}) assert not res.success res = process_config(env_type, {}) assert not res.success
def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config=None, required_resource_keys=None, ): '''Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[list[:class:`dagster.InputDefinition`]]): The solid's inputs. output_defs (Optional[list[:class:`dagster.OutputDefinition`]]): The solid's outputs. required_resource_keys (Optional[set[str]]): The string names of any required resources. Returns: :class:`dagster.SolidDefinition` ''' check.str_param(name, 'name') check.str_param(notebook_path, 'notebook_path') input_defs = check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, 'output_defs', of_type=OutputDefinition) required_resource_keys = check.opt_set_param( required_resource_keys, 'required_resource_keys', of_type=str ) return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path), output_defs=output_defs, config=check_user_facing_opt_config_param(config, 'config'), required_resource_keys=required_resource_keys, description='This solid is backed by the notebook at {path}'.format(path=notebook_path), tags={'notebook_path': notebook_path, 'kind': 'ipynb'}, )
def test_config_arg_mismatch(): def _t_fn(*_args): raise Exception('should not reach') solid = SolidDefinition( name='solid_with_context', inputs=[], outputs=[], config_def=ConfigDefinition.config_dict({ 'some_config': Field(types.String) }), transform_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid]) with pytest.raises(DagsterTypeError): execute_pipeline( pipeline, config.Environment(solids={'solid_with_context': config.Solid({ 'some_config': 1 })}), )
def define_more_complicated_config(): return PipelineDefinition( name='more_complicated_config', solids=[ SolidDefinition( name='a_solid_with_three_field_config', inputs=[], outputs=[], transform_fn=lambda *_args: None, config_field=Field( Dict( { 'field_one': Field(String), 'field_two': Field(String, is_optional=True), 'field_three': Field( String, is_optional=True, default_value='some_value' ), } ) ), ) ], )
def create_templated_sql_transform_solid(name, sql, table_arguments, dependant_solids=None): check.str_param(name, 'name') check.str_param(sql, 'sql') check.list_param(table_arguments, 'table_arguments', of_type=str) dependant_solids = check.opt_list_param( dependant_solids, 'dependant_solids', of_type=SolidDefinition ) field_dict = {} for table in table_arguments: field_dict[table] = Field(String) return SolidDefinition( name=name, inputs=[InputDefinition(solid.name) for solid in dependant_solids], config_field=Field(Dict(field_dict)), transform_fn=_create_templated_sql_transform_with_output(sql), outputs=[ OutputDefinition(name='result', dagster_type=Any), OutputDefinition(name='sql_text', dagster_type=SqlTextType), ], )
def test_execution_plan_create_metadata(): solid_def = SolidDefinition( name='solid_metadata_creation', input_defs=[], output_defs=[], compute_fn=lambda *args, **kwargs: None, config_field=Field(Dict({'str_value': Field(String)})), step_metadata_fn=lambda env_config: { 'computed': env_config.solids['solid_metadata_creation'].config['str_value'] + '1' }, ) p_def = PipelineDefinition(name='test_metadata', solid_defs=[solid_def]) execution_plan = create_execution_plan( p_def, environment_dict={ 'solids': {'solid_metadata_creation': {'config': {'str_value': 'foobar'}}} }, ) compute_step = execution_plan.get_step_by_key('solid_metadata_creation.compute') assert compute_step assert compute_step.metadata == {'computed': 'foobar1'}
def create_templated_sql_transform_solid(name, sql, table_arguments, dependant_solids=None): check.str_param(name, 'name') check.str_param(sql, 'sql') check.list_param(table_arguments, 'table_arguments', of_type=str) dependant_solids = check.opt_list_param(dependant_solids, 'dependant_solids', of_type=SolidDefinition) field_dict = {} for table in table_arguments: field_dict[table] = Field(types.String) return SolidDefinition( name=name, inputs=[InputDefinition(solid.name) for solid in dependant_solids], config_def=ConfigDefinition.config_dict(field_dict), transform_fn=_create_templated_sql_transform_with_output(sql), outputs=[OutputDefinition()], )
def test_required_solid_with_required_subfield(): pipeline_def = PipelineDefinition( name='some_pipeline', solid_defs=[ SolidDefinition( name='int_config_solid', config_schema={'required_field': String}, input_defs=[], output_defs=[], compute_fn=lambda *_args: None, ) ], ) env_type = create_environment_type(pipeline_def) assert env_type.fields['solids'].is_required is True assert env_type.fields['solids'].config_type solids_type = env_type.fields['solids'].config_type assert solids_type.fields['int_config_solid'].is_required is True int_config_solid_type = solids_type.fields['int_config_solid'].config_type assert int_config_solid_type.fields['config'].is_required is True assert env_type.fields['execution'].is_required is False env_obj = EnvironmentConfig.build( pipeline_def, {'solids': {'int_config_solid': {'config': {'required_field': 'foobar'}}}}, ) assert env_obj.solids['int_config_solid'].config['required_field'] == 'foobar' res = process_config(env_type, {'solids': {}}) assert not res.success res = process_config(env_type, {}) assert not res.success
def test_required_solid_with_required_subfield(): pipeline_def = PipelineDefinition( name='some_pipeline', solid_defs=[ SolidDefinition( name='int_config_solid', config={'required_field': String}, input_defs=[], output_defs=[], compute_fn=lambda *_args: None, ) ], ) env_type = create_environment_type(pipeline_def) assert env_type.fields['solids'].is_required is True assert env_type.fields['solids'].config_type solids_type = env_type.fields['solids'].config_type assert solids_type.fields['int_config_solid'].is_required is True int_config_solid_type = solids_type.fields['int_config_solid'].config_type assert int_config_solid_type.fields['config'].is_required is True assert env_type.fields['execution'].is_required is False env_obj = EnvironmentConfig.build( pipeline_def, {'solids': {'int_config_solid': {'config': {'required_field': 'foobar'}}}}, ) assert env_obj.solids['int_config_solid'].config['required_field'] == 'foobar' with pytest.raises(DagsterEvaluateConfigValueError): throwing_validate_config_value(env_type, {'solids': {}}) with pytest.raises(DagsterEvaluateConfigValueError): throwing_validate_config_value(env_type, {})
def test_config_arg_mismatch(): def _t_fn(*_args): raise Exception('should not reach') solid = SolidDefinition( name='solid_with_context', inputs=[], outputs=[], config_field=Field(Dict({'some_config': Field(String)})), compute_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid]) with pytest.raises(PipelineConfigEvaluationError): execute_pipeline( pipeline, {'solids': { 'solid_with_context': { 'config': { 'some_config': 1 } } }})
def test_provided_default_config(): pipeline_def = PipelineDefinition( context_definitions={ 'some_context': PipelineContextDefinition( config_field=Field( Dict({'with_default_int': Field(Int, is_optional=True, default_value=23434)}) ), context_fn=lambda *args: None, ) }, solids=[ SolidDefinition( name='some_solid', inputs=[], outputs=[], transform_fn=lambda *args: None ) ], ) env_type = pipeline_def.environment_type some_context_field = env_type.fields['context'].config_type.fields['some_context'] assert some_context_field.is_optional some_context_config_field = some_context_field.config_type.fields['config'] assert some_context_config_field.is_optional assert some_context_config_field.default_value == {'with_default_int': 23434} assert some_context_field.default_value == { 'config': {'with_default_int': 23434}, 'resources': {}, 'persistence': {'file': {}}, } value = construct_environment_config( throwing_evaluate_config_value(pipeline_def.environment_type, {}) ) assert value.context.name == 'some_context' assert env_type.type_attributes.is_system_config
# pylint: disable=unused-argument from dagster import Int, Output, OutputDefinition, SolidDefinition, solid # start_solid_definition_marker_0 @solid def my_solid(context): return 1 # end_solid_definition_marker_0 # start_solid_definition_marker_1 def _return_one(_context, inputs): yield Output(1) solid = SolidDefinition( name="my_solid", input_defs=[], output_defs=[OutputDefinition(Int)], compute_fn=_return_one, ) # end_solid_definition_marker_1
def get_duplicate_solids(): return ( SolidDefinition("a_solid", [], lambda: None, []), SolidDefinition("a_solid", [], lambda: None, []), )
def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, asset_key_prefix=None, ): """Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook (Optional[str]): If set, will be used as the name of an injected output of type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on the pipeline resources via the "file_manager" resource key, so, e.g., if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a : py:class:`~dagster_aws.s3.S3FileHandle`. asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the asset keys for materialized notebooks. Returns: :py:class:`~dagster.SolidDefinition` """ check.str_param(name, "name") check.str_param(notebook_path, "notebook_path") input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str) if output_notebook is not None: required_resource_keys.add("file_manager") if isinstance(asset_key_prefix, str): asset_key_prefix = [asset_key_prefix] asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path, output_notebook, asset_key_prefix=asset_key_prefix), output_defs=output_defs + ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)] if output_notebook else []), config_schema=config_schema, required_resource_keys=required_resource_keys, description="This solid is backed by the notebook at {path}".format( path=notebook_path), tags={ "notebook_path": notebook_path, "kind": "ipynb" }, )
def get_context(self, solid_config=None, mode_def=None, environment_dict=None): '''Get a dagstermill execution context for interactive exploration and development. Args: solid_config (Optional[Any]): If specified, this value will be made available on the context as its ``solid_config`` property. mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to use to construct the context. Specify this if you would like a context constructed with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode with a console logger will be constructed. environment_dict(Optional[dict]): The environment config dict with which to construct the context. Returns: :class:`dagstermill.DagstermillExecutionContext` ''' check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) solid_def = SolidDefinition( name='this_solid', input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description= 'Ephemeral solid constructed by dagstermill.get_context()', ) if not mode_def: mode_def = ModeDefinition( logger_defs={'dagstermill': colored_console_logger}) environment_dict['loggers'] = {'dagstermill': {}} pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name='ephemeral_dagstermill_pipeline') run_id = str(uuid.uuid4()) # construct stubbed PipelineRun for notebook exploration... # The actual pipeline run during pipeline execution will be serialized and reconstituted # in the `reconstitute_pipeline_context` call pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, environment_dict=environment_dict, mode=mode_def.name, reexecution_config=None, selector=None, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, tags=None, ) self.in_pipeline = False self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, pipeline_run, instance=DagsterInstance.ephemeral(), scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext(pipeline_context, solid_config) return self.context
def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, asset_key_prefix=None, description=None, tags=None, ): """Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook (Optional[str]): If set, will be used as the name of an injected output of type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on the pipeline resources via the "file_manager" resource key, so, e.g., if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a : py:class:`~dagster_aws.s3.S3FileHandle`. asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the asset keys for materialized notebooks. description (Optional[str]): If set, description used for solid. tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid. Dagster uses the tag keys `notebook_path` and `kind`, which cannot be overwritten by the user. Returns: :py:class:`~dagster.SolidDefinition` """ check.str_param(name, "name") check.str_param(notebook_path, "notebook_path") input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str) if output_notebook is not None: required_resource_keys.add("file_manager") if isinstance(asset_key_prefix, str): asset_key_prefix = [asset_key_prefix] asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) default_description = f"This solid is backed by the notebook at {notebook_path}" description = check.opt_str_param(description, "description", default=default_description) user_tags = validate_tags(tags) if tags is not None: check.invariant( "notebook_path" not in tags, "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster", ) check.invariant( "kind" not in tags, "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster", ) default_tags = {"notebook_path": notebook_path, "kind": "ipynb"} return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path, output_notebook, asset_key_prefix=asset_key_prefix), output_defs=output_defs + ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)] if output_notebook else []), config_schema=config_schema, required_resource_keys=required_resource_keys, description=description, tags={ **user_tags, **default_tags }, )
def test_basic_solids_config(): pipeline_def = PipelineDefinition( name='BasicSolidsConfigPipeline', solids=[ SolidDefinition( name='required_field_solid', inputs=[], outputs=[], config_field=Field(Dict(fields={'required_int': Field(Int)})), compute_fn=lambda *_args: fail_me(), ) ], ) env_config_type = create_environment_type(pipeline_def) assert env_config_type.fields['solids'].is_optional is False solids_config_type = env_config_type.fields['solids'].config_type assert solids_config_type.fields[ 'required_field_solid'].is_optional is False required_solid_config_type = solids_config_type.fields[ 'required_field_solid'].config_type assert required_solid_config_type.fields['config'].is_optional is False assert set( env_config_type.fields['loggers'].config_type.fields.keys()) == set( ['console']) console_logger_config_type = env_config_type.fields[ 'loggers'].config_type.fields['console'] assert set(console_logger_config_type.config_type.fields.keys()) == set( ['config']) assert console_logger_config_type.config_type.fields['config'].is_optional console_logger_config_config_type = console_logger_config_type.config_type.fields[ 'config'].config_type assert set(console_logger_config_config_type.fields.keys()) == set( ['log_level', 'name']) assert scaffold_pipeline_config(pipeline_def, skip_optional=False) == { 'loggers': { 'console': { 'config': { 'log_level': '', 'name': '' } } }, 'solids': { 'required_field_solid': { 'config': { 'required_int': 0 } } }, 'expectations': { 'evaluate': True }, 'execution': {}, 'resources': {}, 'storage': { 'filesystem': { 'base_dir': '' }, 'in_memory': {}, 's3': { 's3_bucket': '' } }, }
def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config=None, required_resource_keys=None, output_notebook=None, config_schema=None, ): '''Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook (Optional[str]): If set, will be used as the name of an injected output of type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in addition to the :py:class:`~dagster.Materialization` that is always created). This respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on the pipeline system storage, so, e.g., if :py:class:`~dagster_aws.s3.s3_system_storage` is configured, the output will be a :py:class:`~dagster_aws.s3.S3FileHandle`. Returns: :py:class:`~dagster.SolidDefinition` ''' check.str_param(name, 'name') check.str_param(notebook_path, 'notebook_path') input_defs = check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, 'output_defs', of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, 'required_resource_keys', of_type=str) return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path, output_notebook), output_defs=output_defs + ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)] if output_notebook else []), config_schema=canonicalize_backcompat_args( check_user_facing_opt_config_param(config_schema, 'config_schema'), 'config_schema', check_user_facing_opt_config_param(config, 'config'), 'config', '0.9.0', ), required_resource_keys=required_resource_keys, description='This solid is backed by the notebook at {path}'.format( path=notebook_path), tags={ 'notebook_path': notebook_path, 'kind': 'ipynb' }, )
def sql_solid(name, select_statement, materialization_strategy, table_name=None, input_defs=None): '''Return a new solid that executes and materializes a SQL select statement. Args: name (str): The name of the new solid. select_statement (str): The select statement to execute. materialization_strategy (str): Must be 'table', the only currently supported materialization strategy. If 'table', the kwarg `table_name` must also be passed. Kwargs: table_name (str): THe name of the new table to create, if the materialization strategy is 'table'. Default: None. input_defs (list[InputDefinition]): Inputs, if any, for the new solid. Default: None. Returns: function: The new SQL solid. ''' input_defs = check.opt_list_param(input_defs, 'input_defs', InputDefinition) materialization_strategy_output_types = { # pylint:disable=C0103 'table': SqlTableName, # 'view': String, # 'query': SqlAlchemyQueryType, # 'subquery': SqlAlchemySubqueryType, # 'result_proxy': SqlAlchemyResultProxyType, # could also materialize as a Pandas table, as a Spark table, as an intermediate file, etc. } if materialization_strategy not in materialization_strategy_output_types: raise Exception( 'Invalid materialization strategy {materialization_strategy}, must ' 'be one of {materialization_strategies}'.format( materialization_strategy=materialization_strategy, materialization_strategies=str( list(materialization_strategy_output_types.keys())), )) if materialization_strategy == 'table': if table_name is None: raise Exception( 'Missing table_name: required for materialization strategy \'table\'' ) output_description = ( 'The string name of the new table created by the solid' if materialization_strategy == 'table' else 'The materialized SQL statement. If the materialization_strategy is ' '\'table\', this is the string name of the new table created by the solid.' ) description = '''This solid executes the following SQL statement: {select_statement}'''.format(select_statement=select_statement) # n.b., we will eventually want to make this resources key configurable sql_statement = ( 'drop table if exists {table_name};\n' 'create table {table_name} as {select_statement};').format( table_name=table_name, select_statement=select_statement) def compute_fn(context, _inputs): '''Inner function defining the new solid. Args: context (ComputeExecutionContext): Must expose a `db` resource with an `execute` method, like a SQLAlchemy engine, that can execute raw SQL against a database. Returns: str: The table name of the newly materialized SQL select statement. ''' context.log.info('Executing sql statement:\n{sql_statement}'.format( sql_statement=sql_statement)) context.resources.db_info.engine.execute(text(sql_statement)) yield Output(value=table_name, output_name='result') return SolidDefinition( name=name, input_defs=input_defs, output_defs=[ OutputDefinition( materialization_strategy_output_types[ materialization_strategy], description=output_description, ) ], compute_fn=compute_fn, description=description, metadata={ 'kind': 'sql', 'sql': sql_statement }, )
def test_whole_environment(): pipeline_def = PipelineDefinition( name='some_pipeline', mode_defs=[ ModeDefinition( name='test_mode', resource_defs={ 'test_resource': ResourceDefinition(resource_fn=lambda: None, config_field=Field(Any)) }, ) ], solid_defs=[ SolidDefinition( name='int_config_solid', config_field=Field(Int), input_defs=[], output_defs=[], compute_fn=lambda *args: None, ), SolidDefinition(name='no_config_solid', input_defs=[], output_defs=[], compute_fn=lambda *args: None), ], ) environment_type = create_environment_type(pipeline_def) assert (environment_type.fields['resources'].config_type.name == 'SomePipeline.Mode.TestMode.Resources') solids_type = environment_type.fields['solids'].config_type assert solids_type.name == 'SomePipeline.SolidsConfigDictionary' assert (solids_type.fields['int_config_solid'].config_type.name == 'SomePipeline.SolidConfig.IntConfigSolid') env = EnvironmentConfig.from_config_value( throwing_evaluate_config_value( environment_type, { 'resources': { 'test_resource': { 'config': 1 } }, 'solids': { 'int_config_solid': { 'config': 123 } }, }, ), { 'resources': { 'test_resource': { 'config': 1 } }, 'solids': { 'int_config_solid': { 'config': 123 } }, }, ) assert isinstance(env, EnvironmentConfig) assert env.solids == {'int_config_solid': SolidConfig(123)} assert env.resources == {'test_resource': {'config': 1}}
def test_multiple_outputs_only_emit_one(): def _t_fn(*_args): yield Result(output_name='output_one', value='foo') solid = SolidDefinition( name='multiple_outputs', inputs=[], outputs=[ OutputDefinition(name='output_one'), OutputDefinition(name='output_two') ], transform_fn=_t_fn, ) called = {} def _transform_fn_one(*_args, **_kwargs): called['one'] = True downstream_one = SolidDefinition( name='downstream_one', inputs=[InputDefinition('some_input')], outputs=[], transform_fn=_transform_fn_one, ) def _transform_fn_two(*_args, **_kwargs): raise Exception('do not call me') downstream_two = SolidDefinition( name='downstream_two', inputs=[InputDefinition('some_input')], outputs=[], transform_fn=_transform_fn_two, ) pipeline = PipelineDefinition( solids=[solid, downstream_one, downstream_two], dependencies={ 'downstream_one': { 'some_input': DependencyDefinition(solid.name, output='output_one') }, 'downstream_two': { 'some_input': DependencyDefinition(solid.name, output='output_two') }, }, ) result = execute_pipeline(pipeline) assert result.success assert called['one'] solid_result = result.result_for_solid('multiple_outputs') assert set(solid_result.transformed_values.keys()) == set(['output_one']) with pytest.raises( DagsterInvariantViolationError, match='not_defined not defined in solid multiple_outputs'): solid_result.transformed_value('not_defined') with pytest.raises(DagsterInvariantViolationError, match='Did not find result output_two'): solid_result.transformed_value('output_two') with pytest.raises( DagsterInvariantViolationError, match= 'Try to get result for solid not_present in <<unnamed>>. No such solid.', ): result.result_for_solid('not_present') with pytest.raises( DagsterInvariantViolationError, match= 'Did not find result for solid downstream_two in pipeline execution result', ): result.result_for_solid('downstream_two')
def create_solid_with_deps(name, *solid_deps): inputs = [InputDefinition(solid_dep.name) for solid_dep in solid_deps] return SolidDefinition( name=name, inputs=inputs, transform_fn=_transform_fn, outputs=[OutputDefinition()] )
def get_context(self, solid_config=None, mode_def=None, run_config=None): """Get a dagstermill execution context for interactive exploration and development. Args: solid_config (Optional[Any]): If specified, this value will be made available on the context as its ``solid_config`` property. mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to use to construct the context. Specify this if you would like a context constructed with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode with a console logger will be constructed. run_config(Optional[dict]): The environment config dict with which to construct the context. Returns: :py:class:`~dagstermill.DagstermillExecutionContext` """ check.opt_inst_param(mode_def, "mode_def", ModeDefinition) run_config = check.opt_dict_param(run_config, "run_config", key_type=str) # If we are running non-interactively, and there is already a context reconstituted, return # that context rather than overwriting it. if self.context is not None and isinstance( self.context, DagstermillRuntimeExecutionContext): return self.context if not mode_def: mode_def = ModeDefinition( logger_defs={"dagstermill": colored_console_logger}) run_config["loggers"] = {"dagstermill": {}} solid_def = SolidDefinition( name="this_solid", input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description= "Ephemeral solid constructed by dagstermill.get_context()", required_resource_keys=mode_def.resource_key_set, ) pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name="ephemeral_dagstermill_pipeline") run_id = make_new_run_id() # construct stubbed PipelineRun for notebook exploration... # The actual pipeline run during pipeline execution will be serialized and reconstituted # in the `reconstitute_pipeline_context` call pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=mode_def.name, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, tags=None, ) self.in_pipeline = False self.solid_def = solid_def self.pipeline = pipeline_def environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=mode_def.name) pipeline = InMemoryPipeline(pipeline_def) execution_plan = ExecutionPlan.build(pipeline, environment_config) with scoped_pipeline_context( execution_plan, pipeline, run_config, pipeline_run, DagsterInstance.ephemeral(), scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext( pipeline_context=pipeline_context, pipeline_def=pipeline_def, solid_config=solid_config, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_def, environment_config, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context
def test_basic_solids_config(): pipeline_def = PipelineDefinition( name='BasicSolidsConfigPipeline', solids=[ SolidDefinition( name='required_field_solid', inputs=[], outputs=[], config_field=Field(Dict(fields={'required_int': Field(Int)})), transform_fn=lambda *_args: fail_me(), ) ], ) env_config_type = pipeline_def.environment_type assert env_config_type.fields['solids'].is_optional is False solids_config_type = env_config_type.fields['solids'].config_type assert solids_config_type.fields[ 'required_field_solid'].is_optional is False required_solid_config_type = solids_config_type.fields[ 'required_field_solid'].config_type assert required_solid_config_type.fields['config'].is_optional is False context_config_type = env_config_type.fields['context'].config_type assert 'default' in context_config_type.fields assert context_config_type.fields['default'].is_optional default_context_config_type = context_config_type.fields[ 'default'].config_type assert set(default_context_config_type.fields.keys()) == set( ['config', 'resources', 'persistence']) default_context_user_config_type = default_context_config_type.fields[ 'config'].config_type assert set(default_context_user_config_type.fields.keys()) == set( ['log_level']) assert scaffold_pipeline_config(pipeline_def, skip_optional=False) == { 'context': { 'default': { 'config': { 'log_level': '' }, 'persistence': { 'file': {} }, 'resources': {} } }, 'solids': { 'required_field_solid': { 'config': { 'required_int': 0 } } }, 'expectations': { 'evaluate': True }, 'execution': {}, }
def define_dagstermill_solid( name, notebook_path, inputs=None, outputs=None, config_def=None, ): check.str_param(name, 'name') check.str_param(notebook_path, 'notebook_path') inputs = check.opt_list_param(inputs, 'input_defs', of_type=InputDefinition) outputs = check.opt_list_param(outputs, 'output_defs', of_type=OutputDefinition) do_cleanup = False # for now def _t_fn(info, inputs): if not os.path.exists('/tmp/dagstermill/'): os.mkdir('/tmp/dagstermill/') temp_path = '/tmp/dagstermill/{prefix}-out.ipynb'.format( prefix=str(uuid.uuid4())) try: _source_nb = pm.execute_notebook( notebook_path, temp_path, parameters=dict( inputs=serialize_dm_object(inputs), config=serialize_dm_object(info.config), ), ) output_nb = pm.read_notebook(temp_path) info.context.debug( 'Notebook execution complete for {name}. Data is {data}'. format( name=name, data=output_nb.data, )) for output_def in info.solid_def.output_defs: if output_def.name in output_nb.data: yield Result( deserialize_dm_object(output_nb.data[output_def.name]), output_def.name, ) finally: if do_cleanup and os.path.exists(temp_path): os.remove(temp_path) return SolidDefinition( name=name, inputs=inputs, transform_fn=_t_fn, outputs=outputs, config_def=config_def, description='This solid is backed by the notebook at {path}'.format( path=notebook_path), metadata={ 'notebook_path': notebook_path, })