def test_cycle_detect(): @lambda_solid def return_one(): return 1 @lambda_solid def add(a, b): return a + b with pytest.raises(DagsterInvalidDefinitionError, match="Circular dependencies exist"): PipelineDefinition( solid_defs=[return_one, add], dependencies={ SolidInvocation('add', alias='first'): { 'a': DependencyDefinition('return_one'), 'b': DependencyDefinition('second'), }, SolidInvocation('add', alias='second'): { 'a': DependencyDefinition('first'), 'b': DependencyDefinition('return_one'), }, }, ) with pytest.raises(DagsterInvalidDefinitionError, match="Circular dependencies exist"): CompositeSolidDefinition( name='circletron', solid_defs=[return_one, add], dependencies={ SolidInvocation('add', alias='first'): { 'a': DependencyDefinition('return_one'), 'b': DependencyDefinition('second'), }, SolidInvocation('add', alias='second'): { 'a': DependencyDefinition('first'), 'b': DependencyDefinition('return_one'), }, }, )
def test_cycle_detect(): @lambda_solid def return_one(): return 1 @lambda_solid def add(a, b): return a + b with pytest.raises(DagsterInvalidDefinitionError, match="Circular dependencies exist"): PipelineDefinition( solid_defs=[return_one, add], dependencies={ SolidInvocation("add", alias="first"): { "a": DependencyDefinition("return_one"), "b": DependencyDefinition("second"), }, SolidInvocation("add", alias="second"): { "a": DependencyDefinition("first"), "b": DependencyDefinition("return_one"), }, }, ) with pytest.raises(DagsterInvalidDefinitionError, match="Circular dependencies exist"): CompositeSolidDefinition( name="circletron", solid_defs=[return_one, add], dependencies={ SolidInvocation("add", alias="first"): { "a": DependencyDefinition("return_one"), "b": DependencyDefinition("second"), }, SolidInvocation("add", alias="second"): { "a": DependencyDefinition("first"), "b": DependencyDefinition("return_one"), }, }, )
def test_solid_with_input(): @lambda_solid(inputs=[InputDefinition(name="foo_to_foo")]) def hello_world(foo_to_foo): return foo_to_foo pipeline = PipelineDefinition( solids=[define_stub_solid('test_value', {'foo': 'bar'}), hello_world], dependencies={ 'hello_world': { 'foo_to_foo': DependencyDefinition('test_value') } }, ) pipeline_result = execute_pipeline(pipeline) result = pipeline_result.result_for_solid('hello_world') assert result.success assert result.transformed_value()['foo'] == 'bar'
def test_single_node_passing_json_config_expectations(): in_df = pd.DataFrame.from_dict({'num1': [1, 3], 'num2': [2, 4]}) pipeline = PipelineDefinition( solids=[define_stub_solid('value', in_df), sum_solid_expectations_config], dependencies={ sum_solid_expectations_config.name: { 'num_df': DependencyDefinition('value') } } ) result = execute_pipeline(pipeline) assert result.success assert len(result.result_list) == 2 assert result.result_list[1].success assert result.result_list[1].transformed_value().to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], }
def test_invalid_input_dependency(): @lambda_solid(output_def=OutputDefinition(Nothing)) def do_nothing(): pass @lambda_solid(input_defs=[InputDefinition('num', Int)], output_def=OutputDefinition(Int)) def add_one(num): return num + 1 with pytest.raises(DagsterInvalidDefinitionError): PipelineDefinition( name='bad_dep', solid_defs=[do_nothing, add_one], dependencies={ 'add_one': { 'num': DependencyDefinition('do_nothing') } }, )
def define_rerun_pipeline(): insert_into_sum_table_solid = _get_project_solid('insert_into_sum_table', inputs=None) insert_into_sum_sq_table_solid = _get_project_solid( 'insert_into_sum_sq_table', inputs=[InputDefinition(insert_into_sum_table_solid.name)]) return dagster.PipelineDefinition( name='rerun_pipeline', description= 'Rerun the pipeline, populating the derived tables. Assumes pipeline is setup', solids=[insert_into_sum_table_solid, insert_into_sum_sq_table_solid], dependencies={ insert_into_sum_sq_table_solid.name: { insert_into_sum_table_solid.name: DependencyDefinition(insert_into_sum_table_solid.name) } }, )
def test_invalid_input_dependency(): @lambda_solid(output_def=OutputDefinition(Nothing)) def do_nothing(): pass @lambda_solid(input_defs=[InputDefinition("num", Int)], output_def=OutputDefinition(Int)) def add_one(num): return num + 1 with pytest.raises(DagsterInvalidDefinitionError): PipelineDefinition( name="bad_dep", solid_defs=[do_nothing, add_one], dependencies={ "add_one": { "num": DependencyDefinition("do_nothing") } }, )
def test_user_error_propogation(): err_msg = 'the user has errored' class UserError(Exception): pass @lambda_solid def throws_user_error(): raise UserError(err_msg) @lambda_solid def return_one(): return 1 @lambda_solid(input_defs=[InputDefinition('num')]) def add_one(num): return num + 1 pipeline_def = PipelineDefinition( name='test_user_error_propogation', solid_defs=[throws_user_error, return_one, add_one], dependencies={'add_one': {'num': DependencyDefinition('return_one')}}, ) with pytest.raises(DagsterExecutionStepExecutionError) as e_info: execute_pipeline(pipeline_def) assert isinstance(e_info.value.__cause__, UserError) # meta data on the exception assert e_info.value.step_key == 'throws_user_error.compute' assert e_info.value.solid_name == 'throws_user_error' assert e_info.value.solid_def_name == 'throws_user_error' # and in the message assert 'step key: "throws_user_error.compute"' in str(e_info.value) assert 'solid invocation: "throws_user_error"' in str(e_info.value) assert 'solid definition: "throws_user_error"' in str(e_info.value) # ensure that the inner exception shows up in the error message on python 2 if sys.version_info[0] == 2: assert err_msg in str(e_info.value)
def retry_pipeline(): @solid( config_schema={ "fail": Field(bool, is_required=False, default_value=False), }, ) def return_one(context): if context.solid_config["fail"]: raise Exception("FAILURE") return 1 @solid def add_one(num): return num + 1 return PipelineDefinition( solid_defs=[return_one, add_one], name="test", dependencies={"add_one": {"num": DependencyDefinition("return_one")}}, mode_defs=[default_mode_def_for_test], )
def test_single_solid_with_single(): @lambda_solid def solid_one(): return 1 @lambda_solid(inputs=[InputDefinition(name='num')]) def add_one_solid(num): return num + 1 pipeline_def = PipelineDefinition( solids=[solid_one, add_one_solid], dependencies={ 'add_one_solid': { 'num': DependencyDefinition('solid_one') } }, ) result = execute_solid(pipeline_def, 'add_one_solid', inputs={'num': 2}) assert result.success assert result.transformed_value() == 3
def construct_graph_with_yaml(yaml_file, op_defs) -> GraphDefinition: yaml_data = load_yaml_from_path(yaml_file) deps = {} for op_yaml_data in yaml_data["ops"]: def_name = op_yaml_data["def"] alias = op_yaml_data.get("alias", def_name) op_deps_entry = {} for input_name, input_data in op_yaml_data.get("deps", {}).items(): op_deps_entry[input_name] = DependencyDefinition( solid=input_data["op"], output=input_data.get("output", "result")) deps[NodeInvocation(name=def_name, alias=alias)] = op_deps_entry return GraphDefinition( name=yaml_data["name"], description=yaml_data.get("description"), node_defs=op_defs, dependencies=deps, )
def test_spark_data_frame_serialization_file_system(): with open(os.path.join(os.path.dirname(__file__), 'data/test.csv'), 'rb') as fd: input_csv_file = fd.read() @lambda_solid def nonce(): return input_csv_file pipeline_def = PipelineDefinition( [nonce, ingest_csv_to_spark], dependencies={ 'ingest_csv_to_spark': { 'input_csv_file': DependencyDefinition('nonce') } }, mode_definitions=[spark_mode], ) run_id = str(uuid.uuid4()) storage_mode = RunStorageMode.FILESYSTEM intermediate_store = FileSystemIntermediateStore(run_id=run_id) result = execute_pipeline(pipeline_def, run_config=RunConfig(run_id=run_id, storage_mode=storage_mode, mode='spark')) assert result.success result_dir = os.path.join(intermediate_store.root, 'intermediates', 'ingest_csv_to_spark.compute', 'result') assert '_SUCCESS' in os.listdir(result_dir) spark = spark_mode.resource_defs['spark'].resource_fn(None) df = spark.read.parquet(result_dir) assert isinstance(df, pyspark.sql.dataframe.DataFrame) assert df.head()[0] == '1'
def test_reexecution(): @lambda_solid def return_one(): return 1 @lambda_solid def add_one(num): return num + 1 pipeline_def = PipelineDefinition( solid_defs=[return_one, add_one], dependencies={'add_one': { 'num': DependencyDefinition('return_one') }}, ) pipeline_result = execute_pipeline( pipeline_def, environment_dict={'storage': { 'filesystem': {} }}) assert pipeline_result.success assert pipeline_result.result_for_solid('add_one').output_value() == 2 reexecution_run_config = RunConfig(reexecution_config=ReexecutionConfig( previous_run_id=pipeline_result.run_id, step_output_handles=[StepOutputHandle('return_one.compute')], )) reexecution_result = execute_pipeline( pipeline_def, environment_dict={'storage': { 'filesystem': {} }}, run_config=reexecution_run_config, ) assert reexecution_result.success assert len(reexecution_result.solid_result_list) == 2 assert reexecution_result.result_for_solid( 'return_one').output_value() == 1 assert reexecution_result.result_for_solid('add_one').output_value() == 2
def construct_lakehouse_pipeline(name, lakehouse_tables, resources, preset_defs=None): ''' Dynamically construct the pipeline from the table definitions ''' check.list_param(lakehouse_tables, 'lakehouse_tables', of_type=LakehouseTableDefinition) check.dict_param(resources, 'resources') type_to_solid = {} for lakehouse_table in lakehouse_tables: output_type_name = lakehouse_table.output_defs[0].runtime_type.name check.invariant( output_type_name not in type_to_solid, 'Duplicate Lakehouse output names "{}"'.format(output_type_name), ) type_to_solid[output_type_name] = lakehouse_table dependencies = defaultdict(dict) for lakehouse_table in lakehouse_tables: for input_def in lakehouse_table.input_tables: input_type_name = input_def.runtime_type.name check.invariant(input_type_name in type_to_solid) dependencies[lakehouse_table.name][input_def.name] = DependencyDefinition( type_to_solid[input_type_name].name ) resource_defs = {} for key, resource in resources.items(): if isinstance(resource, ResourceDefinition): resource_defs[key] = resource else: resource_defs[key] = ResourceDefinition.hardcoded_resource(resource) return PipelineDefinition( name=name, mode_defs=[ModeDefinition(resource_defs=resource_defs)], solid_defs=lakehouse_tables, dependencies=dependencies, preset_defs=preset_defs, )
def test_reexecution(): @lambda_solid def return_one(): return 1 @lambda_solid def add_one(num): return num + 1 pipeline_def = PipelineDefinition( solid_defs=[return_one, add_one], dependencies={'add_one': { 'num': DependencyDefinition('return_one') }}, ) instance = DagsterInstance.ephemeral() pipeline_result = execute_pipeline( pipeline_def, environment_dict={'storage': { 'filesystem': {} }}, instance=instance) assert pipeline_result.success assert pipeline_result.result_for_solid('add_one').output_value() == 2 reexecution_run_config = RunConfig(previous_run_id=pipeline_result.run_id) reexecution_result = execute_pipeline( pipeline_def, environment_dict={'storage': { 'filesystem': {} }}, run_config=reexecution_run_config, instance=instance, ) assert reexecution_result.success assert len(reexecution_result.solid_result_list) == 2 assert reexecution_result.result_for_solid( 'return_one').output_value() == 1 assert reexecution_result.result_for_solid('add_one').output_value() == 2
def test_execution_plan(): @solid(outputs=[OutputDefinition(Nothing)]) def emit_nothing(_context): yield Materialization(path='/path/') @lambda_solid(inputs=[InputDefinition('ready', Nothing)]) def consume_nothing(): pass pipe = PipelineDefinition( name='execution_plan_test', solids=[emit_nothing, consume_nothing], dependencies={'consume_nothing': {'ready': DependencyDefinition('emit_nothing')}}, ) plan = create_execution_plan(pipe) levels = plan.topological_step_levels() assert 'emit_nothing' in levels[0][0].key assert 'consume_nothing' in levels[1][0].key assert execute_pipeline(pipe).success
def test_single_step_reexecution(): @lambda_solid def return_one(): return 1 @lambda_solid def add_one(num): return num + 1 pipeline_def = PipelineDefinition( solid_defs=[return_one, add_one], dependencies={'add_one': { 'num': DependencyDefinition('return_one') }}, ) environment_dict = {'storage': {'filesystem': {}}} instance = DagsterInstance.ephemeral() pipeline_result = execute_pipeline(pipeline_def, environment_dict, instance=instance) assert pipeline_result.success assert pipeline_result.result_for_solid('add_one').output_value() == 2 # This is how this is actually done in dagster_graphql.implementation.pipeline_execution_manager reexecution_pipeline_run = instance.create_run_for_pipeline( pipeline_def, environment_dict=environment_dict, step_keys_to_execute=['add_one.compute'], parent_run_id=pipeline_result.run_id, root_run_id=pipeline_result.run_id, ) reexecution_result = execute_run(pipeline_def, reexecution_pipeline_run, instance) assert reexecution_result.success assert reexecution_result.result_for_solid( 'return_one').output_value() == None assert reexecution_result.result_for_solid('add_one').output_value() == 2
def test_execution_plan(): @solid(output_defs=[OutputDefinition(Nothing)]) def emit_nothing(_context): yield AssetMaterialization.file(path="/path/") @lambda_solid(input_defs=[InputDefinition("ready", Nothing)]) def consume_nothing(): pass pipe = PipelineDefinition( name="execution_plan_test", solid_defs=[emit_nothing, consume_nothing], dependencies={"consume_nothing": {"ready": DependencyDefinition("emit_nothing")}}, ) plan = create_execution_plan(pipe) levels = plan.get_steps_to_execute_by_level() assert "emit_nothing" in levels[0][0].key assert "consume_nothing" in levels[1][0].key assert execute_pipeline(pipe).success
def test_solid_with_input(): @lambda_solid(input_defs=[InputDefinition(name="foo_to_foo")]) def hello_world(foo_to_foo): return foo_to_foo pipeline = PipelineDefinition( solid_defs=[ define_stub_solid("test_value", {"foo": "bar"}), hello_world ], dependencies={ "hello_world": { "foo_to_foo": DependencyDefinition("test_value") } }, ) pipeline_result = execute_pipeline(pipeline) result = pipeline_result.result_for_solid("hello_world") assert result.success assert result.output_value()["foo"] == "bar"
def test_wrong_output_value(): csv_input = InputDefinition('num_csv', dagster_pd.DataFrame) @lambda_solid(name="test_wrong_output", inputs=[csv_input], output=OutputDefinition(dagster_pd.DataFrame)) def df_solid(num_csv): return 'not a dataframe' pass_solid = define_stub_solid('pass_solid', pd.DataFrame()) pipeline = PipelineDefinition( solid_defs=[pass_solid, df_solid], dependencies={ 'test_wrong_output': { 'num_csv': DependencyDefinition('pass_solid') } }, ) with pytest.raises(DagsterTypeCheckError): execute_pipeline(pipeline)
def test_execute_solid_with_dep_only_inputs_no_api(): did_run_dict = {} step_one_solid = single_output_transform( name='step_one_solid', inputs=[], transform_fn=lambda context, args: _set_key_value( did_run_dict, 'step_one', True), output=OutputDefinition(), ) step_two_solid = single_output_transform( name='step_two_solid', inputs=[InputDefinition('step_one_solid')], transform_fn=lambda context, args: _set_key_value( did_run_dict, 'step_two', True), output=OutputDefinition(), ) pipeline = PipelineDefinition( solids=[step_one_solid, step_two_solid], dependencies={ 'step_two_solid': { 'step_one_solid': DependencyDefinition('step_one_solid') } }, ) # from dagster.utils import logging pipeline_result = execute_pipeline(pipeline) assert pipeline_result.success for result in pipeline_result.solid_result_list: assert result.success assert did_run_dict['step_one'] is True assert did_run_dict['step_two'] is True
def execute_solid( solid_def, mode_def=None, input_values=None, environment_dict=None, run_config=None ): ''' Independently execute an individual solid without having to specify a pipeline. This also allows one to directly pass in in-memory input values. This is very useful for unit test cases. ''' check.inst_param(solid_def, 'solid_def', ISolidDefinition) check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) input_values = check.opt_dict_param(input_values, 'input_values', key_type=str) solid_defs = [solid_def] def create_value_solid(input_name, input_value): @lambda_solid(name=input_name) def input_solid(): return input_value return input_solid dependencies = defaultdict(dict) for input_name, input_value in input_values.items(): dependencies[solid_def.name][input_name] = DependencyDefinition(input_name) solid_defs.append(create_value_solid(input_name, input_value)) result = execute_pipeline( PipelineDefinition( name='ephemeral_{}_solid_pipeline'.format(solid_def.name), solid_defs=solid_defs, dependencies=dependencies, mode_defs=[mode_def] if mode_def else None, ), environment_dict=environment_dict, run_config=run_config, ) return result.result_for_handle(solid_def.name)
def _get_solid_deps_and_defs(self, assets_to_update, include_nothing_input=False): solid_defs = {} for asset in assets_to_update: if asset.computation: solid_defs[asset.path] = self.get_computed_asset_solid_def( asset, assets_to_update, include_nothing_input) else: check.failed( "All elements of assets_to_update must have computations") solid_deps = { solid_defs[asset.path].name: { solid_defs[dep.asset.path].name: DependencyDefinition(solid_defs[dep.asset.path].name) for dep in asset.computation.deps.values() if dep.asset in assets_to_update } for asset in assets_to_update if asset.computation } return solid_defs, solid_deps
def define_inty_pipeline(): @lambda_solid def return_one(): return 1 @lambda_solid(input_defs=[InputDefinition('num', Int)], output_def=OutputDefinition(Int)) def add_one(num): return num + 1 @lambda_solid def user_throw_exception(): raise Exception('whoops') pipeline = PipelineDefinition( name='basic_external_plan_execution', solid_defs=[return_one, add_one, user_throw_exception], dependencies={'add_one': { 'num': DependencyDefinition('return_one') }}, ) return pipeline
def test_sql_populate_tables(): create_all_tables_solids = _get_project_solid('create_all_tables') populate_num_table_solid = _get_project_solid( 'populate_num_table', inputs=[InputDefinition(create_all_tables_solids.name)]) pipeline = create_mem_sql_pipeline_context_tuple( solids=[create_all_tables_solids, populate_num_table_solid], dependencies={ populate_num_table_solid.name: { create_all_tables_solids.name: DependencyDefinition(create_all_tables_solids.name) } }) pipeline_result = execute_pipeline(pipeline) assert pipeline_result.success assert pipeline_engine(pipeline_result).execute( 'SELECT * FROM num_table').fetchall() == [(1, 2), (3, 4)]
def define_inty_pipeline(): @lambda_solid def return_one(): return 1 @lambda_solid(input_defs=[InputDefinition("num", Int)], output_def=OutputDefinition(Int)) def add_one(num): return num + 1 @lambda_solid def user_throw_exception(): raise Exception("whoops") pipeline = PipelineDefinition( name="basic_external_plan_execution", solid_defs=[return_one, add_one, user_throw_exception], dependencies={"add_one": { "num": DependencyDefinition("return_one") }}, ) return pipeline
def define_pipeline_from_yaml(pipeline_config): if isinstance(pipeline_config, str): pipeline_config = load_yaml_from_path(pipeline_config) pipeline_name = pipeline_config['name'] pipeline_description = pipeline_config.get('description') solids_config = pipeline_config['solids'] deps = {} solid_defs = set() solid_aliases = set() for solid_config in solids_config: solid_def_name = solid_config['def'] solid_defs.add(get_solid_def(solid_def_name)) solid_alias = solid_config.get('alias', solid_def_name) _check_duplicate_alias(solid_alias, solid_aliases) solid_deps = {} for input_name, input_solid in solid_config.get('deps', {}).items(): if '.' in input_solid: input_solid, input_solid_result = input_solid.split('.') else: input_solid, input_solid_result = input_solid, 'result' solid_deps[input_name] = DependencyDefinition( solid=input_solid, output=input_solid_result) deps[SolidInvocation(name=solid_def_name, alias=solid_alias)] = solid_deps return PipelineDefinition( name=pipeline_name, description=pipeline_description, solid_defs=list(solid_defs), dependencies=deps, mode_defs=[celery_docker_mode], preset_defs=[celery_docker_preset], )
def test_pipeline_types(): @lambda_solid def produce_string(): return "foo" @solid( input_defs=[InputDefinition("input_one", String)], output_defs=[OutputDefinition(Any)], config_schema={"another_field": Int}, ) def solid_one(_context, input_one): raise Exception("should not execute") pipeline_def = PipelineDefinition( solid_defs=[produce_string, solid_one], dependencies={"solid_one": {"input_one": DependencyDefinition("produce_string")}}, ) run_config_schema = create_run_config_schema(pipeline_def) assert run_config_schema.has_config_type("String") assert run_config_schema.has_config_type("Int") assert not run_config_schema.has_config_type("SomeName")
def test_only_aliased_solids(): @lambda_solid() def first(): return ['first'] @lambda_solid(inputs=[InputDefinition(name="prev")]) def not_first(prev): return prev + ['not_first'] pipeline = PipelineDefinition( solids=[first, not_first], dependencies={ SolidInstance('first', alias='the_root'): {}, SolidInstance('not_first', alias='the_consequence'): { 'prev': DependencyDefinition('the_root') }, }, ) result = execute_pipeline(pipeline) assert result.success solid_result = result.result_for_solid('the_consequence') assert solid_result.transformed_value() == ['first', 'not_first']
def test_pipeline_types(): @lambda_solid def produce_string(): return 'foo' @solid( input_defs=[InputDefinition('input_one', String)], output_defs=[OutputDefinition(Any)], config={'another_field': Int}, ) def solid_one(_context, input_one): raise Exception('should not execute') pipeline_def = PipelineDefinition( solid_defs=[produce_string, solid_one], dependencies={'solid_one': {'input_one': DependencyDefinition('produce_string')}}, ) environment_schema = create_environment_schema(pipeline_def) assert environment_schema.has_config_type('String') assert environment_schema.has_config_type('Int') assert not environment_schema.has_config_type('SomeName')