def _dataframe_loader_config(): read_fields = { read_from: Permissive({ option_name: Field(option_args[0], is_required=option_args[1], description=option_args[2]) for option_name, option_args in read_opts["options"].items() }) for read_from, read_opts in DataFrameReadTypes.items() } return Selector({ "read": Field( Selector(read_fields), is_required=False, ), # https://github.com/dagster-io/dagster/issues/2872 **{ field_name: Field( field_config, is_required=False, ) for field_name, field_config in read_fields.items() }, })
def _dataframe_materializer_config(): to_fields = { write_to: Permissive({ option_name: Field(option_args[0], is_required=option_args[1], description=option_args[2]) for option_name, option_args in to_opts["options"].items() }) for write_to, to_opts in DataFrameToTypes.items() } return Selector({ "to": Field( Selector(to_fields), is_required=False, ), # https://github.com/dagster-io/dagster/issues/2872 **{ field_name: Field( field_config, is_required=False, ) for field_name, field_config in to_fields.items() }, })
def test_construct_same_selectors(): int_selector_1 = Selector(fields={'an_int': Field(int)}) int_selector_2 = Selector(fields={'an_int': Field(int)}) # assert identical object assert int_selector_1 is int_selector_2 # assert equivalent key assert int_selector_1.key == int_selector_2.key
def test_custom_dagster_dataframe_parametrizable_input(): @input_selector_schema( Selector({'door_a': Field(str), 'door_b': Field(str), 'door_c': Field(str),}) ) def silly_hydrator(_, which_door, _field): if which_door == 'door_a': return DataFrame({'foo': ['goat']}) elif which_door == 'door_b': return DataFrame({'foo': ['car']}) elif which_door == 'door_c': return DataFrame({'foo': ['goat']}) raise DagsterInvariantViolationError( 'You did not pick a door. You chose: {which_door}'.format(which_door=which_door) ) @output_selector_schema(Selector({'devnull': Field(str), 'nothing': Field(str)})) def silly_materializer(_, _location, _field, _value): return Materialization(label='did nothing', description='just one of those days') TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[PandasColumn.exists('foo'),], input_hydration_config=silly_hydrator, output_materialization_config=silly_materializer, ) @solid( input_defs=[InputDefinition('df', TestDataFrame)], output_defs=[OutputDefinition(TestDataFrame)], ) def did_i_win(_, df): return df solid_result = execute_solid( did_i_win, run_config={ 'solids': { 'did_i_win': { 'inputs': {'df': {'door_a': 'bar'}}, 'outputs': [{'result': {'devnull': 'baz'}}], } } }, ) assert solid_result.success output_df = solid_result.output_value() assert isinstance(output_df, DataFrame) assert output_df['foo'].tolist() == ['goat'] materialization_events = solid_result.materialization_events_during_compute assert len(materialization_events) == 1 assert materialization_events[0].event_specific_data.materialization.label == 'did nothing'
def _construct_selector_from_snap(config_type_snap, config_snap_map): check.list_param(config_type_snap.fields, "config_field_snap", ConfigFieldSnap) return Selector( fields=_construct_fields(config_type_snap, config_snap_map), description=config_type_snap.description, )
def test_scalar_or_selector(): int_or_selector = ScalarUnion( scalar_type=int, non_scalar_schema=Selector({ "a_string": str, "an_int": int }), ) assert validate_config(int_or_selector, 2).success assert not validate_config(int_or_selector, "2").success assert not validate_config(int_or_selector, False).success assert validate_config(int_or_selector, {"a_string": "kjdfk"}).success assert validate_config(int_or_selector, {"an_int": 2}).success assert not validate_config(int_or_selector, {}).success assert not validate_config(int_or_selector, { "a_string": "kjdfk", "an_int": 2 }).success assert not validate_config(int_or_selector, {"wrong_key": "kjdfd"}).success assert not validate_config(int_or_selector, {"a_string": 2}).success assert not validate_config(int_or_selector, { "a_string": "kjdfk", "extra_field": "kd" }).success
def test_kitchen_sink_break_out(): @solid(config_schema=[{ "opt_list_of_int": Field([int], is_required=False), "nested_dict": { "list_list": [[int]], "nested_selector": Selector({ "some_field": int, "noneable_list": Noneable([bool]) }), }, "map": { str: { "map_a": int, "map_b": [str] }, }, }]) def solid_with_kitchen_sink_config(_): pass @pipeline def single_solid_pipeline(): solid_with_kitchen_sink_config() config_snaps = build_config_schema_snapshot( single_solid_pipeline).all_config_snaps_by_key solid_config_key = solid_with_kitchen_sink_config.config_schema.config_type.key assert solid_config_key in config_snaps solid_config_snap = config_snaps[solid_config_key] assert solid_config_snap.kind == ConfigTypeKind.ARRAY dict_within_list = config_snaps[solid_config_snap.inner_type_key] assert len(dict_within_list.fields) == 3 opt_field = dict_within_list.get_field("opt_list_of_int") assert opt_field.is_required is False assert config_snaps[opt_field.type_key].kind == ConfigTypeKind.ARRAY nested_dict = config_snaps[dict_within_list.get_field( "nested_dict").type_key] assert len(nested_dict.fields) == 2 nested_selector = config_snaps[nested_dict.get_field( "nested_selector").type_key] noneable_list_bool = config_snaps[nested_selector.get_field( "noneable_list").type_key] assert noneable_list_bool.kind == ConfigTypeKind.NONEABLE list_bool = config_snaps[noneable_list_bool.inner_type_key] assert list_bool.kind == ConfigTypeKind.ARRAY map = config_snaps[dict_within_list.get_field("map").type_key] assert map.kind == ConfigTypeKind.MAP map_dict = config_snaps[map.inner_type_key] assert len(map_dict.fields) == 2 map_a = config_snaps[map_dict.get_field("map_a").type_key] assert map_a.kind == ConfigTypeKind.SCALAR
def test_kitchen_sink(): kitchen_sink = resolve_to_config_type([{ "opt_list_of_int": Field(int, is_required=False), "nested_dict": { "list_list": [[int]], "nested_selector": Field(Selector({ "some_field": int, "more_list": Noneable([bool]) })), }, "map": { str: { "map_a": int, "map_b": [str] }, }, }]) kitchen_sink_snap = snap_from_dagster_type(kitchen_sink) rehydrated_snap = deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(kitchen_sink_snap)) assert kitchen_sink_snap == rehydrated_snap
def test_kitchen_sink_break_out(): nested_dict_cls = resolve_to_config_type({ 'list_list': [[int]], 'nested_selector': Selector({ 'some_field': int, 'list': Noneable([bool]) }), }) dict_within_list_cls = resolve_to_config_type({ 'opt_list_of_int': Field([int], is_optional=True), 'nested_dict': Field(nested_dict_cls) }) kitchen_sink = Array(dict_within_list_cls) dict_within_list_key = dict_within_list_cls.key kitchen_sink_meta = meta_from_dagster_type(kitchen_sink) assert len(kitchen_sink_meta.type_param_refs) == 1 assert kitchen_sink_meta.type_param_refs[0].key == dict_within_list_key assert len(kitchen_sink_meta.inner_type_refs) == 1 assert kitchen_sink_meta.inner_type_refs[0].key == dict_within_list_key dict_within_list_meta = meta_from_dagster_type(dict_within_list_cls) assert dict_within_list_meta.type_param_refs is None # List[int], Int, Shape.XXX assert len(dict_within_list_meta.inner_type_refs) == 3 assert sorted([ type_ref.key for type_ref in dict_within_list_meta.inner_type_refs ]) == sorted([nested_dict_cls.key, 'Int', 'Array.Int'])
def test_resource_invocation_kitchen_sink_config(): @resource( config_schema={ "str_field": str, "int_field": int, "list_int": [int], "list_list_int": [[int]], "dict_field": {"a_string": str}, "list_dict_field": [{"an_int": int}], "selector_of_things": Selector( {"select_list_dict_field": [{"an_int": int}], "select_int": int} ), "optional_list_of_optional_string": Noneable([Noneable(str)]), } ) def kitchen_sink(context): return context.resource_config resource_config = { "str_field": "kjf", "int_field": 2, "list_int": [3], "list_list_int": [[1], [2, 3]], "dict_field": {"a_string": "kdjfkd"}, "list_dict_field": [{"an_int": 2}, {"an_int": 4}], "selector_of_things": {"select_int": 3}, "optional_list_of_optional_string": ["foo", None], } assert kitchen_sink(build_init_resource_context(config=resource_config)) == resource_config
def test_scalar_or_selector(): int_or_selector = ScalarUnion( scalar_type=int, non_scalar_schema=Selector({ 'a_string': str, 'an_int': int }), ) assert validate_config(int_or_selector, 2).success assert not validate_config(int_or_selector, '2').success assert not validate_config(int_or_selector, False).success assert validate_config(int_or_selector, {'a_string': 'kjdfk'}).success assert validate_config(int_or_selector, {'an_int': 2}).success assert not validate_config(int_or_selector, {}).success assert not validate_config(int_or_selector, { 'a_string': 'kjdfk', 'an_int': 2 }).success assert not validate_config(int_or_selector, {'wrong_key': 'kjdfd'}).success assert not validate_config(int_or_selector, {'a_string': 2}).success assert not validate_config(int_or_selector, { 'a_string': 'kjdfk', 'extra_field': 'kd' }).success
def test_kitchen_sink(): kitchen_sink = List[Dict({ 'opt_list_of_int': Field(List[int], is_optional=True), 'tuple_of_things': Field(Tuple[int, str]), 'nested_dict': Field( Dict({ 'list_list': Field(List[List[int]]), 'nested_selector': Field( Selector({ 'some_field': Field(int), 'set': Field(Optional[Set[bool]]) })), })), })] kitchen_sink_meta = meta_from_dagster_type(kitchen_sink) rehydrated_meta = deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(kitchen_sink_meta)) assert kitchen_sink_meta == rehydrated_meta
def test_invalid_selector_field(): with pytest.raises(DagsterInvalidDefinitionError) as exc_info: Selector({'val': Int}) assert str(exc_info.value) == ( 'You have passed a config type "Int" in the parameter "fields" and it is ' 'in the "val" entry of that dict. It is from a Selector with fields ' '[\'val\']. You have likely forgot to wrap this type in a Field.')
def test_selector_of_things(): selector_snap = snap_from_dagster_type(Selector({"bar": Field(int)})) assert selector_snap.key.startswith("Selector") assert selector_snap.kind == ConfigTypeKind.SELECTOR assert selector_snap.fields and len(selector_snap.fields) == 1 field_snap = selector_snap.fields[0] assert field_snap.name == "bar" assert field_snap.type_key == "Int"
def test_selector_of_things(): selector_meta = meta_from_dagster_type(Selector({'bar': Field(int)})) assert selector_meta.key.startswith('Selector') assert selector_meta.kind == ConfigTypeKind.SELECTOR assert selector_meta.fields and len(selector_meta.fields) == 1 field_meta = selector_meta.fields[0] assert field_meta.name == 'bar' assert field_meta.type_ref.key == 'Int'
def get_retries_config(): return Field( Selector({ 'enabled': {}, 'disabled': {} }), is_required=False, default_value={'enabled': {}}, )
def test_kitchen_sink(): @solid( config_schema={ 'str_field': str, 'int_field': int, 'list_int': [int], 'list_list_int': [[int]], 'dict_field': {'a_string': str}, 'list_dict_field': [{'an_int': int}], 'selector_of_things': Selector( {'select_list_dict_field': [{'an_int': int}], 'select_int': int} ), # this is a good argument to use () instead of [] for type parameterization in # the config system 'optional_list_of_optional_string': Noneable([Noneable(str)]), } ) def kitchen_sink(context): return context.solid_config solid_config_one = { 'str_field': 'kjf', 'int_field': 2, 'list_int': [3], 'list_list_int': [[1], [2, 3]], 'dict_field': {'a_string': 'kdjfkd'}, 'list_dict_field': [{'an_int': 2}, {'an_int': 4}], 'selector_of_things': {'select_int': 3}, 'optional_list_of_optional_string': ['foo', None], } assert ( execute_solid( kitchen_sink, run_config={'solids': {'kitchen_sink': {'config': solid_config_one}}}, ).output_value() == solid_config_one ) solid_config_two = { 'str_field': 'kjf', 'int_field': 2, 'list_int': [3], 'list_list_int': [[1], [2, 3]], 'dict_field': {'a_string': 'kdjfkd'}, 'list_dict_field': [{'an_int': 2}, {'an_int': 4}], 'selector_of_things': {'select_list_dict_field': [{'an_int': 5}]}, 'optional_list_of_optional_string': None, } assert ( execute_solid( kitchen_sink, run_config={'solids': {'kitchen_sink': {'config': solid_config_two}}}, ).output_value() == solid_config_two )
def test_kitchen_sink(): @solid( config_schema={ "str_field": str, "int_field": int, "list_int": [int], "list_list_int": [[int]], "dict_field": {"a_string": str}, "list_dict_field": [{"an_int": int}], "selector_of_things": Selector( {"select_list_dict_field": [{"an_int": int}], "select_int": int} ), # this is a good argument to use () instead of [] for type parameterization in # the config system "optional_list_of_optional_string": Noneable([Noneable(str)]), } ) def kitchen_sink(context): return context.solid_config solid_config_one = { "str_field": "kjf", "int_field": 2, "list_int": [3], "list_list_int": [[1], [2, 3]], "dict_field": {"a_string": "kdjfkd"}, "list_dict_field": [{"an_int": 2}, {"an_int": 4}], "selector_of_things": {"select_int": 3}, "optional_list_of_optional_string": ["foo", None], } assert ( execute_solid( kitchen_sink, run_config={"solids": {"kitchen_sink": {"config": solid_config_one}}}, ).output_value() == solid_config_one ) solid_config_two = { "str_field": "kjf", "int_field": 2, "list_int": [3], "list_list_int": [[1], [2, 3]], "dict_field": {"a_string": "kdjfkd"}, "list_dict_field": [{"an_int": 2}, {"an_int": 4}], "selector_of_things": {"select_list_dict_field": [{"an_int": 5}]}, "optional_list_of_optional_string": None, } assert ( execute_solid( kitchen_sink, run_config={"solids": {"kitchen_sink": {"config": solid_config_two}}}, ).output_value() == solid_config_two )
def mysql_config(): return Selector({ "mysql_url": StringSource, "mysql_db": { "username": StringSource, "password": StringSource, "hostname": StringSource, "db_name": StringSource, "port": Field(IntSource, is_required=False, default_value=3306), }, })
def pg_config(): return Selector({ "postgres_url": StringSource, "postgres_db": { "username": StringSource, "password": StringSource, "hostname": StringSource, "db_name": StringSource, "port": Field(IntSource, is_required=False, default_value=5432), }, })
def _define_task(): return Field( Selector({ "notebook_task": _define_notebook_task(), "spark_jar_task": _define_spark_jar_task(), "spark_python_task": _define_spark_python_task(), "spark_submit_task": _define_spark_submit_task(), }), description="The task to run.", is_required=True, )
def pg_config(): return Selector({ 'postgres_url': str, 'postgres_db': { 'username': StringSource, 'password': StringSource, 'hostname': StringSource, 'db_name': StringSource, 'port': Field(IntSource, is_required=False, default_value=5432), }, })
def _define_task(): return Field( Selector({ 'notebook_task': _define_notebook_task(), 'spark_jar_task': _define_spark_jar_task(), 'spark_python_task': _define_spark_python_task(), 'spark_submit_task': _define_spark_submit_task(), }), description='The task to run.', is_required=True, )
def get_retries_config(): return Field( Selector({ 'enabled': {}, 'disabled': {}, 'deferred': { 'previous_attempts': Permissive() } }), is_required=False, default_value={'enabled': {}}, )
def define_databricks_storage_config(): return Field( Selector({ "s3": _define_s3_storage_credentials(), "adls2": _define_adls2_storage_credentials() }), description= "Databricks storage configuration for either S3 or ADLS2. If access credentials " "for your Databricks storage are stored in Databricks secrets, this config indicates the " "secret scope and the secret keys used to access either S3 or ADLS2.", is_required=False, )
def _define_size(): num_workers = Field( Int, description= 'If num_workers, number of worker nodes that this cluster should have. ' 'A cluster has one Spark Driver and num_workers Executors for a total of ' 'num_workers + 1 Spark nodes.', is_required=True, ) return Selector({ 'autoscale': _define_autoscale(), 'num_workers': num_workers })
def define_databricks_storage_config(): return Field( Selector({ "s3": _define_s3_storage_credentials(), "adls2": _define_adls2_storage_credentials() }), description="Databricks storage configuration. Solids using the " "DatabricksPySparkStepLauncher to execute pipeline steps in Databricks MUST configure " "storage using this config (either S3 or ADLS2 can be used). Access credentials for the " "storage must be stored in Databricks secrets; this config indicates the secret scope " "and the secret keys used to access either S3 or ADLS2.", is_required=True, )
def _define_size(): num_workers = Field( Int, description= "If num_workers, number of worker nodes that this cluster should have. " "A cluster has one Spark Driver and num_workers Executors for a total of " "num_workers + 1 Spark nodes.", is_required=True, ) return Selector({ "autoscale": _define_autoscale(), "num_workers": num_workers })
def _define_cluster_log_conf(): return Field( Selector({ "dbfs": _define_dbfs_storage_info(), "s3": _define_s3_storage_info() }), description= "Recommended! The configuration for delivering Spark logs to a long-term " "storage destination. Only one destination can be specified for one cluster. If the conf " "is given, the logs will be delivered to the destination every 5 mins. " "The destination of driver logs is <destination>/<cluster-id>/driver, while the " "destination of executor logs is <destination>/<cluster-id>/executor.", is_required=False, )
def _define_cluster(): existing_cluster_id = Field( String, description= "The ID of an existing cluster that will be used for all runs " "of this job. When running jobs on an existing cluster, you may " "need to manually restart the cluster if it stops responding. " "Databricks suggests running jobs on new clusters for " "greater reliability.", is_required=True, ) return Selector({ "new": _define_new_cluster(), "existing": existing_cluster_id })