def test_basic_even_type(): # start_test_basic_even_type EvenDagsterType = DagsterType( name="EvenDagsterType", type_check_fn=lambda _, value: isinstance(value, int) and value % 2 is 0, ) # end_test_basic_even_type # start_test_basic_even_type_with_annotations @solid def double_even(_, num: EvenDagsterType) -> EvenDagsterType: # These type annotations are a shorthand for constructing InputDefinitions # and OutputDefinitions, and are not mypy compliant return num # at runtime this is a python int # end_test_basic_even_type_with_annotations assert execute_solid(double_even, input_values={"num": 2}).success with pytest.raises(DagsterTypeCheckDidNotPass): execute_solid(double_even, input_values={"num": 3}) assert not execute_solid( double_even, input_values={ "num": 3 }, raise_on_error=False).success
def pandera_schema_to_dagster_type(schema, name, description): def type_check_fn(_context, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description=f"Must be pandas.DataFrame, not {type(value).__name__}.", ) try: # `lazy` instructs pandera to capture every (not just the first) validation error schema.validate(value, lazy=True) except pa.errors.SchemaErrors as e: return TypeCheck( success=False, description=str(e), metadata={ "num_violations": len(e.failure_cases), }, ) return TypeCheck(success=True) return DagsterType( type_check_fn=type_check_fn, name=name, description=description, )
def __init__(self, storage_key, path, computation): self._storage_key = check.str_param(storage_key, "storage_key") self._path = canonicalize_path(path) self._computation = check.opt_inst_param(computation, "computation", Computation) self._dagster_type = DagsterType(type_check_fn=lambda a, b: True, name=".".join(self.path))
def test_basic_even_type_no_annotations(): EvenDagsterType = DagsterType( name="EvenDagsterType", type_check_fn=lambda _, value: isinstance(value, int) and value % 2 is 0, ) # start_test_basic_even_type_no_annotations @solid( input_defs=[InputDefinition("num", EvenDagsterType)], output_defs=[OutputDefinition(EvenDagsterType)], ) def double_even(_, num): return num # end_test_basic_even_type_no_annotations assert execute_solid(double_even, input_values={"num": 2}).success with pytest.raises(DagsterTypeCheckDidNotPass): execute_solid(double_even, input_values={"num": 3}) assert not execute_solid( double_even, input_values={ "num": 3 }, raise_on_error=False).success
def test_inner_inputs_connected_to_nested_outer_dependency(): my_dagster_type = DagsterType(name="foo", type_check_fn=lambda _, _a: True) @solid(input_defs=[InputDefinition("data", my_dagster_type)]) def inner_solid(data): return data @composite_solid(input_defs=[InputDefinition("data_1", my_dagster_type)]) def inner_composite(data_1): # source output handle should be top_level solid return inner_solid(data_1) @composite_solid(input_defs=[InputDefinition("data_2", my_dagster_type)]) def middle_composite(data_2): return inner_composite(data_2) @composite_solid(input_defs=[InputDefinition("data_3", my_dagster_type)]) def outer_composite(data_3): return middle_composite(data_3) @solid def top_level_solid(): return "from top_level_solid" @pipeline def my_pipeline(): # inner_solid should be connected to top_level_solid outer_composite(top_level_solid()) result = execute_pipeline(my_pipeline) assert result.success assert (result.output_for_solid( "outer_composite.middle_composite.inner_composite.inner_solid") == "from top_level_solid")
def __init__(self, storage_key, path, computation): self._storage_key = check.str_param(storage_key, 'storage_key') self._path = check.tuple_param(path, 'path', of_type=str) self._computation = check.opt_inst_param(computation, 'computation', Computation) self._dagster_type = DagsterType(type_check_fn=lambda a, b: True, name='.'.join(self.path))
def test_type_materializer_and_configurable_output_manager(): @dagster_type_materializer(config_schema={"type_materializer_path": str}) def my_materializer(_, _config, _value): assert False, "shouldn't get here" adict = {} @output_manager(output_config_schema={"output_manager_path": str}) def my_output_manager(_context, _resource_config, obj): adict["result"] = obj my_type = DagsterType(lambda _, _val: True, name="my_type", materializer=my_materializer) @solid( output_defs=[ OutputDefinition(name="output1", manager_key="my_output_manager", dagster_type=my_type), OutputDefinition(name="output2", dagster_type=my_type), ] ) def my_solid(_): yield Output(5, "output1") yield Output(7, "output2") @pipeline(mode_defs=[ModeDefinition(resource_defs={"my_output_manager": my_output_manager})]) def my_pipeline(): my_solid() execute_pipeline( my_pipeline, run_config={"solids": {"my_solid": {"outputs": {"output1": {"output_manager_path": "a"}}}}}, ) assert adict["result"] == 5
def pandera_schema_to_dagster_type( schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]], ) -> DagsterType: """ Convert a Pandera dataframe schema to a `DagsterType`. The generated Dagster type will be given an automatically generated `name`. The schema's `title` property, `name` property, or class name (in that order) will be used. If neither `title` or `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated. Additional metadata is also extracted from the Pandera schema and attached to the returned `DagsterType` in an `MetadataEntry` object. The extracted metadata includes: - Descriptions on the schema and constituent columns and checks. - Data types for each column. - String representations of all column-wise checks. - String representations of all row-wise (i.e. "wide") checks. The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all values in the dataframe, rather than stopping on the first error. If validation fails, the returned `TypeCheck` object will contain two pieces of metadata: - `num_failures` total number of validation errors. - `failure_sample` a table containing up to the first 10 validation errors. Args: schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]): Returns: DagsterType: Dagster Type constructed from the Pandera schema. """ if not ( isinstance(schema, pa.DataFrameSchema) or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel)) ): raise TypeError( "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`" ) name = _extract_name_from_pandera_schema(schema) norm_schema = ( schema.to_schema() # type: ignore[attr-defined] if isinstance(schema, type) and issubclass(schema, pa.SchemaModel) else schema ) tschema = _pandera_schema_to_table_schema(norm_schema) type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema) return DagsterType( type_check_fn=type_check_fn, name=name, description=norm_schema.description, metadata_entries=[ MetadataEntry("schema", value=tschema), ], )
def as_dagster_type(self, *args, **kwargs): if self.raise_or_typecheck: raise Exception( "Dagster types can only be constructed from constraints that return typechecks" ) return DagsterType( name=self.name, description="A Pandas DataFrame with the following validation: {}". format(self.description), type_check_fn=lambda x: self.validate(x, *args), **kwargs)
def test_register_after_solid_definition(): class MyClass: pass @solid def _my_solid(_) -> MyClass: return MyClass() my_dagster_type = DagsterType(name="aaaa", type_check_fn=lambda _, _a: True) with pytest.raises(DagsterInvalidDefinitionError): make_python_type_usable_as_dagster_type(MyClass, my_dagster_type)
def test_basic_even_type(): EvenDagsterType = DagsterType( name="EvenDagsterType", type_check_fn=lambda _, value: isinstance(value, int) and value % 2 is 0, ) @solid def double_even(_, num: EvenDagsterType) -> EvenDagsterType: return num # at runtime this is a python int assert execute_solid(double_even, input_values={"num": 2}).success with pytest.raises(DagsterTypeCheckDidNotPass): execute_solid(double_even, input_values={"num": 3}) assert not execute_solid(double_even, input_values={"num": 3}, raise_on_error=False).success
def test_type_missing_resource_fails(): def resource_based_type_check(context, value): return context.resources.a == value CustomType = DagsterType( name="NeedsA", type_check_fn=resource_based_type_check, required_resource_keys={"a"}, ) @solid(output_defs=[OutputDefinition(CustomType, "custom_type")]) def custom_type_solid(_): return "A" with pytest.raises(DagsterInvalidDefinitionError, match='required by type "NeedsA"'): @pipeline def _type_check_pipeline(): custom_type_solid()
def create_dagster_pandas_dataframe_type(name=None, description=None, columns=None, event_metadata_fn=None, dataframe_constraints=None): event_metadata_fn = check.opt_callable_param(event_metadata_fn, 'event_metadata_fn') description = create_dagster_pandas_dataframe_description( check.opt_str_param(description, 'description', default=''), check.opt_list_param(columns, 'columns', of_type=PandasColumn), ) def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= 'Must be a pandas.DataFrame. Got value of type. {type_name}'. format(type_name=type(value).__name__), ) try: validate_constraints(value, pandas_columns=columns, dataframe_constraints=dataframe_constraints) except ConstraintViolationException as e: return TypeCheck(success=False, description=str(e)) return TypeCheck( success=True, metadata_entries=_execute_summary_stats( name, value, event_metadata_fn) if event_metadata_fn else None, ) return DagsterType( name=name, type_check_fn=_dagster_type_check, input_hydration_config=dataframe_input_schema, output_materialization_config=dataframe_output_schema, description=description, )
def define_type_check_pipeline(should_require_resources): @resource def resource_a(_): yield 'A' def resource_based_type_check(context, value): return context.resources.a == value CustomType = DagsterType( name='NeedsA', type_check_fn=resource_based_type_check, required_resource_keys={'a'} if should_require_resources else None, ) @solid(output_defs=[OutputDefinition(CustomType, 'custom_type')]) def custom_type_solid(_): return 'A' @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})]) def type_check_pipeline(): custom_type_solid() return type_check_pipeline
fields = [field for field in value[0].keys()] for i in range(len(value)): row = value[i] if not isinstance(row, dict): return False row_fields = [field for field in row.keys()] if fields != row_fields: return False return True LessSimpleDataFrame = DagsterType( name='LessSimpleDataFrame', description= 'A more sophisticated data frame that type checks its structure.', type_check_fn=less_simple_data_frame_type_check, ) @solid def bad_read_csv(context, csv_path: str) -> LessSimpleDataFrame: csv_path = os.path.join(os.path.dirname(__file__), csv_path) with open(csv_path, 'r') as fd: lines = [row for row in csv.DictReader(fd)] context.log.info('Read {n_lines} lines'.format(n_lines=len(lines))) return ["not_a_dict"] @solid
import csv from dagster import ( DagsterType, InputDefinition, OutputDefinition, String, execute_pipeline, pipeline, solid, ) SimpleDataFrame = DagsterType( name='SimpleDataFrame', type_check_fn=lambda _, value: isinstance(value, list), description= 'A naive representation of a data frame, e.g., as returned by csv.DictReader.', ) @solid( input_defs=[InputDefinition('csv_path', String)], output_defs=[OutputDefinition(SimpleDataFrame)], ) def read_csv(context, csv_path: str) -> list: with open(csv_path, 'r') as fd: lines = [row for row in csv.DictReader(fd)] context.log.info('Read {n_lines} lines'.format(n_lines=len(lines))) return lines
) if timeseries_length != output_vector_length: return TypeCheck( success=False, description= "Every timeseries must have as many snapshots as outputs") return TypeCheck( success=True, metadata_entries=[ EventMetadataEntry.text(str(num_timeseries), "num_ts", "Number of parallel timeseries."), EventMetadataEntry.text(str(timeseries_length), "timeseries_length", "Length of each timeseries."), EventMetadataEntry.text( str(snapshot_length), "snapshot_length", "Number of past observations for each input.", ), ], ) TrainingSet = DagsterType( name="TrainingSet", description="Final training set ready for the ml pipeline", type_check_fn=validate_snapshot_timeseries, )
def create_dagster_pandas_dataframe_type( name, description=None, columns=None, event_metadata_fn=None, dataframe_constraints=None, input_hydration_config=None, output_materialization_config=None, ): """ Constructs a custom pandas dataframe dagster type. Args: name (str): Name of the dagster pandas type. description (Optional[str]): A markdown-formatted string, displayed in tooling. columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects which express dataframe column schemas and constraints. event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata which allow you to express things like summary statistics during runtime. dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints. input_hydration_config (Optional[InputHydrationConfig]): An instance of a class that inherits from :py:class:`~dagster.InputHydrationConfig`. If None, we will default to using the `dataframe_input_schema` input_hydration_config. output_materialization_config (Optional[OutputMaterializationConfig]): An instance of a class that inherits from :py:class:`~dagster.OutputMaterializationConfig`. If None, we will default to using the `dataframe_output_schema` output_materialization_config. """ # We allow for the plugging in of input_hydration_config/output_materialization_configs so that # Users can hydrate and persist their custom dataframes via configuration their own way if the default # configs don't suffice. This is purely optional. check.str_param(name, 'name') event_metadata_fn = check.opt_callable_param(event_metadata_fn, 'event_metadata_fn') description = create_dagster_pandas_dataframe_description( check.opt_str_param(description, 'description', default=''), check.opt_list_param(columns, 'columns', of_type=PandasColumn), ) def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= 'Must be a pandas.DataFrame. Got value of type. {type_name}'. format(type_name=type(value).__name__), ) try: validate_constraints(value, pandas_columns=columns, dataframe_constraints=dataframe_constraints) except ConstraintViolationException as e: return TypeCheck(success=False, description=str(e)) return TypeCheck( success=True, metadata_entries=_execute_summary_stats( name, value, event_metadata_fn) if event_metadata_fn else None, ) return DagsterType( name=name, type_check_fn=_dagster_type_check, input_hydration_config=input_hydration_config if input_hydration_config else dataframe_input_schema, output_materialization_config=output_materialization_config if output_materialization_config else dataframe_output_schema, description=description, )
) if not safe_isfile(value): return TypeCheck( success=False, description=( 'FileExistsAtPath must be a path that points to a file that ' 'exists. "{value}" does not exist on disk' ).format(value=value), ) return True FileExistsAtPath = DagsterType( name='FileExistsAtPath', description='A path at which a file actually exists', type_check_fn=file_exists_at_path_type_check, ) def _download_from_s3_to_file(session, context, bucket, key, target_folder, skip_if_present): # TODO: remove context argument once we support resource logging # file name is S3 key path suffix after last / target_file = os.path.join(target_folder, key.split('/')[-1]) if skip_if_present and safe_isfile(target_file): context.log.info( 'Skipping download, file already present at {target_file}'.format( target_file=target_file )
def create_structured_dataframe_type( name, description=None, columns_validator=None, columns_aggregate_validator=None, dataframe_validator=None, input_hydration_config=None, output_materialization_config=None, ): """ Args: name (str): the name of the new type description (Optional[str]): the description of the new type columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]): what column-level row by row validation you want to have applied. Leave empty for no column-level row by row validation. columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata, MultiAggregateConstraintWithMetadata]]): what column-level aggregate validation you want to have applied, Leave empty for no column-level aggregate validation. dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]): what dataframe-wide validation you want to have applied. Leave empty for no dataframe-wide validation. input_hydration_config (Optional[InputHydrationConfig]): An instance of a class that inherits from :py:class:`~dagster.InputHydrationConfig`. If None, we will default to using the `dataframe_input_schema` input_hydration_config. output_materialization_config (Optional[OutputMaterializationConfig]): An instance of a class that inherits from :py:class:`~dagster.OutputMaterializationConfig`. If None, we will default to using the `dataframe_output_schema` output_materialization_config. Returns: a DagsterType with the corresponding name and packaged validation. """ def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= 'Must be a pandas.DataFrame. Got value of type. {type_name}'. format(type_name=type(value).__name__), ) individual_result_dict = {} if columns_validator is not None: individual_result_dict["columns"] = columns_validator.validate( value) if columns_aggregate_validator is not None: individual_result_dict[ "column aggregates"] = columns_aggregate_validator.validate( value) if dataframe_validator is not None: individual_result_dict["dataframe"] = dataframe_validator.validate( value) typechecks_succeeded = True metadata = [] overall_description = "" for key, result in individual_result_dict.items(): result_val = result.success if result_val: continue typechecks_succeeded = typechecks_succeeded and result_val result_dict = result.metadata_entries[0].entry_data.data metadata.append( EventMetadataEntry.json( result_dict, '{}-constraint-metadata'.format(key), )) overall_description += "{} failing constraints, requiring {}".format( key, result.description) return TypeCheck(success=typechecks_succeeded, description=overall_description, metadata_entries=metadata) description = check.opt_str_param(description, 'description', default='') return DagsterType( name=name, type_check_fn=_dagster_type_check, input_hydration_config=input_hydration_config if input_hydration_config else dataframe_input_schema, output_materialization_config=output_materialization_config if output_materialization_config else dataframe_output_schema, description=description, )
success=True, metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), # string cast columns since they may be things like datetime EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'), ], ) DataFrame = DagsterType( name='PandasDataFrame', description='''Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). See http://pandas.pydata.org/''', input_hydration_config=dataframe_input_schema, output_materialization_config=dataframe_output_schema, type_check_fn=df_type_check, ) def _construct_constraint_list(constraints): def add_bullet(constraint_list, constraint_description): return constraint_list + "+ {constraint_description}\n".format( constraint_description=constraint_description) constraint_list = "" for constraint in constraints: if constraint.__class__ not in CONSTRAINT_BLACKLIST: constraint_list = add_bullet(constraint_list,
@input_hydration_config(Selector({"csv": Field(String)})) def less_simple_data_frame_input_hydration_config(context, selector): lines = [] with open(selector["csv"], "r") as fd: for row in csv.DictReader(fd): row["calories"] = int(row["calories"]) lines.append(row) context.log.info("Read {n_lines} lines".format(n_lines=len(lines))) return lines LessSimpleDataFrame = DagsterType( name="LessSimpleDataFrame", description= "A more sophisticated data frame that type checks its structure.", type_check_fn=less_simple_data_frame_type_check, input_hydration_config=less_simple_data_frame_input_hydration_config, ) def expect_column_to_be_integers(data_frame: LessSimpleDataFrame, column_name: str) -> ExpectationResult: bad_values = [] for idx in range(len(data_frame)): line = data_frame[idx] if not isinstance(line[column_name], int): bad_values.append((idx, str(line[column_name]))) return ExpectationResult( success=(not bad_values), label="col_{column_name}_is_int".format(column_name=column_name),
OutputDefinition, execute_pipeline, pipeline, solid, ) # start_custom_types_2_marker_0 def is_list_of_dicts(_, value): return isinstance(value, list) and all( isinstance(element, dict) for element in value) SimpleDataFrame = DagsterType( name="SimpleDataFrame", type_check_fn=is_list_of_dicts, description= "A naive representation of a data frame, e.g., as returned by csv.DictReader.", ) # end_custom_types_2_marker_0 # start_custom_types_2_marker_1 @solid(output_defs=[OutputDefinition(SimpleDataFrame)]) def bad_read_csv(context): csv_path = os.path.join(os.path.dirname(__file__), "cereal.csv") with open(csv_path, "r") as fd: lines = [row for row in csv.DictReader(fd)] context.log.info(f"Read {len(lines)} lines") return ["not_a_dict"]
success=True, metadata_entries=[ EventMetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"), # string cast columns since they may be things like datetime EventMetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], ) DataFrame = DagsterType( name="PandasDataFrame", description="""Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). See http://pandas.pydata.org/""", loader=dataframe_loader, materializer=dataframe_materializer, type_check_fn=df_type_check, ) def _construct_constraint_list(constraints): def add_bullet(constraint_list, constraint_description): return constraint_list + "+ {constraint_description}\n".format( constraint_description=constraint_description) constraint_list = "" for constraint in constraints: if constraint.__class__ not in CONSTRAINT_BLACKLIST: constraint_list = add_bullet(constraint_list,
def less_simple_data_frame_loader(context, selector): csv_path = os.path.join(os.path.dirname(__file__), selector["csv"]) with open(csv_path, "r") as fd: lines = [row for row in csv.DictReader(fd)] context.log.info("Read {n_lines} lines".format(n_lines=len(lines))) return lines # end_custom_types_3_marker_0 # start_custom_types_3_marker_1 LessSimpleDataFrame = DagsterType( name="LessSimpleDataFrame", description= "A more sophisticated data frame that type checks its structure.", type_check_fn=less_simple_data_frame_type_check, loader=less_simple_data_frame_loader, ) # end_custom_types_3_marker_1 @solid def sort_by_calories(context, cereals: LessSimpleDataFrame): sorted_cereals = sorted(cereals, key=lambda cereal: cereal["calories"]) context.log.info("Least caloric cereal: {least_caloric}".format( least_caloric=sorted_cereals[0]["name"])) context.log.info("Most caloric cereal: {most_caloric}".format( most_caloric=sorted_cereals[-1]["name"]))
to_function(dask_df, *to_args, **to_kwargs) if to_path: yield AssetMaterialization.file(to_path) def df_type_check(_, value): if not isinstance(value, dd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ # string cast columns since they may be things like datetime EventMetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], ) DataFrame = DagsterType( name="DaskDataFrame", description= """A Dask DataFrame is a large parallel DataFrame composed of many smaller Pandas DataFrames, split along the index. These Pandas DataFrames may live on disk for larger-than-memory computing on a single machine, or on many different machines in a cluster. One Dask DataFrame operation triggers many operations on the constituent Pandas DataFrames. See https://docs.dask.org/en/latest/dataframe.html""", loader=dataframe_loader, materializer=dataframe_materializer, type_check_fn=df_type_check, )
assert result.success result = basic.to_job(config={ "ops": { "my_op": { "config": { "conf_str": "foo" } } } }).execute_in_process() assert result.success even_type = DagsterType( name="EvenDagsterType", type_check_fn=lambda _, value: isinstance(value, int) and value % 2 == 0, ) # Test typing override between out and annotation. Should they just match? def test_out_dagster_type(): @op(out=Out(dagster_type=even_type)) def basic() -> int: return 6 assert basic.output_defs[0].dagster_type == even_type assert basic() == 6 def test_multiout_dagster_type(): @op(out={
# end_configured_op_marker # start_input_op_marker @op def my_input_op(abc, xyz): pass # end_input_op_marker # start_typed_input_op_marker MyDagsterType = DagsterType(type_check_fn=lambda _, value: value % 2 == 0, name="MyDagsterType") @op(ins={"abc": In(dagster_type=MyDagsterType)}) def my_typed_input_op(abc): pass # end_typed_input_op_marker # start_output_op_marker @op def my_output_op(): return 5
def create_dagster_pandas_dataframe_type( name, description=None, columns=None, event_metadata_fn=None, dataframe_constraints=None, loader=None, materializer=None, ): """ Constructs a custom pandas dataframe dagster type. Args: name (str): Name of the dagster pandas type. description (Optional[str]): A markdown-formatted string, displayed in tooling. columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects which express dataframe column schemas and constraints. event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata which allow you to express things like summary statistics during runtime. dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints. loader (Optional[DagsterTypeLoader]): An instance of a class that inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default to using `dataframe_loader`. materializer (Optional[DagsterTypeMaterializer]): An instance of a class that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will default to using `dataframe_materializer`. """ # We allow for the plugging in of dagster_type_loaders/materializers so that # Users can load and materialize their custom dataframes via configuration their own way if the default # configs don't suffice. This is purely optional. check.str_param(name, "name") event_metadata_fn = check.opt_callable_param(event_metadata_fn, "event_metadata_fn") description = create_dagster_pandas_dataframe_description( check.opt_str_param(description, "description", default=""), check.opt_list_param(columns, "columns", of_type=PandasColumn), ) def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= "Must be a pandas.DataFrame. Got value of type. {type_name}". format(type_name=type(value).__name__), ) try: validate_constraints(value, pandas_columns=columns, dataframe_constraints=dataframe_constraints) except ConstraintViolationException as e: return TypeCheck(success=False, description=str(e)) return TypeCheck( success=True, metadata_entries=_execute_summary_stats( name, value, event_metadata_fn) if event_metadata_fn else None, ) return DagsterType( name=name, type_check_fn=_dagster_type_check, loader=loader if loader else dataframe_loader, materializer=materializer if loader else dataframe_materializer, description=description, )
def create_structured_dataframe_type( name, description=None, columns_validator=None, columns_aggregate_validator=None, dataframe_validator=None, loader=None, materializer=None, ): """ Args: name (str): the name of the new type description (Optional[str]): the description of the new type columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]): what column-level row by row validation you want to have applied. Leave empty for no column-level row by row validation. columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata, MultiAggregateConstraintWithMetadata]]): what column-level aggregate validation you want to have applied, Leave empty for no column-level aggregate validation. dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]): what dataframe-wide validation you want to have applied. Leave empty for no dataframe-wide validation. loader (Optional[DagsterTypeLoader]): An instance of a class that inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default to using `dataframe_loader`. materializer (Optional[DagsterTypeMaterializer]): An instance of a class that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will default to using `dataframe_materializer`. Returns: a DagsterType with the corresponding name and packaged validation. """ def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= "Must be a pandas.DataFrame. Got value of type. {type_name}". format(type_name=type(value).__name__), ) individual_result_dict = {} if dataframe_validator is not None: individual_result_dict["dataframe"] = dataframe_validator.validate( value) if columns_validator is not None: individual_result_dict["columns"] = columns_validator.validate( value) if columns_aggregate_validator is not None: individual_result_dict[ "column-aggregates"] = columns_aggregate_validator.validate( value) typechecks_succeeded = True metadata = [] overall_description = "Failed Constraints: {}" constraint_clauses = [] for key, result in individual_result_dict.items(): result_val = result.success if result_val: continue typechecks_succeeded = typechecks_succeeded and result_val result_dict = result.metadata_entries[0].entry_data.data metadata.append( EventMetadataEntry.json( result_dict, "{}-constraint-metadata".format(key), )) constraint_clauses.append("{} failing constraints, {}".format( key, result.description)) # returns aggregates, then column, then dataframe return TypeCheck( success=typechecks_succeeded, description=overall_description.format(constraint_clauses), metadata_entries=sorted(metadata, key=lambda x: x.label), ) description = check.opt_str_param(description, "description", default="") return DagsterType( name=name, type_check_fn=_dagster_type_check, loader=loader if loader else dataframe_loader, materializer=materializer if loader else dataframe_materializer, description=description, )