def test_column_type_constraint(): test_dataframe = DataFrame({"foo": ["baz"]}) assert ColumnDTypeInSetConstraint({"object"}).validate( test_dataframe, "foo") is None with pytest.raises(ConstraintViolationException): ColumnDTypeInSetConstraint({"int64"}).validate(test_dataframe, "foo")
def datetime_column( name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, tz=None, ): """ Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to pandas.Timestamp.min. max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to pandas.Timestamp.max. non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'. Defaults to None, meaning naive datetime values. """ if tz is None: datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"}) else: datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"}) # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware if min_datetime.replace(tzinfo=None) == Timestamp.min: min_datetime = Timestamp("1677-09-22 00:12:43.145225Z") if max_datetime.replace(tzinfo=None) == Timestamp.max: max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z") # Convert bounds to same tz if Timestamp(min_datetime).tz is None: min_datetime = Timestamp(min_datetime).tz_localize(tz) if Timestamp(max_datetime).tz is None: max_datetime = Timestamp(max_datetime).tz_localize(tz) return PandasColumn( name=check.str_param(name, "name"), constraints=[ datetime_constraint, InRangeColumnConstraint( min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals ), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals ), is_required=is_required, )
def test_validate_constraints_ok(): column_constraints = [ PandasColumn(name='foo', constraints=[ColumnDTypeInSetConstraint({'object'})]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_missing_column_validation_with_optional_column(): column_constraints = [ PandasColumn( name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})], is_required=False ), ] dataframe = DataFrame({"foo": ["bar", "baz"]}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_missing_column_validation(): column_constraints = [ PandasColumn(name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})]), ] dataframe = DataFrame({"foo": ["bar", "baz"]}) with pytest.raises( ConstraintViolationException, match="Required column qux not in dataframe with columns" ): validate_constraints(dataframe, pandas_columns=column_constraints)
def test_missing_column_validation_with_optional_column(): column_constraints = [ PandasColumn(name='qux', constraints=[ColumnDTypeInSetConstraint({'object'})], is_required=False), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_dataframe_description_generation_just_type_constraint(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"int64"})]) ], ) assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
def test_create_pandas_dataframe_dagster_type(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"int64"})]) ], ) assert isinstance(TestDataFrame, DagsterType)
def test_missing_column_validation(): column_constraints = [ PandasColumn(name='qux', constraints=[ColumnDTypeInSetConstraint({'object'})]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) with pytest.raises( ConstraintViolationException, match="Required column qux not in dataframe with columns"): validate_constraints(dataframe, pandas_columns=column_constraints)
def test_dataframe_description_generation_multi_constraints(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn( name="foo", constraints=[ ColumnDTypeInSetConstraint({"int64"}), InRangeColumnConstraint(0, 100, ignore_missing_vals=False), NonNullableColumnConstraint(), ], ), ], ) assert ( TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n" )
def datetime_column( name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, ): """ Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to pandas.Timestamp.min. max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to pandas.Timestamp.max. non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. """ return PandasColumn( name=check.str_param(name, "name"), constraints=[ ColumnDTypeInSetConstraint({"datetime64[ns]"}), InRangeColumnConstraint( min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals), is_required=is_required, )
def categorical_column( name, categories, of_types=frozenset({"category", "object"}), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, ): """ Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. categories (List[Any]): The valid set of buckets that all values in the column must match. of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must abide by. non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. """ of_types = {of_types} if isinstance(of_types, str) else of_types return PandasColumn( name=check.str_param(name, "name"), constraints=[ ColumnDTypeInSetConstraint(of_types), CategoricalColumnConstraint( categories, ignore_missing_vals=ignore_missing_vals), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals), is_required=is_required, )
def test_column_type_constraint(): test_dataframe = DataFrame({'foo': ['baz']}) assert ColumnDTypeInSetConstraint({'object'}).validate(test_dataframe, 'foo') is None with pytest.raises(ConstraintViolationException): ColumnDTypeInSetConstraint({'int64'}).validate(test_dataframe, 'foo')
def test_missing_column_validation_with_optional_column(): column_constraints = [ PandasColumn( name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})], is_required=False ), ] dataframe = DataFrame({"foo": ["bar", "baz"]}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None @pytest.mark.parametrize( "column_constraints, dataframe", [ ( [PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"int64"})])], DataFrame({"foo": ["bar", "baz"]}), ), ( [PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"object"})])], DataFrame({"bar": ["bar", "baz"]}), ), ], ) def test_validate_constraints_throw_error(column_constraints, dataframe): with pytest.raises(ConstraintViolationException): validate_constraints(dataframe, pandas_columns=column_constraints) def test_shape_validation_ok(): assert (
constraints=[ColumnDTypeInSetConstraint({'object'})], is_required=False), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None @pytest.mark.parametrize( 'column_constraints, dataframe', [ ( [ PandasColumn( name='foo', constraints=[ColumnDTypeInSetConstraint({'int64'})]) ], DataFrame({'foo': ['bar', 'baz']}), ), ( [ PandasColumn( name='foo', constraints=[ColumnDTypeInSetConstraint({'object'})]) ], DataFrame({'bar': ['bar', 'baz']}), ), ], ) def test_validate_constraints_throw_error(column_constraints, dataframe): with pytest.raises(ConstraintViolationException):
def test_validate_constraints_ok(): column_constraints = [ PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"object"})]), ] dataframe = DataFrame({"foo": ["bar", "baz"]}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
if not rows_with_unexpected_buckets.empty: raise ColumnConstraintViolationException( constraint_name=self.name, constraint_description=self.error_description, column_name=column_name, offending_rows=rows_with_unexpected_buckets, ) CustomTripDataFrame = create_dagster_pandas_dataframe_type( name="CustomTripDataFrame", columns=[ PandasColumn( "amount_paid", constraints=[ ColumnDTypeInSetConstraint({"int64"}), DivisibleByFiveConstraint() ], ) ], ) # end_custom_col @solid( output_defs=[ OutputDefinition(name="custom_trip_dataframe", dagster_type=CustomTripDataFrame) ], ) def load_custom_trip_dataframe(_) -> DataFrame: return read_csv(
rows_with_unexpected_buckets = dataframe[dataframe[column_name].apply(lambda x: x % 5 != 0)] if not rows_with_unexpected_buckets.empty: raise ColumnConstraintViolationException( constraint_name=self.name, constraint_description=self.error_description, column_name=column_name, offending_rows=rows_with_unexpected_buckets, ) CustomTripDataFrame = create_dagster_pandas_dataframe_type( name='CustomTripDataFrame', columns=[ PandasColumn( 'amount_paid', constraints=[ColumnDTypeInSetConstraint({'int64'}), DivisibleByFiveConstraint()], ) ], ) @solid( output_defs=[OutputDefinition(name='custom_trip_dataframe', dagster_type=CustomTripDataFrame)], ) def load_custom_trip_dataframe(_) -> DataFrame: return read_csv( script_relative_path('./ebike_trips.csv'), parse_dates=['start_time', 'end_time'], date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'), )