def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return [ EventMetadataEntry.text(str(max(dataframe['pid'])), 'max_pid', 'maximum pid'), ] BasicDF = create_dagster_pandas_dataframe_type( name='BasicDF', columns=[ PandasColumn.integer_column('pid', non_nullable=True), PandasColumn.string_column('names'), ], event_metadata_fn=compute_event_metadata, ) @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=BasicDF)]) def create_dataframe(_): yield Output( DataFrame({'pid': [1, 2, 3], 'names': ['foo', 'bar', 'baz']}), output_name='basic_dataframe', ) @pipeline def basic_pipeline(): return create_dataframe() result = execute_pipeline(basic_pipeline) assert result.success for event in result.event_list: if event.event_type_value == 'STEP_OUTPUT': mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries ) assert len(mock_df_output_event_metadata) == 1 assert any([entry.label == 'max_pid' for entry in mock_df_output_event_metadata])
def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return {"max_pid": str(max(dataframe["pid"]))} BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=compute_event_metadata, ) @op(out={"basic_dataframe": Out(dagster_type=BasicDF)}) def create_dataframe(_): yield Output( DataFrame({"pid": [1, 2, 3], "names": ["foo", "bar", "baz"]}), output_name="basic_dataframe", ) @graph def basic_graph(): return create_dataframe() result = basic_graph.execute_in_process() assert result.success for event in result.all_node_events: if event.event_type_value == "STEP_OUTPUT": mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries ) assert len(mock_df_output_event_metadata) == 1 assert any([entry.label == "max_pid" for entry in mock_df_output_event_metadata])
def test_datetime_column_with_min_max_constraints_ok(): assert ( validate_constraints( DataFrame( { "datetime": [Timestamp("2021-03-14T12:34:56")], "datetime_utc_min_max_no_tz": [Timestamp("2021-03-14T12:34:56Z")], "datetime_utc_min_max_same_tz": [Timestamp("2021-03-14T12:34:56Z")], "datetime_utc_min_max_from_different_tz": [Timestamp("2021-03-14T12:34:56Z")], } ), pandas_columns=[ PandasColumn.datetime_column( "datetime_utc_min_max_no_tz", tz="UTC", min_datetime=Timestamp.min, max_datetime=Timestamp.max, ), PandasColumn.datetime_column( "datetime_utc_min_max_same_tz", tz="UTC", min_datetime=Timestamp("2021-01-01T00:00:00Z"), max_datetime=Timestamp("2021-12-01T00:00:00Z"), ), PandasColumn.datetime_column( "datetime_utc_min_max_from_different_tz", tz="UTC", min_datetime=Timestamp("2021-01-01T00:00:00Z", tz="US/Eastern"), max_datetime=Timestamp("2021-12-01T00:00:00Z"), ), ], ) is None )
def test_shape_validation_throw_error(): with pytest.raises(ConstraintViolationException): validate_constraints( DataFrame({"foo": [2], "bar": ["hello"]}), pandas_columns=[ PandasColumn.integer_column("foo", min_value=0), PandasColumn.string_column("bar"), ], dataframe_constraints=[RowCountConstraint(2)], )
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn(): BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=None, ) assert isinstance(BasicDF, DagsterType) basic_type_check = check_dagster_type(BasicDF, DataFrame({"pid": [1], "names": ["foo"]})) assert basic_type_check.success
def test_shape_validation_ok(): assert (validate_constraints( DataFrame({ 'foo': [2], 'bar': ['hello'] }), pandas_columns=[ PandasColumn.integer_column('foo', min_value=0), PandasColumn.string_column('bar'), ], dataframe_constraints=[RowCountConstraint(1)], ) is None)
def test_shape_validation_ok(): assert ( validate_constraints( DataFrame({"foo": [2], "bar": ["hello"]}), pandas_columns=[ PandasColumn.integer_column("foo", min_value=0), PandasColumn.string_column("bar"), ], dataframe_constraints=[RowCountConstraint(1)], ) is None )
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn(): BasicDF = create_dagster_pandas_dataframe_type( name='BasicDF', columns=[ PandasColumn.integer_column('pid', non_nullable=True), PandasColumn.string_column('names'), ], event_metadata_fn=None, ) assert isinstance(BasicDF, DagsterType) basic_type_check = check_dagster_type(BasicDF, DataFrame({'pid': [1], 'names': ['foo']})) assert basic_type_check.success
def test_shape_validation_throw_error(): with pytest.raises(ConstraintViolationException): validate_constraints( DataFrame({ 'foo': [2], 'bar': ['hello'] }), pandas_columns=[ PandasColumn.integer_column('foo', min_value=0), PandasColumn.string_column('bar'), ], dataframe_constraints=[RowCountConstraint(2)], )
def test_validate_constraints_ok(): column_constraints = [ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_custom_dagster_dataframe_hydration_ok(): input_dataframe = DataFrame({'foo': [1, 2, 3]}) with safe_tempfile_path() as input_csv_fp, safe_tempfile_path() as output_csv_fp: input_dataframe.to_csv(input_csv_fp) TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[PandasColumn.exists('foo'),] ) @solid( input_defs=[InputDefinition('test_df', TestDataFrame)], output_defs=[OutputDefinition(TestDataFrame)], ) def use_test_dataframe(_, test_df): test_df['bar'] = [2, 4, 6] return test_df solid_result = execute_solid( use_test_dataframe, run_config={ 'solids': { 'use_test_dataframe': { 'inputs': {'test_df': {'csv': {'path': input_csv_fp}}}, 'outputs': [{'result': {'csv': {'path': output_csv_fp}}},], } } }, ) assert solid_result.success solid_output_df = read_csv(output_csv_fp) assert all(solid_output_df['bar'] == [2, 4, 6])
def test_missing_column_validation_with_optional_column(): column_constraints = [ PandasColumn( name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})], is_required=False ), ] dataframe = DataFrame({"foo": ["bar", "baz"]}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_create_pandas_dataframe_dagster_type(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('int64')]) ], ) assert isinstance(TestDataFrame, RuntimeType)
def test_missing_column_validation_with_optional_column(): column_constraints = [ PandasColumn(name='qux', constraints=[ColumnDTypeInSetConstraint({'object'})], is_required=False), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_dataframe_description_generation_just_type_constraint(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"int64"})]) ], ) assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
def test_missing_column_validation(): column_constraints = [ PandasColumn(name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})]), ] dataframe = DataFrame({"foo": ["bar", "baz"]}) with pytest.raises( ConstraintViolationException, match="Required column qux not in dataframe with columns" ): validate_constraints(dataframe, pandas_columns=column_constraints)
def test_create_pandas_dataframe_dagster_type(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"int64"})]) ], ) assert isinstance(TestDataFrame, DagsterType)
def test_dataframe_description_generation_just_type_constraint(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('int64')]) ], ) assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return [ EventMetadataEntry.text(str(max(dataframe["pid"])), "max_pid", "maximum pid"), ] BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=compute_event_metadata, ) @solid(output_defs=[ OutputDefinition(name="basic_dataframe", dagster_type=BasicDF) ]) def create_dataframe(_): yield Output( DataFrame({ "pid": [1, 2, 3], "names": ["foo", "bar", "baz"] }), output_name="basic_dataframe", ) @pipeline def basic_pipeline(): return create_dataframe() result = execute_pipeline(basic_pipeline) assert result.success for event in result.event_list: if event.event_type_value == "STEP_OUTPUT": mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries) assert len(mock_df_output_event_metadata) == 1 assert any([ entry.label == "max_pid" for entry in mock_df_output_event_metadata ])
def test_missing_column_validation(): column_constraints = [ PandasColumn(name='qux', constraints=[ColumnDTypeInSetConstraint({'object'})]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) with pytest.raises( ConstraintViolationException, match="Required column qux not in dataframe with columns"): validate_constraints(dataframe, pandas_columns=column_constraints)
def test_datetime_column_with_tz_validation_fails_when_incorrect_tz(): with pytest.raises(ConstraintViolationException): validate_constraints( DataFrame( { "datetime_utc": [Timestamp("2021-03-14T12:34:56")], } ), pandas_columns=[ PandasColumn.datetime_column("datetime_utc", tz="UTC"), ], )
def test_custom_dagster_dataframe_parametrizable_input(): @input_selector_schema( Selector({'door_a': Field(str), 'door_b': Field(str), 'door_c': Field(str),}) ) def silly_hydrator(_, which_door, _field): if which_door == 'door_a': return DataFrame({'foo': ['goat']}) elif which_door == 'door_b': return DataFrame({'foo': ['car']}) elif which_door == 'door_c': return DataFrame({'foo': ['goat']}) raise DagsterInvariantViolationError( 'You did not pick a door. You chose: {which_door}'.format(which_door=which_door) ) @output_selector_schema(Selector({'devnull': Field(str), 'nothing': Field(str)})) def silly_materializer(_, _location, _field, _value): return Materialization(label='did nothing', description='just one of those days') TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[PandasColumn.exists('foo'),], input_hydration_config=silly_hydrator, output_materialization_config=silly_materializer, ) @solid( input_defs=[InputDefinition('df', TestDataFrame)], output_defs=[OutputDefinition(TestDataFrame)], ) def did_i_win(_, df): return df solid_result = execute_solid( did_i_win, run_config={ 'solids': { 'did_i_win': { 'inputs': {'df': {'door_a': 'bar'}}, 'outputs': [{'result': {'devnull': 'baz'}}], } } }, ) assert solid_result.success output_df = solid_result.output_value() assert isinstance(output_df, DataFrame) assert output_df['foo'].tolist() == ['goat'] materialization_events = solid_result.materialization_events_during_compute assert len(materialization_events) == 1 assert materialization_events[0].event_specific_data.materialization.label == 'did nothing'
def test_datetime_column_with_tz_validation_ok(): assert ( validate_constraints( DataFrame( { "datetime": [Timestamp("2021-03-14T12:34:56")], "datetime_utc": [Timestamp("2021-03-14T12:34:56Z")], "datetime_dublin": [Timestamp("2021-03-14T12:34:56", tz="Europe/Dublin")], "datetime_est": [Timestamp("2021-03-14T12:34:56", tz="US/Eastern")], "datetime_chatham": [Timestamp("2021-03-14T12:34:56", tz="Pacific/Chatham")], "datetime_utc_with_min_max": [Timestamp("2021-03-14T12:34:56Z")], } ), pandas_columns=[ PandasColumn.datetime_column("datetime"), PandasColumn.datetime_column("datetime_utc", tz="UTC"), PandasColumn.datetime_column("datetime_dublin", tz="Europe/Dublin"), PandasColumn.datetime_column("datetime_est", tz="US/Eastern"), PandasColumn.datetime_column("datetime_chatham", tz="Pacific/Chatham"), ], ) is None )
def test_custom_dagster_dataframe_loading_ok(): input_dataframe = DataFrame({"foo": [1, 2, 3]}) with safe_tempfile_path() as input_csv_fp, safe_tempfile_path( ) as output_csv_fp: input_dataframe.to_csv(input_csv_fp) TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn.exists("foo"), ], ) @op( ins={"test_df": In(TestDataFrame)}, out=Out(TestDataFrame), ) def use_test_dataframe(_, test_df): test_df["bar"] = [2, 4, 6] return test_df @graph def basic_graph(): use_test_dataframe() result = basic_graph.execute_in_process( run_config={ "ops": { "use_test_dataframe": { "inputs": { "test_df": { "csv": { "path": input_csv_fp } } }, "outputs": [ { "result": { "csv": { "path": output_csv_fp } } }, ], } } }) assert result.success output_df = read_csv(output_csv_fp) assert all(output_df["bar"] == [2, 4, 6])
def test_custom_dagster_dataframe_loading_ok(): input_dataframe = DataFrame({"foo": [1, 2, 3]}) with safe_tempfile_path() as input_csv_fp, safe_tempfile_path( ) as output_csv_fp: input_dataframe.to_csv(input_csv_fp) TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn.exists("foo"), ], ) @solid( input_defs=[InputDefinition("test_df", TestDataFrame)], output_defs=[OutputDefinition(TestDataFrame)], ) def use_test_dataframe(_, test_df): test_df["bar"] = [2, 4, 6] return test_df solid_result = execute_solid( use_test_dataframe, run_config={ "solids": { "use_test_dataframe": { "inputs": { "test_df": { "csv": { "path": input_csv_fp } } }, "outputs": [ { "result": { "csv": { "path": output_csv_fp } } }, ], } } }, ) assert solid_result.success solid_output_df = read_csv(output_csv_fp) assert all(solid_output_df["bar"] == [2, 4, 6])
def test_dataframe_description_generation_multi_constraints(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn( name='foo', constraints=[ ColumnTypeConstraint('int64'), InRangeColumnConstraint(0, 100), NonNullableColumnConstraint(), ], ), ], ) assert ( TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n" )
def test_dataframe_description_generation_multi_constraints(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn( name="foo", constraints=[ ColumnDTypeInSetConstraint({"int64"}), InRangeColumnConstraint(0, 100, ignore_missing_vals=False), NonNullableColumnConstraint(), ], ), ], ) assert ( TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n" )
EventMetadataEntry.text( str(min(dataframe["start_time"])), "min_start_time", "Date data collection started", ), EventMetadataEntry.text(str(max(dataframe["end_time"])), "max_end_time", "Timestamp of last trip"), EventMetadataEntry.text(str(len(dataframe)), "n_rows", "Number of rows seen in the dataframe"), EventMetadataEntry.text(str(dataframe.columns), "columns", "Keys of columns seen in the dataframe"), ] TripDataFrameSchema = [ PandasColumn.integer_column("bike_id", min_value=0), PandasColumn.datetime_column( "start_time", min_datetime=Timestamp(year=2017, month=1, day=1), ), PandasColumn.datetime_column( "end_time", min_datetime=Timestamp(year=2017, month=1, day=1), ), PandasColumn.string_column("interval_date"), ] RawTripDataFrame = create_dagster_pandas_dataframe_type( name="RawTripDataFrame", columns=[ PandasColumn(column.name) for column in TripDataFrameSchema
EventMetadataEntry.text( str(min(dataframe['start_time'])), 'min_start_time', 'Date data collection started', ), EventMetadataEntry.text(str(max(dataframe['end_time'])), 'max_end_time', 'Timestamp of last trip'), EventMetadataEntry.text(str(len(dataframe)), 'n_rows', 'Number of rows seen in the dataframe'), EventMetadataEntry.text(str(dataframe.columns), 'columns', 'Keys of columns seen in the dataframe'), ] TripDataFrameSchema = [ PandasColumn.integer_column('bike_id', min_value=0), PandasColumn.datetime_column( 'start_time', min_datetime=Timestamp(year=2018, month=1, day=1), ), PandasColumn.datetime_column( 'end_time', min_datetime=Timestamp(year=2018, month=1, day=1), ), PandasColumn.string_column('interval_date'), ] RawTripDataFrame = create_dagster_pandas_dataframe_type( name='RawTripDataFrame', columns=[ PandasColumn(column.name) for column in TripDataFrameSchema
PandasColumn(name='qux', constraints=[ColumnDTypeInSetConstraint({'object'})], is_required=False), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None @pytest.mark.parametrize( 'column_constraints, dataframe', [ ( [ PandasColumn( name='foo', constraints=[ColumnDTypeInSetConstraint({'int64'})]) ], DataFrame({'foo': ['bar', 'baz']}), ), ( [ PandasColumn( name='foo', constraints=[ColumnDTypeInSetConstraint({'object'})]) ], DataFrame({'bar': ['bar', 'baz']}), ), ], ) def test_validate_constraints_throw_error(column_constraints, dataframe):