"Date data collection ended", ), EventMetadataEntry.text( str(dataframe["bike_id"].nunique()), "num_unique_bikes", "Number of unique bikes that took trips", ), EventMetadataEntry.text(str(len(dataframe)), "n_rows", "Number of rows seen in the dataframe"), EventMetadataEntry.text(str(dataframe.columns), "columns", "Keys of columns seen in the dataframe"), ] SummaryStatsTripDataFrame = create_dagster_pandas_dataframe_type( name="SummaryStatsTripDataFrame", event_metadata_fn=compute_trip_dataframe_summary_statistics) @solid(output_defs=[ OutputDefinition(name="summary_stats_trip_dataframe", dagster_type=SummaryStatsTripDataFrame) ]) def load_summary_stats_trip_dataframe(_) -> DataFrame: return read_csv( script_relative_path("./ebike_trips.csv"), parse_dates=["start_time", "end_time"], date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"), )
lambda x: x % 5 != 0)] if not rows_with_unexpected_buckets.empty: raise ColumnConstraintViolationException( constraint_name=self.name, constraint_description=self.error_description, column_name=column_name, offending_rows=rows_with_unexpected_buckets, ) CustomTripDataFrame = create_dagster_pandas_dataframe_type( name="CustomTripDataFrame", columns=[ PandasColumn( "amount_paid", constraints=[ ColumnDTypeInSetConstraint({"int64"}), DivisibleByFiveConstraint() ], ) ], ) # end_custom_col @solid( output_defs=[ OutputDefinition(name="custom_trip_dataframe", dagster_type=CustomTripDataFrame) ], ) def load_custom_trip_dataframe(_) -> DataFrame: return read_csv(
metadata_dataframe_types = dp.create_dagster_pandas_dataframe_type( name="metadata_dataframe_types", description="Dataframe type to validate the metadata.csv", columns=[ dp.PandasColumn.string_column("Source ID", unique=True, non_nullable=True), dp.PandasColumn.string_column("SSID", unique=True, ignore_missing_vals=True), dp.PandasColumn.string_column("Title"), # dp.PandasColumn.string_column( # "Description (English)"), dp.PandasColumn.string_column("Description (Portuguese)"), dp.PandasColumn.string_column("Date"), dp.PandasColumn("First Year", constraints=[int_column()]), dp.PandasColumn("Last Year", constraints=[int_column()]), dp.PandasColumn.string_column("Type"), dp.PandasColumn.string_column("Collections"), dp.PandasColumn.string_column("Source"), dp.PandasColumn("Source URL", constraints=[url_column()]), dp.PandasColumn.string_column("Materials"), dp.PandasColumn.string_column("Fabrication Method"), dp.PandasColumn.string_column("Rights", ignore_missing_vals=True), # dp.PandasColumn.string_column( # "License", ignore_missing_vals=True), # dp.PandasColumn.string_column( # "Attribution", ignore_missing_vals=True), dp.PandasColumn("Width (mm)", constraints=[int_column()]), dp.PandasColumn("Height (mm)", constraints=[int_column()]), dp.PandasColumn("Latitude", constraints=[float_column()]), dp.PandasColumn("Longitude", constraints=[float_column()]), dp.PandasColumn.string_column("Depicts"), dp.PandasColumn("Wikidata ID", constraints=[url_column()]), dp.PandasColumn("Smapshot ID", constraints=[int_column()]), dp.PandasColumn("Media URL", constraints=[url_column()]), ], )
from datetime import datetime from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type from pandas import DataFrame, read_csv from dagster import OutputDefinition, pipeline, solid from dagster.utils import script_relative_path TripDataFrame = create_dagster_pandas_dataframe_type( name="TripDataFrame", columns=[ PandasColumn.integer_column("bike_id", min_value=0), PandasColumn.categorical_column("color", categories={"red", "green", "blue"}), PandasColumn.datetime_column( "start_time", min_datetime=datetime(year=2020, month=2, day=10) ), PandasColumn.datetime_column("end_time", min_datetime=datetime(year=2020, month=2, day=10)), PandasColumn.string_column("station"), PandasColumn.exists("amount_paid"), PandasColumn.boolean_column("was_member"), ], ) @solid(output_defs=[OutputDefinition(name="trip_dataframe", dagster_type=TripDataFrame)]) def load_trip_dataframe(_) -> DataFrame: return read_csv( script_relative_path("./ebike_trips.csv"), parse_dates=["start_time", "end_time"], date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"), )
from datetime import datetime from dagster import OutputDefinition, pipeline, solid from dagster.utils import script_relative_path from dagster_pandas import RowCountConstraint, create_dagster_pandas_dataframe_type from pandas import DataFrame, read_csv # start_create_type ShapeConstrainedTripDataFrame = create_dagster_pandas_dataframe_type( name="ShapeConstrainedTripDataFrame", dataframe_constraints=[RowCountConstraint(4)]) # end_create_type @solid(output_defs=[ OutputDefinition(name="shape_constrained_trip_dataframe", dagster_type=ShapeConstrainedTripDataFrame) ]) def load_shape_constrained_trip_dataframe(_) -> DataFrame: return read_csv( script_relative_path("./ebike_trips.csv"), parse_dates=["start_time", "end_time"], date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"), ) @pipeline def shape_constrained_pipeline(): load_shape_constrained_trip_dataframe()
def validate(self, dataframe, column_name): rows_with_unexpected_buckets = dataframe[dataframe[column_name].apply(lambda x: x % 5 != 0)] if not rows_with_unexpected_buckets.empty: raise ColumnConstraintViolationException( constraint_name=self.name, constraint_description=self.error_description, column_name=column_name, offending_rows=rows_with_unexpected_buckets, ) CustomTripDataFrame = create_dagster_pandas_dataframe_type( name='CustomTripDataFrame', columns=[ PandasColumn( 'amount_paid', constraints=[ColumnDTypeInSetConstraint({'int64'}), DivisibleByFiveConstraint()], ) ], ) @solid( output_defs=[OutputDefinition(name='custom_trip_dataframe', dagster_type=CustomTripDataFrame)], ) def load_custom_trip_dataframe(_) -> DataFrame: return read_csv( script_relative_path('./ebike_trips.csv'), parse_dates=['start_time', 'end_time'], date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'), )
from dagster import solid, SolidExecutionContext, Field, Array, String from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type from pandas import DataFrame from typing import Any, Optional, List, TYPE_CHECKING from azmeta.access.resource_graph import query_dataframe if TYPE_CHECKING: ResourcesDataFrame = Any # DataFrame # Pandas has no type info yet. else: ResourcesDataFrame = create_dagster_pandas_dataframe_type( name='ResourcesDataFrame', columns=[ PandasColumn.string_column('resource_id'), PandasColumn.string_column('subscription_id'), ], ) @solid( config_schema={ 'subscriptions': Field(Array(String), description='The subscriptions to query in the Resource Graph.'), 'filters': Field(String, is_required=False, description='Conditions for a KQL where operator.'), 'custom_projections': Field(String, is_required=False, description='Assignments for a KQL project operator.'),
build_perf_counter_percentile_query, build_disk_percentile_query) from azmeta.access.utils.chunking import build_grouped_chunk_list from .resources import ResourcesDataFrame from .specifications import AzureComputeSpecifications if TYPE_CHECKING: UtilizationDataFrame = Any # DataFrame # Pandas has no type info yet. else: UtilizationDataFrame = create_dagster_pandas_dataframe_type( name='UtilizationDataFrame', columns=[ PandasColumn.string_column('resource_id'), PandasColumn.float_column('percentile_50th'), PandasColumn.float_column('percentile_80th'), PandasColumn.float_column('percentile_90th'), PandasColumn.float_column('percentile_95th'), PandasColumn.float_column('percentile_99th'), PandasColumn.float_column('max'), PandasColumn.integer_column('samples'), ], ) @solid(required_resource_keys={'azure_monitor'}) def query_cpu_utilization( context: SolidExecutionContext, resources: ResourcesDataFrame) -> UtilizationDataFrame: builder = functools.partial(build_perf_counter_percentile_query, spec=PerformanceCounterSpec( "Processor", "% Processor Time", "_Total"))
from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type from pandas import DataFrame, read_csv from dagster import OutputDefinition, pipeline, solid from dagster.utils import script_relative_path TripDataFrame = create_dagster_pandas_dataframe_type( name='TripDataFrame', columns=[ PandasColumn.integer_column('bike_id', min_value=0), PandasColumn.categorical_column('color', categories={'red', 'green', 'blue'}), PandasColumn.datetime_column('start_time', min_datetime=datetime(year=2020, month=2, day=10)), PandasColumn.datetime_column('end_time', min_datetime=datetime(year=2020, month=2, day=10)), PandasColumn.string_column('station'), PandasColumn.exists('amount_paid'), PandasColumn.boolean_column('was_member'), ], ) @solid(output_defs=[ OutputDefinition(name='trip_dataframe', dagster_type=TripDataFrame) ]) def load_trip_dataframe(_) -> DataFrame:
EventMetadataEntry.text( max(dataframe["day"]), "max_day", "Maximum date of exchange rates", ), EventMetadataEntry.text( str(dataframe["day"].nunique()), "num_unique_day", "Total unique dates of exchange rates", ), EventMetadataEntry.text( str(dataframe["currency"].nunique()), "num_unique_currency", "Total unique currencies of exchange rates", ), EventMetadataEntry.text(str(len(dataframe)), "n_rows", "Number of rows seen in the dataframe"), ] ExchangeRateDataFrame = create_dagster_pandas_dataframe_type( name="ExchangeRateDataFrame", columns=[ PandasColumn.string_column("id"), PandasColumn.string_column("day"), PandasColumn.string_column("currency"), PandasColumn.numeric_column("rate"), ], event_metadata_fn=compute_exchange_rate_dataframe_summary_statistics, )