예제 #1
0
            "Date data collection ended",
        ),
        EventMetadataEntry.text(
            str(dataframe["bike_id"].nunique()),
            "num_unique_bikes",
            "Number of unique bikes that took trips",
        ),
        EventMetadataEntry.text(str(len(dataframe)), "n_rows",
                                "Number of rows seen in the dataframe"),
        EventMetadataEntry.text(str(dataframe.columns), "columns",
                                "Keys of columns seen in the dataframe"),
    ]


SummaryStatsTripDataFrame = create_dagster_pandas_dataframe_type(
    name="SummaryStatsTripDataFrame",
    event_metadata_fn=compute_trip_dataframe_summary_statistics)


@solid(output_defs=[
    OutputDefinition(name="summary_stats_trip_dataframe",
                     dagster_type=SummaryStatsTripDataFrame)
])
def load_summary_stats_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path("./ebike_trips.csv"),
        parse_dates=["start_time", "end_time"],
        date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"),
    )

            lambda x: x % 5 != 0)]
        if not rows_with_unexpected_buckets.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_unexpected_buckets,
            )


CustomTripDataFrame = create_dagster_pandas_dataframe_type(
    name="CustomTripDataFrame",
    columns=[
        PandasColumn(
            "amount_paid",
            constraints=[
                ColumnDTypeInSetConstraint({"int64"}),
                DivisibleByFiveConstraint()
            ],
        )
    ],
)
# end_custom_col


@solid(
    output_defs=[
        OutputDefinition(name="custom_trip_dataframe",
                         dagster_type=CustomTripDataFrame)
    ], )
def load_custom_trip_dataframe(_) -> DataFrame:
    return read_csv(
예제 #3
0
metadata_dataframe_types = dp.create_dagster_pandas_dataframe_type(
    name="metadata_dataframe_types",
    description="Dataframe type to validate the metadata.csv",
    columns=[
        dp.PandasColumn.string_column("Source ID", unique=True, non_nullable=True),
        dp.PandasColumn.string_column("SSID", unique=True, ignore_missing_vals=True),
        dp.PandasColumn.string_column("Title"),
        # dp.PandasColumn.string_column(
        #     "Description (English)"),
        dp.PandasColumn.string_column("Description (Portuguese)"),
        dp.PandasColumn.string_column("Date"),
        dp.PandasColumn("First Year", constraints=[int_column()]),
        dp.PandasColumn("Last Year", constraints=[int_column()]),
        dp.PandasColumn.string_column("Type"),
        dp.PandasColumn.string_column("Collections"),
        dp.PandasColumn.string_column("Source"),
        dp.PandasColumn("Source URL", constraints=[url_column()]),
        dp.PandasColumn.string_column("Materials"),
        dp.PandasColumn.string_column("Fabrication Method"),
        dp.PandasColumn.string_column("Rights", ignore_missing_vals=True),
        # dp.PandasColumn.string_column(
        #     "License", ignore_missing_vals=True),
        # dp.PandasColumn.string_column(
        #    "Attribution", ignore_missing_vals=True),
        dp.PandasColumn("Width (mm)", constraints=[int_column()]),
        dp.PandasColumn("Height (mm)", constraints=[int_column()]),
        dp.PandasColumn("Latitude", constraints=[float_column()]),
        dp.PandasColumn("Longitude", constraints=[float_column()]),
        dp.PandasColumn.string_column("Depicts"),
        dp.PandasColumn("Wikidata ID", constraints=[url_column()]),
        dp.PandasColumn("Smapshot ID", constraints=[int_column()]),
        dp.PandasColumn("Media URL", constraints=[url_column()]),
    ],
)
예제 #4
0
from datetime import datetime

from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type
from pandas import DataFrame, read_csv

from dagster import OutputDefinition, pipeline, solid
from dagster.utils import script_relative_path

TripDataFrame = create_dagster_pandas_dataframe_type(
    name="TripDataFrame",
    columns=[
        PandasColumn.integer_column("bike_id", min_value=0),
        PandasColumn.categorical_column("color", categories={"red", "green", "blue"}),
        PandasColumn.datetime_column(
            "start_time", min_datetime=datetime(year=2020, month=2, day=10)
        ),
        PandasColumn.datetime_column("end_time", min_datetime=datetime(year=2020, month=2, day=10)),
        PandasColumn.string_column("station"),
        PandasColumn.exists("amount_paid"),
        PandasColumn.boolean_column("was_member"),
    ],
)


@solid(output_defs=[OutputDefinition(name="trip_dataframe", dagster_type=TripDataFrame)])
def load_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path("./ebike_trips.csv"),
        parse_dates=["start_time", "end_time"],
        date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"),
    )
예제 #5
0
from datetime import datetime

from dagster import OutputDefinition, pipeline, solid
from dagster.utils import script_relative_path
from dagster_pandas import RowCountConstraint, create_dagster_pandas_dataframe_type
from pandas import DataFrame, read_csv

# start_create_type
ShapeConstrainedTripDataFrame = create_dagster_pandas_dataframe_type(
    name="ShapeConstrainedTripDataFrame",
    dataframe_constraints=[RowCountConstraint(4)])
# end_create_type


@solid(output_defs=[
    OutputDefinition(name="shape_constrained_trip_dataframe",
                     dagster_type=ShapeConstrainedTripDataFrame)
])
def load_shape_constrained_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path("./ebike_trips.csv"),
        parse_dates=["start_time", "end_time"],
        date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"),
    )


@pipeline
def shape_constrained_pipeline():
    load_shape_constrained_trip_dataframe()
    def validate(self, dataframe, column_name):
        rows_with_unexpected_buckets = dataframe[dataframe[column_name].apply(lambda x: x % 5 != 0)]
        if not rows_with_unexpected_buckets.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_unexpected_buckets,
            )


CustomTripDataFrame = create_dagster_pandas_dataframe_type(
    name='CustomTripDataFrame',
    columns=[
        PandasColumn(
            'amount_paid',
            constraints=[ColumnDTypeInSetConstraint({'int64'}), DivisibleByFiveConstraint()],
        )
    ],
)


@solid(
    output_defs=[OutputDefinition(name='custom_trip_dataframe', dagster_type=CustomTripDataFrame)],
)
def load_custom_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path('./ebike_trips.csv'),
        parse_dates=['start_time', 'end_time'],
        date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'),
    )
예제 #7
0
from dagster import solid, SolidExecutionContext, Field, Array, String
from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type
from pandas import DataFrame
from typing import Any, Optional, List, TYPE_CHECKING
from azmeta.access.resource_graph import query_dataframe

if TYPE_CHECKING:
    ResourcesDataFrame = Any  # DataFrame # Pandas has no type info yet.
else:
    ResourcesDataFrame = create_dagster_pandas_dataframe_type(
        name='ResourcesDataFrame',
        columns=[
            PandasColumn.string_column('resource_id'),
            PandasColumn.string_column('subscription_id'),
        ],
    )


@solid(
    config_schema={
        'subscriptions':
        Field(Array(String),
              description='The subscriptions to query in the Resource Graph.'),
        'filters':
        Field(String,
              is_required=False,
              description='Conditions for a KQL where operator.'),
        'custom_projections':
        Field(String,
              is_required=False,
              description='Assignments for a KQL project operator.'),
예제 #8
0
                                        build_perf_counter_percentile_query,
                                        build_disk_percentile_query)
from azmeta.access.utils.chunking import build_grouped_chunk_list
from .resources import ResourcesDataFrame
from .specifications import AzureComputeSpecifications

if TYPE_CHECKING:
    UtilizationDataFrame = Any  # DataFrame # Pandas has no type info yet.
else:
    UtilizationDataFrame = create_dagster_pandas_dataframe_type(
        name='UtilizationDataFrame',
        columns=[
            PandasColumn.string_column('resource_id'),
            PandasColumn.float_column('percentile_50th'),
            PandasColumn.float_column('percentile_80th'),
            PandasColumn.float_column('percentile_90th'),
            PandasColumn.float_column('percentile_95th'),
            PandasColumn.float_column('percentile_99th'),
            PandasColumn.float_column('max'),
            PandasColumn.integer_column('samples'),
        ],
    )


@solid(required_resource_keys={'azure_monitor'})
def query_cpu_utilization(
        context: SolidExecutionContext,
        resources: ResourcesDataFrame) -> UtilizationDataFrame:
    builder = functools.partial(build_perf_counter_percentile_query,
                                spec=PerformanceCounterSpec(
                                    "Processor", "% Processor Time", "_Total"))
예제 #9
0
from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type
from pandas import DataFrame, read_csv

from dagster import OutputDefinition, pipeline, solid
from dagster.utils import script_relative_path

TripDataFrame = create_dagster_pandas_dataframe_type(
    name='TripDataFrame',
    columns=[
        PandasColumn.integer_column('bike_id', min_value=0),
        PandasColumn.categorical_column('color',
                                        categories={'red', 'green', 'blue'}),
        PandasColumn.datetime_column('start_time',
                                     min_datetime=datetime(year=2020,
                                                           month=2,
                                                           day=10)),
        PandasColumn.datetime_column('end_time',
                                     min_datetime=datetime(year=2020,
                                                           month=2,
                                                           day=10)),
        PandasColumn.string_column('station'),
        PandasColumn.exists('amount_paid'),
        PandasColumn.boolean_column('was_member'),
    ],
)


@solid(output_defs=[
    OutputDefinition(name='trip_dataframe', dagster_type=TripDataFrame)
])
def load_trip_dataframe(_) -> DataFrame:
        EventMetadataEntry.text(
            max(dataframe["day"]),
            "max_day",
            "Maximum date of exchange rates",
        ),
        EventMetadataEntry.text(
            str(dataframe["day"].nunique()),
            "num_unique_day",
            "Total unique dates of exchange rates",
        ),
        EventMetadataEntry.text(
            str(dataframe["currency"].nunique()),
            "num_unique_currency",
            "Total unique currencies of exchange rates",
        ),
        EventMetadataEntry.text(str(len(dataframe)), "n_rows",
                                "Number of rows seen in the dataframe"),
    ]


ExchangeRateDataFrame = create_dagster_pandas_dataframe_type(
    name="ExchangeRateDataFrame",
    columns=[
        PandasColumn.string_column("id"),
        PandasColumn.string_column("day"),
        PandasColumn.string_column("currency"),
        PandasColumn.numeric_column("rate"),
    ],
    event_metadata_fn=compute_exchange_rate_dataframe_summary_statistics,
)