예제 #1
0
from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type
from pandas import DataFrame, read_csv

from dagster import OutputDefinition, pipeline, solid
from dagster.utils import script_relative_path

TripDataFrame = create_dagster_pandas_dataframe_type(
    name="TripDataFrame",
    columns=[
        PandasColumn.integer_column("bike_id", min_value=0),
        PandasColumn.categorical_column("color", categories={"red", "green", "blue"}),
        PandasColumn.datetime_column(
            "start_time", min_datetime=datetime(year=2020, month=2, day=10)
        ),
        PandasColumn.datetime_column("end_time", min_datetime=datetime(year=2020, month=2, day=10)),
        PandasColumn.string_column("station"),
        PandasColumn.exists("amount_paid"),
        PandasColumn.boolean_column("was_member"),
    ],
)


@solid(output_defs=[OutputDefinition(name="trip_dataframe", dagster_type=TripDataFrame)])
def load_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path("./ebike_trips.csv"),
        parse_dates=["start_time", "end_time"],
        date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"),
    )

예제 #2
0
from dagster import solid, SolidExecutionContext, Field, Array, String
from dagster_pandas import PandasColumn, create_dagster_pandas_dataframe_type
from pandas import DataFrame
from typing import Any, Optional, List, TYPE_CHECKING
from azmeta.access.resource_graph import query_dataframe

if TYPE_CHECKING:
    ResourcesDataFrame = Any  # DataFrame # Pandas has no type info yet.
else:
    ResourcesDataFrame = create_dagster_pandas_dataframe_type(
        name='ResourcesDataFrame',
        columns=[
            PandasColumn.string_column('resource_id'),
            PandasColumn.string_column('subscription_id'),
        ],
    )


@solid(
    config_schema={
        'subscriptions':
        Field(Array(String),
              description='The subscriptions to query in the Resource Graph.'),
        'filters':
        Field(String,
              is_required=False,
              description='Conditions for a KQL where operator.'),
        'custom_projections':
        Field(String,
              is_required=False,
              description='Assignments for a KQL project operator.'),
예제 #3
0
TripDataFrame = create_dagster_pandas_dataframe_type(
    name='TripDataFrame',
    columns=[
        PandasColumn.integer_column('bike_id', min_value=0),
        PandasColumn.categorical_column('color',
                                        categories={'red', 'green', 'blue'}),
        PandasColumn.datetime_column('start_time',
                                     min_datetime=datetime(year=2020,
                                                           month=2,
                                                           day=10)),
        PandasColumn.datetime_column('end_time',
                                     min_datetime=datetime(year=2020,
                                                           month=2,
                                                           day=10)),
        PandasColumn.string_column('station'),
        PandasColumn.exists('amount_paid'),
        PandasColumn.boolean_column('was_member'),
    ],
)


@solid(output_defs=[
    OutputDefinition(name='trip_dataframe', dagster_type=TripDataFrame)
])
def load_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path('./ebike_trips.csv'),
        parse_dates=['start_time', 'end_time'],
        date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'),
    )
예제 #4
0
import functools
from azmeta.access.monitor_logs import (PerformanceCounterSpec,
                                        query_dataframe_by_workspace_chunk,
                                        build_perf_counter_percentile_query,
                                        build_disk_percentile_query)
from azmeta.access.utils.chunking import build_grouped_chunk_list
from .resources import ResourcesDataFrame
from .specifications import AzureComputeSpecifications

if TYPE_CHECKING:
    UtilizationDataFrame = Any  # DataFrame # Pandas has no type info yet.
else:
    UtilizationDataFrame = create_dagster_pandas_dataframe_type(
        name='UtilizationDataFrame',
        columns=[
            PandasColumn.string_column('resource_id'),
            PandasColumn.float_column('percentile_50th'),
            PandasColumn.float_column('percentile_80th'),
            PandasColumn.float_column('percentile_90th'),
            PandasColumn.float_column('percentile_95th'),
            PandasColumn.float_column('percentile_99th'),
            PandasColumn.float_column('max'),
            PandasColumn.integer_column('samples'),
        ],
    )


@solid(required_resource_keys={'azure_monitor'})
def query_cpu_utilization(
        context: SolidExecutionContext,
        resources: ResourcesDataFrame) -> UtilizationDataFrame:
        EventMetadataEntry.text(
            max(dataframe["day"]),
            "max_day",
            "Maximum date of exchange rates",
        ),
        EventMetadataEntry.text(
            str(dataframe["day"].nunique()),
            "num_unique_day",
            "Total unique dates of exchange rates",
        ),
        EventMetadataEntry.text(
            str(dataframe["currency"].nunique()),
            "num_unique_currency",
            "Total unique currencies of exchange rates",
        ),
        EventMetadataEntry.text(str(len(dataframe)), "n_rows",
                                "Number of rows seen in the dataframe"),
    ]


ExchangeRateDataFrame = create_dagster_pandas_dataframe_type(
    name="ExchangeRateDataFrame",
    columns=[
        PandasColumn.string_column("id"),
        PandasColumn.string_column("day"),
        PandasColumn.string_column("currency"),
        PandasColumn.numeric_column("rate"),
    ],
    event_metadata_fn=compute_exchange_rate_dataframe_summary_statistics,
)