Exemplo n.º 1
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):

    student_data = PandasDataAssetIO.read_data_asset(
        asset=asset, source_files=asset.pickedup_files(airflow_context))

    student_data = asset.rename_fields_as_declared(student_data)

    PandasDataAssetIO.write_data_asset(asset=asset, data=student_data)
def rebuild_for_store(asset: PandasDataAsset, airflow_context):
    student = PandasDataAsset(name="student")
    programme = PandasDataAsset(name="programme")
    enrollment = PandasDataAsset(name="enrollment")

    student_df = student.retrieve_from_store(
        airflow_context=airflow_context, consuming_asset=asset
    )
    programme_df = programme.retrieve_from_store(
        airflow_context=airflow_context, consuming_asset=asset
    )

    enrollment_df = enrollment.retrieve_from_store(
        airflow_context=airflow_context, consuming_asset=asset
    )

    enrollment_summary: pd.DataFrame = enrollment_df.merge(
        right=student_df, on=student.declarations.key_columns
    ).merge(right=programme_df, on=programme.declarations.key_columns)

    enrollment_summary = (
        enrollment_summary.loc[:, ["student_major", "programme_name", "student_id"]]
        .groupby(by=["student_major", "programme_name"])
        .count()
    )

    PandasDataAssetIO.write_data_asset(asset=asset, data=enrollment_summary)
Exemplo n.º 3
0
def test_data_asset_paths(
    test_parquet_asset: PandasDataAsset,
    test_parquet_asset_df: pd.DataFrame,
    fake_airflow_context: Dict,
) -> None:

    # test various path getters/properties:
    test_path = test_parquet_asset.staging_pickedup_path(fake_airflow_context)
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.ingest_archive_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.ready_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.staging_ready_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.landing_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.ready_archive_path(fake_airflow_context)
    assert isinstance(test_path, str)
Exemplo n.º 4
0
def test_pandas_data_asset(
    fake_airflow_context: Dict,
    test_parquet_asset: PandasDataAsset,
    test_parquet_asset_df: pd.DataFrame,
) -> None:

    # none, none
    d1 = test_parquet_asset.retrieve_from_store()

    # only airflow_context
    d2 = test_parquet_asset.retrieve_from_store(
        airflow_context=fake_airflow_context)

    # only consuming asset
    d3 = test_parquet_asset.retrieve_from_store(consuming_asset=ShellDataAsset(
        name="test_consumer"))

    # both parameters set
    d4 = test_parquet_asset.retrieve_from_store(
        airflow_context=fake_airflow_context,
        consuming_asset=ShellDataAsset(name="test_consumer"),
    )

    assert d1.equals(d2) and d2.equals(d3) and d3.equals(d4)
Exemplo n.º 5
0
def test_read_write_parquet(test_parquet_in_asset: PandasDataAsset,
                            iris: pd.DataFrame, fake_airflow_context) -> None:
    p = path.join(
        test_parquet_in_asset.staging_pickedup_path(fake_airflow_context),
        "test_parquet_in.parquet",
    )
    os.makedirs(path.dirname(p), exist_ok=True)
    iris.to_parquet(p)

    PandasDataAssetIO.read_data_asset(test_parquet_in_asset, source_files=[p])

    # try with additional kwargs:
    PandasDataAssetIO.read_data_asset(asset=test_parquet_in_asset,
                                      source_files=[p],
                                      engine="auto")
Exemplo n.º 6
0
def test_read_write_xlsx(test_xlsx_in_asset: PandasDataAsset,
                         iris: pd.DataFrame, fake_airflow_context) -> None:
    p = path.join(
        test_xlsx_in_asset.staging_pickedup_path(fake_airflow_context),
        "test_xlsx_in.xls",
    )
    os.makedirs(path.dirname(p), exist_ok=True)
    iris.to_excel(p)

    # try without any extra kwargs:
    PandasDataAssetIO.read_data_asset(asset=test_xlsx_in_asset,
                                      source_files=[p])
    # try with additional kwargs:
    PandasDataAssetIO.read_data_asset(asset=test_xlsx_in_asset,
                                      source_files=[p],
                                      sheet_name=0)
Exemplo n.º 7
0
from datetime import datetime

from airflow.models import DAG

from airtunnel import PandasDataAsset
from airtunnel.operators.archival import DataAssetArchiveOperator, IngestArchiveOperator
from airtunnel.operators.ingestion import IngestOperator
from airtunnel.operators.loading import StagingToReadyOperator
from airtunnel.operators.transformation import PandasTransformationOperator
from airtunnel.sensors.ingestion import SourceFileIsReadySensor

student = PandasDataAsset("student")
programme = PandasDataAsset("programme")
enrollment = PandasDataAsset("enrollment")
enrollment_summary = PandasDataAsset("enrollment_summary")

with DAG(
        dag_id="university",
        schedule_interval=None,
        start_date=datetime(year=2019, month=9, day=1),
) as dag:
    ingested_ready_tasks = set()

    # a common stream of tasks for all ingested assets:
    for ingested_asset in (student, programme, enrollment):
        source_is_ready = SourceFileIsReadySensor(
            # we reduce the poke interval to only 3 seconds so that our example runs complete faster
            # do not do in production!! :)
            asset=ingested_asset,
            poke_interval=3,
            no_of_required_static_pokes=2,
Exemplo n.º 8
0
def test_pandas_asset() -> PandasDataAsset:
    return PandasDataAsset("test_parquet_in_asset")
Exemplo n.º 9
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):
    programme_data = PandasDataAssetIO.read_data_asset(
        asset=asset, source_files=asset.pickedup_files(airflow_context))
    programme_data = programme_data.drop_duplicates(
        subset=asset.declarations.key_columns)
    PandasDataAssetIO.write_data_asset(asset=asset, data=programme_data)
Exemplo n.º 10
0
from datetime import datetime, timedelta

import pytest
from airflow.models import DAG

from airtunnel import PandasDataAsset
from airtunnel.sensors.metadata import (
    AwaitLoadStatusSensor,
    AwaitAssetAncestorsUpdatedSensor,
)

enrollment_summary = PandasDataAsset("enrollment_summary")

with DAG(
    dag_id="metadata_sensors",
    schedule_interval=None,
    start_date=datetime(year=2019, month=9, day=1),
) as dag:
    await_load_status = AwaitLoadStatusSensor(
        asset=enrollment_summary,
        refreshed_within=timedelta(days=1),
        poke_interval=5,
        timeout=120,
    )

    await_load_status_refreshed_after = AwaitLoadStatusSensor(
        asset=enrollment_summary,
        task_id="enrollment_summary_load_status_2",
        refreshed_after=datetime.now() - timedelta(days=1),
        poke_interval=5,
        timeout=120,
Exemplo n.º 11
0
def test_pandas_data_asset_exceptions(
        fake_airflow_context: Dict,
        test_parquet_asset: PandasDataAsset) -> None:
    with pytest.raises(Exception):
        test_parquet_asset.name = "fail"
        test_parquet_asset.rebuild_for_store(fake_airflow_context)
Exemplo n.º 12
0
def test_xlsx_in_asset() -> PandasDataAsset:
    return PandasDataAsset("test_xlsx_in_asset")
Exemplo n.º 13
0
def test_csv_asset() -> PandasDataAsset:
    return PandasDataAsset("test_csv_out_asset_pandas")
Exemplo n.º 14
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):
    enrollment_data = PandasDataAssetIO.read_data_asset(
        asset=asset, source_files=asset.pickedup_files(airflow_context)
    )

    PandasDataAssetIO.write_data_asset(asset=asset, data=enrollment_data)