params=dict(tablename=f"{dag_id}_{subject}_new"),
        )
        for subject in files_to_proces.keys()
    ]

    # 8. Prepare the checks and added them per source to a dictionary
    for subject in files_to_proces.keys():

        total_checks.clear()
        count_checks.clear()
        geo_checks.clear()

        count_checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{subject}",
                pass_value=2,
                params=dict(table_name=f"{dag_id}_{subject}_new"),
                result_checker=operator.ge,
            )
        )

        geo_checks.append(
            GEO_CHECK.make_check(
                check_id=f"geo_check_{subject}",
                params=dict(
                    table_name=f"{dag_id}_{subject}_new",
                    geotype=["POLYGON", "MULTIPOLYGON"],
                ),
                pass_value=1,
            )
        )
Пример #2
0
            old_table_name=key,
            new_table_name=f"{dag_id}_{key}",
        ) for key in files_to_download.keys()
    ]

    # Prepare the checks and added them per source to a dictionary
    for key in files_to_download.keys():

        total_checks.clear()
        count_checks.clear()
        geo_checks.clear()

        count_checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{key}",
                pass_value=2,
                params=dict(table_name=key),
                result_checker=operator.ge,
            ))

        geo_checks.append(
            GEO_CHECK.make_check(
                check_id=f"geo_check_{key}",
                params=dict(
                    table_name=key,
                    geotype=["POINT", "MULTIPOLYGON"],
                ),
                pass_value=1,
            ))

        total_checks = count_checks + geo_checks
        check_name[key] = total_checks
Пример #3
0
        old_table_name=dag_id,
        new_table_name=f"{dag_id}_{dag_id}",
    )

    # 8. ADD missing COLUMNS in source
    add_category = PostgresOperator(
        task_id="add_columns",
        sql=ADD_CATEGORIE_CATEGORIENAAM,
        params=dict(tablename=f"{dag_id}_{dag_id}"),
    )

    # 9. PREPARE CHECKS
    count_checks.append(
        COUNT_CHECK.make_check(
            check_id="count_check",
            pass_value=75,
            params=dict(table_name=f"{dag_id}_{dag_id}"),
            result_checker=operator.ge,
        ))

    # Data shows that 17 / 132 polygonen are invalid, to avoid crashing the flow, temporaly turned off
    # geo_checks.append(
    #     GEO_CHECK.make_check(
    #         check_id="geo_check",
    #         params=dict(
    #             table_name=f"{dag_id}_{dag_id}",
    #             geotype=["POLYGON", "MULTIPOLYGON"],
    #         ),
    #         pass_value=1,
    #     )
    # )
    # total_checks = count_checks + geo_checks
    provenance_trans = ProvenanceRenameOperator(
        task_id="provenance_rename",
        dataset_name=schema_name,
        prefix_table_name=f"{schema_name}_",
        postfix_table_name="_new",
        rename_indexes=False,
        pg_schema="public",
    )

    # Prepare the checks
    count_checks.clear()

    count_checks.append(
        COUNT_CHECK.make_check(
            check_id="count_check",
            pass_value=1,
            params=dict(table_name=f"{schema_name}_{table_name}_new"),
            result_checker=operator.ge,
        )
    )

    check_name[dag_id] = count_checks

    # 8. Execute bundled checks on database (in this case just a count check)
    count_check = PostgresMultiCheckOperator(task_id="count_check", checks=check_name[dag_id])

    # 9. Create the DB target table (as specified in the JSON data schema)
    # if table not exists yet
    create_target_table = SqlAlchemyCreateObjectOperator(
        task_id="create_target_table_based_upon_schema",
        data_schema_name=schema_name,
        data_table_name=f"{schema_name}_{table_name}",
            180000,
            "POINT",
            {"objnr", "knoopnr", "objectsoor", "type_funde", "geometrie",},
        ),
        (
            "kel_rioolleidingen",
            194000,
            ["MULTILINESTRING", "LINESTRING"],
            {"objnr", "leidingnaa", "br_diamete", "vorm"},
        ),
    ):

        checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{table_name}",
                pass_value=count,
                params=dict(table_name=f"pte.{table_name}"),
                result_checker=operator.ge,
            )
        )

        # XXX Get colnames from schema (provenance info)
        checks.append(
            COLNAMES_CHECK.make_check(
                check_id=f"colname_check_{table_name}",
                parameters=["pte", table_name],
                pass_value=field_names,
                result_checker=operator.ge,
            )
        )

        checks.append(
Пример #6
0
            },
        ),
    )

    # 7. Dummy operator acts as an Interface between parallel tasks
    # to another parallel tasks (i.e. lists or tuples) with different number
    # of lanes (without this intermediar, Airflow will give an error)
    Interface2 = DummyOperator(task_id="interface2")

    # 8. Check minimum number of records
    # PREPARE CHECKS
    for resource in variables:
        count_checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{resource}",
                pass_value=50,
                params=dict(table_name=f"{dag_id}_{resource}_new "),
                result_checker=operator.ge,
            ))

        geo_checks.append(
            GEO_CHECK.make_check(
                check_id=f"geo_check_{resource}",
                params=dict(
                    table_name=f"{dag_id}_{resource}_new",
                    geotype=["POINT"],
                    geo_column="geometrie",
                ),
                pass_value=1,
            ))

    total_checks = count_checks + geo_checks
Пример #7
0
        task_id="drop_imported_table",
        sql="DROP TABLE IF EXISTS pte.beschermde_stadsdorpsgezichten CASCADE",
    )

    swift_load_task = SwiftLoadSqlOperator(
        task_id="swift_load_task",
        container="Dataservices",
        object_id=f"beschermde_stads_en_dorpsgezichten/{DATASTORE_TYPE}/"
        "beschermde_stadsdorpsgezichten.zip",
        swift_conn_id="objectstore_dataservices",
    )

    checks.append(
        COUNT_CHECK.make_check(
            check_id="count_check",
            pass_value=10,
            params=dict(table_name="pte.beschermde_stadsdorpsgezichten"),
            result_checker=operator.ge,
        )
    )

    checks.append(
        COLNAMES_CHECK.make_check(
            check_id="colname_check",
            parameters=["pte", "beschermde_stadsdorpsgezichten"],
            pass_value={
                "id",
                "naam",
                "status",
                "aanwijzingsdatum",
                "intrekkingsdatum",
                "geometry",
        sql=SQL_CREATE_TEMP_TABLES,
        params=dict(base_table=f"{dag_id}_{dag_id}"),
    )

    run_import_task = PythonOperator(
        task_id="run_import_task",
        python_callable=run_imports,
        dag=dag,
    )

    count_check = PostgresMultiCheckOperator(
        task_id="count_check",
        checks=[
            COUNT_CHECK.make_check(
                check_id="non_zero_check",
                pass_value=10,
                params=dict(table_name=f"{dag_id}_{dag_id}_temp"),
                result_checker=operator.ge,
            )
        ])

    rename_temp_tables = PostgresOperator(
        task_id="rename_temp_tables",
        sql=SQL_RENAME_TEMP_TABLES,
        params=dict(base_table=f"{dag_id}_{dag_id}"),
    )

    # Grant database permissions
    grant_db_permissions = PostgresPermissionsOperator(task_id="grants",
                                                       dag_name=dag_id)

(mk_tmp_dir >> download_and_extract_zip >> download_and_extract_nietfiscaal_zip
Пример #9
0
        task_id="drop_old_tables",
        sql=DROP_TMPL,
        params=dict(tablenames=TABLES_TO_DROP),
    )

    import_geojson = PythonOperator(
        task_id="import_geojson",
        python_callable=_load_geojson,
        op_args=[default_args.get("postgres_conn_id", "postgres_default")],
    )

    for route in ROUTES:
        count_checks.append(
            COUNT_CHECK.make_check(
                check_id=f"count_check_{route.name}",
                pass_value=3,
                params=dict(table_name=route.tmp_db_table_name),
                result_checker=operator.ge,
            ))

        colname_checks.append(
            COLNAMES_CHECK.make_check(
                check_id=f"colname_check_{route.name}",
                parameters=["public", route.tmp_db_table_name],
                pass_value=set(route.columns),
                result_checker=operator.ge,
            ))

        geo_checks.append(
            GEO_CHECK.make_check(
                check_id=f"geo_check_{route.name}",
                params=dict(
Пример #10
0
    swift_task = SwiftOperator(
        task_id="swift_task",
        container="Dataservices",
        object_id=
        "beschermde_stads_en_dorpsgezichten/acceptance/beschermde_stadsdorpsgezichten.zip",
        output_path="/tmp/bsd.zip",
        # container="afval",
        # object_id="acceptance/afval_cluster.zip",
        # output_path="/tmp/blaat/out2.zip",
        # conn_id="afval",
        swift_conn_id="objectstore_dataservices",
    )

    count_check = COUNT_CHECK.make_check(
        check_id="count_check",
        pass_value=1587,
        params=dict(table_name="fietspaaltjes"),
        result_checker=operator.ge,
    )

    colname_check = COLNAMES_CHECK.make_check(
        check_id="colname_check",
        parameters=["fietspaaltjes"],
        pass_value=set(["id"]),
        result_checker=operator.ge,
    )

    geo_check = GEO_CHECK.make_check(
        check_id="geo_check",
        params=dict(table_name="fietspaaltjes", geotype="POINT"),
        pass_value=1,
    )