params=dict(tablename=f"{dag_id}_{subject}_new"), ) for subject in files_to_proces.keys() ] # 8. Prepare the checks and added them per source to a dictionary for subject in files_to_proces.keys(): total_checks.clear() count_checks.clear() geo_checks.clear() count_checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{subject}", pass_value=2, params=dict(table_name=f"{dag_id}_{subject}_new"), result_checker=operator.ge, ) ) geo_checks.append( GEO_CHECK.make_check( check_id=f"geo_check_{subject}", params=dict( table_name=f"{dag_id}_{subject}_new", geotype=["POLYGON", "MULTIPOLYGON"], ), pass_value=1, ) )
old_table_name=key, new_table_name=f"{dag_id}_{key}", ) for key in files_to_download.keys() ] # Prepare the checks and added them per source to a dictionary for key in files_to_download.keys(): total_checks.clear() count_checks.clear() geo_checks.clear() count_checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{key}", pass_value=2, params=dict(table_name=key), result_checker=operator.ge, )) geo_checks.append( GEO_CHECK.make_check( check_id=f"geo_check_{key}", params=dict( table_name=key, geotype=["POINT", "MULTIPOLYGON"], ), pass_value=1, )) total_checks = count_checks + geo_checks check_name[key] = total_checks
old_table_name=dag_id, new_table_name=f"{dag_id}_{dag_id}", ) # 8. ADD missing COLUMNS in source add_category = PostgresOperator( task_id="add_columns", sql=ADD_CATEGORIE_CATEGORIENAAM, params=dict(tablename=f"{dag_id}_{dag_id}"), ) # 9. PREPARE CHECKS count_checks.append( COUNT_CHECK.make_check( check_id="count_check", pass_value=75, params=dict(table_name=f"{dag_id}_{dag_id}"), result_checker=operator.ge, )) # Data shows that 17 / 132 polygonen are invalid, to avoid crashing the flow, temporaly turned off # geo_checks.append( # GEO_CHECK.make_check( # check_id="geo_check", # params=dict( # table_name=f"{dag_id}_{dag_id}", # geotype=["POLYGON", "MULTIPOLYGON"], # ), # pass_value=1, # ) # ) # total_checks = count_checks + geo_checks
provenance_trans = ProvenanceRenameOperator( task_id="provenance_rename", dataset_name=schema_name, prefix_table_name=f"{schema_name}_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ) # Prepare the checks count_checks.clear() count_checks.append( COUNT_CHECK.make_check( check_id="count_check", pass_value=1, params=dict(table_name=f"{schema_name}_{table_name}_new"), result_checker=operator.ge, ) ) check_name[dag_id] = count_checks # 8. Execute bundled checks on database (in this case just a count check) count_check = PostgresMultiCheckOperator(task_id="count_check", checks=check_name[dag_id]) # 9. Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_target_table = SqlAlchemyCreateObjectOperator( task_id="create_target_table_based_upon_schema", data_schema_name=schema_name, data_table_name=f"{schema_name}_{table_name}",
180000, "POINT", {"objnr", "knoopnr", "objectsoor", "type_funde", "geometrie",}, ), ( "kel_rioolleidingen", 194000, ["MULTILINESTRING", "LINESTRING"], {"objnr", "leidingnaa", "br_diamete", "vorm"}, ), ): checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{table_name}", pass_value=count, params=dict(table_name=f"pte.{table_name}"), result_checker=operator.ge, ) ) # XXX Get colnames from schema (provenance info) checks.append( COLNAMES_CHECK.make_check( check_id=f"colname_check_{table_name}", parameters=["pte", table_name], pass_value=field_names, result_checker=operator.ge, ) ) checks.append(
}, ), ) # 7. Dummy operator acts as an Interface between parallel tasks # to another parallel tasks (i.e. lists or tuples) with different number # of lanes (without this intermediar, Airflow will give an error) Interface2 = DummyOperator(task_id="interface2") # 8. Check minimum number of records # PREPARE CHECKS for resource in variables: count_checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{resource}", pass_value=50, params=dict(table_name=f"{dag_id}_{resource}_new "), result_checker=operator.ge, )) geo_checks.append( GEO_CHECK.make_check( check_id=f"geo_check_{resource}", params=dict( table_name=f"{dag_id}_{resource}_new", geotype=["POINT"], geo_column="geometrie", ), pass_value=1, )) total_checks = count_checks + geo_checks
task_id="drop_imported_table", sql="DROP TABLE IF EXISTS pte.beschermde_stadsdorpsgezichten CASCADE", ) swift_load_task = SwiftLoadSqlOperator( task_id="swift_load_task", container="Dataservices", object_id=f"beschermde_stads_en_dorpsgezichten/{DATASTORE_TYPE}/" "beschermde_stadsdorpsgezichten.zip", swift_conn_id="objectstore_dataservices", ) checks.append( COUNT_CHECK.make_check( check_id="count_check", pass_value=10, params=dict(table_name="pte.beschermde_stadsdorpsgezichten"), result_checker=operator.ge, ) ) checks.append( COLNAMES_CHECK.make_check( check_id="colname_check", parameters=["pte", "beschermde_stadsdorpsgezichten"], pass_value={ "id", "naam", "status", "aanwijzingsdatum", "intrekkingsdatum", "geometry",
sql=SQL_CREATE_TEMP_TABLES, params=dict(base_table=f"{dag_id}_{dag_id}"), ) run_import_task = PythonOperator( task_id="run_import_task", python_callable=run_imports, dag=dag, ) count_check = PostgresMultiCheckOperator( task_id="count_check", checks=[ COUNT_CHECK.make_check( check_id="non_zero_check", pass_value=10, params=dict(table_name=f"{dag_id}_{dag_id}_temp"), result_checker=operator.ge, ) ]) rename_temp_tables = PostgresOperator( task_id="rename_temp_tables", sql=SQL_RENAME_TEMP_TABLES, params=dict(base_table=f"{dag_id}_{dag_id}"), ) # Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) (mk_tmp_dir >> download_and_extract_zip >> download_and_extract_nietfiscaal_zip
task_id="drop_old_tables", sql=DROP_TMPL, params=dict(tablenames=TABLES_TO_DROP), ) import_geojson = PythonOperator( task_id="import_geojson", python_callable=_load_geojson, op_args=[default_args.get("postgres_conn_id", "postgres_default")], ) for route in ROUTES: count_checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{route.name}", pass_value=3, params=dict(table_name=route.tmp_db_table_name), result_checker=operator.ge, )) colname_checks.append( COLNAMES_CHECK.make_check( check_id=f"colname_check_{route.name}", parameters=["public", route.tmp_db_table_name], pass_value=set(route.columns), result_checker=operator.ge, )) geo_checks.append( GEO_CHECK.make_check( check_id=f"geo_check_{route.name}", params=dict(
swift_task = SwiftOperator( task_id="swift_task", container="Dataservices", object_id= "beschermde_stads_en_dorpsgezichten/acceptance/beschermde_stadsdorpsgezichten.zip", output_path="/tmp/bsd.zip", # container="afval", # object_id="acceptance/afval_cluster.zip", # output_path="/tmp/blaat/out2.zip", # conn_id="afval", swift_conn_id="objectstore_dataservices", ) count_check = COUNT_CHECK.make_check( check_id="count_check", pass_value=1587, params=dict(table_name="fietspaaltjes"), result_checker=operator.ge, ) colname_check = COLNAMES_CHECK.make_check( check_id="colname_check", parameters=["fietspaaltjes"], pass_value=set(["id"]), result_checker=operator.ge, ) geo_check = GEO_CHECK.make_check( check_id="geo_check", params=dict(table_name="fietspaaltjes", geotype="POINT"), pass_value=1, )