def main(): """ starting import """ # get argument parser = argparse.ArgumentParser() parser.add_argument("input_json", type=str, help="JSON file to process") args = parser.parse_args() # transform data into dataframe for row in data_iter(args.input_json): subset_amterdam.append(row) result = pd.DataFrame(subset_amterdam) # lower all column names result.columns = result.columns.str.lower() # aggregating numbers per group result = result.groupby( ['date_of_publication', 'municipality_code', 'municipality_name', 'province']).agg( total_reported = ('total_reported','sum'), hospital_admission = ('hospital_admission','sum'), deceased = ('deceased','sum'), ).reset_index() log.info(f"Starting import {args.input_json}") engine = get_engine() result.to_sql( "corona_gevallen_new", engine, dtype={ "index": Integer(), "date_of_publication": Date(), "municipality_code": String(), "municipality_name": String(), "province": String(), "total_reported": Integer(), "hospital_admission": Integer(), "deceased": Integer(), }, ) log.info("Data loaded into DB") ProvenanceRenameOperator( task_id="provenance_rename", dataset_name="corona", prefix_table_name="corona_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ).execute() log.info("Renamed columns based on provenance.")
tmp_dir / "cmsa_sensor_new.sql", tmp_dir / "cmsa_locatie_new.sql", ], ) # 8. Create target tables: Markering (join between Sensor en Locatie) fill_markering = PostgresFilesOperator( task_id="insert_into_table_markering", sql_files=[f"{sql_path}/cmsa_data_insert_markering.sql"], ) # 9. RENAME columns based on PROVENANCE provenance_translation = ProvenanceRenameOperator( task_id="provenance_rename", dataset_name=dag_id, prefix_table_name=f"{dag_id}_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ) # 10. Rename temp named tables to final names rename_tables = PostgresOperator(task_id="rename_tables", sql=SQL_TABLE_RENAMES) # 11. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) ( slack_at_start >> mkdir >> download_geojson
task_id="load_dump_file", container="Dataservices", object_id=f"afval_huishoudelijk/{DATASTORE_TYPE}/" "afval_api.zip", swift_conn_id="objectstore_dataservices", # optionals # db_target_schema will create the schema if not present db_target_schema="pte", ) # 4. DUMP FILE SOURCE # Make the provenance translations provenance_file_data = ProvenanceRenameOperator( task_id="provenance_file", dataset_name="huishoudelijkafval", subset_tables=f"{tables['dump_file']}", pg_schema="pte", rename_indexes=True, ) # 5. DUMP FILE SOURCE # Swap tables to target schema public swap_schema = SwapSchemaOperator(task_id="swap_schema", dataset_name="huishoudelijkafval", subset_tables=tables['dump_file']) # 6. DWH STADSDELEN SOURCE # Load voertuigenplanning data into DB load_dwh = PythonOperator( task_id="load_from_dwh_stadsdelen", python_callable=load_from_dwh,
create_table = BashOperator( task_id="create_table", bash_command=f"psql {pg_params()} < {tmp_dir}/{dag_id}.utf8.sql", ) # 6. DROP Exisiting TABLE drop_table = PostgresOperator( task_id="drop_existing_table", sql=[ f"DROP TABLE IF EXISTS {dag_id}_{dag_id} CASCADE", ], ) # 7. RENAME COLUMNS based on PROVENANCE provenance_translation = ProvenanceRenameOperator(task_id="rename_columns", dataset_name=dag_id, pg_schema="public") # 8. RENAME TABLE rename_table = PostgresTableRenameOperator( task_id="rename_table", old_table_name=dag_id, new_table_name=f"{dag_id}_{dag_id}", ) # 8. ADD missing COLUMNS in source add_category = PostgresOperator( task_id="add_columns", sql=ADD_CATEGORIE_CATEGORIENAAM, params=dict(tablename=f"{dag_id}_{dag_id}"), )
# 3. load the dump file swift_load_task = SwiftLoadSqlOperator( task_id="swift_load_task", container="Dataservices", object_id=f"varen/{DATASTORE_TYPE}/varen.zip", swift_conn_id="objectstore_dataservices", # optionals # db_target_schema will create the schema if not present db_target_schema="pte", ) # 4. Make the provenance translations provenance_renames = ProvenanceRenameOperator( task_id="provenance_renames", dataset_name="varen", pg_schema="pte", rename_indexes=True, ) # 5. Swap tables to target schema public swap_schema = SwapSchemaOperator(task_id="swap_schema", dataset_name="varen") # 6. Grant database permissions grant_db_permissions = PostgresPermissionsOperator( task_id="grants", dag_name=dag_id ) # FLOW slack_at_start >> drop_tables >> swift_load_task >> provenance_renames >> swap_schema >> grant_db_permissions
) for key in files_to_download.keys() ] # 7. Remove unnecessary cols remove_cols = [ PostgresOperator( task_id=f"remove_cols_{key}", sql=REMOVE_COLS, params=dict(tablename=f"{key}"), ) for key in files_to_download.keys() ] # 8. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=f"{dag_id}", rename_indexes=False, pg_schema="public", ) # 9. Add PDF hyperlink only for table bominslag and verdachtgebied add_hyperlink_pdf = [ PostgresOperator( task_id=f"add_hyperlink_pdf_{table}", sql=ADD_HYPERLINK_PDF, params=dict(tablename=f"{table}"), ) for table in ("bominslag", "verdachtgebied") ] # 10. Dummy operator acts as an interface between parallel tasks to another parallel tasks with different number of lanes # (without this intermediar, Airflow will give an error) Interface2 = DummyOperator(task_id="interface2")
def create_basiskaart_dag(is_first, table_name, select_statement): """ DAG generator: Generates a DAG for each table. The table_name is the target table in de masterDB where the data will be inserted. The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB. """ # start time first DAG # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between 3 and 5 a.m. # Because of this, the first DAG starts running at 7 a.m. schedule_start_hour = 7 dag = DAG( f"{dag_id}_{table_name}", default_args={ "owner": owner, **default_args }, # the first DAG will have the is_first boolean set to True # the other DAG's will be triggered to start when the previous DAG is finished (estafette run / relay run) schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, description= """basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). The basiskaart data is collected from basiskaart DB.""", tags=["basiskaart"], ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp and target table create_tables = PostgresOperator( task_id="create_tables", sql=CREATE_TABLES, params=dict(base_table=table_name, dag_id=dag_id), ) # 3. Copy data into temp table copy_data = PythonOperator( task_id="insert_data", python_callable=create_tables_from_basiskaartdb_to_masterdb, op_kwargs={ "source_connection": source_connection, "source_select_statement": globals()[select_statement], "target_base_table": f"{dag_id}_{table_name}_temp", }, dag=dag, ) # 4. Check for changes in temp table to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{table_name}_temp", target_table=f"{dag_id}_{table_name}") # 5. Create mviews for T-REX tile server create_mviews = PostgresOperator( task_id="create_mviews", sql=CREATE_MVIEWS, params=dict(base_table=table_name, dag_id=dag_id), ) # 6. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=f"{dag_id}", prefix_table_name=f"{dag_id}_", rename_indexes=False, pg_schema="public", ) # 7. Drop temp table clean_up = PostgresOperator( task_id="drop_temp_table", sql=[ f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE", ], ) # 8. Trigger next DAG to run (estafette) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix=f"{dag_id}_", trigger_rule="all_done", ) # Flow slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag dag.doc_md = """ #### DAG summery This DAG containts BGT (basisregistratie grootschalige topografie) and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data #### Mission Critical Classified as 2 (beschikbaarheid [range: 1,2,3]) #### On Failure Actions Fix issues and rerun dag on working days #### Point of Contact Inform the businessowner at [businessowner]@amsterdam.nl #### Business Use Case / process / origin NA #### Prerequisites/Dependencies/Resourcing https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG. """ return dag
COLNAMES_CHECK.make_check( check_id=f"colname_check_{table_name}", parameters=["pte", table_name], pass_value=field_names, result_checker=operator.ge, ) ) checks.append( GEO_CHECK.make_check( check_id=f"geo_check_{table_name}", params=dict( table_name=f"pte.{table_name}", geo_column="geometrie", geotype=geo_type, ), pass_value=1, ) ) multi_check = PostgresMultiCheckOperator(task_id="multi_check", checks=checks) rename_columns = ProvenanceRenameOperator( task_id="rename_columns", dataset_name="rioolnetwerk", pg_schema="pte" ) rename_tables = PostgresOperator(task_id="rename_tables", sql=RENAME_TABLES_SQL,) slack_at_start >> drop_tables >> swift_load_task >> multi_check >> rename_columns >> rename_tables
python_callable=load_from_dwh, op_args=[f"{dag_id}_new"], ) # 3. Check minimum number of records check_count = PostgresCheckOperator( task_id="check_count", sql=SQL_CHECK_COUNT, params=dict(tablename=f"{dag_id}_new", mincount=5000), ) # 4. Rename COLUMNS based on provenance (if specified) provenance_dwh_data = ProvenanceRenameOperator( task_id="provenance_dwh", dataset_name=data_schema_id, prefix_table_name=f"{data_schema_id}_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ) # 5. # Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_tables = SqlAlchemyCreateObjectOperator( task_id="create_table", data_schema_name=data_schema_id, data_table_name=dag_id, ind_table=True, # when set to false, it doesn't create indexes; only tables ind_extra_index=True, )
input_file=f"{tmp_dir}/{shp_file}", s_srs=None, t_srs="EPSG:28992", auto_detect_type="YES", geometry_name="geometrie", mode="PostgreSQL", fid="id", db_conn=db_conn, ) # 6. RENAME columns based on PROVENANCE provenance_trans = ProvenanceRenameOperator( task_id="provenance_rename", dataset_name=schema_name, prefix_table_name=f"{schema_name}_", postfix_table_name="_new", subset_tables=["".join(table_name)], rename_indexes=False, pg_schema="public", ) # Prepare the checks and added them per source to a dictionary total_checks.clear() count_checks.clear() count_checks.append( COUNT_CHECK.make_check( check_id="count_check", pass_value=1, params=dict(table_name=f"{schema_name}_{table_name}_new"), result_checker=operator.ge,
pg_schema="pte", ) # 3. load the dump file swift_load_task = SwiftLoadSqlOperator( task_id="swift_load_task", container="Dataservices", object_id=f"afval_huishoudelijk/{DATASTORE_TYPE}/" "afval_api.zip", swift_conn_id="objectstore_dataservices", # optionals # db_target_schema will create the schema if not present db_target_schema="pte", ) # 4. Make the provenance translations provenance_renames = ProvenanceRenameOperator( task_id="provenance_renames", dataset_name="huishoudelijkafval", pg_schema="pte", rename_indexes=True, ) # 5. Swap tables to target schema public swap_schema = SwapSchemaOperator(task_id="swap_schema", dataset_name="huishoudelijkafval") # FLOW slack_at_start >> drop_tables >> swift_load_task >> provenance_renames >> swap_schema