# 9. RENAME columns based on PROVENANCE provenance_translation = ProvenanceRenameOperator( task_id="provenance_rename", dataset_name=dag_id, prefix_table_name=f"{dag_id}_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ) # 10. Rename temp named tables to final names rename_tables = PostgresOperator(task_id="rename_tables", sql=SQL_TABLE_RENAMES) # 11. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) ( slack_at_start >> mkdir >> download_geojson >> fetch_files >> proces_cmsa >> create_tables >> import_data >> fill_markering >> provenance_translation >> rename_tables >> grant_db_permissions )
dag_id = "airflow_db_permissions" logger = logging.getLogger(__name__) with DAG( dag_id, default_args=default_args, schedule_interval="*/15 * * * *", catchup=False, description= "set grants on database objects to database roles based upon schema auth definition, based upon successfully executed dags within specified time window.", ) as dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Add grants (in batch mode) # seeks for dags that successfully executed within time window # default time window is set on 30 min. # beware: currently the logic assumes dag_id == dataset name # TODO look for possibility to use other then dag_id as dataset name grants = PostgresPermissionsOperator(task_id="grants", batch_ind=True) # FLOW slack_at_start >> grants
def create_gob_dag(is_first: bool, gob_dataset_id: str, gob_table_id: str) -> DAG: dataset_table_id = f"{gob_dataset_id}_{gob_table_id}" graphql_dir_path = graphql_path / f"{gob_dataset_id}-{gob_table_id}" graphql_params_path = graphql_dir_path / "args.json" extra_kwargs = {} schedule_start_hour = 6 if graphql_params_path.exists(): with graphql_params_path.open() as json_file: args_from_file = json.load(json_file) extra_kwargs = args_from_file.get("extra_kwargs", {}) protected = extra_kwargs.get("protected", False) if protected: extra_kwargs["endpoint"] = GOB_SECURE_ENDPOINT dag = DAG( f"{dag_id}_{dataset_table_id}", default_args={ "owner": owner, **default_args }, schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, tags=["gob"], ) kwargs = dict( task_id=f"load_{dataset_table_id}", endpoint=GOB_PUBLIC_ENDPOINT, retries=3, graphql_query_path=graphql_dir_path / "query.graphql", max_records=MAX_RECORDS, http_conn_id="gob_graphql", token_expires_margin=OAUTH_TOKEN_EXPIRES_MARGIN, xcom_table_info_task_ids=f"mkinfo_{dataset_table_id}", ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id=f"slack_at_start_{dataset_table_id}", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) def _create_dataset_info(dataset_id: str, table_id: str) -> DatasetInfo: dataset = schema_def_from_url(SCHEMA_URL, dataset_id, prefetch_related=True) # Fetch the db_name for this dataset and table db_table_name = dataset.get_table_by_id(table_id).db_name() # We do not pass the dataset through xcom, but only the id. # The methodtools.lru_cache decorator is not pickleable # (Airflow uses pickle for (de)serialization). # provide the dataset_table_id as fully qualified name, for convenience dataset_table_id = f"{dataset_id}_{table_id}" return DatasetInfo(SCHEMA_URL, dataset_id, table_id, dataset_table_id, db_table_name) # 2. Create Dataset info to put on the xcom channel for later use # by operators create_dataset_info = PythonOperator( task_id=f"mkinfo_{dataset_table_id}", python_callable=_create_dataset_info, op_args=(gob_dataset_id, gob_table_id), ) def init_assigner(o: Any, x: Any) -> None: o.table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}" # 3. drop temp table if exists init_table = PostgresTableInitOperator( task_id=f"init_{dataset_table_id}", table_name=None, xcom_task_ids=f"mkinfo_{dataset_table_id}", xcom_attr_assigner=init_assigner, drop_table=True, ) # 4. load data into temp table load_data = HttpGobOperator(**{**kwargs, **extra_kwargs}) def copy_assigner(o: Any, x: Any) -> None: o.source_table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}" o.target_table_name = x.db_table_name # 5. truncate target table and insert data from temp table copy_table = PostgresTableCopyOperator( task_id=f"copy_{dataset_table_id}", source_table_name=None, target_table_name=None, xcom_task_ids=f"mkinfo_{dataset_table_id}", xcom_attr_assigner=copy_assigner, ) def index_assigner(o: Any, x: Any) -> None: o.data_table_name = x.db_table_name # 6. create an index on the identifier fields (as specified in the JSON data schema) create_extra_index = SqlAlchemyCreateObjectOperator( task_id=f"create_extra_index_{dataset_table_id}", data_schema_name=gob_dataset_id, data_table_name=None, # when set to false, it doesn't create the tables; only the index ind_table=False, ind_extra_index=True, xcom_task_ids=f"mkinfo_{dataset_table_id}", xcom_attr_assigner=index_assigner, ) # 7. trigger next DAG (estafette / relay run) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix="gob_", trigger_rule="all_done", ) # 9. Grant database permissions grant_db_permissions = PostgresPermissionsOperator( task_id="grants", dag_name=f"{dag_id}_{dataset_table_id}") # FLOW (slack_at_start >> create_dataset_info >> init_table >> load_data >> copy_table >> create_extra_index >> trigger_next_dag >> grant_db_permissions) return dag
def create_basiskaart_dag(is_first: bool, table_name: str, select_statement: str) -> DAG: """Generates a DAG for each table. The table_name is the target table in de masterDB where the data will be inserted. The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB. """ # start time first DAG # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between # 3 and 5 a.m. Because of this, the first DAG starts running at 7 a.m. schedule_start_hour = 7 dag = DAG( f"{dag_id}_{table_name}", default_args={ "owner": owner, **default_args }, # the first DAG will have the is_first boolean set to True # the other DAG's will be triggered to start when the previous DAG is finished # (estafette run / relay run) schedule_interval=f"0 {schedule_start_hour} * * *" if is_first else None, description=""" basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). The basiskaart data is collected from basiskaart DB.""", tags=["basiskaart"], ) with dag: # 1. Post info message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp and target table create_tables = PostgresOperator( task_id="create_tables", sql=CREATE_TABLES, params=dict(base_table=table_name, dag_id=dag_id), ) # 3. Copy data into temp table copy_data = PythonOperator( task_id="insert_data", python_callable=create_tables_from_basiskaartdb_to_masterdb, op_kwargs={ "source_connection": source_connection, "source_select_statement": globals()[select_statement], "target_base_table": f"{dag_id}_{table_name}_temp", }, dag=dag, ) # 4. Check for changes in temp table to merge in target table change_data_capture = PgComparatorCDCOperator( task_id="change_data_capture", source_table=f"{dag_id}_{table_name}_temp", target_table=f"{dag_id}_{table_name}", ) # 5. Create mviews for T-REX tile server create_mviews = PostgresOperator( task_id="create_mviews", sql=CREATE_MVIEWS, params=dict(base_table=table_name, dag_id=dag_id), ) # 6. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=dag_id, prefix_table_name=f"{dag_id}_", rename_indexes=False, pg_schema="public", ) # 7. Drop temp table clean_up = PostgresOperator( task_id="drop_temp_table", sql=[ f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE", ], ) # 8. Trigger next DAG to run (estafette) trigger_next_dag = TriggerDynamicDagRunOperator( task_id="trigger_next_dag", dag_id_prefix=f"{dag_id}_", trigger_rule="all_done", ) # 9. Grant database permissions grant_db_permissions = PostgresPermissionsOperator(task_id="grants", dag_name=dag_id) # Flow (slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag >> grant_db_permissions) dag.doc_md = """ #### DAG summary This DAG contains BGT (basisregistratie grootschalige topografie) i and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data #### Mission Critical Classified as 2 (beschikbaarheid [range: 1,2,3]) #### On Failure Actions Fix issues and rerun dag on working days #### Point of Contact Inform the businessowner at [businessowner]@amsterdam.nl #### Business Use Case / process / origin NA #### Prerequisites/Dependencies/Resourcing https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG. """ return dag