) as dag:

    tmp_dir = f"/tmp/{dag_id}"
    colnames = [
        "ogc_fid",
        "wkb_geometry",
        "oov_naam",
        "type",
        "url",
    ]
    fetch_shp_files = []

    slack_at_start = MessageOperator(
        task_id="slack_at_start",
        http_conn_id="slack",
        webhook_token=slack_webhook_token,
        message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
        username="******",
    )

    for ext in ("dbf", "prj", "shp", "shx"):
        file_name = f"OOV_gebieden_totaal.{ext}"
        fetch_shp_files.append(
            SwiftOperator(
                task_id=f"fetch_shp_{ext}",
                container=dag_id,
                object_id=file_name,
                output_path=f"/tmp/{dag_id}/{file_name}",
            ))

    extract_shp = BashOperator(
示例#2
0
def create_gob_dag(is_first, gob_dataset_name, gob_table_name):

    gob_db_table_name = f"{gob_dataset_name}_{gob_table_name}"
    graphql_dir_path = graphql_path / f"{gob_dataset_name}-{gob_table_name}"
    graphql_params_path = graphql_dir_path / "args.json"
    extra_kwargs = {}
    schedule_start_hour = 6
    if graphql_params_path.exists():
        with graphql_params_path.open() as json_file:
            args_from_file = json.load(json_file)
            extra_kwargs = args_from_file.get("extra_kwargs", {})
            protected = extra_kwargs.get("protected", False)
            if protected:
                extra_kwargs["endpoint"] = GOB_SECURE_ENDPOINT

    dag = DAG(
        f"{dag_id}_{gob_db_table_name}",
        default_args={
            "owner": owner,
            **default_args
        },
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        tags=["gob"],
    )

    kwargs = dict(
        task_id=f"load_{gob_db_table_name}",
        endpoint=GOB_PUBLIC_ENDPOINT,
        dataset=gob_dataset_name,
        schema=gob_table_name,
        retries=3,
        graphql_query_path=graphql_dir_path / "query.graphql",
        max_records=MAX_RECORDS,
        http_conn_id="gob_graphql",
        token_expires_margin=OAUTH_TOKEN_EXPIRES_MARGIN,
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id=f"slack_at_start_{gob_db_table_name}",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        # 2. drop temp table if exists
        init_table = PostgresTableInitOperator(
            task_id=f"init_{gob_db_table_name}",
            table_name=f"{gob_db_table_name}{TMP_TABLE_POSTFIX}",
            drop_table=True,
        )

        # 3. load data into temp table
        load_data = HttpGobOperator(**{**kwargs, **extra_kwargs})

        # 4. truncate target table and insert data from temp table
        copy_table = PostgresTableCopyOperator(
            task_id=f"copy_{gob_db_table_name}",
            source_table_name=f"{gob_db_table_name}{TMP_TABLE_POSTFIX}",
            target_table_name=gob_db_table_name,
        )

        # 5. create an index on the identifier fields (as specified in the JSON data schema)
        create_extra_index = SqlAlchemyCreateObjectOperator(
            task_id=f"create_extra_index_{gob_db_table_name}",
            data_schema_name=kwargs.get("dataset", None),
            data_table_name=f"{gob_db_table_name}",
            # when set to false, it doesn't create the tables; only the index
            ind_table=False,
            ind_extra_index=True,
        )

        # 6. trigger next DAG (estafette / relay run)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix="gob_",
            trigger_rule="all_done",
        )

        # FLOW
        (slack_at_start >> init_table >> load_data >> copy_table >>
         create_extra_index >> trigger_next_dag)

    return dag
示例#3
0
def create_gob_dag(is_first: bool, gob_dataset_id: str,
                   gob_table_id: str) -> DAG:

    dataset_table_id = f"{gob_dataset_id}_{gob_table_id}"
    graphql_dir_path = graphql_path / f"{gob_dataset_id}-{gob_table_id}"
    graphql_params_path = graphql_dir_path / "args.json"
    extra_kwargs = {}
    schedule_start_hour = 6
    if graphql_params_path.exists():
        with graphql_params_path.open() as json_file:
            args_from_file = json.load(json_file)
            extra_kwargs = args_from_file.get("extra_kwargs", {})
            protected = extra_kwargs.get("protected", False)
            if protected:
                extra_kwargs["endpoint"] = GOB_SECURE_ENDPOINT

    dag = DAG(
        f"{dag_id}_{dataset_table_id}",
        default_args={
            "owner": owner,
            **default_args
        },
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        tags=["gob"],
    )

    kwargs = dict(
        task_id=f"load_{dataset_table_id}",
        endpoint=GOB_PUBLIC_ENDPOINT,
        retries=3,
        graphql_query_path=graphql_dir_path / "query.graphql",
        max_records=MAX_RECORDS,
        http_conn_id="gob_graphql",
        token_expires_margin=OAUTH_TOKEN_EXPIRES_MARGIN,
        xcom_table_info_task_ids=f"mkinfo_{dataset_table_id}",
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id=f"slack_at_start_{dataset_table_id}",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        def _create_dataset_info(dataset_id: str,
                                 table_id: str) -> DatasetInfo:
            dataset = schema_def_from_url(SCHEMA_URL,
                                          dataset_id,
                                          prefetch_related=True)
            # Fetch the db_name for this dataset and table
            db_table_name = dataset.get_table_by_id(table_id).db_name()

            # We do not pass the dataset through xcom, but only the id.
            # The methodtools.lru_cache decorator is not pickleable
            # (Airflow uses pickle for (de)serialization).
            # provide the dataset_table_id as fully qualified name, for convenience
            dataset_table_id = f"{dataset_id}_{table_id}"
            return DatasetInfo(SCHEMA_URL, dataset_id, table_id,
                               dataset_table_id, db_table_name)

        # 2. Create Dataset info to put on the xcom channel for later use
        # by operators
        create_dataset_info = PythonOperator(
            task_id=f"mkinfo_{dataset_table_id}",
            python_callable=_create_dataset_info,
            op_args=(gob_dataset_id, gob_table_id),
        )

        def init_assigner(o: Any, x: Any) -> None:
            o.table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}"

        # 3. drop temp table if exists
        init_table = PostgresTableInitOperator(
            task_id=f"init_{dataset_table_id}",
            table_name=None,
            xcom_task_ids=f"mkinfo_{dataset_table_id}",
            xcom_attr_assigner=init_assigner,
            drop_table=True,
        )

        # 4. load data into temp table
        load_data = HttpGobOperator(**{**kwargs, **extra_kwargs})

        def copy_assigner(o: Any, x: Any) -> None:
            o.source_table_name = f"{x.db_table_name}{TMP_TABLE_POSTFIX}"
            o.target_table_name = x.db_table_name

        # 5. truncate target table and insert data from temp table
        copy_table = PostgresTableCopyOperator(
            task_id=f"copy_{dataset_table_id}",
            source_table_name=None,
            target_table_name=None,
            xcom_task_ids=f"mkinfo_{dataset_table_id}",
            xcom_attr_assigner=copy_assigner,
        )

        def index_assigner(o: Any, x: Any) -> None:
            o.data_table_name = x.db_table_name

        # 6. create an index on the identifier fields (as specified in the JSON data schema)
        create_extra_index = SqlAlchemyCreateObjectOperator(
            task_id=f"create_extra_index_{dataset_table_id}",
            data_schema_name=gob_dataset_id,
            data_table_name=None,
            # when set to false, it doesn't create the tables; only the index
            ind_table=False,
            ind_extra_index=True,
            xcom_task_ids=f"mkinfo_{dataset_table_id}",
            xcom_attr_assigner=index_assigner,
        )

        # 7. trigger next DAG (estafette / relay run)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix="gob_",
            trigger_rule="all_done",
        )

        # 9. Grant database permissions
        grant_db_permissions = PostgresPermissionsOperator(
            task_id="grants", dag_name=f"{dag_id}_{dataset_table_id}")

        # FLOW
        (slack_at_start >> create_dataset_info >> init_table >> load_data >>
         copy_table >> create_extra_index >> trigger_next_dag >>
         grant_db_permissions)

    return dag
def create_basiskaart_dag(is_first, table_name, select_statement):
    """ 
    DAG generator: Generates a DAG for each table.
    The table_name is the target table in de masterDB where the data will be inserted.
    The select_statement is one of the imported SQL query selects (see above) that will be executed on the source DB.    
    """
    # start time first DAG
    # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between 3 and 5 a.m.
    # Because of this, the first DAG starts running at 7 a.m.
    schedule_start_hour = 7

    dag = DAG(
        f"{dag_id}_{table_name}",
        default_args={
            "owner": owner,
            **default_args
        },
        # the first DAG will have the is_first boolean set to True
        # the other DAG's will be triggered to start when the previous DAG is finished (estafette run / relay run)
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        description=
        """basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50). 
        The basiskaart data is collected from basiskaart DB.""",
        tags=["basiskaart"],
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id="slack_at_start",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        # 2. Create temp and target table
        create_tables = PostgresOperator(
            task_id="create_tables",
            sql=CREATE_TABLES,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 3. Copy data into temp table
        copy_data = PythonOperator(
            task_id="insert_data",
            python_callable=create_tables_from_basiskaartdb_to_masterdb,
            op_kwargs={
                "source_connection": source_connection,
                "source_select_statement": globals()[select_statement],
                "target_base_table": f"{dag_id}_{table_name}_temp",
            },
            dag=dag,
        )

        # 4. Check for changes in temp table to merge in target table
        change_data_capture = PgComparatorCDCOperator(
            task_id="change_data_capture",
            source_table=f"{dag_id}_{table_name}_temp",
            target_table=f"{dag_id}_{table_name}")

        # 5. Create mviews for T-REX tile server
        create_mviews = PostgresOperator(
            task_id="create_mviews",
            sql=CREATE_MVIEWS,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 6. Rename COLUMNS based on Provenance
        provenance_translation = ProvenanceRenameOperator(
            task_id="rename_columns",
            dataset_name=f"{dag_id}",
            prefix_table_name=f"{dag_id}_",
            rename_indexes=False,
            pg_schema="public",
        )

        # 7. Drop temp table
        clean_up = PostgresOperator(
            task_id="drop_temp_table",
            sql=[
                f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE",
            ],
        )

        # 8. Trigger next DAG to run (estafette)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix=f"{dag_id}_",
            trigger_rule="all_done",
        )

    # Flow
    slack_at_start >> create_tables >> copy_data >> change_data_capture >> create_mviews >> provenance_translation >> clean_up >> trigger_next_dag

    dag.doc_md = """
    #### DAG summery
    This DAG containts BGT (basisregistratie grootschalige topografie) and KBK10 (kleinschalige basiskaart 10) and KBK50 (kleinschalige basiskaart 50) data
    #### Mission Critical
    Classified as 2 (beschikbaarheid [range: 1,2,3])
    #### On Failure Actions
    Fix issues and rerun dag on working days
    #### Point of Contact
    Inform the businessowner at [businessowner]@amsterdam.nl
    #### Business Use Case / process / origin
    NA
    #### Prerequisites/Dependencies/Resourcing
    https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html
    Note: The basiskaart data is collected from the GOB objectstore and processed in the basiskaart DB => which is the source for this DAG.
    """

    return dag