task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{file}", # Default swift = Various Small Datasets objectstore # swift_conn_id="SWIFT_DEFAULT", container="overlastgebieden", object_id=f"{file}", output_path=f"{tmp_dir}/{file}", ) for file in files_to_download ] # 4. Dummy operator acts as an interface between parallel tasks to another parallel tasks with different number of lanes # (without this intermediar, Airflow will give an error) Interface = DummyOperator(task_id="interface") # 5. Create SQL SHP_to_SQL = [ BashOperator( task_id=f"create_SQL_{key}", bash_command=f"ogr2ogr -f 'PGDump' " f"-s_srs EPSG:28992 -t_srs EPSG:28992 "
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. create download temp directory to store the data mk_tmp_dir = BashOperator(task_id="mk_tmp_dir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{file_name}", swift_conn_id="objectstore_dataservices", container="Dataservices", object_id=url, output_path=f"{tmp_dir}/{url}", ) for file_name, url in data_endpoints.items() ] # 4. Cleanse the downloaded data (remove the space hyphen characters) clean_up_data = [ PythonOperator( task_id=f"clean_data_{file_name}", python_callable=clean_data, op_args=[f"{tmp_dir}/{url}"], ) for file_name, url in data_endpoints.items() ]
dag_id = "reclamebelasting" dag_config = Variable.get(dag_id, deserialize_json=True) with DAG( "reclamebelasting", default_args=default_args, description="reclamebelasting", ) as dag: zip_file = dag_config["zip_file"] shp_file = dag_config["shp_file"] tmp_dir = f"/tmp/{dag_id}" mk_tmp_dir = BashOperator(task_id="mk_tmp_dir", bash_command=f"mkdir -p {tmp_dir}") fetch_zip = SwiftOperator( task_id="fetch_zip", container="reclame", object_id=zip_file, output_path=f"{tmp_dir}/{zip_file}", ) extract_zip = BashOperator( task_id="extract_zip", bash_command=f"unzip -o {tmp_dir}/{zip_file} -d {tmp_dir}", ) extract_shp = BashOperator( task_id="extract_shp", bash_command=f"ogr2ogr -f 'PGDump' -t_srs EPSG:28992 -nln {dag_id}_new " f"{tmp_dir}/{dag_id}.sql {tmp_dir}/Reclame_tariefgebieden.shp", ) convert_shp = BashOperator(
slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = SwiftOperator( task_id=f"download_{files_to_download[0]}", # when swift_conn_id is ommitted then the default connection will be the VSD objectstore # swift_conn_id="SWIFT_DEFAULT", container="vastgoed", object_id=f"{files_to_download[0]}", output_path=f"{tmp_dir}/{files_to_download[0]}", ) # 4. Convert data to UTF8 character set convert_to_UTF8 = BashOperator( task_id="convert_to_UTF8", bash_command=f"iconv -f iso-8859-1 -t utf-8 {tmp_dir}/{files_to_download[0]} > " f"{tmp_dir}/{dag_id}_utf8.csv", ) # 5. Create TABLE from CSV # The source has no spatial data, but OGR2OGR is used to create the SQL insert statements. CSV_to_SQL = BashOperator( task_id=f"CSV_to_SQL",
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. create download temp directory to store the data mk_tmp_dir = BashOperator(task_id="mk_tmp_dir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id="download_file", # Default swift = Various Small Datasets objectstore # swift_conn_id="SWIFT_DEFAULT", container=f"{dag_id}", object_id=f"{files_to_download}", output_path=f"{tmp_dir}/{file}", ) for file in files_to_download ] # 3. Unzip extract_zip = [ BashOperator( task_id="extract_zip_file", bash_command=f'unzip -o "{tmp_dir}/{file}" -d {tmp_dir}', ) for file in files_to_download ]
with DAG( "grex", default_args=default_args, description="GrondExploitatie", ) as dag: csv_file = dag_config["csv_file"] tmp_dir = f"/tmp/{dag_id}" mk_tmp_dir = BashOperator(task_id="mk_tmp_dir", bash_command=f"mkdir -p {tmp_dir}") fetch_csv = SwiftOperator( task_id="fetch_csv", container="grex", object_id=csv_file, output_path=f"{tmp_dir}/{csv_file}", ) load_data = PythonOperator( task_id="load_data", python_callable=load_grex, op_args=[f"{tmp_dir}/{csv_file}", table_name_new], ) check_count = PostgresCheckOperator( task_id="check_count", sql=SQL_CHECK_COUNT, params=dict(tablename=table_name_new, mincount=400), )
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{file}", # when swift_conn_id is ommitted then the default connection will be the VSD objectstore # swift_conn_id="SWIFT_DEFAULT", container="aardgasvrij", object_id=file, output_path=f"{tmp_dir}/{file}", ) for files in files_to_download.values() for file in files ] # 4. Dummy operator acts as an interface between parallel tasks to another parallel tasks with different number of lanes # (without this intermediar, Airflow will give an error) Interface = DummyOperator(task_id="interface") # 5. Create SQL SHP_to_SQL = [ BashOperator( task_id=f"create_SQL_{key}", bash_command="ogr2ogr -f 'PGDump' " "-t_srs EPSG:28992 -s_srs EPSG:28992 "
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{file}", swift_conn_id="OBJECTSTORE_MILIEUTHEMAS", container="Bommenkaart", object_id=file, output_path=f"{tmp_dir}/{file}", ) for files in files_to_download.values() for file in files ] # 4. Dummy operator acts as an interface between parallel tasks to another parallel tasks with different number of lanes # (without this intermediar, Airflow will give an error) Interface = DummyOperator(task_id="interface") # 5. Create SQL SHP_to_SQL = [ BashOperator( task_id=f"create_SQL_{key}", bash_command="ogr2ogr -f 'PGDump' " "-s_srs EPSG:28992 -t_srs EPSG:28992 "
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_file_{key}", swift_conn_id="OBJECTSTORE_MILIEUTHEMAS", container="Milieuthemas", object_id=file, output_path=f"{tmp_dir}/{file}", ) for key, file in files_to_download.items() ] # 4. Transform seperator from pipeline to semicolon and set code schema to UTF-8 change_seperator = [ BashOperator( task_id=f"change_seperator_{key}", bash_command= f"cat {tmp_dir}/{file} | sed 's/|/;/g' > {tmp_dir}/seperator_{file} ;" f"iconv -f iso-8859-1 -t utf-8 {tmp_dir}/seperator_{file} > " f"{tmp_dir}/utf-8_{file}", ) for key, file in files_to_download.items() ]
# 1. Post message on slack slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Download data from objectstore and store in tmp dir download_data = [ SwiftOperator( task_id=f"download_{file}", # swift_conn_id default when ommited is the Various Small Datasets objectstore container="bed_and_breakfast", object_id=f"{DATAPUNT_ENVIRONMENT}/{file}", output_path=f"{tmp_dir}/{file}", ) for file in files_to_download ] # 3. Modify data: remove all but inserts remove_owner_alters = [ BashOperator( task_id=f"get_SQL_inserts_{file}", bash_command=f"sed -i -r '/INSERT INTO/!d' {tmp_dir}/{file} && " f"echo 'COMMIT;' >> {tmp_dir}/{file}", ) for file in files_to_download ] # 4. Modify data: change table name to tmp name
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{file}", # Default swift = Various Small Datasets objectstore # swift_conn_id="SWIFT_DEFAULT", container="milieuzones", object_id=f"{file}", output_path=f"{tmp_dir}/{file}", ) for file in files_to_download ] # 4. Convert data to geojson convert_to_geojson = [ PythonOperator( task_id=f"convert_{file}_to_geojson", python_callable=import_milieuzones, op_args=[f"{tmp_dir}/{file}", f"{tmp_dir}/geojson_{file}",], ) for file in files_to_download ]
slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = SwiftOperator( task_id=f"download_{zip_file}", # Default swift == Various Small Datasets objectstore # swift_conn_id="SWIFT_DEFAULT", container="reclame", object_id=zip_file, output_path=f"{tmp_dir}/{zip_file}", ) # 4. Extract zip file extract_zip = BashOperator( task_id="extract_zip", bash_command=f"unzip -o {tmp_dir}/{zip_file} -d {tmp_dir}", ) # 5. Load data load_data = Ogr2OgrOperator( task_id=f"import_{shp_file}", target_table_name=f"{schema_name}_{table_name}_new", input_file=f"{tmp_dir}/{shp_file}",
webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = PythonOperator(task_id="download_data", python_callable=get_data) # 4. Upload data to objectstore upload_to_obs = SwiftOperator( task_id="upload_to_obs", swift_conn_id="OBJECTSTORE_VICTOR", action_type="upload", container="WIOR", output_path=f"{tmp_dir}/{dag_id}.geojson", object_id=f"{datetime.now(timezone.utc).astimezone(to_zone).strftime('%Y-%m-%d')}_{dag_id}.geojson", # noqa E501 ) # 5. Delete files from objectstore (that do not fit given time window) delete_from_obs = SwiftOperator( task_id="delete_from_obs", swift_conn_id="OBJECTSTORE_VICTOR", action_type="delete", container="WIOR", time_window_in_days=100, ) # 6. Import data import_data = Ogr2OgrOperator(
sql_file_new = f"{sql_file_new_base}.sql" tmp_dir = f"/tmp/{dag_id}" sql_file_path = f"{tmp_dir}/{DATAPUNT_ENVIRONMENT}/{sql_file}" sql_file_new_path = f"{tmp_dir}/{sql_file_new}" slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) fetch_sql = SwiftOperator( task_id="fetch_sql", container=dag_id, object_id=f"{DATAPUNT_ENVIRONMENT}/{sql_file}", output_path=sql_file_path, ) remove_owner_alters = BashOperator( task_id="remove_owner_alters", bash_command=f'egrep -v "^ALTER TABLE.*OWNER TO" {sql_file_path} ' f'| egrep -v "^GRANT SELECT ON" > "{sql_file_new_path}"', ) replace_tablename = BashOperator( task_id="replace_tablename", bash_command=f'perl -pi -e "s/quota_bbkaartlaagexport/bb_quotum_new/g" ' f"{sql_file_new_path}", )
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{key}", swift_conn_id="OBJECTSTORE_MILIEUTHEMAS", container="Milieuthemas", object_id=f"{file}", output_path=f"{tmp_dir}/{key}_{file}", ) for key, file in files_to_download.items() ] # 4. Transform seperator from pipeline to semicolon and set code schema to UTF-8 change_seperators = [ BashOperator( task_id=f"change_seperator_{key}", bash_command=f"cat {tmp_dir}/{key}_{file} | " f"sed 's/|/;/g' > {tmp_dir}/seperator_{key} ;" f"iconv -f iso-8859-1 -t utf-8 {tmp_dir}/seperator_{key} > " f"{tmp_dir}/utf_8_{key}.csv", )
shp_files = dag_config["shp_files"] tables = dag_config["tables"] rename_tablenames = dag_config["rename_tablenames"] tmp_dir = f"/tmp/{dag_id}" slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) fetch_zip = SwiftOperator( task_id="fetch_zip", container=dag_id, object_id=zip_file, output_path=f"/tmp/{dag_id}/{zip_file}", ) extract_zip = BashOperator( task_id="extract_zip", bash_command=f'unzip -o "{tmp_dir}/{zip_file}" -d {tmp_dir}', ) for shp_filename, tablename in zip(shp_files, tables): extract_shps.append( BashOperator( task_id=f"extract_{shp_filename}", bash_command=f"ogr2ogr -f 'PGDump' -t_srs EPSG:28992 " f"-nln {tablename} " f"{tmp_dir}/{tablename}.sql {tmp_dir}/Shape/{shp_filename}",
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{data_file}", swift_conn_id="objectstore_dataruimte", container="ondergrond", object_id=f"historische_onderzoeken/{data_file}", output_path=f"{tmp_dir}/{data_file}", ) for _, data_file in files_to_download.items() ] # 4. Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_tables = [ SqlAlchemyCreateObjectOperator( task_id=f"create_{table_name}_based_upon_schema", data_schema_name=dag_id, data_table_name=f"{dag_id}_{table_name}", ind_table=True, # when set to false, it doesn't create indexes; only tables
print("Duplicates found: {}".format(", ".join(duplicates))) with DAG(dag_id, default_args=default_args, description="Parkeervakken") as dag: last_date = find_export_date() zip_file = "nivo_{}.zip".format(last_date) source = pathlib.Path(TMP_DIR) mk_tmp_dir = BashOperator(task_id="mk_tmp_dir", bash_command=f"mkdir -p {TMP_DIR}") fetch_zip = SwiftOperator( task_id="fetch_zip", container="tijdregimes", object_id=zip_file, output_path=f"{TMP_DIR}/{zip_file}", conn_id="parkeervakken_objectstore", ) extract_zip = BashOperator( task_id="extract_zip", bash_command=f'unzip -o "{TMP_DIR}/{zip_file}" -d {TMP_DIR}', ) create_temp_tables = PostgresOperator( task_id="create_temp_tables", postgres_conn_id=postgres_conn_id, sql=SQL_CREATE_TEMP_TABLES, params=dict(base_table=f"{dag_id}_{dag_id}"), )
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{file}", # if conn is ommitted, it defaults to Objecstore Various Small Datasets # swift_conn_id="SWIFT_DEFAULT", container="spoorlijnen", object_id=str(file), output_path=f"{tmp_dir}/{file}", ) for files in files_to_download.values() for file in files ] # 4. Dummy operator acts as an interface between parallel tasks to another parallel tasks with different number of lanes # (without this intermediar, Airflow will give an error) Interface = DummyOperator(task_id="interface") # 5. Create SQL SHP_to_SQL = [ BashOperator( task_id=f"create_SQL_{key}", bash_command=f"ogr2ogr -f 'PGDump' " f"-s_srs EPSG:28992 -t_srs EPSG:28992 "
mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download sensor data (geojson) from maps.amsterdam.nl download_geojson = HttpFetchOperator( task_id="download_geojson", endpoint="open_geodata/geojson.php?KAARTLAAG=CROWDSENSOREN&THEMA=cmsa", http_conn_id="ams_maps_conn_id", tmp_file=tmp_dir / "sensors.geojson", ) # 4. Download additional data (beacons.csv, cameras.xlsx) fetch_files = [ SwiftOperator( task_id=f"download_{file}", # if conn is ommitted, it defaults to Objecstore Various Small Datasets # swift_conn_id="SWIFT_DEFAULT", container="cmsa", object_id=file, output_path=tmp_dir / file, ) for file in files_to_download ] # 5. Create SQL insert statements out of downloaded data proces_cmsa = PythonOperator( task_id="proces_sensor_data", python_callable=import_cmsa, op_args=[ tmp_dir / "cameras.xlsx", tmp_dir / "beacons.csv", tmp_dir / "sensors.geojson", tmp_dir,
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) # 2. Create temp directory to store files mkdir = BashOperator(task_id="mkdir", bash_command=f"mkdir -p {tmp_dir}") # 3. Download data download_data = [ SwiftOperator( task_id=f"download_{data_file}", swift_conn_id="objectstore_dataservices", container="Dataservices", object_id=f"{dag_id}/{data_file}", output_path=f"{tmp_dir}/{data_file}", ) for _, data_file in files_to_download.items() ] # 4. Create the DB target table (as specified in the JSON data schema) # if table not exists yet create_tables = [ SqlAlchemyCreateObjectOperator( task_id=f"create_{table_name}_based_upon_schema", data_schema_name=f"{dag_id}", data_table_name=f"{dag_id}_{table_name}", ind_table=True, # when set to false, it doesn't create indexes; only tables
fetch_shp_files = [] slack_at_start = MessageOperator( task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) for ext in ("dbf", "prj", "shp", "shx"): file_name = f"OOV_gebieden_totaal.{ext}" fetch_shp_files.append( SwiftOperator( task_id=f"fetch_shp_{ext}", container=dag_id, object_id=file_name, output_path=f"/tmp/{dag_id}/{file_name}", )) extract_shp = BashOperator( task_id="extract_shp", bash_command= f"ogr2ogr -f 'PGDump' -t_srs EPSG:28992 -skipfailures -nln {dag_id}_new " f"{tmp_dir}/{dag_id}.sql {tmp_dir}/OOV_gebieden_totaal.shp", ) convert_shp = BashOperator( task_id="convert_shp", bash_command=f"iconv -f iso-8859-1 -t utf-8 {tmp_dir}/{dag_id}.sql > " f"{tmp_dir}/{dag_id}.utf8.sql", )
def create_error(*args, **kwargs): raise Exception with DAG( "testdag", default_args=default_args, ) as dag: swift_task = SwiftOperator( task_id="swift_task", container="Dataservices", object_id= "beschermde_stads_en_dorpsgezichten/acceptance/beschermde_stadsdorpsgezichten.zip", output_path="/tmp/bsd.zip", # container="afval", # object_id="acceptance/afval_cluster.zip", # output_path="/tmp/blaat/out2.zip", # conn_id="afval", swift_conn_id="objectstore_dataservices", ) count_check = COUNT_CHECK.make_check( check_id="count_check", pass_value=1587, params=dict(table_name="fietspaaltjes"), result_checker=operator.ge, ) colname_check = COLNAMES_CHECK.make_check( check_id="colname_check",