active_env=Variable.get("active_env"), prod_webhook=Variable.get("active_env") == "prod", ) with DAG( dag_id, default_args=airflow_utils.get_default_args({ "owner": "Gary Qi", "depends_on_past": False, "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 0, "on_failure_callback": task_failure_slack_alert, "start_date": days_ago(1), }), description= "Take new earlyon.json from opendata.toronto.ca and put into datastore", schedule_interval="0 18 * * *", catchup=False, tags=["earlyon", "datasets"], ) as dag:
return "\n".join(message_lines) def return_branch(**kwargs): msg = kwargs.pop("ti").xcom_pull(task_ids="build_message") if msg is None: return "no_need_for_notification" return "send_notification" default_args = airflow_utils.get_default_args({ "on_failure_callback": send_failure_msg, "start_date": job_settings["start_date"] }) with DAG( job_name, default_args=default_args, description=job_settings["description"], schedule_interval=job_settings["schedule"], tags=["sustainment"], catchup=False, ) as dag: load_files = PythonOperator( task_id="load_file_list", python_callable=load_remote_files,
def create_dag(d): def send_success_msg(**kwargs): msg = kwargs.pop("ti").xcom_pull(task_ids="build_message") airflow_utils.message_slack( name=PACKAGE_ID, message_type="success", msg=msg, prod_webhook=active_env == "prod", active_env=active_env, ) def send_failure_msg(self): airflow_utils.message_slack( name=PACKAGE_ID, message_type="error", msg="Job not finished", prod_webhook=active_env == "prod", active_env=active_env, ) def is_resource_new(**kwargs): package = kwargs.pop("ti").xcom_pull(task_ids="get_package") resource_name = kwargs.pop("resource_name") resource = [r for r in package["resources"] if r["name"] == resource_name] assert ( len(resource) <= 1 ), f"Found {len(resource)} named {resource_name}. Must be 1 or 0." if len(resource) == 1: return "do_not_create_new_resource" return "create_new_resource" def get_resource(**kwargs): package = ckan_utils.get_package(ckan=ckan, package_id=PACKAGE_ID) resource_name = kwargs.pop("resource_name") resource = [r for r in package["resources"] if r["name"] == resource_name][0] return resource def create_new_resource(**kwargs): ti = kwargs.pop("ti") package = ti.xcom_pull(task_ids="get_package") tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir")) resource_name = kwargs.pop("resource_name") save_path = tmp_dir / f"{resource_name}.zip" with zipfile.ZipFile(save_path, "w") as file: file pass logging.info( "New resource. Creating empty placeholder Zip file to upload with resource" ) res = ckan.action.resource_create( package_id=package["name"], name=resource_name, is_preview=False, format="ZIP", extract_job=f"Airflow: {kwargs['dag'].dag_id}", upload=open(save_path, "rb"), ) logging.info(res) return save_path def download_data(**kwargs): ti = kwargs.pop("ti") resource = ti.xcom_pull(task_ids="get_resource") tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir")) r = requests.get(resource["url"], stream=True) save_path = tmp_dir / f'src{Path(resource["url"]).suffix}' with open(save_path, "wb") as fd: for chunk in r.iter_content( chunk_size=128 ): # to-do: read up on chunk size here fd.write(chunk) return save_path def unzip_data(**kwargs): ti = kwargs.pop("ti") fp = ti.xcom_pull(task_ids="download_data") tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir")) target_dir = tmp_dir / "src" with zipfile.ZipFile(fp, "r") as f: f.extractall(target_dir) if target_dir.exists() is False: target_dir.mkdir() return target_dir def get_filename_date_format(**kwargs): period_range = kwargs["period_range"] if period_range == "weekly": filename_date_format = "%Y%m%d" elif period_range == "monthly": filename_date_format = "%Y%m" elif period_range == "yearly": filename_date_format = "%Y" return filename_date_format def determine_latest_period_loaded(**kwargs): ti = kwargs.pop("ti") data_fp = Path(ti.xcom_pull(task_ids="unzip_data")) filename_date_format = ti.xcom_pull(task_ids="get_filename_date_format") dates_loaded = [ datetime.strptime(p.name, filename_date_format) for p in data_fp.iterdir() if p.is_file() is False ] if not dates_loaded: return datetime(2018, 12, 30) return max(dates_loaded) def calculate_periods_to_load(**kwargs): ti = kwargs.pop("ti") latest_loaded = ti.xcom_pull(task_ids="determine_latest_period_loaded") period_range = kwargs["period_range"] def weeks(latest_loaded): logging.info("Calculating weeks to load") periods_to_load = [] begin = latest_loaded + timedelta(days=1) end = begin + timedelta(days=6) while end < datetime.now(): periods_to_load.append( { "begin": datetime.strftime(begin, "%Y/%m/%d/0"), "end": datetime.strftime(end, "%Y/%m/%d/23"), } ) begin = end + timedelta(days=1) end = begin + timedelta(days=6) return periods_to_load def months(latest_loaded): logging.info("Calculating months to load") periods_to_load = [] begin = latest_loaded + timedelta(days=32) month_end_day = calendar.monthrange(begin.year, begin.month)[1] end = datetime(begin.year, begin.month, month_end_day) while end < datetime.now(): periods_to_load.append( { "begin": datetime.strftime(begin, "%Y/%m/1/0"), "end": datetime.strftime(end, "%Y/%m/%d/23"), } ) begin = begin + timedelta(days=32) month_end_day = calendar.monthrange(begin.year, begin.month)[1] end = datetime(begin.year, begin.month, month_end_day) return periods_to_load def years(latest_loaded): logging.info("Calculating years to load") periods_to_load = [] begin = datetime(latest_loaded.year + 1, 1, 1) end = datetime(begin.year, 12, 31) while end < datetime.now(): periods_to_load.append( { "begin": datetime.strftime(begin, "%Y/1/1/0"), "end": datetime.strftime(end, "%Y/12/31/23"), } ) begin = datetime(begin.year + 1, 1, 1) end = datetime(begin.year, 12, 31) return periods_to_load if period_range == "weekly": return weeks(latest_loaded) elif period_range == "monthly": return months(latest_loaded) elif period_range == "yearly": return years(latest_loaded) def make_new_extract_folders(**kwargs): logging.info("Created directory for storing extracts") ti = kwargs.pop("ti") filename_date_format = ti.xcom_pull(task_ids="get_filename_date_format") periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load") dest_path = Path(ti.xcom_pull(task_ids="make_staging_folder")) dirs = [] for period in periods_to_load: period_path_name = datetime.strptime(period["end"], "%Y/%m/%d/%H").strftime( filename_date_format ) period_path = dest_path / period_path_name if period_path.exists() is False: period_path.mkdir() dirs.append(period_path) logging.info(period_path) return dirs def extract_new_report(**kwargs): ti = kwargs.pop("ti") periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load") filename_date_format = ti.xcom_pull(task_ids="get_filename_date_format") dest_path = Path(ti.xcom_pull(task_ids="make_staging_folder")) account_id = Variable.get("oracle_infinity_account_id") user = Variable.get("oracle_infinity_user") password = Variable.get("oracle_infinity_password") report_name = kwargs["report_name"] report_id = reports[report_name] logging.info(f"Getting reports. Parameters: {args}") file_paths = [] for period in periods_to_load: period_path_name = datetime.strptime(period["end"], "%Y/%m/%d/%H").strftime( filename_date_format ) period_path = dest_path / period_path_name fpath = period_path / (report_name + ".csv") file_paths.append(fpath) response = generate_report( report_name=report_name, report_id=report_id, begin=period["begin"], end=period["end"], account_id=account_id, user=user, password=password, ) with open(fpath, "wb") as f: f.write(response.content) return file_paths def are_there_new_periods(**kwargs): ti = kwargs.pop("ti") periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load") if len(periods_to_load) > 0: return "new_periods_to_load" return "no_new_periods_to_load" def make_staging_folder(**kwargs): ti = kwargs.pop("ti") tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir")) resource_name = kwargs["resource_name"] staging = tmp_dir / resource_name staging.mkdir(parents=True, exist_ok=True) return staging def zip_files(**kwargs): ti = kwargs.pop("ti") resource_name = kwargs["resource_name"] dest_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir")) staging_dir = Path(ti.xcom_pull(task_ids="make_staging_folder")) return shutil.make_archive( base_name=dest_dir / resource_name, format="zip", root_dir=staging_dir ) def copy_previous_to_staging(**kwargs): ti = kwargs.pop("ti") from_dir = Path(ti.xcom_pull(task_ids="unzip_data")) dest_dir = Path(ti.xcom_pull(task_ids="make_staging_folder")) copy_tree(str(from_dir.absolute()), str(dest_dir.absolute())) return dest_dir def upload_zip(**kwargs): ti = kwargs.pop("ti") path = Path(ti.xcom_pull(task_ids="zip_files")) resource = ti.xcom_pull(task_ids="get_resource") res = ckan.action.resource_patch(id=resource["id"], upload=open(path, "rb"),) return res def build_message(**kwargs): ti = kwargs.pop("ti") periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load") msg = [f"Loaded {d['period_range']} data:", ""] for p in periods_to_load: begin = "-".join(p["begin"].split("/")[:-1]) end = "-".join(p["end"].split("/")[:-1]) msg.append(f"- {begin} to {end}") return "\n".join(msg) dag = DAG( d["dag_id"], default_args=airflow_utils.get_default_args( { "on_failure_callback": task_failure_slack_alert, "start_date": d["start_date"], "retries": 5, "retry_delay": timedelta(minutes=15), } ), description=d["description"], schedule_interval=d["schedule"], tags=["dataset"], catchup=False, ) with dag: package = PythonOperator( task_id="get_package", op_kwargs={"ckan": ckan, "package_id": PACKAGE_ID}, python_callable=ckan_utils.get_package, ) create_tmp_dir = PythonOperator( task_id="create_tmp_data_dir", python_callable=airflow_utils.create_dir_with_dag_name, op_kwargs={"dag_id": d["dag_id"], "dir_variable_name": "tmp_dir"}, ) is_resource_new_branch = BranchPythonOperator( task_id="is_resource_new", python_callable=is_resource_new, op_kwargs={"resource_name": d["resource_name"]}, ) create_resource = PythonOperator( task_id="create_new_resource", python_callable=create_new_resource, op_kwargs={"resource_name": d["resource_name"]}, ) no_new_resource = DummyOperator(task_id="do_not_create_new_resource") resource = PythonOperator( task_id="get_resource", python_callable=get_resource, trigger_rule="none_failed", op_kwargs={"resource_name": d["resource_name"]}, ) get_data = PythonOperator( task_id="download_data", python_callable=download_data, ) unzip_files = PythonOperator(task_id="unzip_data", python_callable=unzip_data,) filename_date_format = PythonOperator( task_id="get_filename_date_format", python_callable=get_filename_date_format, op_kwargs={"period_range": d["period_range"]}, ) latest_loaded = PythonOperator( task_id="determine_latest_period_loaded", python_callable=determine_latest_period_loaded, ) periods_to_load = PythonOperator( task_id="calculate_periods_to_load", python_callable=calculate_periods_to_load, op_kwargs={"period_range": d["period_range"]}, ) no_new_periods_to_load = DummyOperator(task_id="no_new_periods_to_load") new_periods_to_load = DummyOperator(task_id="new_periods_to_load") new_data_to_load = BranchPythonOperator( task_id="are_there_new_periods", python_callable=are_there_new_periods, op_kwargs={"resource_name": d["resource_name"]}, ) staging_folder = PythonOperator( task_id="make_staging_folder", python_callable=make_staging_folder, op_kwargs={"resource_name": d["resource_name"]}, ) extract_complete = DummyOperator(task_id="extract_complete") extract_new = PythonOperator( task_id="extract_new", python_callable=make_new_extract_folders, ) key_metrics = PythonOperator( task_id="key_metrics", python_callable=extract_new_report, op_kwargs={"report_name": "Key Metrics"}, ) new_v_return_visitors = PythonOperator( task_id="new_v_return_visitors", python_callable=extract_new_report, op_kwargs={"report_name": "New vs. Return Visitors"}, ) hits_by_hour = PythonOperator( task_id="hits_by_hour", python_callable=extract_new_report, op_kwargs={"report_name": "Hits by Hour of Day"}, ) visits_by_day = PythonOperator( task_id="visits_by_day", python_callable=extract_new_report, op_kwargs={"report_name": "Visits by Day of Week"}, ) operating_system = PythonOperator( task_id="operating_system", python_callable=extract_new_report, op_kwargs={"report_name": "Operating System Platform"}, ) browser = PythonOperator( task_id="browser", python_callable=extract_new_report, op_kwargs={"report_name": "Browser"}, ) screen_resolution = PythonOperator( task_id="screen_resolution", python_callable=extract_new_report, op_kwargs={"report_name": "Screen Resolution"}, ) mobile_devices = PythonOperator( task_id="mobile_devices", python_callable=extract_new_report, op_kwargs={"report_name": "Mobile Devices"}, ) mobile_browser = PythonOperator( task_id="mobile_browser", python_callable=extract_new_report, op_kwargs={"report_name": "Mobile Browser"}, ) referring_site = PythonOperator( task_id="referring_site", python_callable=extract_new_report, op_kwargs={"report_name": "Referring Site"}, ) search_engines = PythonOperator( task_id="search_engines", python_callable=extract_new_report, op_kwargs={"report_name": "Search Engines"}, ) countries = PythonOperator( task_id="countries", python_callable=extract_new_report, op_kwargs={"report_name": "Countries"}, ) cities = PythonOperator( task_id="cities", python_callable=extract_new_report, op_kwargs={"report_name": "Cities"}, ) top_pages = PythonOperator( task_id="top_pages", python_callable=extract_new_report, op_kwargs={"report_name": "Top Pages"}, ) entry_pages = PythonOperator( task_id="entry_pages", python_callable=extract_new_report, op_kwargs={"report_name": "Entry Pages"}, ) exit_pages = PythonOperator( task_id="exit_pages", python_callable=extract_new_report, op_kwargs={"report_name": "Exit Pages"}, ) file_downloads = PythonOperator( task_id="file_downloads", python_callable=extract_new_report, op_kwargs={"report_name": "File Downloads"}, ) email_address = PythonOperator( task_id="email_address", python_callable=extract_new_report, op_kwargs={"report_name": "Email Address"}, ) offsite_links = PythonOperator( task_id="offsite_links", python_callable=extract_new_report, op_kwargs={"report_name": "Offsite Links"}, ) anchor_tags = PythonOperator( task_id="anchor_tags", python_callable=extract_new_report, op_kwargs={"report_name": "Anchor Tags"}, ) copy_previous = PythonOperator( task_id="copy_previous", python_callable=copy_previous_to_staging, ) zip_resource_files = PythonOperator( task_id="zip_files", python_callable=zip_files, op_kwargs={"resource_name": d["resource_name"]}, ) upload_data = PythonOperator(task_id="upload_zip", python_callable=upload_zip,) msg = PythonOperator(task_id="build_message", python_callable=build_message,) send_notification = PythonOperator( task_id="send_success_msg", python_callable=send_success_msg, ) delete_tmp_dir = PythonOperator( task_id="delete_tmp_dir", python_callable=airflow_utils.delete_tmp_data_dir, op_kwargs={"dag_id": d["dag_id"], "recursively": True}, trigger_rule="none_failed", ) package >> is_resource_new_branch is_resource_new_branch >> create_resource is_resource_new_branch >> no_new_resource [create_resource, no_new_resource] >> resource >> get_data >> unzip_files [unzip_files, filename_date_format] >> latest_loaded latest_loaded >> periods_to_load >> new_data_to_load [copy_previous, extract_complete] >> zip_resource_files >> upload_data >> msg create_tmp_dir >> get_data new_data_to_load >> no_new_periods_to_load new_data_to_load >> new_periods_to_load >> staging_folder >> [ extract_new, copy_previous, ] extract_new >> [ key_metrics, new_v_return_visitors, hits_by_hour, visits_by_day, operating_system, browser, screen_resolution, mobile_devices, mobile_browser, referring_site, search_engines, countries, cities, top_pages, entry_pages, exit_pages, file_downloads, email_address, offsite_links, anchor_tags, ] >> extract_complete msg >> send_notification [send_notification, no_new_periods_to_load] >> delete_tmp_dir return dag
] >> job_failed >> restore_backup >> message_slack_recover return dag # build a dag for each number in range(10) for dataset in datasets: dag_id = dataset['package_id'] + '-' + dataset['tps_table_code'] schedule = '@once' default_args = airflow_utils.get_default_args({ "owner": "Gary", "depends_on_past": False, "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "on_failure_callback": task_failure_slack_alert, "retries": 0, "start_date": common_job_settings["start_date"], }) globals()[dag_id] = create_dag(dag_id, dataset, schedule, default_args)
airflow_utils.message_slack( name=PACKAGE_NAME, message_type="error", msg="Job not finished", active_env=Variable.get("active_env"), prod_webhook=Variable.get("active_env") == "prod", ) with DAG( PACKAGE_NAME, default_args=airflow_utils.get_default_args({ "on_failure_callback": task_failure_slack_alert, "start_date": datetime(2020, 11, 24, 13, 35, 0), "retries": 0, # "retry_delay": timedelta(minutes=3), }), description= "Take data from opendata.toronto.ca (CSV) and put into datastore", schedule_interval="0 17 * * *", catchup=False, tags=["dataset"], ) as dag: def is_resource_new(**kwargs): package = kwargs["ti"].xcom_pull(task_ids="get_package") logging.info( f"resources found: {[r['name'] for r in package['resources']]}")
get_resource_id >> new_or_existing >> [new_resource, existing_resource ] new_resource >> join_or >> join_and existing_resource >> backup_resource >> delete_resource >> join_or >> join_and join_and >> insert_records >> modify_metadata >> job_success >> delete_tmp_dir>> message_slack [get_agol_data, get_resource_id] >> job_failed [insert_records] >> job_failed >> restore_backup return dag for dataset in DATASETS: dag_id = dataset['package_id'] schedule = '@once' default_args = airflow_utils.get_default_args( { "owner": "Mackenzie", "depends_on_past": False, "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "on_failure_callback": task_failure_slack_alert, "retries": 0, "start_date": COMMON_JOB_SETTINGS["start_date"], } ) globals()[dag_id] = create_dag(dag_id, dataset, schedule, default_args)
from datetime import datetime from airflow import DAG from airflow.models import Variable from airflow.operators.bash import BashOperator from utils import airflow_utils with DAG( "pull_latest_code", default_args=airflow_utils.get_default_args( {"retries": 0, "start_date": datetime(2020, 11, 10, 0, 30, 0)} ), description="Pulls repo code. Updated dags must be deleted and restarted.", schedule_interval="*/5 * * * *", tags=["sustainment"], catchup=False, ) as dag: pull_repo = BashOperator( task_id="pull_repo", bash_command=f"git -C {Variable.get('repo_dir')} pull; echo $?", ) pull_repo
from ckan_operators.package_operator import GetAllPackagesOperator, AssertIdenticalPackagesOperator from utils import airflow_utils CONTRIB_ADDRESS = "https://ckanadmin0.intra.prod-toronto.ca/" DELIVERY_ADDRESS = "https://ckan0.cf.opendata.inter.prod-toronto.ca/" DEFAULT_ARGS = airflow_utils.get_default_args({ "owner": "Mackenzie", "depends_on_past": False, "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, #"on_failure_callback": task_failure_slack_alert, "retries": 0, "start_date": datetime(2021, 9, 1, 0, 0, 0) }) DESCRIPTION = "Compares CKAN Contrib to Delivery and returns slack message if they aren't identical" SCHEDULE = "1 0,12 * * 1-5" # minute 1 at noon and midnight on weekdays TAGS = ["sustainment"] with DAG("check_contrib_delivery_sync", description=DESCRIPTION, default_args=DEFAULT_ARGS,
def send_failure_msg(): airflow_utils.message_slack( name=job_name, message_type="error", msg="Job not finished", active_env=Variable.get("active_env"), prod_webhook=Variable.get("active_env") == "prod", ) with DAG( job_name, default_args=airflow_utils.get_default_args({ "on_failure_callback": send_failure_msg, "start_date": datetime(2020, 11, 9, 0, 30, 0), }), description="Identifies empty datastore resources and send to Slack", schedule_interval="5 15,18,21,0,3 * * *", tags=["sustainment"], catchup=False, ) as dag: ckan_creds = Variable.get("ckan_credentials_secret", deserialize_json=True) active_env = Variable.get("active_env") ckan_address = ckan_creds[active_env]["address"] ckan_apikey = ckan_creds[active_env]["apikey"] def send_success_msg(**kwargs): msg = kwargs.pop("ti").xcom_pull(task_ids="build_message")
resources_to_load = kwargs["ti"].xcom_pull( task_ids="identify_resources_to_load") if len(resources_to_load) == 0: return "no_files_are_not_new" return "yes_continue_with_refresh" def get_package(): return CKAN.action.package_show(id=PACKAGE_ID) default_args = airflow_utils.get_default_args({ "on_failure_callback": task_failure_slack_alert, "start_date": job_settings["start_date"] }) with DAG( PACKAGE_ID, default_args=default_args, description=job_settings["description"], schedule_interval=job_settings["schedule"], catchup=False, tags=["dataset"], ) as dag: create_tmp_dir = PythonOperator( task_id="create_tmp_dir", python_callable=airflow_utils.create_dir_with_dag_name,
message_type="error", msg="Job not finished", active_env=Variable.get("active_env"), prod_webhook=Variable.get("active_env") == "prod", ) with DAG( PACKAGE_NAME, default_args=airflow_utils.get_default_args( { "owner": "Gary", "depends_on_past": False, "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(seconds=600), "on_failure_callback": task_failure_slack_alert, "start_date": days_ago(1), "retries": 0, } ), description="Take tpp json and narratives from progress portal", schedule_interval="0 22 * * 1-5", catchup=False, tags=["dataset"], ) as dag: def is_resource_new(**kwargs): package = kwargs["ti"].xcom_pull(task_ids="get_package") logging.info(f"resources found: {[r['name'] for r in package['resources']]}")
def send_failure_message(): airflow_utils.message_slack( name=PACKAGE_ID, message_type="error", msg="Job not finished", active_env=ACTIVE_ENV, prod_webhook=ACTIVE_ENV == "prod", ) with DAG( PACKAGE_ID, default_args=airflow_utils.get_default_args({ "on_failure_callback": task_failure_slack_alert, "start_date": datetime(2020, 11, 10, 13, 35, 0), }), description= "Get rain gauge data from the last time it was loaded to now", schedule_interval="30 14 * * *", catchup=False, tags=["dataset"], ) as dag: CKAN_CREDS = Variable.get("ckan_credentials_secret", deserialize_json=True) CKAN = ckanapi.RemoteCKAN(**CKAN_CREDS[ACTIVE_ENV]) def send_success_msg(**kwargs): msg = kwargs.pop("ti").xcom_pull(task_ids="build_message") airflow_utils.message_slack(
def send_failure_message(): airflow_utils.message_slack( name=PACKAGE_NAME, message_type="error", msg="Job not finished", active_env=Variable.get("active_env"), prod_webhook=Variable.get("active_env") == "prod", ) with DAG( PACKAGE_NAME, default_args=airflow_utils.get_default_args({ "on_failure_callback": task_failure_slack_alert, "start_date": days_ago(1), "retries": 0, # "retry_delay": timedelta(minutes=3), }), description="", schedule_interval="0 17 * * *", catchup=False, tags=["dataset"], ) as dag: def is_resource_new(**kwargs): pkg = kwargs["ti"].xcom_pull(task_ids="get_package") resource_name = kwargs["resource_name"] logging.info( f"looking for: {resource_name} | resources found: {[r['name'] for r in pkg['resources']]}" )