subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="firefox-android-beta-adjust-import", cluster_name="firefox-android-beta-adjust-import-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=40, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={"PIP_PACKAGES": "click==7.1.2"}, python_driver_code="gs://{}/jobs/adjust_import.py".format( params.artifact_bucket), py_args=[ "--pbkdf2", "--salt", "org.mozilla.fenix-salt", "--project", project, "--input_table", "tmp.adjust_firefox_preview", "--output_table", "firefox_android_beta_external.adjust_install_time_v1", "--bucket", params.storage_bucket, ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ),
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="ltv-daily", cluster_name="ltv-daily-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=5, worker_machine_type="n1-standard-8", optional_components=["ANACONDA"], init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={"PIP_PACKAGES": "lifetimes==0.11.1"}, python_driver_code="gs://{}/jobs/ltv_daily.py".format( params.artifact_bucket), py_args=[ "--submission-date", "{{ ds }}", "--prediction-days", "364", "--project-id", project, "--source-qualified-table-id", "{project}.search.search_rfm".format(project=project), "--dataset-id", "analysis", "--intermediate-table-id", "ltv_daily_temporary_search_rfm_day", "--model-input-table-id", "ltv_daily_model_perf", "--model-output-table-id", "ltv_daily", "--temporary-gcs-bucket", params.storage_bucket, ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ),
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="prerelease_aggregates", cluster_name="prerelease-telemetry-aggregates-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=10, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, additional_metadata={ "PIP_PACKAGES": "git+https://github.com/mozilla/python_mozaggregator.git" }, python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format( artifact_bucket), py_args=[ "aggregator", "--date", "{{ ds_nodash }}", "--channels", "nightly,aurora,beta", "--postgres-db", "telemetry", "--postgres-user", "root", "--postgres-pass", "{{ var.value.mozaggregator_postgres_pass }}", "--postgres-host", "{{ var.value.mozaggregator_postgres_host }}", "--postgres-ro-host", "{{ var.value.mozaggregator_postgres_ro_host }}", "--num-partitions", str(10 * 32), ] + ([ "--source", "bigquery", "--project-id", "moz-fx-data-shared-prod" ] if not EXPORT_TO_AVRO else [ "--source", "avro", "--avro-prefix", "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod", ]), gcp_conn_id=gcp_conn.gcp_conn_id, service_account=client_email, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, default_args=subdag_args, ),
task_id="public_data_hardware_report", dag=dag, subdag = moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="public_data_hardware_report", default_args=default_args, cluster_name="public-data-hardware-report-{{ ds }}", job_name="Firefox_Public_Data_Hardware_Report-{{ ds }}", python_driver_code="gs://{}/jobs/moz_dataproc_runner.py".format(params.artifact_bucket), init_actions_uris=["gs://dataproc-initialization-actions/python/pip-install.sh"], additional_metadata={'PIP_PACKAGES': "git+https://github.com/mozilla/firefox-public-data-report-etl.git"}, additional_properties={"spark:spark.jars":"gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark-env:AWS_ACCESS_KEY_ID": aws_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key}, py_args=[ "public_data_report", "hardware_report", "--date_from", "{{ ds }}", "--bq_table", "moz-fx-data-shared-prod.telemetry_derived.public_data_report_hardware", "--temporary_gcs_bucket", params.storage_bucket, "--s3_bucket", "telemetry-public-analysis-2", "--s3_path", "public-data-report/hardware/", ], idle_delete_ttl='14400', num_workers=2, worker_machine_type='n1-standard-4', gcp_conn_id=params.conn_id, service_account=params.client_email, storage_bucket=params.storage_bucket, ) )
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="bgbb_pred_dataproc", cluster_name="bgbb-pred-{{ ds_nodash }}", idle_delete_ttl="600", num_workers=10, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={ "PIP_PACKAGES": "git+https://github.com/wcbeard/bgbb_airflow.git" }, python_driver_code="gs://{}/jobs/bgbb_runner.py".format(params.artifact_bucket), py_args=[ "bgbb_pred", "--submission-date", "{{ ds }}", "--model-win", "90", "--sample-ids", "[42]" if params.is_dev else "[]", "--source", "bigquery", "--view-materialization-project", params.project_id if params.is_dev else "moz-fx-data-shared-prod", "--view-materialization-dataset", "analysis", "--bucket-protocol", "gs", "--param-bucket", params.output_bucket, "--param-prefix", "bgbb/params/v1", "--pred-bucket", params.output_bucket, "--pred-prefix", "bgbb/active_profiles/v1", ], gcp_conn_id=params.conn_id, service_account=params.client_email, artifact_bucket=params.artifact_bucket, storage_bucket=params.storage_bucket, default_args=subdag_args, ),
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="hardware_report", default_args=default_args, cluster_name=cluster_name, job_name="Firefox_Hardware_Report", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/hardware_report.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={ 'PIP_PACKAGES': "google-cloud-bigquery==1.21.0 python_moztelemetry==0.10.2 boto3==1.9.87 click==6.7 click_datetime==0.2 requests-toolbelt==0.8.0 requests==2.20.1 typing==3.6.4" }, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark-env:AWS_ACCESS_KEY_ID": aws_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key }, py_args=[ "--start_date", DS_WEEKLY, "--bucket", "telemetry-public-analysis-2", "--spark-provider", "dataproc", ], idle_delete_ttl='14400', num_workers=15, worker_machine_type='n1-standard-4', gcp_conn_id=gcp_conn_id))
# Spark job reads gcs json and writes gcs parquet crash_report_parquet = SubDagOperator( task_id="crash_report_parquet", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="crash_report_parquet", default_args=default_args, cluster_name=cluster_name, job_name="Socorro_Crash_Reports_to_Parquet", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/socorro_import_crash_data.py", py_args=[ "--date", "{{ ds_nodash }}", "--source-gcs-path", "gs://{}/v1/crash_report".format(gcs_data_bucket), "--dest-gcs-path", "gs://{}/{}".format(gcs_data_bucket, dataset), ], idle_delete_ttl="14400", num_workers=8, worker_machine_type="n1-standard-8", aws_conn_id=read_aws_conn_id, gcp_conn_id=gcp_conn_id, ), ) bq_gcp_conn_id = "google_cloud_derived_datasets" bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id)
dag=dag) taar_dynamo_job = SubDagOperator( task_id="taar_dynamo_job", subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="taar_dynamo_job", default_args=default_args, master_machine_type='n1-standard-32', worker_machine_type='n1-standard-32', cluster_name=taar_dynamo_cluster_name, job_name="TAAR_Dynamo", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_dynamo.py", num_workers=12, py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", taar_aws_access_key, "--aws_secret_access_key", taar_aws_secret_key, ], aws_conn_id=taar_aws_conn_id, gcp_conn_id=taar_gcpdataproc_conn_id, master_disk_type='pd-ssd', worker_disk_type='pd-ssd', ), dag=dag, ) taar_locale = SubDagOperator(
taar_locale = SubDagOperator( task_id="taar_locale", subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="taar_locale", default_args=default_args, cluster_name=taar_locale_cluster_name, job_name="TAAR_Locale", python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_locale.py", num_workers=12, py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", taar_aws_access_key, "--aws_secret_access_key", taar_aws_secret_key, "--bucket", "telemetry-private-analysis-2", "--prefix", "taar/locale/", ], aws_conn_id=taar_aws_conn_id, gcp_conn_id=taar_gcpdataproc_conn_id, ), dag=dag, ) taar_similarity_args = default_args.copy()
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="graphics_trends", default_args=default_args, cluster_name="graphics-trends-{{ ds }}", job_name="graphics-trends", python_driver_code= "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/graphics/graphics_telemetry_trends.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={'PIP_PACKAGES': " ".join(PIP_PACKAGES)}, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", "spark-env:AWS_ACCESS_KEY_ID": aws_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key }, py_args=[ "--s3-bucket", S3_BUCKET, "--s3-prefix", S3_PREFIX, "--weekly-fraction", "0.003", ], idle_delete_ttl="14400", num_workers=2, worker_machine_type="n1-standard-4", gcp_conn_id=params.conn_id, service_account=params.client_email, storage_bucket=params.storage_bucket, ))
subdag=moz_dataproc_pyspark_runner( parent_dag_name=taar_weekly.dag_id, dag_name="taar_ensemble", default_args=default_args_weekly, cluster_name=taar_ensemble_cluster_name, job_name="TAAR_ensemble", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_ensemble.py", additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark:spark.hadoop.fs.s3a.access.key": taar_aws_access_key, "spark:spark.hadoop.fs.s3a.secret.key": taar_aws_secret_key, "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", "spark:spark.python.profile": "true", }, num_workers=35, worker_machine_type="n1-standard-8", master_machine_type="n1-standard-8", init_actions_uris=[ "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/pip-install.sh" ], additional_metadata={ "PIP_PACKAGES": "mozilla-taar3==0.4.12 mozilla-srgutil==0.2.1 python-decouple==3.1 click==7.0 boto3==1.7.71 dockerflow==2018.4.0" }, optional_components=["ANACONDA", "JUPYTER"], py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", taar_aws_access_key, "--aws_secret_access_key", taar_aws_secret_key, "--sample_rate", "0.005", ], aws_conn_id=taar_aws_conn_id, gcp_conn_id=taar_gcpdataproc_conn_id, master_disk_type="pd-ssd", worker_disk_type="pd-ssd", master_disk_size=1024, worker_disk_size=1024, master_num_local_ssds=2, worker_num_local_ssds=2, ),
# Required to write json output back to s3://telemetry-public-analysis-2/app-update/data/out-of-date/ write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw' aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials() crash_report_parquet = SubDagOperator( task_id="update_orphaning_dashboard_etl", dag=dag, subdag = moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="update_orphaning_dashboard_etl", default_args=default_args, cluster_name=cluster_name, job_name="update_orphaning_dashboard_etl", python_driver_code="gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/update_orphaning_dashboard_etl.py", init_actions_uris=["gs://dataproc-initialization-actions/python/pip-install.sh"], additional_metadata={'PIP_PACKAGES': "google-cloud-bigquery==1.20.0 google-cloud-storage==1.19.1 boto3==1.9.253"}, additional_properties={"spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.3"}, py_args=[ "--run-date", DS_WEEKLY, "--gcs-bucket", "moz-fx-data-derived-datasets-analysis", "--gcs-prefix", "update-orphaning-airflow", "--s3-output-bucket", "telemetry-public-analysis-2", "--s3-output-path", "app-update/data/out-of-date/", "--aws-access-key-id", aws_access_key, "--aws-secret-access-key", aws_secret_key ], idle_delete_ttl='14400', num_workers=20, worker_machine_type='n1-standard-8', gcp_conn_id=gcp_conn_id) )
subdag=moz_dataproc_pyspark_runner( parent_dag_name=taar_weekly.dag_id, dag_name="taar_ensemble", default_args=default_args_weekly, cluster_name=taar_ensemble_cluster_name, job_name="TAAR_ensemble", # GCS bucket for testing is located in `cfr-personalization-experiment` project # python_driver_code="gs://taar_models/tmp/jobs/taar_ensemble.py", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_ensemble.py", additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar", "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", "spark:spark.python.profile": "true", }, num_workers=35, worker_machine_type="n1-standard-8", master_machine_type="n1-standard-8", init_actions_uris=[ "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/pip-install.sh" ], additional_metadata={ "PIP_PACKAGES": "mozilla-taar3==1.0.7 python-decouple==3.1 click==7.0 " "google-cloud-storage==1.19.1" }, optional_components=["ANACONDA", "JUPYTER"], py_args=[ "--date", "{{ ds_nodash }}", "--gcs_model_bucket", TAAR_ETL_MODEL_STORAGE_BUCKET, "--sample_rate", "0.005", ], gcp_conn_id=taar_gcpdataproc_conn_id, master_disk_type="pd-ssd", worker_disk_type="pd-ssd", master_disk_size=1024, worker_disk_size=1024, master_num_local_ssds=2, worker_num_local_ssds=2, ),
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name=task_id, job_name="prerelease_aggregates_cloudsql", cluster_name="prerelease-telemetry-aggregates-cloudsql-{{ ds_nodash }}", idle_delete_ttl="600", zone="us-west2-a", subnetwork_uri="default", internal_ip_only=True, num_workers=10, worker_machine_type="n1-standard-8", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar" }, additional_metadata={ "PIP_PACKAGES": "git+https://github.com/mozilla/python_mozaggregator.git" }, python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format( artifact_bucket), py_args=[ "aggregator", "--date", "{{ ds_nodash }}", "--channels", "nightly,aurora,beta", "--postgres-db", "telemetry", "--postgres-user", "root", "--postgres-pass", "{{ var.value.mozaggregator_cloudsql_pass }}", "--postgres-host", "{{ var.value.mozaggregator_cloudsql_host }}", "--postgres-ro-host", "{{ var.value.mozaggregator_cloudsql_ro_host }}", "--num-partitions", str(10 * 32), "--source", "bigquery", "--project-id", "moz-fx-data-shared-prod", ], gcp_conn_id=gcp_conn.gcp_conn_id, service_account=client_email, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, default_args=subdag_args, ),
dag=dag, ) taar_lite = SubDagOperator( task_id="taar_lite", subdag=moz_dataproc_pyspark_runner( parent_dag_name="taar_amodump", dag_name="taar_lite", default_args=default_args, cluster_name=taarlite_cluster_name, job_name="TAAR_Lite_GUID_GUID", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_lite_guidguid.py", # python_driver_code="gs://temp-hwoo-removemelater/taar_lite_guidguid.py", num_workers=8, py_args=[ "--date", "{{ ds_nodash }}", "--aws_access_key_id", aws_access_key, "--aws_secret_access_key", aws_secret_key, ], aws_conn_id=aws_conn_id, gcp_conn_id=gcpdataproc_conn_id, ), dag=dag, ) # Set a dependency on amodump from amowhitelist amowhitelist.set_upstream(amodump)
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="modules_with_missing_symbols", default_args=default_args, cluster_name="modules-with-missing-symbols-{{ ds }}", job_name="modules-with-missing-symbols", python_driver_code= "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/modules_with_missing_symbols.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)}, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", "spark-env:AWS_ACCESS_KEY_ID": ses_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": ses_secret_key, }, py_args=[ "--run-on-days", "0", # run monday "--date", "{{ ds }}" ], idle_delete_ttl="14400", num_workers=2, worker_machine_type="n1-standard-4", gcp_conn_id=params.conn_id, service_account=params.client_email, storage_bucket=params.storage_bucket, ),
subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, image_version="1.5", dag_name="bhr_collection", default_args=default_args, cluster_name="bhr-collection-{{ ds }}", job_name="bhr-collection", python_driver_code= "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/bhr_collection/bhr_collection.py", init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh" ], additional_metadata={ "PIP_PACKAGES": "boto3==1.16.20 click==7.1.2" }, additional_properties={ "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", "spark-env:AWS_ACCESS_KEY_ID": aws_access_key, "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key }, py_args=[ "--date", "{{ ds }}", "--sample-size", "0.5", ], idle_delete_ttl="14400", num_workers=6, worker_machine_type="n1-highmem-4", gcp_conn_id=params.conn_id, service_account=params.client_email, storage_bucket=params.storage_bucket, ))
pool="DATA_ENG_EXTERNALTASKSENSOR", email_on_retry=False, dag=dag) taar_locale = SubDagOperator( task_id="taar_locale", subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="taar_locale", default_args=default_args, cluster_name=taar_locale_cluster_name, job_name="TAAR_Locale", python_driver_code= "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/taar_locale.py", # GCS bucket for testing is located in `cfr-personalization-experiment` project # python_driver_code="gs://taar_models/tmp/jobs/taar_locale.py", num_workers=12, py_args=[ "--date", "{{ ds_nodash }}", "--bucket", TAAR_ETL_MODEL_STORAGE_BUCKET, "--prefix", "taar/locale", ], gcp_conn_id=taar_gcpdataproc_conn_id), dag=dag) taar_similarity = SubDagOperator( task_id="taar_similarity", subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id,