'source': { 'image_uri': GCP_VISION_ANNOTATE_IMAGE_URL } }, 'features': [{ 'type': enums.Feature.Type.LOGO_DETECTION }], } # [END howto_operator_vision_annotate_image_request] # [START howto_operator_vision_detect_image_param] DETECT_IMAGE = {"source": {"image_uri": GCP_VISION_ANNOTATE_IMAGE_URL}} # [END howto_operator_vision_detect_image_param] with models.DAG('example_gcp_vision_autogenerated_id', default_args=default_args, schedule_interval=None) as dag_autogenerated_id: # ################################## # # ### Autogenerated IDs examples ### # # ################################## # # [START howto_operator_vision_product_set_create] product_set_create = CloudVisionCreateProductSetOperator( location=GCP_VISION_LOCATION, product_set=product_set, retry=Retry(maximum=10.0), timeout=5, task_id='product_set_create', ) # [END howto_operator_vision_product_set_create]
# Example dataset DATASET = { "display_name": "test_video_dataset", "video_classification_dataset_metadata": {}, } IMPORT_INPUT_CONFIG = {"gcs_source": {"input_uris": [GCP_AUTOML_VIDEO_BUCKET]}} default_args = {"start_date": days_ago(1)} extract_object_id = CloudAutoMLHook.extract_object_id # Example DAG for AutoML Video Intelligence Classification with models.DAG( "example_automl_video", default_args=default_args, schedule_interval=None, # Override to match your needs user_defined_macros={"extract_object_id": extract_object_id}, tags=['example'], ) as example_dag: create_dataset_task = AutoMLCreateDatasetOperator( task_id="create_dataset_task", dataset=DATASET, location=GCP_AUTOML_LOCATION) dataset_id = ( '{{ task_instance.xcom_pull("create_dataset_task", key="dataset_id") }}' ) import_dataset_task = AutoMLImportDataOperator( task_id="import_dataset_task", dataset_id=dataset_id,
# under the License. import os from airflow import models from airflow.providers.google.cloud.transfers.sheets_to_gcs import GoogleSheetsToGCSOperator from airflow.providers.google.suite.transfers.gcs_to_sheets import GCSToGoogleSheetsOperator from airflow.utils.dates import days_ago BUCKET = os.environ.get("GCP_GCS_BUCKET", "example-test-bucket3") SPREADSHEET_ID = os.environ.get("SPREADSHEET_ID", "example-spreadsheetID") NEW_SPREADSHEET_ID = os.environ.get("NEW_SPREADSHEET_ID", "1234567890qwerty") with models.DAG( "example_gcs_to_sheets", start_date=days_ago(1), schedule_interval='@once', # Override to match your needs tags=["example"], ) as dag: upload_sheet_to_gcs = GoogleSheetsToGCSOperator( task_id="upload_sheet_to_gcs", destination_bucket=BUCKET, spreadsheet_id=SPREADSHEET_ID, ) # [START upload_gcs_to_sheets] upload_gcs_to_sheet = GCSToGoogleSheetsOperator( task_id="upload_gcs_to_sheet", bucket_name=BUCKET, object_name="{{ task_instance.xcom_pull('upload_sheet_to_gcs')[0] }}", spreadsheet_id=NEW_SPREADSHEET_ID,
'owner': '*****@*****.**', 'start_date': datetime.datetime(2019, 5, 12), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'depends_on_past': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=10), } dag_name = 'kpi_dashboard' with models.DAG( dag_name, # KPI dashboard refreshes at 16:00 UTC, so run this 15 minutes beforehand. schedule_interval='45 15 * * *', default_args=default_args) as dag: kpi_dashboard = bigquery_etl_query( destination_table='firefox_kpi_dashboard_v1', dataset_id='telemetry', date_partition_parameter=None, email=[ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**' ]) smoot_usage_new_profiles_v2 = bigquery_etl_query( task_id='smoot_usage_new_profiles_v2',
} PARAMETERS = {"dataRange": "LAST_14_DAYS", "timezoneCode": "America/New_York"} CREATE_SDF_DOWNLOAD_TASK_BODY_REQUEST: Dict = { "version": SDF_VERSION, "advertiserId": ADVERTISER_ID, "inventorySourceFilter": {"inventorySourceIds": []}, } DOWNLOAD_LINE_ITEMS_REQUEST: Dict = {"filterType": ADVERTISER_ID, "format": "CSV", "fileSpec": "EWF"} # [END howto_display_video_env_variables] with models.DAG( "example_display_video", schedule_interval='@once', # Override to match your needs, start_date=dates.days_ago(1), ) as dag1: # [START howto_google_display_video_createquery_report_operator] create_report = GoogleDisplayVideo360CreateReportOperator(body=REPORT, task_id="create_report") report_id = create_report.output["report_id"] # [END howto_google_display_video_createquery_report_operator] # [START howto_google_display_video_runquery_report_operator] run_report = GoogleDisplayVideo360RunReportOperator( report_id=report_id, parameters=PARAMETERS, task_id="run_report" ) # [END howto_google_display_video_runquery_report_operator] # [START howto_google_display_video_wait_report_operator] wait_for_report = GoogleDisplayVideo360ReportSensor(task_id="wait_for_report", report_id=report_id)
# Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from datetime import datetime from airflow import models from airflow.providers.amazon.aws.transfers.sftp_to_s3 import SFTPToS3Operator S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket") S3_KEY = os.environ.get("S3_KEY", "key") with models.DAG( "example_sftp_to_s3", schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, ) as dag: # [START howto_transfer_sftp_to_s3] create_sftp_to_s3_job = SFTPToS3Operator( task_id="create_sftp_to_s3_job", sftp_path="/tmp/sftp_path", s3_bucket=S3_BUCKET, s3_key=S3_KEY, ) # [END howto_transfer_sftp_to_s3]
"location={location}&" \ "instance={instance}&" \ "use_proxy=False&" \ "use_ssl=True&" \ "sslcert={client_cert_file}&" \ "sslkey={client_key_file}&" \ "sslrootcert={server_ca_file}".format(**mysql_kwargs) # [END howto_operator_cloudsql_query_connections] # [START howto_operator_cloudsql_query_operators] connection_names = [ "proxy_postgres_tcp", "proxy_postgres_socket", "public_postgres_tcp", "public_postgres_tcp_ssl", "proxy_mysql_tcp", "proxy_mysql_socket", "public_mysql_tcp", "public_mysql_tcp_ssl" ] tasks = [] with models.DAG(dag_id='example_gcp_sql_query', default_args=default_args, schedule_interval=None) as dag: for connection_name in connection_names: tasks.append( CloudSqlQueryOperator(gcp_cloudsql_conn_id=connection_name, task_id="example_gcp_sql_task_" + connection_name, sql=SQL)) # [END howto_operator_cloudsql_query_operators]
from utils.amplitude import export_to_amplitude default_args = { 'owner': '*****@*****.**', 'start_date': datetime.datetime(2019, 6, 27), 'email': ['*****@*****.**', '*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': datetime.timedelta(minutes=10), } dag_name = 'bq_events_to_amplitude' with models.DAG( dag_name, default_args=default_args, schedule_interval='0 1 * * *') as dag: fenix_task_id = 'fenix_amplitude_export' SubDagOperator( subdag=export_to_amplitude( dag_name=fenix_task_id, parent_dag_name=dag_name, default_args=default_args, project='moz-fx-data-shared-prod', dataset='telemetry', table_or_view='fenix_events_v1', s3_prefix='fenix', ), task_id=fenix_task_id )
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Example Airflow DAG that shows the complex DAG structure. """ from datetime import datetime from airflow import models from airflow.models.baseoperator import chain from airflow.operators.bash import BashOperator with models.DAG( dag_id="example_complex", schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example', 'example2', 'example3'], ) as dag: # Create create_entry_group = BashOperator(task_id="create_entry_group", bash_command="echo create_entry_group") create_entry_group_result = BashOperator( task_id="create_entry_group_result", bash_command="echo create_entry_group_result") create_entry_group_result2 = BashOperator( task_id="create_entry_group_result2", bash_command="echo create_entry_group_result2")
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from airflow import models from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator from airflow.utils import dates # [START howto_gcs_environment_variables] BUCKET_NAME = os.environ.get('GCP_GCS_BUCKET', 'example-bucket-name') PATH_TO_UPLOAD_FILE = os.environ.get('GCP_GCS_PATH_TO_UPLOAD_FILE', 'example-text.txt') DESTINATION_FILE_LOCATION = os.environ.get('GCP_GCS_DESTINATION_FILE_LOCATION', 'example-text.txt') # [END howto_gcs_environment_variables] with models.DAG('example_local_to_gcs', default_args=dict(start_date=dates.days_ago(1)), schedule_interval=None, tags=['example']) as dag: # [START howto_operator_local_filesystem_to_gcs] upload_file = LocalFilesystemToGCSOperator( task_id="upload_file", src=PATH_TO_UPLOAD_FILE, dst=DESTINATION_FILE_LOCATION, bucket=BUCKET_NAME, ) # [END howto_operator_local_filesystem_to_gcs]
}, } IMPORT_INPUT_CONFIG = { "gcs_source": { "input_uris": [GCP_AUTOML_TEXT_CLS_BUCKET] } } default_args = {"start_date": days_ago(1)} extract_object_id = CloudAutoMLHook.extract_object_id # Example DAG for AutoML Natural Language Text Classification with models.DAG( "example_automl_text_cls", default_args=default_args, schedule_interval=None, # Override to match your needs tags=['example'], ) as example_dag: create_dataset_task = AutoMLCreateDatasetOperator( task_id="create_dataset_task", dataset=DATASET, location=GCP_AUTOML_LOCATION) dataset_id = ( '{{ task_instance.xcom_pull("create_dataset_task", key="dataset_id") }}' ) import_dataset_task = AutoMLImportDataOperator( task_id="import_dataset_task", dataset_id=dataset_id, location=GCP_AUTOML_LOCATION,
"proxy_postgres_tcp", "proxy_postgres_socket", "public_postgres_tcp", "public_postgres_tcp_ssl", "proxy_mysql_tcp", "proxy_mysql_socket", "public_mysql_tcp", "public_mysql_tcp_ssl", "public_mysql_tcp_ssl_no_project_id", ] tasks = [] with models.DAG( dag_id='example_gcp_sql_query', schedule_interval=None, start_date=days_ago(1), tags=['example'], ) as dag: prev_task = None for connection_name in connection_names: task = CloudSQLExecuteQueryOperator( gcp_cloudsql_conn_id=connection_name, task_id="example_gcp_sql_task_" + connection_name, sql=SQL) tasks.append(task) if prev_task: prev_task >> task prev_task = task # [END howto_operator_cloudsql_query_operators]
# Setting start date as yesterday starts the DAG immediately when it is # detected in the Cloud Storage bucket. 'start_date': yesterday, # To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG( 'composer_sample_quickstart', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='quickstart-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
""" Using column name returns spec of the column. """ for column in columns_specs: if column["displayName"] == column_name: return extract_object_id(column) return "" # Example DAG to create dataset, train model_id and deploy it. with models.DAG( "example_create_and_deploy", default_args=default_args, schedule_interval=None, # Override to match your needs user_defined_macros={ "get_target_column_spec": get_target_column_spec, "target": TARGET, "extract_object_id": extract_object_id, }, tags=['example'], ) as create_deploy_dag: # [START howto_operator_automl_create_dataset] create_dataset_task = AutoMLCreateDatasetOperator( task_id="create_dataset_task", dataset=DATASET, location=GCP_AUTOML_LOCATION, project_id=GCP_PROJECT_ID, ) dataset_id = ( "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}"
default_args = { "owner": "*****@*****.**", "start_date": datetime.datetime(2019, 7, 25), "email": ["*****@*****.**", "*****@*****.**"], "email_on_failure": True, "email_on_retry": True, "depends_on_past": False, # If a task fails, retry it once after waiting at least 5 minutes "retries": 1, "retry_delay": datetime.timedelta(minutes=5), } dag_name = "copy_deduplicate" with models.DAG(dag_name, schedule_interval="0 1 * * *", default_args=default_args) as dag: # This single task is responsible for sequentially running copy queries # over all the tables in _live datasets into _stable datasets except those # that are specifically used in another DAG. copy_deduplicate_all = bigquery_etl_copy_deduplicate( task_id="copy_deduplicate_all", target_project_id="moz-fx-data-shared-prod", # Any table listed here under except_tables _must_ have a corresponding # copy_deduplicate job in another DAG. except_tables=["telemetry_live.main_v4"]) # Events. event_events = bigquery_etl_query(
DF_TEMP = models.Variable.get('df_temp_location') COMPOSER_BUCKET_NAME = models.Variable.get('COMPOSER_BUCKET_NAME') #[START dag_predict_serve] default_dag_args = { 'start_date': datetime.datetime(2050, 1, 1), 'schedule_internal': None, 'provide_context': True, 'dataflow_default_options': { 'project': PROJECT, 'zone': DF_ZONE, 'tempLocation': DF_TEMP } } dag = models.DAG('predict_serve', default_args=default_dag_args) #[END dag_predict_serve] # # Runs prediction. # job_id = 'clv-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M')) def do_predict_clv(**kwargs): """ Runs a batch prediction on new data and saving the results as CSV into output_path. """ gcs_prediction_intput = 'gs://{}/predictions/to_predict.csv'.format( COMPOSER_BUCKET_NAME)
DATASET_NAME = os.environ.get("GCP_BIGQUERY_DATASET_NAME", "test_dataset_operations") LOCATION_DATASET_NAME = f"{DATASET_NAME}_location" DATA_SAMPLE_GCS_URL = os.environ.get( "GCP_BIGQUERY_DATA_GCS_URL", "gs://cloud-samples-data/bigquery/us-states/us-states.csv", ) DATA_SAMPLE_GCS_URL_PARTS = urlparse(DATA_SAMPLE_GCS_URL) DATA_SAMPLE_GCS_BUCKET_NAME = DATA_SAMPLE_GCS_URL_PARTS.netloc DATA_SAMPLE_GCS_OBJECT_NAME = DATA_SAMPLE_GCS_URL_PARTS.path[1:] with models.DAG( "example_bigquery_operations", schedule_interval=None, # Override to match your needs start_date=days_ago(1), tags=["example"], ) as dag: # [START howto_operator_bigquery_create_table] create_table = BigQueryCreateEmptyTableOperator( task_id="create_table", dataset_id=DATASET_NAME, table_id="test_table", schema_fields=[ { "name": "emp_name", "type": "STRING", "mode": "REQUIRED" }, { "name": "salary",
'owner': '*****@*****.**', 'start_date': datetime.datetime(2019, 3, 1), 'email': ['*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'depends_on_past': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=10), } dag_name = 'fxa_events' with models.DAG( dag_name, # Continue to run DAG once per day schedule_interval='0 10 * * *', default_args=default_args) as dag: fxa_auth_events = bigquery_etl_query( task_id='fxa_auth_events', destination_table='fxa_auth_events_v1', dataset_id='telemetry', arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',), ) fxa_auth_bounce_events = bigquery_etl_query( task_id='fxa_auth_bounce_events', destination_table='fxa_auth_bounce_events_v1', dataset_id='telemetry', arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
def build_load_dag(dag_id, output_bucket, destination_dataset_project_id, copy_dataset_project_id=None, copy_dataset_name=None, chain='ethereum', notification_emails=None, load_start_date=datetime(2018, 7, 1), schedule_interval='0 0 * * *'): # The following datasets must be created in BigQuery: # - crypto_{chain}_raw # - crypto_{chain}_temp # - crypto_{chain} # Environment variable OUTPUT_BUCKET must be set and point to the GCS bucket # where files exported by export_dag.py are located dataset_name = f'crypto_{chain}' dataset_name_raw = f'crypto_{chain}_raw' dataset_name_temp = f'crypto_{chain}_temp' if not destination_dataset_project_id: raise ValueError('destination_dataset_project_id is required') environment = { 'DATASET_NAME': dataset_name, 'DATASET_NAME_RAW': dataset_name_raw, 'DATASET_NAME_TEMP': dataset_name_temp, 'DESTINATION_DATASET_PROJECT_ID': destination_dataset_project_id } def read_bigquery_schema_from_file(filepath): result = [] file_content = read_file(filepath) json_content = json.loads(file_content) for field in json_content: result.append( bigquery.SchemaField(name=field.get('name'), field_type=field.get('type', 'STRING'), mode=field.get('mode', 'NULLABLE'), description=field.get('description'))) return result def read_file(filepath): with open(filepath) as file_handle: content = file_handle.read() for key, value in environment.items(): # each bracket should be doubled to be escaped # we need two escaped and one unescaped content = content.replace('{{{{{key}}}}}'.format(key=key), value) return content def submit_bigquery_job(job, configuration): try: logging.info('Creating a job: ' + json.dumps(configuration.to_api_repr())) result = job.result() logging.info(result) assert job.errors is None or len(job.errors) == 0 return result except Exception: logging.info(job.errors) raise default_dag_args = { 'depends_on_past': False, 'start_date': load_start_date, 'email_on_failure': True, 'email_on_retry': True, 'retries': 5, 'retry_delay': timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] # Define a DAG (directed acyclic graph) of tasks. dag = models.DAG(dag_id, catchup=False, schedule_interval=schedule_interval, default_args=default_dag_args) dags_folder = os.environ.get('DAGS_FOLDER', '/home/airflow/gcs/dags') def add_load_tasks(task, file_format, allow_quoted_newlines=False): wait_sensor = GoogleCloudStorageObjectSensor( task_id='wait_latest_{task}'.format(task=task), timeout=60 * 60, poke_interval=60, bucket=output_bucket, object='export/{task}/block_date={datestamp}/{task}.{file_format}'. format(task=task, datestamp='{{ds}}', file_format=file_format), dag=dag) def load_task(): client = bigquery.Client() job_config = bigquery.LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE' load_operator = PythonOperator(task_id='load_{task}'.format(task=task), python_callable=load_task, execution_timeout=timedelta(minutes=30), dag=dag) wait_sensor >> load_operator return load_operator def add_enrich_tasks(task, time_partitioning_field='block_timestamp', dependencies=None): def enrich_task(): client = bigquery.Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table = bigquery.Table(temp_table_ref, schema=schema) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = bigquery.QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' all_destination_projects = [(destination_dataset_project_id, dataset_name)] if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \ and copy_dataset_name is not None and len(copy_dataset_name) > 0: all_destination_projects.append( (copy_dataset_project_id, copy_dataset_name)) for dest_project, dest_dataset_name in all_destination_projects: dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dest_dataset_name, project=dest_project).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref) enrich_operator = PythonOperator( task_id='enrich_{task}'.format(task=task), python_callable=enrich_task, execution_timeout=timedelta(minutes=60), dag=dag) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: dependency >> enrich_operator return enrich_operator def add_verify_tasks(task, dependencies=None): # The queries in verify/sqls will fail when the condition is not met # Have to use this trick since the Python 2 version of BigQueryCheckOperator doesn't support standard SQL # and legacy SQL can't be used to query partitioned tables. sql_path = os.path.join( dags_folder, 'resources/stages/verify/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path) verify_task = BigQueryOperator( task_id='verify_{task}'.format(task=task), bql=sql, use_legacy_sql=False, dag=dag) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: dependency >> verify_task return verify_task load_blocks_task = add_load_tasks('blocks', 'csv') load_transactions_task = add_load_tasks('transactions', 'csv') load_receipts_task = add_load_tasks('receipts', 'csv') load_logs_task = add_load_tasks('logs', 'json') load_contracts_task = add_load_tasks('contracts', 'json') load_tokens_task = add_load_tasks('tokens', 'csv', allow_quoted_newlines=True) load_token_transfers_task = add_load_tasks('token_transfers', 'csv') load_traces_task = add_load_tasks('traces', 'csv') enrich_blocks_task = add_enrich_tasks('blocks', time_partitioning_field='timestamp', dependencies=[load_blocks_task]) enrich_transactions_task = add_enrich_tasks('transactions', dependencies=[ load_blocks_task, load_transactions_task, load_receipts_task ]) enrich_logs_task = add_enrich_tasks( 'logs', dependencies=[load_blocks_task, load_logs_task]) enrich_tokens_task = add_enrich_tasks('tokens', time_partitioning_field=None, dependencies=[load_tokens_task]) enrich_token_transfers_task = add_enrich_tasks( 'token_transfers', dependencies=[load_blocks_task, load_token_transfers_task]) enrich_traces_task = add_enrich_tasks( 'traces', dependencies=[load_blocks_task, load_traces_task]) enrich_contracts_task = add_enrich_tasks( 'contracts', dependencies=[load_contracts_task, enrich_traces_task]) verify_blocks_count_task = add_verify_tasks('blocks_count', [enrich_blocks_task]) verify_blocks_have_latest_task = add_verify_tasks('blocks_have_latest', [enrich_blocks_task]) verify_transactions_count_task = add_verify_tasks( 'transactions_count', [enrich_blocks_task, enrich_transactions_task]) verify_transactions_have_latest_task = add_verify_tasks( 'transactions_have_latest', [enrich_transactions_task]) verify_logs_have_latest_task = add_verify_tasks('logs_have_latest', [enrich_logs_task]) verify_token_transfers_have_latest_task = add_verify_tasks( 'token_transfers_have_latest', [enrich_token_transfers_task]) verify_traces_blocks_count_task = add_verify_tasks( 'traces_blocks_count', [enrich_blocks_task, enrich_traces_task]) verify_traces_transactions_count_task = add_verify_tasks( 'traces_transactions_count', [enrich_transactions_task, enrich_traces_task]) verify_traces_contracts_count_task = add_verify_tasks( 'traces_contracts_count', [enrich_transactions_task, enrich_traces_task, enrich_contracts_task]) if notification_emails and len(notification_emails) > 0: send_email_task = EmailOperator( task_id='send_email', to=[email.strip() for email in notification_emails.split(',')], subject='Ethereum ETL Airflow Load DAG Succeeded', html_content='Ethereum ETL Airflow Load DAG Succeeded - {}'.format( chain), dag=dag) verify_blocks_count_task >> send_email_task verify_blocks_have_latest_task >> send_email_task verify_transactions_count_task >> send_email_task verify_transactions_have_latest_task >> send_email_task verify_logs_have_latest_task >> send_email_task verify_token_transfers_have_latest_task >> send_email_task verify_traces_blocks_count_task >> send_email_task verify_traces_transactions_count_task >> send_email_task verify_traces_contracts_count_task >> send_email_task enrich_tokens_task >> send_email_task return dag
# specific language governing permissions and limitations # under the License. """ Example Airflow DAG that translates text in Google Cloud Translate service in the Google Cloud. """ from airflow import models from airflow.operators.bash import BashOperator from airflow.providers.google.cloud.operators.translate import CloudTranslateTextOperator from airflow.utils.dates import days_ago with models.DAG( 'example_gcp_translate', schedule_interval='@once', # Override to match your needs start_date=days_ago(1), tags=['example'], ) as dag: # [START howto_operator_translate_text] product_set_create = CloudTranslateTextOperator( task_id='translate', values=['zażółć gęślą jaźń'], target_language='en', format_='text', source_language=None, model='base', ) # [END howto_operator_translate_text] # [START howto_operator_translate_access] translation_access = BashOperator( task_id='access',
# [START howto_operator_cloudsql_db_create_body] db_create_body = { "instance": INSTANCE_NAME, "name": DB_NAME, "project": GCP_PROJECT_ID } # [END howto_operator_cloudsql_db_create_body] # [START howto_operator_cloudsql_db_patch_body] db_patch_body = {"charset": "utf16", "collation": "utf16_general_ci"} # [END howto_operator_cloudsql_db_patch_body] default_args = {'start_date': airflow.utils.dates.days_ago(1)} with models.DAG( 'example_gcp_sql', default_args=default_args, schedule_interval=None # Override to match your needs ) as dag: def next_dep(task, prev): prev >> task return task # ############################################## # # ### INSTANCES SET UP ######################### # # ############################################## # # [START howto_operator_cloudsql_create] sql_instance_create_task = CloudSqlInstanceCreateOperator( project_id=GCP_PROJECT_ID, body=body,
GCS_PYTHON = os.environ.get('GCP_DATAFLOW_PYTHON', 'gs://INVALID BUCKET NAME/wordcount_debugging.py') GCS_JAR_PARTS = urlparse(GCS_JAR) GCS_JAR_BUCKET_NAME = GCS_JAR_PARTS.netloc GCS_JAR_OBJECT_NAME = GCS_JAR_PARTS.path[1:] default_args = { 'dataflow_default_options': { 'tempLocation': GCS_TMP, 'stagingLocation': GCS_STAGING, } } with models.DAG( "example_gcp_dataflow_native_java", schedule_interval='@once', # Override to match your needs start_date=days_ago(1), tags=['example'], ) as dag_native_java: # [START howto_operator_start_java_job_jar_on_gcs] start_java_job = BeamRunJavaPipelineOperator( task_id="start-java-job", jar=GCS_JAR, pipeline_options={ 'output': GCS_OUTPUT, }, job_class='org.apache.beam.examples.WordCount', dataflow_config={ "check_if_running": CheckJobRunning.IgnoreJob, "location": 'europe-west3', "poll_sleep": 10,
], }, ], "resources": { "regions": [f"{LOCATION}"], "virtualMachine": { "machineType": "n1-standard-1", }, }, } } # [END howto_configure_multiple_action_pipeline] with models.DAG( "example_gcp_life_sciences", default_args=dict(start_date=dates.days_ago(1)), schedule_interval=None, tags=['example'], ) as dag: # [START howto_run_pipeline] simple_life_science_action_pipeline = LifeSciencesRunPipelineOperator( task_id='simple-action-pipeline', body=SIMPLE_ACTION_PIEPELINE, project_id=PROJECT_ID, location=LOCATION, ) # [END howto_run_pipeline] multiple_life_science_action_pipeline = LifeSciencesRunPipelineOperator( task_id='multi-action-pipeline', body=MULTI_ACTION_PIPELINE, project_id=PROJECT_ID, location=LOCATION )
# The name of the environment variable, since deploy_type is `env` rather # than `volume`. deploy_target='SQL_CONN', # Name of the Kubernetes Secret secret='airflow-secrets', # Key of a secret stored in this Secret object key='sql_alchemy_conn') # [END composer_kubernetespodoperator_secretobject] YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1) # If a Pod fails to launch, or has an error occur in the container, Airflow # will show the task as failed, as well as contain all of the task logs # required to debug. with models.DAG(dag_id='composer_sample_kubernetes_pod', schedule_interval=datetime.timedelta(days=1), start_date=YESTERDAY) as dag: # Only name, namespace, image, and task_id are required to create a # KubernetesPodOperator. In Cloud Composer, currently the operator defaults # to using the config file found at `/home/airflow/composer_kube_config if # no `config_file` parameter is specified. By default it will contain the # credentials for Cloud Composer's Google Kubernetes Engine cluster that is # created upon environment creation. # [START composer_kubernetespodoperator_minconfig] kubernetes_min_pod = kubernetes_pod_operator.KubernetesPodOperator( # The ID specified for the task. task_id='pod-ex-minimum', # Name of task you want to run, used to generate Pod ID. name='pod-ex-minimum', # Entrypoint of the container, if not specified the Docker container's
"metrics": ["METRIC_IMPRESSIONS", "METRIC_CLICKS"], "includeInviteData": True, }, "schedule": { "frequency": "ONE_TIME" }, } PARAMS = {"dataRange": "LAST_14_DAYS", "timezoneCode": "America/New_York"} # [END howto_display_video_env_variables] default_args = {"start_date": dates.days_ago(1)} with models.DAG( "example_display_video", default_args=default_args, schedule_interval=None, # Override to match your needs ) as dag: # [START howto_google_display_video_createquery_report_operator] create_report = GoogleDisplayVideo360CreateReportOperator( body=REPORT, task_id="create_report") report_id = "{{ task_instance.xcom_pull('create_report', key='report_id') }}" # [END howto_google_display_video_createquery_report_operator] # [START howto_google_display_video_runquery_report_operator] run_report = GoogleDisplayVideo360RunReportOperator(report_id=report_id, params=PARAMS, task_id="run_report") # [END howto_google_display_video_runquery_report_operator] # [START howto_google_display_video_wait_report_operator]
"/global/instanceTemplates/" + GCE_NEW_TEMPLATE_NAME) UPDATE_POLICY = { "type": "OPPORTUNISTIC", "minimalAction": "RESTART", "maxSurge": { "fixed": 1 }, "minReadySec": 1800 } # [END howto_operator_compute_igm_update_template_args] with models.DAG( 'example_gcp_compute_igm', default_args=default_args, schedule_interval=None, # Override to match your needs tags=['example'], ) as dag: # [START howto_operator_gce_igm_copy_template] gce_instance_template_copy = ComputeEngineCopyInstanceTemplateOperator( project_id=GCP_PROJECT_ID, resource_id=GCE_TEMPLATE_NAME, body_patch=GCE_INSTANCE_TEMPLATE_BODY_UPDATE, task_id='gcp_compute_igm_copy_template_task') # [END howto_operator_gce_igm_copy_template] # Added to check for idempotence # [START howto_operator_gce_igm_copy_template_no_project_id] gce_instance_template_copy2 = ComputeEngineCopyInstanceTemplateOperator( resource_id=GCE_TEMPLATE_NAME, body_patch=GCE_INSTANCE_TEMPLATE_BODY_UPDATE, task_id='gcp_compute_igm_copy_template_task_2')
default_args = {"email": ["*****@*****.**"]} YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1) secret_file = secret.Secret( deploy_type="volume", deploy_target="/tmp/secrets/google", secret="gc-storage-rw-key", key="key.json", ) dag_id = "data_tagger_pipeline" dag_number = dag_id dag = models.DAG( dag_id, schedule_interval=datetime.timedelta(days=1), default_args=default_args, start_date=YESTERDAY, ) with dag: kubernetes_list_bucket_pod = kubernetes_pod_operator.KubernetesPodOperator( task_id="data-tagger", name="data-tagger", cmds=[ "python", "-m", "src.scripts.data_tagger", "cluster", bucket_name, "data/audiotospeech/config/datatagger/config.yaml", ], # namespace='composer-1-10-4-airflow-1-10-6-3b791e93',
# under the License. """ Example DAG using GoogleCloudStorageToBigQueryOperator. """ from airflow import models from airflow.operators.bash import BashOperator from airflow.operators.gcs_to_bq import GCSToBigQueryOperator from airflow.utils.dates import days_ago args = { 'owner': 'airflow', 'start_date': days_ago(2) } dag = models.DAG( dag_id='example_gcs_to_bq_operator', default_args=args, schedule_interval=None, tags=['example']) create_test_dataset = BashOperator( task_id='create_airflow_test_dataset', bash_command='bq mk airflow_test', dag=dag) # [START howto_operator_gcs_to_bq] load_csv = GCSToBigQueryOperator( task_id='gcs_to_bq_example', bucket='cloud-samples-data', source_objects=['bigquery/us-states/us-states.csv'], destination_project_dataset_table='airflow_test.gcs_to_bq_table', schema_fields=[ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
'start_date': datetime(2019, 2, 1), 'email_on_failure': True, 'email_on_retry': True, 'retries': 5, 'retry_delay': timedelta(minutes=5) } notification_emails = os.environ.get('NOTIFICATION_EMAILS') if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] dag = models.DAG(dag_id='redshift_refresh_aggregates', schedule_interval=None, concurrency=1, max_active_runs=1, default_args=default_dag_args) sql_folder = os.environ.get('REDSHIFT_SQL_FOLDER', "/usr/local/airflow/dags/redshift/sql") if sql_folder is None: raise ValueError("You must set REDSHIFT_SQL_FOLDER environment variable") def run_sql(**kwargs): conn_id = kwargs.get('conn_id') sql_file_path = kwargs.get('sql_file_path') pg_hook = PostgresHook(conn_id) with open(sql_file_path, 'r') as sql_file:
) from airflow.utils import dates ACCOUNT_ID = os.environ.get("GA_ACCOUNT_ID", "123456789") BUCKET = os.environ.get("GMP_ANALYTICS_BUCKET", "test-airflow-analytics-bucket") BUCKET_FILENAME = "data.csv" WEB_PROPERTY_ID = os.environ.get("GA_WEB_PROPERTY", "UA-12345678-1") WEB_PROPERTY_AD_WORDS_LINK_ID = os.environ.get( "GA_WEB_PROPERTY_AD_WORDS_LINK_ID", "rQafFTPOQdmkx4U-fxUfhj") DATA_ID = "kjdDu3_tQa6n8Q1kXFtSmg" with models.DAG( "example_google_analytics", schedule_interval='@once', # Override to match your needs, start_date=dates.days_ago(1), ) as dag: # [START howto_marketing_platform_list_accounts_operator] list_account = GoogleAnalyticsListAccountsOperator(task_id="list_account") # [END howto_marketing_platform_list_accounts_operator] # [START howto_marketing_platform_get_ads_link_operator] get_ad_words_link = GoogleAnalyticsGetAdsLinkOperator( web_property_ad_words_link_id=WEB_PROPERTY_AD_WORDS_LINK_ID, web_property_id=WEB_PROPERTY_ID, account_id=ACCOUNT_ID, task_id="get_ad_words_link", ) # [END howto_marketing_platform_get_ads_link_operator]