default_dag_args = { 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, # 'retry_delay': datetime.timedelta(minutes=5), 'start_date': datetime.datetime.today() - datetime.timedelta(days=1) } with models.DAG('lastfm-1k-ingest', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: dataflow = dataflow_operator.DataFlowPythonOperator( task_id='ingest-users-dataflow', py_file='gs://{}/lastfm-dataset-1K/code/ingest-users.py'.format( PROJECT), job_name='ingest-users-dataflow', py_options=[], dataflow_default_options={ 'project': PROJECT, 'region': 'europe-west1' }, options={}, poll_sleep=30) start = bash_operator.BashOperator(task_id='start', bash_command='echo "Start"') end = bash_operator.BashOperator(task_id='end', bash_command='echo "End"') start >> dataflow >> end
# Args required for the Dataflow job. job_args = { 'input': INPUT_BUCKET_CSV, # TODO: Populate the models.Variable.get() with the variable name for BQ table 'output': models.Variable.get('bq_output_table'), # TODO: Populate the models.Variable.get() with the variable name for input field names 'fields': models.Variable.get('input_field_names'), 'load_dt': DS_TAG } # Main Dataflow task that will process and load the input delimited file. # TODO: Specify the type of operator we need to call to invoke DataFlow dataflow_task = dataflow_operator.DataFlowPythonOperator( task_id="process-delimited-and-push", py_file=DATAFLOW_FILE, options=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the dataflow_task was a success or a failure. success_move_task = python_operator.PythonOperator( task_id='success-move-to-completion', python_callable=move_to_completion_bucket, # A success_tag is used to move # the input file to a success # prefixed folder. op_args=[COMPLETION_BUCKET, SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) failure_move_task = python_operator.PythonOperator(
default_args=DEFAULT_DAG_ARGS) as dag: # Args required for the Dataflow job. job_args = { 'input': 'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}', 'output': 'compose-test-291802:lake.SALES_DATA', 'runner': 'DataflowRunner', 'project': 'compose-test-291802', 'job_name': 'job-name-001', 'temp_location': 'gs://sanch-test-bucket12/tmp/', 'staging_location': 'gs://sanch-test-bucket12/stg', 'load_dt': DS_TAG } # Main Dataflow task that will process and load the input delimited file. dataflow_task = dataflow_operator.DataFlowPythonOperator( task_id="dataflowstoragebq1", py_file=DATAFLOW_FILE, options=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the dataflow_task was a success or a failure. success_move_task = python_operator.PythonOperator( task_id='success-move-to-completion', python_callable=move_to_completion_bucket, # A success_tag is used to move # the input file to a success # prefixed folder. op_args=[COMPLETION_BUCKET, SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) failure_move_task = python_operator.PythonOperator( task_id='failure-move-to-completion',
job_args = { 'input': 'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"]}}', 'output_raw': 'RAW_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])}}', # takes out the file name removing ".csv" from it 'output_err': 'PRZ_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])_ERR}}', 'output_prz': 'PRZ_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])}}', 'fields': g_fields, 'load_dt': '{{ dag_run.conf["bqTimestamp"]}}', 'op_dict': g_operations_dict } # Main Dataflow task TSK_dataflow_file_ingestion = dataflow_operator.DataFlowPythonOperator( task_id="tsk-dataflow-file-ingestion", py_file=DATAFLOW_FILE, options=job_args) # Upon Dataflow task success the TSK_move_into_arc_bucket starts TSK_move_into_arc_bucket = python_operator.PythonOperator( task_id='TSK_move_into_arc_bucket', python_callable=DPLF_move_into_arc_bucket, op_args=[g_output_bucket], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) # Upon Dataflow task failure the TSK_move_into_inv_bucket starts TSK_move_into_inv_bucket = python_operator.PythonOperator( task_id='TSK_move_into_inv_bucket', python_callable=DPLF_move_into_inv_bucket, op_args=[g_failed_bucket],
def create_dag(env_variables): """Creates the Airflow directed acyclic graph. Args: env_variables: Dictionary of Airflow environment variables. Returns: driblet_dag: An instance of models.DAG. """ driblet_dag = initialize_dag() # Clients setup. project_id = env_variables['project_id'] bq_client = bigquery.Client(project=project_id) gcs_client = storage.Client(project=project_id) # TASK 1: Convert BigQuery CSV to TFRECORD. dag_dir = configuration.get('core', 'dags_folder') transformer_py = os.path.join(dag_dir, 'tasks/preprocess', 'transformer.py') bq_to_tfrecord = dataflow_operator.DataFlowPythonOperator( task_id='bq-to-tfrecord', py_file=transformer_py, options={ 'project': project_id, 'predict-data': '{}.{}.{}_{}'.format(project_id, env_variables['bq_dataset'], env_variables['bq_input_table'], datetime.datetime.now().strftime('%Y%m%d')), 'data-source': 'bigquery', 'transform-dir': 'gs://%s/transformer' % env_variables['bucket_name'], 'output-dir': 'gs://%s/input' % env_variables['bucket_name'], 'mode': 'predict' }, dataflow_default_options={'project': project_id}, dag=driblet_dag) # TASK 2: Make prediction from CSV in GCS. make_predictions = mlengine_operator.MLEngineBatchPredictionOperator( task_id='make-predictions', project_id=project_id, job_id='driblet_run_{}'.format( datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')), data_format='TF_RECORD', input_paths=['gs://%s/input/predict-*' % env_variables['bucket_name']], output_path='gs://%s/output' % env_variables['bucket_name'], region=env_variables['region'], model_name=env_variables['model_name'], version_name=env_variables['model_version'], gcp_conn_id='google_cloud_default', dag=driblet_dag) # TASK 3: Export predicted CSV from Cloud Storage to BigQuery. job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType. DAY, # Sets daily partitioned table. expiration_ms=env_variables['dataset_expiration']) gcs_to_bigquery = GCStoBQOperator( task_id='gcs-to-bigquery', bq_client=bq_client, gcs_client=gcs_client, job_config=job_config, dataset_id=env_variables['bq_dataset'], table_id=env_variables['bq_output_table'], gcs_bucket=env_variables['bucket_name'], gcs_location=env_variables['location'], exclude_prefix='errors_stats', # Exclude files starting with name. dir_prefix='output', dag=driblet_dag) # TASK 4: Delete files in Cloud Storage bucket. gcs_delete_blob = GCSDeleteBlobOperator( task_id='gcs-delete-blob', client=gcs_client, gcs_bucket=env_variables['bucket_name'], prefixes=['input', 'output'], dag=driblet_dag) make_predictions.set_upstream(bq_to_tfrecord) make_predictions.set_downstream(gcs_to_bigquery) gcs_delete_blob.set_upstream(gcs_to_bigquery) return driblet_dag
# Setting schedule_interval to None as this DAG is externally trigger by a Cloud Function. with models.DAG(dag_id='mssql_gcs_dataflow_bigquery_dag_2', description='A DAG triggered by an external Cloud Function', schedule_interval=None, default_args=DEFAULT_DAG_ARGS) as dag: # Args required for the Dataflow job. job_args = { 'input': 'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}', 'output': models.Variable.get('bq_output_table'), 'load_dt': DS_TAG } # Main Dataflow task that will process and load the input delimited file. dataflow_task = dataflow_operator.DataFlowPythonOperator( task_id="process-json-to-dataflow", py_file=DATAFLOW_FILE, options=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the export_sales_orders was a success or a failure. success_move_task = email_operator.EmailOperator( task_id='success', trigger_rule=TriggerRule.ALL_SUCCESS, to=models.Variable.get('email'), subject= 'mssql_gcs_dataflow_bigquery_dag_2 Job Succeeded: start_date {{ ds }}', html_content="HTML CONTENT") failure_move_task = email_operator.EmailOperator( task_id='failure', trigger_rule=TriggerRule.ALL_FAILED,
return df.values[-1][0], df.values[-1][1] def tell_slack(context): o = slack_webhook_operator.SlackWebhookOperator(task_id="tell_slack", http_conn_id='slack_default', message="Number one page today is %s (%s hits)" % ( find_number_one())) return o.execute(context) with models.DAG( 'ga_daily_reporter', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: benchmark_tally = dataflow_operator.DataFlowPythonOperator(task_id='benchmark_tally', py_file='/home/airflow/gcs/dags/pipelines/benchmark_tally.py') combine_tally = python_operator.PythonOperator( task_id='combine_tally', python_callable=combine_tally, on_success_callback=tell_slack) # on_success_callback is a hack to delay generating the slack message # https://stackoverflow.com/questions/52054427/how-to-integrate-apache-airflow-with-slack tell_slack = slack_webhook_operator.SlackWebhookOperator(task_id="tell_slack", http_conn_id='slack_default', message="A new report is out: " "https://%s/data/tally_69211100_20190425.csv" % ( models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'))) generate_graph = python_operator.PythonOperator( task_id='generate_graph', python_callable=generate_graph)
'email_on_retry': False } dag = DAG( 'twitter_search', default_args=default_args, description='Load data from GCS to BQ Serving Layer', schedule_interval='@daily', dagrun_timeout=timedelta(minutes=30) ) load_raw_data = dataflow_operator.DataFlowPythonOperator( task_id='load_raw_data', dag=dag, py_file='/home/airflow/gcs/dags/dataflow/twitter-google-dataflow.py', #py_file='dataflow/twitter-google-dataflow.py', job_name='twitter-google-dataflow-{{ ds }}', dataflow_default_options={'project':os.environ.get('GCP_PROJECT'), 'region': 'europe-west1','zone':'europe-west6-a','runner':'DataflowRunner'}, options={'job_date':'{{ ds }}', 'twitter_bucket':os.environ.get('TWITTER_BUCKET'), 'dataflow_bucket':os.environ.get('DATAFLOW_BUCKET')} ) #delete_sl_partition = bigquery_operator.BigQueryOperator( # TODO change to bq command line # task_id='delete_sl_partition', # dag=dag, # sql='''DELETE FROM dataops_demo_sl_dev.t_twitter_google WHERE c_created = '{{ ds }}' ''', # use_legacy_sql=False #) delete_sl_partition = bash_operator.BashOperator( task_id='delete_sl_partition', dag=dag,
job_args = { 'input': 'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}', 'output': 'seu-projeto-nome-no-google:dataNavigationDataSet.RAW_DATA_NAVIGATION', 'runner': 'DataflowRunner', 'project': 'seu-projeto-nome-no-google', 'job_name': 'job-name-001', 'temp_location': 'gs://seu-projeto-nome-no-google-bucket-navigation/tmp/', 'load_dt': DS_TAG } # Main Dataflow task that will process and load the input delimited file. dataflow_task = dataflow_operator.DataFlowPythonOperator( task_id="dataflow-b2w-raw-nav-data-001", py_file=DATAFLOW_FILE, options=job_args) # Here we create two conditional tasks, one of which will be executed # based on whether the dataflow_task was a success or a failure. success_move_task = python_operator.PythonOperator( task_id='success-move-to-completion', python_callable=move_to_completion_bucket, # A success_tag is used to move # the input file to a success # prefixed folder. op_args=[COMPLETION_BUCKET, SUCCESS_TAG], provide_context=True, trigger_rule=TriggerRule.ALL_SUCCESS) failure_move_task = python_operator.PythonOperator(