default_args = { # Tell airflow to start one day ago, so that it runs as soon as you upload it "start_date": days_ago(1), "project_id": project_id, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( # The id you will see in the DAG airflow page "dataproc_workflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator( # The task id of your job task_id="dataproc_workflow_dag", # The template id of your workflow template_id="sparkpi", project_id=project_id, # The region for the template region="us-central1", ) # [END composer_dataproc_workflow_instantiate_operator_tutorial_airflow_1]
'retry_delay': datetime.timedelta(minutes=5), 'start_date': yesterday, } gcs_output_bucket = 'myworkspace' bq_dataset_name = 'mydataset' bq_table_id = bq_dataset_name + '.my_table_name' with airflow.DAG('composer_sample_dag', 'catchup=False', default_args=default_args, schedule_interval=datetime.timedelta(days=1)) as dag: Spark_Job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator( task_id='spark_job', template_id='mytemplate', project_id='my-project-id', dag=dag) GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BQ', bucket=gcs_output_bucket, source_objects=['output/*.parquet'], destination_project_dataset_table=bq_table_id, source_format='PARQUET', write_disposition='WRITE_TRUNCATE', autodetect=True) Spark_Job >> GCS_to_BQ # [END composer_quickstart]
# Tell airflow to start one day ago, so that it runs as soon as you upload it "start_date": days_ago(1), "project_id": project_id, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( # The id you will see in the DAG airflow page "dataproc_workflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator( # The task id of your job task_id="dataproc_workflow_dag", # The template id of your workflow template_id="sparkpi", project_id=project_id, # The region for the template # For more info on regions where Dataflow is available see: # https://cloud.google.com/dataflow/docs/resources/locations region="us-central1", ) # [END composer_dataproc_workflow_instantiate_operator_tutorial_airflow_1]
from airflow.utils.dates import days_ago project_id = models.Variable.get("project_id") default_args = { # Tell airflow to start one day ago, so that it runs as soon as you upload it "start_date": days_ago(1), "project_id": project_id, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( # The id you will see in the DAG airflow page "dataproc_workflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator( # The task id of your job task_id="airports-in-usa", # The template id of your workflow template_id="airports-main", project_id=project_id, # The region for the template region="us-east1", )