default_args = {
    # Tell airflow to start one day ago, so that it runs as soon as you upload it
    "start_date": days_ago(1),
    "project_id": project_id,
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        # The id you will see in the DAG airflow page
        "dataproc_workflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator(
        # The task id of your job
        task_id="dataproc_workflow_dag",
        # The template id of your workflow
        template_id="sparkpi",
        project_id=project_id,
        # The region for the template
        region="us-central1",
    )

# [END composer_dataproc_workflow_instantiate_operator_tutorial_airflow_1]
示例#2
0
    'retry_delay': datetime.timedelta(minutes=5),
    'start_date': yesterday,
}

gcs_output_bucket = 'myworkspace'
bq_dataset_name = 'mydataset'
bq_table_id = bq_dataset_name + '.my_table_name'

with airflow.DAG('composer_sample_dag',
                 'catchup=False',
                 default_args=default_args,
                 schedule_interval=datetime.timedelta(days=1)) as dag:

    Spark_Job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator(
        task_id='spark_job',
        template_id='mytemplate',
        project_id='my-project-id',
        dag=dag)

    GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='GCS_to_BQ',
        bucket=gcs_output_bucket,
        source_objects=['output/*.parquet'],
        destination_project_dataset_table=bq_table_id,
        source_format='PARQUET',
        write_disposition='WRITE_TRUNCATE',
        autodetect=True)

    Spark_Job >> GCS_to_BQ
# [END composer_quickstart]
示例#3
0
    # Tell airflow to start one day ago, so that it runs as soon as you upload it
    "start_date": days_ago(1),
    "project_id": project_id,
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        # The id you will see in the DAG airflow page
        "dataproc_workflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator(
        # The task id of your job
        task_id="dataproc_workflow_dag",
        # The template id of your workflow
        template_id="sparkpi",
        project_id=project_id,
        # The region for the template
        # For more info on regions where Dataflow is available see:
        # https://cloud.google.com/dataflow/docs/resources/locations
        region="us-central1",
    )

# [END composer_dataproc_workflow_instantiate_operator_tutorial_airflow_1]
示例#4
0
from airflow.utils.dates import days_ago

project_id = models.Variable.get("project_id")

default_args = {
    # Tell airflow to start one day ago, so that it runs as soon as you upload it
    "start_date": days_ago(1),
    "project_id": project_id,
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        # The id you will see in the DAG airflow page
        "dataproc_workflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = dataproc_operator.DataprocWorkflowTemplateInstantiateOperator(
        # The task id of your job
        task_id="airports-in-usa",
        # The template id of your workflow
        template_id="airports-main",
        project_id=project_id,
        # The region for the template
        region="us-east1",
    )