Пример #1
0
        object='flights/{{ execution_date.format("%Y/%m/%d/%H") }}/_SUCCESS')

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='lab8-work-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run Spark job - Popular airports
    run_dataproc_spark_t1 = dataproc_operator.DataProcSparkOperator(
        task_id='run_dataproc_spark_task1',
        dataproc_spark_jars=[TASK1_JAR],
        job_name='popular_airports',
        main_class = 'com.github.rmdarth.bdpclab6.PopularAirportsDF',
        cluster_name='lab8-work-cluster-{{ ds_nodash }}',
        arguments=task1_args
    )

    # Run Spark job - Canceled flights
    run_dataproc_spark_t2 = dataproc_operator.DataProcSparkOperator(
        task_id='run_dataproc_spark_task2',
        dataproc_spark_jars=[TASK2_JAR],
        job_name='canceled_flights',
        main_class = 'com.github.rmdarth.bdpclab6.CanceledFlightsDF',
        cluster_name='lab8-work-cluster-{{ ds_nodash }}',
        arguments=task2_args
    )

    # Delete Cloud Dataproc cluster.
Пример #2
0
        task_id="create_dataproc_cluster",
        cluster_name="gcp-data-platform",
        num_workers=0,
        zone="us-west1a",
        master_machine_type="n1-highmem-4",
    )

    run_dataproc_spark = dataproc_operator.DataProcSparkOperator(
        task_id="events_dataproc",
        cluster_name="gcp-data-platform",
        region=REGION,
        main_class="io.dagster.events.EventPipeline",
        dataproc_spark_jars=[
            "%s/events-assembly-%s.jar" %
            (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
        ],
        arguments=[
            "--gcs-input-bucket",
            INPUT_BUCKET,
            "--gcs-output-bucket",
            OUTPUT_BUCKET,
            "--date",
            "{{ ds }}",
        ],
    )

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT_ID,
        task_id="delete_dataproc_cluster",
        cluster_name="gcp-data-platform",
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
    )
Пример #3
0
        task_id='create_dataproc_cluster',
        cluster_name='gcp-data-platform',
        num_workers=0,
        zone='us-west1a',
        master_machine_type='n1-highmem-4',
    )

    run_dataproc_spark = dataproc_operator.DataProcSparkOperator(
        task_id='events_dataproc',
        cluster_name='gcp-data-platform',
        region=REGION,
        main_class='io.dagster.events.EventPipeline',
        dataproc_spark_jars=[
            '%s/events-assembly-%s.jar' %
            (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
        ],
        arguments=[
            '--gcs-input-bucket',
            INPUT_BUCKET,
            '--gcs-output-bucket',
            OUTPUT_BUCKET,
            '--date',
            '{{ ds }}',
        ],
    )

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT_ID,
        task_id='delete_dataproc_cluster',
        cluster_name='gcp-data-platform',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
    )
Пример #4
0
    'retries': 1,
    'retry_delay': dt.timedelta(seconds=30),
    'project_id': models.Variable.get('gcp_project')
}

with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *',
    default_args=default_dag_args) as dag:

    create_dataproc_cluster = dpo.DataprocClusterCreateOperator(
        project_id = default_dag_args['project_id'],
        task_id = 'create_dataproc_cluster',
        cluster_name = CLUSTER_NAME,
        num_workers = 2,
        zone = models.Variable.get('gce_zone')
    )

    run_spark_job = dpo.DataProcSparkOperator(
        task_id = 'run_spark_job',
        #main_jar = MAIN_JAR,
        main_class = MAIN_CLASS,
        cluster_name = CLUSTER_NAME
    )

    delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator(
        project_id = default_dag_args['project_id'],
        task_id = 'delete_dataproc_cluster',
        cluster_name = CLUSTER_NAME,
        trigger_rule = trigger_rule.TriggerRule.ALL_DONE
    )

    create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
Пример #5
0
        object=full_path,
        timeout=3600)

  # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')


    run_dataproc_spark = dataproc_operator.DataProcSparkOperator(
        task_id='run_dataproc_spark',
        main_jar=SPRARK_JAR,
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        arguments=spark_args)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # Define DAG dependencies.
gcs_file_sensor>>create_dataproc_cluster >>run_dataproc_spark >> delete_dataproc_cluster
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='macys-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Spark wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_spark = dataproc_operator.DataProcSparkOperator(
        task_id='run_dataproc_spark',
        main_jar=WORDCOUNT_JAR,
        cluster_name='macys-cluster-{{ ds_nodash }}',
        arguments=input_file)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='macys-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster

Пример #7
0
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        storage_bucket='listery-staging',
        num_workers=2,
        master_disk_size=20,
        worker_disk_size=20,
        num_preemptible_workers=1,
        zone='us-east1-c',
        master_machine_type='n1-standard-4',
        worker_machine_type='n1-standard-4')

    run_integration_job = dataproc_operator.DataProcSparkOperator(
        task_id='run_integration_job',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "integration"])

    offer_integration = dataproc_operator.DataProcSparkOperator(
        task_id='offer_integration',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "offerIntegration"])

    es_refresh = dataproc_operator.DataProcSparkOperator(
        task_id='es_refresh',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "refreshEs"])