def create_cluster_with_invalid_internal_ip_only_setup():
            # Given
            create_cluster = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name=CLUSTER_NAME,
                project_id=GCP_PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=GCE_ZONE,
                dag=self.dag,
                internal_ip_only=True)

            # When
            create_cluster._build_cluster_data()
 def test_build_cluster_data_with_auto_zone(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT_ID,
         num_workers=NUM_WORKERS,
         master_machine_type=MASTER_MACHINE_TYPE,
         worker_machine_type=WORKER_MACHINE_TYPE
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertNotIn('zoneUri', cluster_data['config']['gceClusterConfig'])
     self.assertEqual(cluster_data['config']['masterConfig']['machineTypeUri'], MASTER_MACHINE_TYPE)
     self.assertEqual(cluster_data['config']['workerConfig']['machineTypeUri'], WORKER_MACHINE_TYPE)
 def test_build_cluster_data_with_autoDeleteTtl(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=GCE_ZONE,
         dag=self.dag,
         auto_delete_ttl=AUTO_DELETE_TTL,
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertEqual(cluster_data['config']['lifecycleConfig']['autoDeleteTtl'],
                      "654s")
 def test_build_cluster_data_with_autoDeleteTime(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=ZONE,
         dag=self.dag,
         auto_delete_time=AUTO_DELETE_TIME,
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertEqual(cluster_data['config']['lifecycleConfig']['autoDeleteTime'],
                      "2017-06-07T00:00:00.000000Z")
 def test_build_single_node_cluster(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT_ID,
         num_workers=0,
         num_preemptible_workers=0,
         zone=GCE_ZONE,
         dag=self.dag
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertEqual(
         cluster_data['config']['softwareConfig']['properties']
         ['dataproc:dataproc.allow.zero.workers'], "true")
Exemplo n.º 6
0
    def test_cluster_name_log_no_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') \
            as mock_hook, patch('logging.info') as l:
            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name=CLUSTER_NAME,
                project_id=PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=ZONE,
                dag=self.dag
            )

            with self.assertRaises(TypeError) as _:
                dataproc_task.execute(None)
            l.assert_called_with(('Creating cluster: ' + CLUSTER_NAME))
 def test_cluster_name_log_no_sub(self):
     with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
         mock_hook.return_value.get_conn = self.mock_conn
         dataproc_task = DataprocClusterCreateOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=GCP_PROJECT_ID,
             num_workers=NUM_WORKERS,
             zone=GCE_ZONE,
             dag=self.dag
         )
         with patch.object(dataproc_task.log, 'info') as mock_info:
             with self.assertRaises(TypeError):
                 dataproc_task.execute(None)
             mock_info.assert_called_with('Creating cluster: %s', CLUSTER_NAME)
 def test_build_cluster_data_with_autoDeleteTime_and_autoDeleteTtl(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=GCE_ZONE,
         dag=self.dag,
         auto_delete_time=AUTO_DELETE_TIME,
         auto_delete_ttl=AUTO_DELETE_TTL,
     )
     cluster_data = dataproc_operator._build_cluster_data()
     if 'autoDeleteTtl' in cluster_data['config']['lifecycleConfig']:
         self.fail("If 'auto_delete_time' and 'auto_delete_ttl' is set, " +
                   "only `auto_delete_time` is used")
     self.assertEqual(cluster_data['config']['lifecycleConfig']['autoDeleteTime'],
                      "2017-06-07T00:00:00.000000Z")
    def test_cluster_name_log_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            mock_hook.return_value.get_conn = self.mock_conn
            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name='smoke-cluster-{{ ts_nodash }}',
                project_id=PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=ZONE,
                dag=self.dag
            )
            with patch.object(dataproc_task.log, 'info') as mock_info:
                context = { 'ts_nodash' : 'testnodash'}

                rendered = dataproc_task.render_template('cluster_name', getattr(dataproc_task,'cluster_name'), context)
                setattr(dataproc_task, 'cluster_name', rendered)
                with self.assertRaises(TypeError) as _:
                    dataproc_task.execute(None)
                mock_info.assert_called_with('Creating cluster: %s', u'smoke-cluster-testnodash')
    def test_init_with_custom_image(self):
        dataproc_operator = DataprocClusterCreateOperator(
            task_id=TASK_ID,
            cluster_name=CLUSTER_NAME,
            project_id=GCP_PROJECT_ID,
            num_workers=NUM_WORKERS,
            zone=GCE_ZONE,
            dag=self.dag,
            custom_image=CUSTOM_IMAGE
        )

        cluster_data = dataproc_operator._build_cluster_data()
        expected_custom_image_url = \
            'https://www.googleapis.com/compute/beta/projects/' \
            '{}/global/images/{}'.format(GCP_PROJECT_ID, CUSTOM_IMAGE)
        self.assertEqual(cluster_data['config']['masterConfig']['imageUri'],
                         expected_custom_image_url)
        self.assertEqual(cluster_data['config']['workerConfig']['imageUri'],
                         expected_custom_image_url)
 def setUp(self):
     self.dataproc = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=ZONE,
         storage_bucket=STORAGE_BUCKET,
         image_version=IMAGE_VERSION,
         master_machine_type=MASTER_MACHINE_TYPE,
         master_disk_size=MASTER_DISK_SIZE,
         worker_machine_type=WORKER_MACHINE_TYPE,
         worker_disk_size=WORKER_DISK_SIZE,
         num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS)
Exemplo n.º 12
0
 def setUp(self):
     # instantiate two different test cases with different labels.
     self.labels = [LABEL1, LABEL2]
     self.dataproc_operators = []
     self.mock_conn = Mock()
     for labels in self.labels:
         self.dataproc_operators.append(
             DataprocClusterCreateOperator(
                 task_id=TASK_ID,
                 cluster_name=CLUSTER_NAME,
                 project_id=GCP_PROJECT_ID,
                 num_workers=NUM_WORKERS,
                 zone=GCE_ZONE,
                 autoscaling_policy=SCALING_POLICY,
                 network_uri=NETWORK_URI,
                 subnetwork_uri=SUBNETWORK_URI,
                 internal_ip_only=INTERNAL_IP_ONLY,
                 tags=TAGS,
                 storage_bucket=STORAGE_BUCKET,
                 image_version=IMAGE_VERSION,
                 master_machine_type=MASTER_MACHINE_TYPE,
                 master_disk_type=MASTER_DISK_TYPE,
                 master_disk_size=MASTER_DISK_SIZE,
                 worker_machine_type=WORKER_MACHINE_TYPE,
                 worker_disk_type=WORKER_DISK_TYPE,
                 worker_disk_size=WORKER_DISK_SIZE,
                 num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                 labels=deepcopy(labels),
                 service_account_scopes=SERVICE_ACCOUNT_SCOPES,
                 idle_delete_ttl=IDLE_DELETE_TTL,
                 auto_delete_time=AUTO_DELETE_TIME,
                 auto_delete_ttl=AUTO_DELETE_TTL
             )
         )
     self.dag = DAG(
         'test_dag',
         default_args={
             'owner': 'airflow',
             'start_date': DEFAULT_DATE,
             'end_date': DEFAULT_DATE,
         },
         schedule_interval='@daily')
 def setUp(self):
     # instantiate two different test cases with different labels.
     self.labels = [LABEL1, LABEL2]
     self.dataproc_operators = []
     for labels in self.labels:
         self.dataproc_operators.append(
             DataprocClusterCreateOperator(
                 task_id=TASK_ID,
                 cluster_name=CLUSTER_NAME,
                 project_id=PROJECT_ID,
                 num_workers=NUM_WORKERS,
                 zone=ZONE,
                 storage_bucket=STORAGE_BUCKET,
                 image_version=IMAGE_VERSION,
                 master_machine_type=MASTER_MACHINE_TYPE,
                 master_disk_size=MASTER_DISK_SIZE,
                 worker_machine_type=WORKER_MACHINE_TYPE,
                 worker_disk_size=WORKER_DISK_SIZE,
                 num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                 labels=deepcopy(labels)))
class DataprocClusterCreateOperatorTest(unittest.TestCase):

    def setUp(self):
        self.dataproc = DataprocClusterCreateOperator(
            task_id=TASK_ID,
            cluster_name=CLUSTER_NAME,
            project_id=PROJECT_ID,
            num_workers=NUM_WORKERS,
            zone=ZONE,
            storage_bucket=STORAGE_BUCKET,
            image_version=IMAGE_VERSION,
            master_machine_type=MASTER_MACHINE_TYPE,
            master_disk_size=MASTER_DISK_SIZE,
            worker_machine_type=WORKER_MACHINE_TYPE,
            worker_disk_size=WORKER_DISK_SIZE,
            num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS)

    def test_init(self):
        """Test DataFlowPythonOperator instance is properly initialized."""
        self.assertEqual(self.dataproc.cluster_name, CLUSTER_NAME)
        self.assertEqual(self.dataproc.project_id, PROJECT_ID)
        self.assertEqual(self.dataproc.num_workers, NUM_WORKERS)
        self.assertEqual(self.dataproc.zone, ZONE)
        self.assertEqual(self.dataproc.storage_bucket, STORAGE_BUCKET)
        self.assertEqual(self.dataproc.image_version, IMAGE_VERSION)
        self.assertEqual(self.dataproc.master_machine_type, MASTER_MACHINE_TYPE)
        self.assertEqual(self.dataproc.master_disk_size, MASTER_DISK_SIZE)
        self.assertEqual(self.dataproc.worker_machine_type, WORKER_MACHINE_TYPE)
        self.assertEqual(self.dataproc.worker_disk_size, WORKER_DISK_SIZE)
        self.assertEqual(self.dataproc.num_preemptible_workers, NUM_PREEMPTIBLE_WORKERS)

    def test_build_cluster_data(self):
        cluster_data = self.dataproc._build_cluster_data()
        self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME)
        self.assertEqual(cluster_data['projectId'], PROJECT_ID)
        self.assertEqual(cluster_data['config']['softwareConfig'], {'imageVersion': IMAGE_VERSION})
        self.assertEqual(cluster_data['config']['configBucket'], STORAGE_BUCKET)
        self.assertEqual(cluster_data['config']['workerConfig']['numInstances'], NUM_WORKERS)
        self.assertEqual(cluster_data['config']['secondaryWorkerConfig']['numInstances'],
                         NUM_PREEMPTIBLE_WORKERS)
Exemplo n.º 15
0
    def _internal(begin_task, end_task):
        task_id = 'cluster-{}'.format(job_name)
        cluster_name = '{}-cluster'.format(job_name)

        cluster = DataprocClusterCreateOperator(
            task_id=task_id,
            project_id=project_id,
            zone=zone,
            cluster_name=cluster_name,
            num_workers=2,
            num_preemptible_workers=2,
            storage_bucket=storage_bucket,
            master_machine_type='n1-standard-2',
            master_disk_size=200,
            worker_machine_type='n1-standard-4',
            worker_disk_size=200,
            init_actions_uris=[
                'gs://dataproc-initialization-actions/connectors/connectors.sh'
            ],
            metadata={
                'gcs-connector-version': '1.9.16',
                'bigquery-connector-version': '0.13.16'
            },
            subnetwork_uri=subnetwork_uri,
            internal_ip_only=True,
            region=region,
            idle_delete_ttl=600,
            **default_args)

        job = DataProcPySparkOperator(task_id=job_name,
                                      main=main_file,
                                      arguments=arguments,
                                      pyfiles=extra_files,
                                      job_name=job_name,
                                      cluster_name=cluster_name,
                                      region=region,
                                      **default_args)

        begin_task >> cluster >> job >> end_task
     'dfp_data_transfer_unified_impressions_from_hadoop5',
     schedule_interval='@once', 
     #schedule_interval='@once', 
     default_args=default_dag_args) as dag:
  
  # insert overwrite is not working via sparkSQL, and the file file is not working in Hive
  # GoogleCloudStorageDeleteOperator will only be supported in v.1.10
  
  
  create_dataproc_cluster = DataprocClusterCreateOperator(
 	task_id='create_dataproc_cluster',
 	cluster_name=my_cluster_name,
 	region=my_region,
 	num_workers=2,
 	storage_bucket=my_bucket,
 	master_machine_type=my_instance,
 	master_disk_size=my_disk_size,
 	worker_machine_type=my_instance,
 	worker_disk_size=my_disk_size,
     num_preemptible_workers=0, #use scale out/in operator
     zone=my_zone,
     idle_delete_ttl=my_idle_delete_ttl,
  	dag=dag)       
  	
  drop_if_exists_src_table = DataProcHiveOperator(
 	task_id='drop_if_exists_src_table',
 	job_name='drop_if_exists_src_table_job_name',
 	cluster_name=my_cluster_name,
 	region=my_region,
 	query=drop_if_exists_src_table
  )
  
    'schedule_interval': "30 2 * * *"
}

with DAG('sqoop-incremental-dag', default_args=DEFAULT_DAG_ARGS) as dag:

    create_cluster = DataprocClusterCreateOperator(
        task_id='create_sqoop_cluster',
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-2',
        init_actions_uris=[
            "gs://dataproc-initialization-actions/cloud-sql-proxy/cloud-sql-proxy.sh"
        ],
        num_workers=2,
        region='asia-southeast2',
        zone='asia-southeast2-a',
        service_account_scopes=[
            "https://www.googleapis.com/auth/sqlservice.admin"
        ],
        properties={
            "hive:hive.metastore.warehouse.dir": BUCKET + "/hive-warehouse"
        },
        metadata={
            "additional-cloud-sql-instances": instance_name + "=tcp:3307",
            "enable-cloud-sql-hive-metastore": "false"
        },
        image_version="1.2")

    sqoop_inc_import = BashOperator(
        task_id='sqoop_incremental_import',
        bash_command=
        "bash /home/airflow/gcs/plugins/sqoop-incremental-imports.sh ephemeral-spark-cluster-{{ds_nodash}}",
Exemplo n.º 18
0
             'email': '*****@*****.**',
             'email_on_failure': True,
             'email_on_retry': False
         }) as dag:

    push_cluster_name = PythonOperator(dag=dag,
                                       task_id="push-cluster-name",
                                       provide_context=True,
                                       python_callable=push_cluster_name)

    # The task of creating a cluster.
    dataproc_create_cluster = DataprocClusterCreateOperator(
        task_id='dataproc-create-cluster',
        project_id='silicon-parity-282607',
        region='us-central1',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}',
        num_workers=2)

    # The task of running the Spark job.
    dataproc_spark_process = DataProcSparkOperator(
        task_id='dataproc-test',
        dataproc_spark_jars=[
            'gs://lendingclub12/LendingClub-assembly-0.1.jar'
        ],
        main_class='p2p_data_analysis.spark.LoanDataAnalyzer',
        job_name='loan',
        region='us-central1',
        cluster_name=
Exemplo n.º 19
0
pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    postgres_conn_id="airflow-training-postgres",
    sql="SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflow-training-knab-jochem",
    filename="land_registry_price_paid_uk/{{ ds }}/properties_{}.json",
    dag=dag,
)


dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-ea393e48abe0a85089b6b551da",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
    auto_delete_ttl=5 * 60,  # Autodelete after 5 minutes
)


df_to_bq = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-ea393e48abe0a85089b6b551da",
        "region": "europe-west1",
    },
    py_file="gs://airflow-training-knab-jochem/dataflow_job.py",
    dag=dag,
)
Exemplo n.º 20
0
          default_args=args,
          on_failure_callback=_on_failure_callback,
          description="Own stuff",
          schedule_interval="0 0 * * *") as dag:





    exchange_to_gcs = HttpToGcsOperator(gcs_bucket='land_data_training_jjac_airflow',
                                                 gcs_path='exchange-rates/exchange-rates-{{ds}}.json',
                                                 endpoint='/history?start_at={{ds}}&end_at={{tomorrow_ds}}&symbols=EUR&base=GBP',
                                                   task_id="get_data")

    start_dataproc = DataprocClusterCreateOperator(project_id='airflowbolcomdec-7601d68caa710',
                                                   cluster_name='test-dataproc-jjac-{{ds}}',
                                                   num_workers=4,
                                                   region='europe-west1',
                                                   task_id='start_dataproc')
    proc_dataproc = DataProcPySparkOperator(main=path.join(path.dirname(__file__)) + '/spark/build_statistics.py',
                                            project_id='airflowbolcomdec-7601d68caa710',
                                            cluster_name='test-dataproc-jjac-{{ds}}',
                                            region='europe-west1',
                                            arguments=['inp_prop', 'inp_curren', 'target_path', 'tar_curr', 'tar_date'],
                                            task_id="proc_dataproc")
    delete_dataproc = DataprocClusterDeleteOperator(project_id='airflowbolcomdec-7601d68caa710',
                                                    cluster_name='test-dataproc-jjac-{{ds}}',
                                                    region='europe-west1',
                                                    task_id="delete_dataproc", trigger_rule=TriggerRule.ALL_DONE)

    exchange_to_gcs >> start_dataproc >> proc_dataproc >> delete_dataproc
Exemplo n.º 21
0
)

pgsq_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    postgres_conn_id="postgres_training",
    sql=("SELECT * FROM land_registry_price_paid_uk "
         "WHERE transfer_date = '{{ ds }}'"),
    bucket="airflow_training",
    filename="land_registry_price_paid_uk/{{ ds }}/properties_{}.json",
    dag=dag,
)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-05b583b94256b6965bb8c8119a",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

dataflow_job = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-05b583b94256b6965bb8c8119a",
        "region": "europe-west1",
    },
    py_file="gs://airflow_training/other/dataflow_job.py",
    dag=dag,
)

for currency in {"EUR", "USD"}:
    # Alternatively, this could be set to '@daily' to run the job once a day.
    # more options at https://airflow.apache.org/scheduler.html#dag-runs
}

# Create Directed Acyclic Graph for Airflow
with DAG('ephemeral_dataproc_spark_dag',
         default_args=DEFAULT_DAG_ARGS,
         schedule_interval=None) as dag:  # Here we are using dag as context.
    # Create the Cloud Dataproc cluster.
    # Note: this operator will be flagged a success if the cluster by this name
    # already exists.
    create_cluster = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # ds_nodash is an airflow macro for "[Execution] Date string no dashes"
        # in YYYYMMDD format.
        # See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        image_version='1.5-debian10',
        num_workers=2,
        storage_bucket=Variable.get('dataproc_bucket'),
        zone=Variable.get('gce_zone'))

    # Submit the PySpark job.
    submit_pyspark = DataProcPySparkOperator(
        task_id='run_dataproc_pyspark',
        main='gs://' + Variable.get('gcs_bucket') +
        '/spark-jobs/spark_avg_speed.py',
        # Obviously needs to match the name of cluster created in the prior
        # Operator.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # Let's template our arguments for the pyspark job from the POST
        # payload.
args = {
    "owner": "sacha_roggeveen",
    "schedule_interval": "@daily",
    "start_date": airflow.utils.dates.days_ago(1),
}
dag = DAG(dag_id="daggerd", default_args=args, description="clustertjerunnen")

t_start = BashOperator(task_id="print_execution_date",
                       bash_command="date",
                       dag=dag)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="dataproc_create_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-may2829-257c0046",
    num_workers=4,
    zone="europe-west4-a",
    dag=dag)

compute_aggregates = DataProcPySparkOperator(
    task_id="compute_aggregates",
    main="gs://europe-west1-training-airfl-48bde282-bucket/build_statistics.py",
    cluster_name="analyse-pricing-{{ ds }}",
    arguments=[
        "gs://buckster/daily_load_{{ ds }}",
        "gs://buckster/bucketie",
        "gs://buckster/results",
    ],
    dag=dag)
Exemplo n.º 24
0
def reprocess_parquet(parent_dag_name,
                      default_args,
                      reprocess,
                      gcp_conn_id,
                      gcs_buckets,
                      objects_prefix,
                      date_submission_col,
                      dataset,
                      dataset_version,
                      gs_dataset_location=None,
                      dataproc_zone='us-central1-a',
                      dag_name='reprocess_parquet',
                      num_preemptible_workers=10):

    """ Reprocess Parquet datasets to conform with BigQuery Parquet loader.

    This function should be invoked as part of `load_to_bigquery`.

    https://github.com/mozilla-services/spark-parquet-to-bigquery/blob/master/src/main/scala/com/mozilla/dataops/spark/TransformParquet.scala ## noqa

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param dict gcp_buckets:               source and dest gcp buckets for reprocess
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str object_prefix               objects location
    :param str date_submission_col:        dataset date submission column
    :param str dataproc_zone:              GCP zone to launch dataproc clusters
    :param str dag_name:                   name of dag
    :param int num_preemptible_workers:    number of dataproc cluster workers to provision
    :param bool reprocess:                 enable dataset reprocessing. defaults to False
    :param str gs_dataset_location:        override source location, defaults to None

    :return airflow.models.DAG
    """

    JAR = [
        'gs://moz-fx-data-derived-datasets-parquet-tmp/jars/spark-parquet-to-bigquery-assembly-1.0.jar' # noqa
    ]

    if gs_dataset_location:
        _gs_dataset_location = gs_dataset_location
    else:
        _gs_dataset_location = 'gs://{}/{}'.format(gcs_buckets['transfer'],
                                                   objects_prefix)

    cluster_name = '{}-{}'.format(dataset.replace('_', '-'),
                                  dataset_version) + '-{{ ds_nodash }}'

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    spark_args = [
        '--files', _gs_dataset_location,
        '--submission-date-col', date_submission_col,
        '--gcp-project-id', connection.project_id,
        '--gcs-bucket', 'gs://{}'.format(gcs_buckets['load']),
    ]

    _dag_name = '%s.%s' % (parent_dag_name, dag_name)

    with models.DAG(
            _dag_name,
            default_args=default_args) as dag:

        if reprocess:
            create_dataproc_cluster = DataprocClusterCreateOperator(
                task_id='create_dataproc_cluster',
                cluster_name=cluster_name,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                num_workers=2,
                image_version='1.3',
                storage_bucket=gcs_buckets['transfer'],
                zone=dataproc_zone,
                master_machine_type='n1-standard-8',
                worker_machine_type='n1-standard-8',
                num_preemptible_workers=num_preemptible_workers,
                metadata={
                    'gcs-connector-version': '1.9.6',
                    'bigquery-connector-version': '0.13.6'
                    })

            run_dataproc_spark = DataProcSparkOperator(
                task_id='run_dataproc_spark',
                cluster_name=cluster_name,
                dataproc_spark_jars=JAR,
                main_class='com.mozilla.dataops.spark.TransformParquet',
                arguments=spark_args,
                gcp_conn_id=gcp_conn_id)

            delete_dataproc_cluster = DataprocClusterDeleteOperator(
                task_id='delete_dataproc_cluster',
                cluster_name=cluster_name,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

            create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster # noqa

        else:
            DummyOperator(task_id='no_reprocess')

        return dag
Exemplo n.º 25
0
    #https://api.exchangeratesapi.io/history?start_at=2018-01-01&end_at=2018-01-02&symbols=EUR&base=GBP
    http_to_gcs = HttpToGcsOperator(
        task_id="get_currency_" + currency,
        method="GET",
        endpoint=
        f"/history?start_at={{{{ ds }}}}&end_at={{{{ tomorrow_ds }}}}&base=GBP&symbols={currency}",
        http_conn_id="currency-http",
        gcs_conn_id="google_cloud_storage_default",
        gcs_path=f"usecase/currency/{{{{ ds }}}}-{currency}.json",
        gcs_bucket=f"{bucket_name}",
    )

    dataproc_create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc",
        cluster_name="analyse-pricing-{{ ds }}",
        project_id=project_id,
        num_workers=2,
        zone="europe-west4-a",
    )

    compute_aggregates = DataProcPySparkOperator(
        task_id="compute_aggregates",
        main=f"gs://{analytics_bucket_name}/build_statistics.py",
        cluster_name="analyse-pricing-{{ ds }}",
        arguments=[
            f"gs://{bucket_name}/usecase/land_registry_price_paid_uk/*/*.json",
            f"gs://{bucket_name}/usecase/currency/{{{{ ds }}}}-{currency}.json",
            f"gs://{bucket_name}/usecase/results/{{{{ ds }}}}/", currency,
            "{{ tomorrow_ds }}"
        ],
    )
Exemplo n.º 26
0
    project_id="test",
    zone="us-central1-a",
    master_machine_type="n1-standard-4",
    worker_machine_type="n1-standard-4",
    num_workers=2,
    storage_bucket="test",
    init_actions_uris=[path],
    metadata={
        'PIP_PACKAGES': 'pyyaml requests pandas openpyxl'
    },
).make()

create_cluster_operator = DataprocClusterCreateOperator(
    task_id='create_dataproc_cluster',
    cluster_name="test",
    project_id="test",
    region="us-central1",
    cluster_config=CLUSTER_CONFIG,
)
# [END how_to_cloud_dataproc_create_cluster_generate_cluster_config]

# Update options
# [START how_to_cloud_dataproc_updatemask_cluster_operator]
CLUSTER_UPDATE = {
    "config": {
        "worker_config": {
            "num_instances": 3
        },
        "secondary_worker_config": {
            "num_instances": 3
        }
Exemplo n.º 27
0
    task_id="pg_2_gcs",
    postgres_conn_id="my_db_connection",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflowbolcom_ghermann_dummybucket",
    filename="mypgdata_{{ ds }}",
    dag=dag)

zone = "europe-west4-a"

dataproc_cluster_name = "my-dp-cluster-{{ ds }}"

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="my_create_dp_cluster",
    cluster_name=dataproc_cluster_name,
    project_id=project_id,
    num_workers=2,
    zone=zone,
    dag=dag,
)

compute_aggregates = DataProcPySparkOperator(
    task_id="my_compute_aggregates",
    main=
    'gs://europe-west1-training-airfl-4cdc0c96-bucket/other/build_statistics_simple.py',
    cluster_name=dataproc_cluster_name,
    arguments=["{{ ds }}"],
    project_id=project_id,
    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
#         bucket=bucket,
#         object=filepath,
#         filename="gs://europe-west1-training-airfl-46f2603e-bucket/dags/build_statistics.py"
#     )

# upload_build_statistics = PythonOperator(
#     task_id="upload_build_statistics",
#     python_callable=upload_file(bucket=Variable.get('gs_bucket'),
#                                 filepath="pyspark/build_statistics.py"),
#     provide_context=True,
#     dag=dag, )

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-fc205e26bebb44fa",
    num_workers=2,
    zone="europe-west1-c",
    dag=dag,
)

compute_aggregates = DataProcPySparkOperator(
    task_id='compute_aggregates',
    # TODO: create operator to upload localfile "build_statistics.py"
    main=
    "gs://europe-west1-training-airfl-46f2603e-bucket/dags/build_statistics.py",
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=[
        "gs://" + Variable.get('gs_bucket') +
        "/land_registry_price/{{ ds }}/*.json",
        "gs://" + Variable.get('gs_bucket') + "/currency/{{ ds }}/*.json",
        "gs://" + Variable.get('gs_bucket') + "/average_prices/{{ ds }}/"
Exemplo n.º 29
0
                                                 target_currency="EUR")

    psql_to_gcs = PostgresToGoogleCloudStorageOperator(
        task_id="read_postgres",
        postgres_conn_id="postgres_training",
        sql=
        "select * from land_registry_price_paid_uk where transfer_date = '{{ ds }}'::date",
        bucket="airflow-training-simple-dag",
        filename="training-price-paid-uk/{{ ds }}/land_registry.json")

    cluster_name = "cluster-{{ ds }}"
    gcs_project_id = "airflowbolcom-544f36a42f5c0d9d"

    create_cluster = DataprocClusterCreateOperator(task_id="create_cluster",
                                                   cluster_name=cluster_name,
                                                   project_id=gcs_project_id,
                                                   num_workers=2,
                                                   zone="europe-west4-a")

    cloud_analytics = DataProcPySparkOperator(
        task_id="analyze_data",
        main=
        "gs://europe-west1-training-airfl-b3ce8eaa-bucket/other/spark_statistics.py",
        cluster_name=cluster_name,
        arguments=["{{ ds }}"])

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_cluster",
        cluster_name=cluster_name,
        project_id=gcs_project_id,
        trigger_rule=TriggerRule.ALL_DONE)
Exemplo n.º 30
0
 schedule_interval = '@daily',
 max_active_runs = 1,
 concurrency = 1,
 default_args = default_args
 ) as dag:
 # operators def and dependency: 1.push cluster name through xcom -> 2.create cluster -> 3.run credit analysis spark job -> 4.delete cluster
 push_cluster_name_op = PythonOperator(
   task_id = "push_cluster_name",
   python_callable = push_cluster_name,
   dag = dag
 )
 create_dataproc_cluster = DataprocClusterCreateOperator(
   task_id = 'create_dataproc_cluster',
   project_id = project_id,
   region = 'us-west1',
   master_machine_type = 'n1-standard-2',
   worker_machine_type = 'n1-standard-2',
   num_workers = 2,
   cluster_name = '{{ ti.xcom_pull(key = "cluster_name", task_ids = "push_cluster_name") }}' #get the cluster name from xcom in template
 )
 run_collection_analysis_job = DataProcSparkOperator(
   task_id = 'start_collection_analysis_spark_job',
   main_class = 'com.makoto.spark.LoanAnalyze',
   dataproc_spark_jars = "gs://creditclub/CreditClub-assembly-0.1.jar",
   arguments = [     
     "input_load_stats_csv_path",
     "input_rejection_stats_csv_path",
     "output_path"    
   ],
   job_name = 'creditanalysis',
   region = 'us-west1',
Exemplo n.º 31
0
# unique tasks
push_unique_cluster_name = PythonOperator(
    task_id='generate_unique_cluster_name',
    provide_context=True,
    python_callable=push_unique_cluster_name,
    dag=dag)

# replicate tasks (dataproc_list)

create_cluster_1 = DataprocClusterCreateOperator(
    task_id='create_cluster_1',
    cluster_name=
    '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}'
    + '1',
    project_id=Variable.get('project_id', default_var=None),
    region='us-west1',
    master_machine_type='n1-standard-2',
    worker_machine_type='n1-standard-2',
    num_workers=2,
    execution_timeout=timedelta(minutes=20),
    dag=dag)

delete_cluster_1 = DataprocClusterDeleteOperator(
    task_id='delete_cluster_1',
    cluster_name=
    '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}'
    + '1',
    region='us-west1',
    project_id=Variable.get('project_id', default_var=None),
    execution_timeout=timedelta(minutes=20),
    dag=dag)
        postgres_conn_id='postgres_gdd',
        google_cloud_storage_conn_id='google_cloud_storage_default',
    )

    # Fetch exchange rate (average) from previous day until current day, and store
    # the result in GCS
    exchange_rates_to_gcs = HttpToGcsOperator(
        task_id='exchange_rates_to_gcs',
        base_url='https://api.exchangeratesapi.io',
        endpoint='history',
        data={
            'start_at': '{{ yesterday_ds }}',
            'end_at': '{{ ds }}',
            'symbols': 'EUR',
            'base': 'GBP',
        },
        gcs_bucket="gdd_airflow_npage_exchange_rates",
        gcs_path='exchange_rates-{{ ds }}.json',
    )

    # Create the Cloud Dataproc cluster
    create_dataproc_cluster = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='ephemeral-real-estate-{{ ds_nodash }}',
        num_workers=2,
        zone=Variable.get('gce_zone'),
    )

    [land_registry_prices_to_gcs, exchange_rates_to_gcs
     ] >> create_dataproc_cluster
        response = http.run(self.endpoint)
        self.log.info(response.text)

        with NamedTemporaryFile() as tmp_file:
            tmp_file.write(response.content)
            tmp_file.flush()

            hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id)
            hook.upload(bucket=self.bucket, object=self.gcs_path, filename=tmp_file.name)

PROJECT_ID = 'airflowbolcom-9362d2a84f6f553b'

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="pricing-analysis-{{ ds }}",
    project_id=PROJECT_ID,
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,)


for target_currency in ['EUR', 'USD']:
    HttpToGcsOperator(
        task_id='get_currency_' + str(target_currency),
        # when there are multiple options (E.g. in a loop), make task_id parameterized
        gcs_conn_id='postgres_conn',
        gcs_path="currency/{{ ds }}/" + target_currency + ".json",
        http_conn_id='http_new',
        bucket='marloes_bucket',
        endpoint="/convert-currency?date={{ ds }}&from=GBP&to=" + str(target_currency),
        dag=dag,
    DataprocClusterDeleteOperator)

default_args = {"start_date": airflow.utils.dates.days_ago(1)}

CLUSTER_NAME = os.environ.get('GCP_DATAPROC_CLUSTER_NAME', 'example-project')
PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'an-id')
REGION = os.environ.get('GCP_LOCATION', 'europe-west1')

with models.DAG(
        "example_gcp_dataproc_pig_operator",
        default_args=default_args,
        schedule_interval=None,
) as dag:
    create_task = DataprocClusterCreateOperator(task_id="create_task",
                                                cluster_name=CLUSTER_NAME,
                                                project_id=PROJECT_ID,
                                                region=REGION,
                                                num_workers=2)

    pig_task = DataProcPigOperator(task_id="pig_task",
                                   query="define sin HiveUDF('sin');",
                                   region=REGION,
                                   cluster_name=CLUSTER_NAME)

    delete_task = DataprocClusterDeleteOperator(task_id="delete_task",
                                                project_id=PROJECT_ID,
                                                cluster_name=CLUSTER_NAME,
                                                region=REGION)

    create_task >> pig_task >> delete_task
Exemplo n.º 35
0
with DAG(
    'bigquery_data_analytics',
    schedule_interval='0 20 * * *',
    catchup=False,
    default_args=default_arguments
) as dag:

    dag.doc_md = __doc__

    create_cluster = DataprocClusterCreateOperator(
        task_id='create_cluster',
        project_id='fsp-airflow',
        cluster_name='spark-cluster-{{ ds_nodash }}',
        num_workers=2,
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1',
        image_version='1.3.89-debian10',
        storage_bucket='fsp-logistics-spark-bucket',
        region='europe-central2'
    )

    create_cluster.doc_md = """## Create Dataproc cluster
    This task creates a Dataproc cluster in your project.
    """

    weekday_or_weekend = BranchPythonOperator(
        task_id='weekday_or_weekend',
        python_callable=assess_day,
        op_kwargs={'execution_date': '{{ ds }}'}
    )
Exemplo n.º 36
0
    postgres_conn_id="my_database_connection",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket='airflow_training_bucket',
    filename='land_registry_price_paid_uk/{{ ds }}/result.json',
    dag=dag)

my_task = PythonOperator(task_id="task_name",
                         python_callable=print_exec_date,
                         provide_context=True,
                         dag=dag)

create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id='airflowbolcom-20165e4959a78c1d',
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

comp_aggregate = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main=
    'gs://europe-west1-training-airfl-159310f1-bucket/other/build_statistics_simple.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)

del_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
Exemplo n.º 37
0
# Create Directed Acyclic Graph for Airflow
SPARK_DAG = DAG('Reporting_scheduler_DAG',
                default_args=default_args,
                schedule_interval=timedelta(days=1))

logger.debug('Starting task cto create cluster on GCP')

#Start DataProc Cluster
start_cluster = DataprocClusterCreateOperator(
    task_id='start_cluster',
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    num_workers=2,
    worker_machine_type='n1-standard-1',
    properties={
        'spark:spark.executor.cores': '1',
        'spark:spark.executor.memory': '1g',
        # The maximum number of bytes to pack into a single partition when reading files. 256MB
        'spark:spark.files.maxPartitionBytes': '268435456'
    },
    zone=Variable.get('gcp_zone'),
    dag=SPARK_DAG)

logger.debug('Submitting spark job on cluster')
#Submit Spark Job
submit_pyspark = DataProcPySparkOperator(
    task_id='run_dataproc_pyspark_job',
    main=PYSPARK_JOB,
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    arguments=[CONFIG_FILE_ARG],
    dag=SPARK_DAG)
Exemplo n.º 38
0
    'depends_on_past': False,
    "start_date": datetime.utcnow(),
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    "project_id": PROJECT_ID,
    "scheduled_interval": "30 2 * * *"
}

with DAG("flights_delay_etl", default_args=DEFAULT_DAG_ARGS) as dag:

    create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        worker_machine_type="n1-standard-2",
        num_workers=2,
        region="asia-east1",
        zone="asia-east1-a")

    submit_pyspark = DataProcPySparkOperator(
        task_id="run_pyspark_etl",
        main=PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-east1")

    bq_load_delays_by_distance = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_avg_delays_by_distance",
        bucket='enr1qu319-data-engineer-1',
        source_objects=[
            "flights_data_output/" + current_date + "_distance_category/part-*"
Exemplo n.º 39
0
 create_dataproc_cluster = DataprocClusterCreateOperator(
     task_id='create_dataproc_cluster',
     cluster_name="vf-polimi-demo",
     project_id=PROJECT,
     num_workers=2,
     service_account="dataproc-service-account@" + PROJECT +
     ".iam.gserviceaccount.com",
     master_machine_type="n1-highmem-4",
     worker_machine_type="n1-highmem-4",
     worker_disk_size=50,
     master_disk_size=50,
     image_version="1.4-debian9",
     tags=['default-allow-internal', 'default-allow-ssh'],
     region="europe-west1",
     subnetwork_uri="projects/" + PROJECT +
     "/regions/europe-west1/subnetworks/default",
     properties={
         'core:fs.gs.implicit.dir.repair.enable': 'false',
         'core:fs.gs.status.parallel.enable': 'true',
         'core:mapreduce.fileoutputcommitter.marksuccessfuljobs': 'false',
         'core:spark.pyspark.python': 'python3',
         'core:spark.pyspark.driver.python': 'python3'
     },
     metadata={
         'enable-oslogin': '******',
         'PIP_PACKAGES': 'google-cloud-pubsub'
     },
     optional_components=['ANACONDA', 'JUPYTER', 'ZEPPELIN'],
     enable_optional_components=True,
     enable_http_port_access=True,
     zone="europe-west1-b",
     storage_bucket="vf-polimi-batch-data",
     idle_delete_ttl=3601,
     internal_ip_only=False,
     init_actions_uris=[
         'gs://goog-dataproc-initialization-actions-europe-west1/python/pip-install.sh'
     ])
Exemplo n.º 40
0
def export_to_parquet(
    table,
    destination_table=None,
    static_partitions=[],
    arguments=[],
    use_storage_api=False,
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_workers=2,
    num_preemptible_workers=0,
    gcs_output_bucket="moz-fx-data-derived-datasets-parquet",
    s3_output_bucket="telemetry-parquet",
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param Optional[str] destination_table:       Output table name, defaults to table,
                                                  will have r'_v[0-9]+$' replaced with
                                                  r'/v[0-9]+'
    :param List[str] arguments:                   Additional pyspark arguments
    :param bool use_storage_api:                  Whether to read from the BigQuery
                                                  Storage API or an AVRO export
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # remove the dataset prefix and partition suffix from table
    table_id = table.rsplit(".", 1)[-1]
    unqualified_table, _, partition_id = table_id.partition("$")
    # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total)
    cluster_name = unqualified_table.replace("_", "-")
    if len(cluster_name) > 35:
        # preserve version when truncating cluster name to 42 characters
        prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("")
        cluster_name = prefix[:35 - len(version)] + version
    cluster_name += "-export-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    if destination_table is None:
        destination_table = unqualified_table
    # separate version using "/" instead of "_"
    export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/"
    if static_partitions:
        export_prefix += "/".join(static_partitions) + "/"
    avro_prefix = "avro/" + export_prefix
    if not static_partitions and partition_id:
        avro_prefix += "partition_id=" + partition_id + "/"
    avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro"

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            num_workers=num_workers,
            image_version="1.4",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                "gs://dataproc-initialization-actions/python/pip-install.sh",
            ],
            metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
            ],
            dataproc_pyspark_properties={
                "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
            },
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table]
            + [
                "--" + key + "=" + value
                for key, value in {
                    "avro-path": (not use_storage_api) and avro_path,
                    "destination": "gs://" + gcs_output_bucket,
                    "destination-table": destination_table,
                }.items()
                if value
            ]
            + (["--static-partitions"] if static_partitions else [])
            + [static_partitions]
            + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        gcs_to_s3 = DataProcHadoopOperatorWithAws(
            task_id="gcs_to_s3",
            main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-distcp.jar",
            arguments=[
                "-update",
                "-delete",
                "gs://{}/{}".format(gcs_output_bucket, export_prefix),
                "s3a://{}/{}".format(s3_output_bucket, export_prefix),
            ],
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            aws_conn_id=aws_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        if not use_storage_api:
            avro_export = BigQueryToCloudStorageOperator(
                task_id="avro_export",
                source_project_dataset_table=table,
                destination_cloud_storage_uris=avro_path,
                compression=None,
                export_format="AVRO",
                bigquery_conn_id=gcp_conn_id,
            )
            avro_delete = GoogleCloudStorageDeleteOperator(
                task_id="avro_delete",
                bucket_name=gcs_output_bucket,
                prefix=avro_prefix,
                gcp_conn_id=gcp_conn_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
            )
            avro_export >> run_dataproc_pyspark >> avro_delete

        create_dataproc_cluster >> run_dataproc_pyspark >> gcs_to_s3
        gcs_to_s3 >> delete_dataproc_cluster

        return dag
Exemplo n.º 41
0
    "project_id": "bigdata-etl-20201027",
    "scheduled_interval": "30 2 * * *"  # every day at 2:30 am utc
}

with DAG("sqoop_import_full_table", default_args=DEFAULT_DAG_ARGS) as dag:
    create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        worker_machine_type="n1-standard-2",
        init_actions_uris=[
            'gs://dataproc-initialization-actions/cloud-sql-proxy/cloud-sql-proxy.sh'
        ],
        num_workers=2,
        region="us-central1",
        zone="us-central1-a",
        service_account_scopes=[
            'https://www.googleapis.com/auth/sqlservice.admin'
        ],
        properties={
            'hive:hive.metastore.warehouse.dir': BUCKET + '/hive-warehouse'
        },
        metadata={
            'enable-cloud-sql-hive-metastore': 'false',
            'additional-cloud-sql-instances': INSTANCE_NAME
        },
        image_version='1.5')

    submit_sqoop = BashOperator(
        task_id="sqoop_full_table_import",
        bash_command=
        'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}'
Exemplo n.º 42
0
    def test_create_cluster(self):
        # Setup service.projects().regions().clusters().create()
        #              .execute()

        # pylint:disable=attribute-defined-outside-init
        self.operation = {'name': 'operation', 'done': True}
        self.mock_execute = Mock()
        self.mock_execute.execute.return_value = self.operation
        self.mock_clusters = Mock()
        self.mock_clusters.create.return_value = self.mock_execute
        self.mock_regions = Mock()
        self.mock_regions.clusters.return_value = self.mock_clusters
        self.mock_projects = Mock()
        self.mock_projects.regions.return_value = self.mock_regions
        self.mock_conn = Mock()
        self.mock_conn.projects.return_value = self.mock_projects
        # pylint:enable=attribute-defined-outside-init

        with patch(HOOK) as mock_hook:
            hook = mock_hook()
            hook.get_conn.return_value = self.mock_conn
            hook.wait.return_value = None

            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                region=GCP_REGION,
                cluster_name=CLUSTER_NAME,
                project_id=GCP_PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=GCE_ZONE,
                dag=self.dag
            )
            dataproc_task.execute(None)

            project_uri = 'https://www.googleapis.com/compute/v1/projects/test-project-id'
            machine_type_uri = project_uri + '/zones/us-central1-a/machineTypes/n1-standard-4'
            zone_uri = project_uri + '/zones/us-central1-a'

            self.mock_clusters.create.assert_called_once_with(
                region=GCP_REGION,
                projectId=GCP_PROJECT_ID,
                requestId=mock.ANY,
                body={
                    'projectId': 'test-project-id',
                    'clusterName': 'test-cluster-name',
                    'config': {
                        'gceClusterConfig':
                            {'zoneUri': zone_uri},
                        'masterConfig': {
                            'numInstances': 1,
                            'machineTypeUri': machine_type_uri,
                            'diskConfig': {'bootDiskType': 'pd-standard', 'bootDiskSizeGb': 1024}},
                        'workerConfig': {
                            'numInstances': 123,
                            'machineTypeUri': machine_type_uri,
                            'diskConfig': {'bootDiskType': 'pd-standard', 'bootDiskSizeGb': 1024}},
                        'secondaryWorkerConfig': {},
                        'softwareConfig': {},
                        'lifecycleConfig': {},
                        'encryptionConfig': {},
                        'autoscalingConfig': {},
                    },
                    'labels': {'airflow-version': mock.ANY}})
            hook.wait.assert_called_once_with(self.operation)