def test_update_cluster(self): with patch(HOOK) as mock_hook: hook = mock_hook() hook.get_conn.return_value = self.mock_conn hook.wait.return_value = None dataproc_task = DataprocClusterScaleOperator( task_id=TASK_ID, region=GCP_REGION, project_id=GCP_PROJECT_ID, cluster_name=CLUSTER_NAME, num_workers=NUM_WORKERS, num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS, dag=self.dag ) dataproc_task.execute(None) self.mock_clusters.patch.assert_called_once_with( region=GCP_REGION, projectId=GCP_PROJECT_ID, clusterName=CLUSTER_NAME, requestId=mock.ANY, updateMask="config.worker_config.num_instances," "config.secondary_worker_config.num_instances", body={ 'config': { 'workerConfig': { 'numInstances': NUM_WORKERS }, 'secondaryWorkerConfig': { 'numInstances': NUM_PREEMPTIBLE_WORKERS } } }) hook.wait.assert_called_once_with(self.operation)
def test_cluster_name_log_no_sub(self): with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') as mock_hook: mock_hook.return_value.get_conn = self.mock_conn dataproc_task = DataprocClusterScaleOperator( task_id=TASK_ID, cluster_name=CLUSTER_NAME, project_id=GCP_PROJECT_ID, num_workers=NUM_WORKERS, num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS, dag=self.dag ) with patch.object(dataproc_task.log, 'info') as mock_info: with self.assertRaises(TypeError): dataproc_task.execute(None) mock_info.assert_called_with('Scaling cluster: %s', CLUSTER_NAME)
def test_cluster_name_log_no_sub(self): with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') as mock_hook: mock_hook.return_value.get_conn = self.mock_conn dataproc_task = DataprocClusterScaleOperator( task_id=TASK_ID, cluster_name=CLUSTER_NAME, project_id=PROJECT_ID, num_workers=NUM_WORKERS, num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS, dag=self.dag ) with patch.object(dataproc_task.log, 'info') as mock_info: with self.assertRaises(TypeError): dataproc_task.execute(None) mock_info.assert_called_with('Scaling cluster: %s', CLUSTER_NAME)
def test_cluster_name_log_sub(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook: mock_hook.return_value.get_conn = self.mock_conn dataproc_task = DataprocClusterScaleOperator( task_id=TASK_ID, cluster_name='smoke-cluster-{{ ts_nodash }}', project_id=GCP_PROJECT_ID, num_workers=NUM_WORKERS, num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS, dag=self.dag ) with patch.object(dataproc_task.log, 'info') as mock_info: context = {'ts_nodash': 'testnodash'} rendered = dataproc_task.render_template( 'cluster_name', getattr(dataproc_task, 'cluster_name'), context) setattr(dataproc_task, 'cluster_name', rendered) with self.assertRaises(TypeError): dataproc_task.execute(None) mock_info.assert_called_with('Scaling cluster: %s', u'smoke-cluster-testnodash')
def test_cluster_name_log_sub(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') \ as mock_hook: mock_hook.return_value.get_conn = self.mock_conn dataproc_task = DataprocClusterScaleOperator( task_id=TASK_ID, cluster_name='smoke-cluster-{{ ts_nodash }}', project_id=PROJECT_ID, num_workers=NUM_WORKERS, num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS, dag=self.dag) with patch.object(dataproc_task.log, 'info') as mock_info: context = {'ts_nodash': 'testnodash'} rendered = dataproc_task.render_template( 'cluster_name', getattr(dataproc_task, 'cluster_name'), context) setattr(dataproc_task, 'cluster_name', rendered) with self.assertRaises(TypeError): dataproc_task.execute(None) mock_info.assert_called_with('Scaling cluster: %s', u'smoke-cluster-testnodash')
region=my_region, query=create_external_src_table ) create_external_dst_table = DataProcHiveOperator( task_id='create_external_dst_table', job_name='create_external_dst_table_job_name', cluster_name=my_cluster_name, region=my_region, query=create_external_dst_table ) dataproc_scale_out = DataprocClusterScaleOperator( task_id='dataproc_scale_out', cluster_name=my_cluster_name, region=my_region, num_workers=2, num_preemptible_workers=num_preemptible_vms, graceful_decommission_timeout='1h', dag=dag) ##notice the insert overwrite was concatenated with set_dynamic_partitions check variable: insert_overwrite_with_transformation_query insert_overwrite_with_transformation_query = DataProcSparkSqlOperator( task_id='insert_overwrite_with_transformation_query', job_name='insert_overwrite_with_transformation_query_job_name', cluster_name=my_cluster_name, region=my_region, query=insert_overwrite_with_transformation_query #query=evya_query # for dev purposes, dummy query )
"example_gcp_dataproc", default_args={"start_date": airflow.utils.dates.days_ago(1)}, schedule_interval=None, ) as dag: create_cluster = DataprocClusterCreateOperator( task_id="create_cluster", cluster_name=CLUSTER_NAME, project_id=PROJECT_ID, num_workers=2, region=REGION, ) scale_cluster = DataprocClusterScaleOperator( task_id="scale_cluster", num_workers=3, cluster_name=CLUSTER_NAME, project_id=PROJECT_ID, region=REGION, ) pig_task = DataProcPigOperator( task_id="pig_task", query="define sin HiveUDF('sin');", region=REGION, cluster_name=CLUSTER_NAME, ) spark_sql_task = DataProcSparkSqlOperator( task_id="spark_sql_task", query="SHOW DATABASES;", region=REGION,