def test_deprecation_warning(self): with pytest.warns(DeprecationWarning) as warnings: op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name="cluster_name", num_workers=2, zone="zone", ) assert_warning("Passing cluster parameters by keywords", warnings) assert op.project_id == GCP_PROJECT assert op.cluster_name == "cluster_name" assert op.cluster_config['worker_config']['num_instances'] == 2 assert "zones/zone" in op.cluster_config['master_config'][ "machine_type_uri"] with pytest.warns(DeprecationWarning) as warnings: op_default_region = DataprocCreateClusterOperator( task_id=TASK_ID, project_id=GCP_PROJECT, cluster_name="cluster_name", cluster_config=op.cluster_config, ) assert_warning("Default region value", warnings) assert op_default_region.region == 'global'
def test_execute_if_cluster_exists(self, mock_hook): mock_hook.return_value.create_cluster.side_effect = [ AlreadyExists("test") ] op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster=CLUSTER, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, request_id=REQUEST_ID, ) op.execute(context={}) mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID) mock_hook.return_value.create_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster=CLUSTER, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, ) mock_hook.return_value.get_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, )
def test_deprecation_warning(self): with self.assertWarns(DeprecationWarning) as warning: op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name="cluster_name", num_workers=2, zone="zone", ) assert_warning("Passing cluster parameters by keywords", warning) self.assertEqual(op.project_id, GCP_PROJECT) self.assertEqual(op.cluster_name, "cluster_name") self.assertEqual(op.cluster_config['worker_config']['num_instances'], 2) self.assertIn("zones/zone", op.cluster_config['master_config']["machine_type_uri"]) with self.assertWarns(DeprecationWarning) as warning: op_default_region = DataprocCreateClusterOperator( task_id=TASK_ID, project_id=GCP_PROJECT, cluster_name="cluster_name", cluster_config=op.cluster_config, ) assert_warning("Default region value", warning) self.assertEqual(op_default_region.region, 'global')
def test_execute(self, mock_hook): op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, labels=LABELS, cluster_name=CLUSTER_NAME, project_id=GCP_PROJECT, cluster_config=CONFIG, request_id=REQUEST_ID, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, impersonation_chain=IMPERSONATION_CHAIN, ) op.execute(context={}) mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) mock_hook.return_value.create_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_config=CONFIG, labels=LABELS, cluster_name=CLUSTER_NAME, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, )
def test_execute_if_cluster_exists_in_error_state(self, mock_hook): mock_hook.return_value.create_cluster.side_effect = [AlreadyExists("test")] cluster_status = mock_hook.return_value.get_cluster.return_value.status cluster_status.state = 0 cluster_status.ERROR = 0 op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_config=CONFIG, labels=LABELS, cluster_name=CLUSTER_NAME, delete_on_error=True, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, request_id=REQUEST_ID, ) with self.assertRaises(AirflowException): op.execute(context={}) mock_hook.return_value.diagnose_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME ) mock_hook.return_value.delete_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME )
def test_execute_if_cluster_exists_in_deleting_state( self, mock_hook, mock_get_cluster, mock_create_cluster, mock_generator ): cluster = mock.MagicMock() cluster.status.state = 0 cluster.status.DELETING = 0 cluster2 = mock.MagicMock() cluster2.status.state = 0 cluster2.status.ERROR = 0 mock_create_cluster.side_effect = [AlreadyExists("test"), cluster2] mock_generator.return_value = [0] mock_get_cluster.side_effect = [cluster, NotFound("test")] op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_config=CONFIG, labels=LABELS, cluster_name=CLUSTER_NAME, delete_on_error=True, gcp_conn_id=GCP_CONN_ID, ) with self.assertRaises(AirflowException): op.execute(context={}) calls = [mock.call(mock_hook.return_value), mock.call(mock_hook.return_value)] mock_get_cluster.assert_has_calls(calls) mock_create_cluster.assert_has_calls(calls) mock_hook.return_value.diagnose_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME )
def test_execute_if_cluster_exists_do_not_use(self, mock_hook): mock_hook.return_value.create_cluster.side_effect = [AlreadyExists("test")] mock_hook.return_value.get_cluster.return_value.status.state = 0 op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster=CLUSTER, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, request_id=REQUEST_ID, use_if_exists=False, ) with self.assertRaises(AlreadyExists): op.execute(context={})
def test_execute_if_cluster_exists(self, mock_hook, to_dict_mock): mock_hook.return_value.create_cluster.side_effect = [ AlreadyExists("test") ] mock_hook.return_value.get_cluster.return_value.status.state = 0 op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_config=CONFIG, labels=LABELS, cluster_name=CLUSTER_NAME, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, request_id=REQUEST_ID, impersonation_chain=IMPERSONATION_CHAIN, ) op.execute(context={}) mock_hook.assert_called_once_with( gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) mock_hook.return_value.create_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_config=CONFIG, labels=LABELS, cluster_name=CLUSTER_NAME, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, ) mock_hook.return_value.get_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, ) to_dict_mock.assert_called_once_with( mock_hook.return_value.get_cluster.return_value)
def test_execute(self, mock_hook): op = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster=CLUSTER, request_id=REQUEST_ID, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, ) op.execute(context={}) mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID) mock_hook.return_value.create_cluster.assert_called_once_with( region=GCP_LOCATION, project_id=GCP_PROJECT, cluster=CLUSTER, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, )
def test_deprecation_warning(self): with self.assertWarns(DeprecationWarning) as warning: cluster_operator = DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name="cluster_name", num_workers=2, zone="zone", ) assert_warning("Passing cluster parameters by keywords", warning) cluster = cluster_operator.cluster self.assertEqual(cluster['project_id'], GCP_PROJECT) self.assertEqual(cluster['cluster_name'], "cluster_name") self.assertEqual(cluster['config']['worker_config']['num_instances'], 2) self.assertIn("zones/zone", cluster["config"]['master_config']["machine_type_uri"])
def test_depreciation_warning(self, mock_generator, mock_signature): mock_signature.return_value.parameters = cluster_params with self.assertWarns(DeprecationWarning) as warning: DataprocCreateClusterOperator( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name="cluster_name", num_workers=2, zone="zone", ) assert_warning("Passing cluster parameters by keywords", warning) mock_generator.assert_called_once_with( task_id=TASK_ID, region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name="cluster_name", num_workers=2, zone="zone", )
"retries": 0, "retry_delay": datetime.timedelta(minutes=5), "project_id": PROJECT, } with models.DAG( "Weekly-ETL-DAG-5", schedule_interval=None, start_date=datetime.datetime.combine(datetime.datetime.today(), datetime.datetime.min.time()), ) as dag: create_dataproc_cluster = DataprocCreateClusterOperator( task_id="create_dataproc_cluster", project_id=PROJECT, region=REGION, cluster=get_dataproc_config(), trigger_rule="all_done", ) create_firewall_rule = PythonOperator( task_id="create_firewall_rule", provide_context=True, python_callable=add_firewall_function, dag=dag, trigger_rule="all_done", ) start_pipelines = [] for x in range(len(DF_PIPELINES)): start_pipelines.append(
gcp_config = Variable.get('gcp_project_1', deserialize_json=True) dataproc_config = gcp_config['dataproc'] bucket_config = dataproc_config['bucket'] cluster_config = DataprocCreateClusterConfig.make(gcp_config) default_args = { 'owner': 'airflow', 'depends_on_past': True, 'start_date': datetime(2021, 3, 9, tzinfo=local_tz), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': timedelta(minutes=1), 'project_id': gcp_config['project_id'], 'region': gcp_config['region'], 'gcp_conn_id': gcp_config['conn_id'] } with DAG('create_dataproc', default_args=default_args, description='create_dataproc', schedule_interval='@once') as dag: create_dataproc = DataprocCreateClusterOperator( task_id='create_dataproc', cluster_name=dataproc_config['cluster_name'], cluster_config=cluster_config) create_dataproc
project_id="test", zone="us-central1-a", master_machine_type="n1-standard-4", worker_machine_type="n1-standard-4", num_workers=2, storage_bucket="test", init_actions_uris=[path], metadata={ 'PIP_PACKAGES': 'pyyaml requests pandas openpyxl' }, ).make() create_cluster_operator = DataprocCreateClusterOperator( task_id='create_dataproc_cluster', cluster_name="test", project_id="test", region="us-central1", cluster_config=CLUSTER_GENERATOR_CONFIG, ) # [END how_to_cloud_dataproc_create_cluster_generate_cluster_config] # Update options # [START how_to_cloud_dataproc_updatemask_cluster_operator] CLUSTER_UPDATE = { "config": { "worker_config": { "num_instances": 3 }, "secondary_worker_config": { "num_instances": 3 }
"cluster_name": CLUSTER_NAME }, "hadoop_job": { "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar", "args": ["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH], }, } with models.DAG( "example_gcp_dataproc", default_args={"start_date": days_ago(1)}, schedule_interval=None, ) as dag: create_cluster = DataprocCreateClusterOperator(task_id="create_cluster", project_id=PROJECT_ID, cluster=CLUSTER, region=REGION) scale_cluster = DataprocUpdateClusterOperator( task_id="scale_cluster", cluster_name=CLUSTER_NAME, cluster=CLUSTER_UPDATE, update_mask=UPDATE_MASK, graceful_decommission_timeout=TIMEOUT, project_id=PROJECT_ID, location=REGION, ) pig_task = DataprocSubmitJobOperator(task_id="pig_task", job=PIG_JOB, location=REGION,
"step_id": "pig_job_1", "pig_job": PIG_JOB["pig_job"] }], } with models.DAG( "example_gcp_dataproc", schedule_interval='@once', start_date=datetime(2021, 1, 1), catchup=False, ) as dag: # [START how_to_cloud_dataproc_create_cluster_operator] create_cluster = DataprocCreateClusterOperator( task_id="create_cluster", project_id=PROJECT_ID, cluster_config=CLUSTER_CONFIG, region=REGION, cluster_name=CLUSTER_NAME, ) # [END how_to_cloud_dataproc_create_cluster_operator] # [START how_to_cloud_dataproc_update_cluster_operator] scale_cluster = DataprocUpdateClusterOperator( task_id="scale_cluster", cluster_name=CLUSTER_NAME, cluster=CLUSTER_UPDATE, update_mask=UPDATE_MASK, graceful_decommission_timeout=TIMEOUT, project_id=PROJECT_ID, region=REGION, )
"properties": {"spark.yarn.queue": "default"} }, } with models.DAG( DAG_ID, schedule_interval=None, default_args=default_dag_args, tags=DAG_TAGS, ) as dag: # Must specify your tenant name and owner of the dag # Create a Cloud Dataproc cluster. create_dataproc_cluster = DataprocCreateClusterOperator( task_id="create_dataproc_cluster", impersonation_chain=CONNECT_SA, cluster_name=CLUSTER_NAME, # include your dataproc cluster name region=REGION, cluster_config=CLUSTER_CONFIG, labels={"tenant": TENANT, "created-by": USER, }, # specify your tenant's name ) # By default you won't have access to use `gcloud dataproc jobs submit` on the cluster that you created. # Running this script would let you submit jobs to the cluster through gcloud. # Be sure to give the correct cluster-name, cluster-region and your group entity assign_permissions = BashOperator( task_id="assign_permissions_for_dataproc_cluster", bash_command=f"bash {DAGS_FOLDER}/dataproc-set-iam.sh {CLUSTER_NAME} {REGION} group:{GROUP_NAME}", ) # BashOperator to hold the Dataproc delete operator for specified sleep time # sleep_task = BashOperator(task_id="sleep_task_to_keep_dataproc_cluster_alive_3h", bash_command="sleep 8h",)
""" ############################################################################### # DAG ############################################################################### with DAG( dag_id="DAF_PIPELINE_FOOD_ONTOLOGY_PREDICT_DAG", catchup=True, schedule_interval='00 6 * * *', max_active_runs=1, default_args=default_args ) as dag_daily: create_cluster = DataprocCreateClusterOperator( task_id="create_cluster", cluster_name=CLUSTER_NAME_DAILY, region=REGION, project_id=PROJECT_ID, cluster_config=CLUSTER_CONFIGURATION, ) pig_job_nltk_stopwords = DataprocSubmitJobOperator( task_id="pig_job_nltk_stopwords", job=get_pig_job_config("sh python -m nltk.downloader stopwords", CLUSTER_NAME_DAILY), location=REGION, project_id=PROJECT_ID ) pig_job_spacy_vocabulary = DataprocSubmitJobOperator( task_id="pig_job_spacy_vocabulary", job=get_pig_job_config("sh python -m spacy download es_core_news_lg",