def test_create_pyspark_job_operator(self, create_pyspark_job_mock, *_): operator = DataprocCreatePysparkJobOperator( task_id='create_pyspark_job', main_python_file_uri= 's3a://some-in-bucket/jobs/sources/pyspark-001/main.py', python_file_uris=[ 's3a://some-in-bucket/jobs/sources/pyspark-001/geonames.py', ], file_uris=[ 's3a://some-in-bucket/jobs/sources/data/config.json', ], archive_uris=[ 's3a://some-in-bucket/jobs/sources/data/country-codes.csv.zip', ], args=[ 's3a://some-in-bucket/jobs/sources/data/cities500.txt.bz2', 's3a://some-out-bucket/jobs/results/${{JOB_ID}}', ], jar_file_uris=[ 's3a://some-in-bucket/jobs/sources/java/dataproc-examples-1.0.jar', 's3a://some-in-bucket/jobs/sources/java/icu4j-61.1.jar', 's3a://some-in-bucket/jobs/sources/java/commons-lang-2.6.jar', ], properties={ 'spark.submit.deployMode': 'cluster', }, ) context = {'task_instance': MagicMock()} context['task_instance'].xcom_pull.return_value = 'my_cluster_id' operator.execute(context) context['task_instance'].xcom_pull.assert_has_calls([ call(key='cluster_id'), call(key='yandexcloud_connection_id'), ]) create_pyspark_job_mock.assert_called_once_with( archive_uris=[ 's3a://some-in-bucket/jobs/sources/data/country-codes.csv.zip' ], args=[ 's3a://some-in-bucket/jobs/sources/data/cities500.txt.bz2', 's3a://some-out-bucket/jobs/results/${{JOB_ID}}', ], cluster_id='my_cluster_id', file_uris=['s3a://some-in-bucket/jobs/sources/data/config.json'], jar_file_uris=[ 's3a://some-in-bucket/jobs/sources/java/dataproc-examples-1.0.jar', 's3a://some-in-bucket/jobs/sources/java/icu4j-61.1.jar', 's3a://some-in-bucket/jobs/sources/java/commons-lang-2.6.jar', ], main_python_file_uri= 's3a://some-in-bucket/jobs/sources/pyspark-001/main.py', name='Pyspark job', properties={'spark.submit.deployMode': 'cluster'}, python_file_uris=[ 's3a://some-in-bucket/jobs/sources/pyspark-001/geonames.py' ], )
create_pyspark_job = DataprocCreatePysparkJobOperator( task_id='create_pyspark_job', main_python_file_uri= 's3a://data-proc-public/jobs/sources/pyspark-001/main.py', python_file_uris=[ 's3a://data-proc-public/jobs/sources/pyspark-001/geonames.py', ], file_uris=[ 's3a://data-proc-public/jobs/sources/data/config.json', ], archive_uris=[ 's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip', ], args=[ 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/dataproc/job/results/${{JOB_ID}}', ], jar_file_uris=[ 's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', 's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar', 's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar', ], properties={ 'spark.submit.deployMode': 'cluster', }, packages=['org.slf4j:slf4j-simple:1.7.30'], repositories=['https://repo1.maven.org/maven2'], exclude_packages=['com.amazonaws:amazon-kinesis-client'], )
}, ) create_pyspark_job = DataprocCreatePysparkJobOperator( task_id='create_pyspark_job', main_python_file_uri= 's3a://data-proc-public/jobs/sources/pyspark-001/main.py', python_file_uris=[ 's3a://data-proc-public/jobs/sources/pyspark-001/geonames.py', ], file_uris=[ 's3a://data-proc-public/jobs/sources/data/config.json', ], archive_uris=[ 's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip', ], args=[ 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/jobs/results/${{JOB_ID}}', ], jar_file_uris=[ 's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', 's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar', 's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar', ], properties={ 'spark.submit.deployMode': 'cluster', }, ) delete_cluster = DataprocDeleteClusterOperator(task_id='delete_cluster', )