def test_init_container(self): # GIVEN volume_mounts = [ k8s.V1VolumeMount(mount_path='/etc/foo', name='test-volume', sub_path=None, read_only=True) ] init_environments = [ k8s.V1EnvVar(name='key1', value='value1'), k8s.V1EnvVar(name='key2', value='value2'), ] init_container = k8s.V1Container( name="init-container", image="ubuntu:16.04", env=init_environments, volume_mounts=volume_mounts, command=["bash", "-cx"], args=["echo 10"], ) volume = k8s.V1Volume( name='test-volume', persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name='test-volume'), ) expected_init_container = { 'name': 'init-container', 'image': 'ubuntu:16.04', 'command': ['bash', '-cx'], 'args': ['echo 10'], 'env': [{'name': 'key1', 'value': 'value1'}, {'name': 'key2', 'value': 'value2'}], 'volumeMounts': [{'mountPath': '/etc/foo', 'name': 'test-volume', 'readOnly': True}], } k = KubernetesPodOperator( namespace='default', image="ubuntu:16.04", cmds=["bash", "-cx"], arguments=["echo 10"], labels={"foo": "bar"}, name="test-" + str(random.randint(0, 1000000)), task_id="task" + self.get_current_task_name(), volumes=[volume], init_containers=[init_container], in_cluster=False, do_xcom_push=False, ) context = create_context(k) k.execute(context) actual_pod = self.api_client.sanitize_for_serialization(k.pod) self.expected_pod['spec']['initContainers'] = [expected_init_container] self.expected_pod['spec']['volumes'] = [ {'name': 'test-volume', 'persistentVolumeClaim': {'claimName': 'test-volume'}} ] assert self.expected_pod == actual_pod
def _construct_volume(name, claim, host) -> k8s.V1Volume: volume = k8s.V1Volume(name=name) if claim: volume.persistent_volume_claim = k8s.V1PersistentVolumeClaimVolumeSource( claim_name=claim) elif host: volume.host_path = k8s.V1HostPathVolumeSource(path=host, type='') else: volume.empty_dir = {} return volume
def test_volume_mount(self): with mock.patch.object(PodLauncher, 'log') as mock_logger: volume_mount = k8s.V1VolumeMount( name='test-volume', mount_path='/tmp/test_volume', sub_path=None, read_only=False ) volume = k8s.V1Volume( name='test-volume', persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name='test-volume'), ) args = [ "echo \"retrieved from mount\" > /tmp/test_volume/test.txt " "&& cat /tmp/test_volume/test.txt" ] k = KubernetesPodOperator( namespace='default', image="ubuntu:16.04", cmds=["bash", "-cx"], arguments=args, labels={"foo": "bar"}, volume_mounts=[volume_mount], volumes=[volume], name="test-" + str(random.randint(0, 1000000)), task_id="task" + self.get_current_task_name(), in_cluster=False, do_xcom_push=False, ) context = create_context(k) k.execute(context=context) mock_logger.info.assert_any_call('retrieved from mount') actual_pod = self.api_client.sanitize_for_serialization(k.pod) self.expected_pod['spec']['containers'][0]['args'] = args self.expected_pod['spec']['containers'][0]['volumeMounts'] = [ {'name': 'test-volume', 'mountPath': '/tmp/test_volume', 'readOnly': False} ] self.expected_pod['spec']['volumes'] = [ {'name': 'test-volume', 'persistentVolumeClaim': {'claimName': 'test-volume'}} ] assert self.expected_pod == actual_pod
def pipeline_definition( hydrosphere_name="local", hydrosphere_address="http://hydro-serving-sidecar-serving.kubeflow.svc.cluster.local:8080", data_directory='/data/mnist', models_directory="/models/mnist", learning_rate="0.01", learning_steps="5000", batch_size="256", warmpup_count="100", model_name="mnist", application_name="mnist-app", signature_name="predict", acceptable_accuracy="0.90", ): data_pvc = k8s.V1PersistentVolumeClaimVolumeSource(claim_name="data") models_pvc = k8s.V1PersistentVolumeClaimVolumeSource(claim_name="models") data_volume = k8s.V1Volume(name="data", persistent_volume_claim=data_pvc) models_volume = k8s.V1Volume(name="models", persistent_volume_claim=models_pvc) data_volume_mount = k8s.V1VolumeMount( mount_path="{{workflow.parameters.data-directory}}", name="data") models_volume_mount = k8s.V1VolumeMount( mount_path="{{workflow.parameters.models-directory}}", name="models") hydrosphere_address_env = k8s.V1EnvVar( name="CLUSTER_ADDRESS", value="{{workflow.parameters.hydrosphere-address}}") hydrosphere_name_env = k8s.V1EnvVar( name="CLUSTER_NAME", value="{{workflow.parameters.hydrosphere-name}}") data_directory_env = k8s.V1EnvVar( name="MNIST_DATA_DIR", value="{{workflow.parameters.data-directory}}") models_directory_env = k8s.V1EnvVar( name="MNIST_MODELS_DIR", value="{{workflow.parameters.models-directory}}") model_name_env = k8s.V1EnvVar(name="MODEL_NAME", value="{{workflow.parameters.model-name}}") application_name_env = k8s.V1EnvVar( name="APPLICATION_NAME", value="{{workflow.parameters.application-name}}") signature_name_env = k8s.V1EnvVar( name="SIGNATURE_NAME", value="{{workflow.parameters.signature-name}}") acceptable_accuracy_env = k8s.V1EnvVar( name="ACCEPTABLE_ACCURACY", value="{{workflow.parameters.acceptable-accuracy}}") learning_rate_env = k8s.V1EnvVar( name="LEARNING_RATE", value="{{workflow.parameters.learning-rate}}") learning_steps_env = k8s.V1EnvVar( name="LEARNING_STEPS", value="{{workflow.parameters.learning-steps}}") batch_size_env = k8s.V1EnvVar(name="BATCH_SIZE", value="{{workflow.parameters.batch-size}}") warmup_count_env = k8s.V1EnvVar( name="WARMUP_IMAGES_AMOUNT", value="{{workflow.parameters.warmpup-count}}") # 1. Download MNIST data download = dsl.ContainerOp( name="download", image="tidylobster/mnist-pipeline-download:latest") download.add_volume(data_volume) download.add_volume_mount(data_volume_mount) download.add_env_variable(data_directory_env) # 2. Train and save a MNIST classifier using Tensorflow train = dsl.ContainerOp(name="train", image="tidylobster/mnist-pipeline-train:latest") train.after(download) train.set_memory_request('2G') train.set_cpu_request('1') train.add_volume(data_volume) train.add_volume(models_volume) train.add_volume_mount(data_volume_mount) train.add_volume_mount(models_volume_mount) train.add_env_variable(data_directory_env) train.add_env_variable(models_directory_env) train.add_env_variable(learning_rate_env) train.add_env_variable(learning_steps_env) train.add_env_variable(batch_size_env) # 3. Upload trained model to the cluster upload = dsl.ContainerOp( name="upload", image="tidylobster/mnist-pipeline-upload:latest", file_outputs={"model_version": "/model_version.txt"}) upload.after(train) upload.add_volume(models_volume) upload.add_volume_mount(models_volume_mount) upload.add_env_variable(models_directory_env) upload.add_env_variable(model_name_env) upload.add_env_variable(hydrosphere_name_env) upload.add_env_variable(hydrosphere_address_env) # 4. Deploy application deploy = dsl.ContainerOp(name="deploy", image="tidylobster/mnist-pipeline-deploy:latest", arguments=[upload.outputs["model_version"]]) deploy.after(upload) deploy.add_env_variable(hydrosphere_name_env) deploy.add_env_variable(hydrosphere_address_env) deploy.add_env_variable(application_name_env) deploy.add_env_variable(model_name_env) # 5. Test the model test = dsl.ContainerOp(name="test", image="tidylobster/mnist-pipeline-test:latest") test.after(deploy) test.add_volume(data_volume) test.add_volume_mount(data_volume_mount) test.add_env_variable(data_directory_env) test.add_env_variable(hydrosphere_address_env) test.add_env_variable(application_name_env) test.add_env_variable(signature_name_env) test.add_env_variable(warmup_count_env) test.add_env_variable(acceptable_accuracy_env) # 6. Clean environment clean = dsl.ContainerOp(name="clean", image="tidylobster/mnist-pipeline-clean:latest") clean.after(test) clean.add_volume(data_volume) clean.add_volume_mount(data_volume_mount) clean.add_env_variable(data_directory_env) clean.add_volume(models_volume) clean.add_volume_mount(models_volume_mount) clean.add_env_variable(models_directory_env)
default_args=ai_training_run_dag_default_args, schedule_interval=None, start_date=days_ago(2), tags=['training'] ) # Define Kubernetes namespace to execute DAG in namespace = 'airflow' ## Define volume details (change values as necessary to match your environment) # Dataset volume dataset_volume_pvc_existing = 'dataset-vol' dataset_volume = k8s.V1Volume( name=dataset_volume_pvc_existing, persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name=dataset_volume_pvc_existing), ) dataset_volume_mount = k8s.V1VolumeMount( name=dataset_volume_pvc_existing, mount_path='/mnt/dataset', sub_path=None, read_only=False ) # Model volume model_volume_pvc_existing = 'airflow-model-vol' model_volume = k8s.V1Volume( name=model_volume_pvc_existing, persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name=model_volume_pvc_existing), ) model_volume_mount = k8s.V1VolumeMount(
secret_all_keys = Secret("env", None, "airflow-secrets-2") volume_mount = k8s.V1VolumeMount(name="test-volume", mount_path="/root/mount_file", sub_path=None, read_only=True) configmaps = [ k8s.V1EnvFromSource(config_map_ref=k8s.V1ConfigMapEnvSource( name="test-configmap-1")), k8s.V1EnvFromSource(config_map_ref=k8s.V1ConfigMapEnvSource( name="test-configmap-2")), ] volume = k8s.V1Volume( name="test-volume", persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource( claim_name="test-volume"), ) port = k8s.V1ContainerPort(name="http", container_port=80) init_container_volume_mounts = [ k8s.V1VolumeMount(mount_path="/etc/foo", name="test-volume", sub_path=None, read_only=True) ] init_environments = [ k8s.V1EnvVar(name="key1", value="value1"), k8s.V1EnvVar(name="key2", value="value2"), ]
from datetime import datetime, timedelta from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator from airflow.operators.dummy_operator import DummyOperator from kubernetes.client import models as k8s from airflow.utils.dates import days_ago from airflow.models import Variable from airflow.operators.bash import BashOperator from airflow.operators.http_operator import SimpleHttpOperator import urllib.request import json default_args = {'owner': 'datagap'} volume = k8s.V1Volume( name='data-volume', persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource( claim_name='shared-data-volume')) volume_mount = k8s.V1VolumeMount(name='data-volume', mount_path='/shared-data', sub_path=None, read_only=False) druidUrl = Variable.get("druid_broker_url") templateUrl = Variable.get("ntreis_druid_validation_index_url") ntreisPropDatasource = Variable.get("ntreis_prop_sold_datasource") validationDatasource = Variable.get("validation_datasource") login_url = Variable.get("ntreis_login_url") rets_type = Variable.get("ntreis_rets_type") search_limit = Variable.get("ntreis_search_limit") password = Variable.get("ntreis_password")
def pipeline_definition( hydrosphere_address="{hydrosphere-instance-address}", # <-- Replace with correct instance address mount_path='/storage', learning_rate="0.01", learning_steps="10000", batch_size="256", warmpup_count="100", model_name="mnist", application_name="mnist-app", signature_name="predict", acceptable_accuracy="0.90", requests_delay="4", recurring_run="0", ): storage_pvc = k8s.V1PersistentVolumeClaimVolumeSource(claim_name="storage") storage_volume = k8s.V1Volume(name="storage", persistent_volume_claim=storage_pvc) storage_volume_mount = k8s.V1VolumeMount( mount_path="{{workflow.parameters.mount-path}}", name="storage") hydrosphere_address_env = k8s.V1EnvVar( name="CLUSTER_ADDRESS", value="{{workflow.parameters.hydrosphere-address}}") mount_path_env = k8s.V1EnvVar( name="MOUNT_PATH", value="{{workflow.parameters.mount-path}}") model_name_env = k8s.V1EnvVar( name="MODEL_NAME", value="{{workflow.parameters.model-name}}") application_name_env = k8s.V1EnvVar( name="APPLICATION_NAME", value="{{workflow.parameters.application-name}}") signature_name_env = k8s.V1EnvVar( name="SIGNATURE_NAME", value="{{workflow.parameters.signature-name}}") acceptable_accuracy_env = k8s.V1EnvVar( name="ACCEPTABLE_ACCURACY", value="{{workflow.parameters.acceptable-accuracy}}") learning_rate_env = k8s.V1EnvVar( name="LEARNING_RATE", value="{{workflow.parameters.learning-rate}}") learning_steps_env = k8s.V1EnvVar( name="LEARNING_STEPS", value="{{workflow.parameters.learning-steps}}") batch_size_env = k8s.V1EnvVar( name="BATCH_SIZE", value="{{workflow.parameters.batch-size}}") warmup_count_env = k8s.V1EnvVar( name="WARMUP_IMAGES_AMOUNT", value="{{workflow.parameters.warmpup-count}}") requests_delay_env = k8s.V1EnvVar( name="REQUESTS_DELAY", value="{{workflow.parameters.requests-delay}}") recurring_run_env = k8s.V1EnvVar( name="RECURRING_RUN", value="{{workflow.parameters.recurring-run}}") # # 1. Make a sample of production data for retraining sample = dsl.ContainerOp( name="sample", image="tidylobster/mnist-pipeline-sampling:latest") # <-- Replace with correct docker image sample.add_volume(storage_volume) sample.add_volume_mount(storage_volume_mount) sample.add_env_variable(mount_path_env) sample.add_env_variable(hydrosphere_address_env) sample.add_env_variable(application_name_env) # 2. Train and save a MNIST classifier using Tensorflow train = dsl.ContainerOp( name="train", image="tidylobster/mnist-pipeline-train:latest", # <-- Replace with correct docker image file_outputs={"accuracy": "/accuracy.txt"}) train.after(sample) train.set_memory_request('2G') train.set_cpu_request('1') train.add_volume(storage_volume) train.add_volume_mount(storage_volume_mount) train.add_env_variable(mount_path_env) train.add_env_variable(learning_rate_env) train.add_env_variable(learning_steps_env) train.add_env_variable(batch_size_env) train.add_env_variable(recurring_run_env) # 3. Upload trained model to the cluster upload = dsl.ContainerOp( name="upload", image="tidylobster/mnist-pipeline-upload:latest", # <-- Replace with correct docker image file_outputs={"model-version": "/model-version.txt"}, arguments=[train.outputs["accuracy"]]) upload.after(train) upload.add_volume(storage_volume) upload.add_volume_mount(storage_volume_mount) upload.add_env_variable(mount_path_env) upload.add_env_variable(model_name_env) upload.add_env_variable(hydrosphere_address_env) upload.add_env_variable(learning_rate_env) upload.add_env_variable(learning_steps_env) upload.add_env_variable(batch_size_env) # 4. Pre-deploy application predeploy = dsl.ContainerOp( name="predeploy", image="tidylobster/mnist-pipeline-predeploy:latest", # <-- Replace with correct docker image arguments=[upload.outputs["model-version"]], file_outputs={"predeploy-app-name": "/predeploy-app-name.txt"}) predeploy.after(upload) predeploy.add_env_variable(hydrosphere_address_env) predeploy.add_env_variable(application_name_env) predeploy.add_env_variable(model_name_env) # 5. Test the model test = dsl.ContainerOp( name="test", image="tidylobster/mnist-pipeline-test:latest", # <-- Replace with correct docker image arguments=[predeploy.outputs["predeploy-app-name"]]) test.set_retry(3) test.after(predeploy) test.add_volume(storage_volume) test.add_volume_mount(storage_volume_mount) test.add_env_variable(mount_path_env) test.add_env_variable(hydrosphere_address_env) test.add_env_variable(application_name_env) test.add_env_variable(signature_name_env) test.add_env_variable(warmup_count_env) test.add_env_variable(acceptable_accuracy_env) test.add_env_variable(requests_delay_env) test.add_env_variable(recurring_run_env) # 6. Remove predeploy application rm_predeploy = dsl.ContainerOp( name="remove-predeploy", image="tidylobster/mnist-pipeline-rm-predeploy:latest", # <-- Replace with correct docker image arguments=[predeploy.outputs["predeploy-app-name"]]) rm_predeploy.after(test) rm_predeploy.add_env_variable(hydrosphere_address_env) # 7. Deploy application deploy = dsl.ContainerOp( name="deploy", image="tidylobster/mnist-pipeline-deploy:latest", # <-- Replace with correct docker image arguments=[upload.outputs["model-version"]]) deploy.after(test) deploy.add_env_variable(hydrosphere_address_env) deploy.add_env_variable(application_name_env) deploy.add_env_variable(model_name_env)
def to_k8s_volumes(self): return k8s.V1Volume(name=self.pv_name,persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource(claim_name=self.pvc_name))
from kubernetes.client import models as k8s from airflow import DAG from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import ( KubernetesPodOperator, ) with DAG( dag_id="02_kubernetes", description="Fetches ratings from the Movielens API using kubernetes.", start_date=dt.datetime(2019, 1, 1), end_date=dt.datetime(2019, 1, 3), schedule_interval="@daily", default_args={"depends_on_past": True}, ) as dag: volume_claim = k8s.V1PersistentVolumeClaimVolumeSource( claim_name="data-volume") volume = k8s.V1Volume(name="data-volume", persistent_volume_claim=volume_claim) volume_mount = k8s.V1VolumeMount(name="data-volume", mount_path="/data", sub_path=None, read_only=False) fetch_ratings = KubernetesPodOperator( task_id="fetch_ratings", # Airflow 2.0.0a2 has a bug that results in the pod operator not applying # the image pull policy. By default, the k8s SDK uses a policy of always # pulling the image when using the latest tag, but only pulling an image if # it's not present (what we want) when using a different tag. For now, we # use this behaviour to get our desired image policy behaviour.
# smaller than the number of the actual filters num_of_pyramid_tasks_per_tile = 10 # Kubernetes config: namespace, resources, volume and volume_mounts namespace = "default" compute_resources = { "request_cpu": "2000m", "request_memory": "1.5Gi", "limit_cpu": "2000m", "limit_memory": "4.5Gi", } dataset_volume = k8s.V1Volume( name="eo-data", persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource( claim_name="fonda-datasets"), ) dataset_volume_mount = k8s.V1VolumeMount(name="eo-data", mount_path="/data/input", sub_path=None, read_only=True) outputs_volume = k8s.V1Volume( name="outputs-data", persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource( claim_name="force-airflow"), ) outputs_volume_mount = k8s.V1VolumeMount(name="outputs-data", mount_path=OUTPUTS_DATA_PATH,
# # Use this for local testing # # volume = k8s.V1Volume( # name="lyric-wordcloud-volume", # host_path=k8s.V1HostPathVolumeSource( # path="/data", # type="Directory" # ) # ) # lyrics-wordcloud-volume is an existing PersistentVolumeClaim in the CephFS storage volume = k8s.V1Volume( name="lyrics-wordcloud-volume", persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource( claim_name='lyrics-wordcloud-volume')) # describes where to mount the volume in the pod volume_mount = k8s.V1VolumeMount(name="lyrics-wordcloud-volume", mount_path="/results", sub_path=None, read_only=False) # TODO: Read from secret env_vars = {'GENIUS_API_KEY': 'XXXXXXXXXXXXXXXXXXXXXXXXXX'} # # List of artists to create lyric wordlcouds for artists = [ "Rammstein", "Die Ärzte", "Die Toten Hosen", "Peter Maffay", "Nimo", "Mark Forster", "Lea"
def pipeline_definition( hydrosphere_address, mount_path='/storage', learning_rate="0.01", epochs="10", batch_size="256", model_name="mnist", acceptable_accuracy="0.90", ): storage_pvc = k8s.V1PersistentVolumeClaimVolumeSource(claim_name="storage") storage_volume = k8s.V1Volume(name="storage", persistent_volume_claim=storage_pvc) storage_volume_mount = k8s.V1VolumeMount( mount_path="{{workflow.parameters.mount-path}}", name="storage") # 1. Download MNIST data download = dsl.ContainerOp( name="download", image= "tidylobster/mnist-pipeline-download:latest", # <-- Replace with correct docker image file_outputs={"data_path": "/data_path.txt"}, arguments=["--mount-path", mount_path]) download.add_volume(storage_volume) download.add_volume_mount(storage_volume_mount) # 2. Train and save a MNIST classifier using Tensorflow train = dsl.ContainerOp( name="train", image= "tidylobster/mnist-pipeline-train:latest", # <-- Replace with correct docker image file_outputs={ "accuracy": "/accuracy.txt", "model_path": "/model_path.txt", }, command=[ "python", "train-estimator.py", "--data-path", download.outputs["data_path"], "--mount-path", mount_path, "--learning-rate", learning_rate, "--epochs", epochs, "--batch-size", batch_size ]) train.add_volume(storage_volume) train.add_volume_mount(storage_volume_mount) train.after(download) train.set_memory_request('1G') train.set_cpu_request('1') # 3. Release trained model to the cluster release = dsl.ContainerOp( name="release", image= "tidylobster/mnist-pipeline-release:latest", # <-- Replace with correct docker image file_outputs={"model-version": "/model-version.txt"}, arguments=[ "--data-path", download.outputs["data_path"], "--mount-path", mount_path, "--model-name", model_name, "--model-path", train.outputs["model_path"], "--accuracy", train.outputs["accuracy"], "--hydrosphere-address", hydrosphere_address, "--learning-rate", learning_rate, "--epochs", epochs, "--batch-size", batch_size, ]) release.add_volume(storage_volume) release.add_volume_mount(storage_volume_mount) release.after(train) # 4. Deploy to stage application deploy_to_stage = dsl.ContainerOp( name="deploy_to_stage", image= "tidylobster/mnist-pipeline-deploy-to-stage:latest", # <-- Replace with correct docker image file_outputs={"stage-app-name": "/stage-app-name.txt"}, arguments=[ "--model-version", release.outputs["model-version"], "--hydrosphere-address", hydrosphere_address, "--model-name", model_name, ], ) deploy_to_stage.after(release) # 5. Test the model test = dsl.ContainerOp( name="test", image= "tidylobster/mnist-pipeline-test:latest", # <-- Replace with correct docker image arguments=[ "--stage-app-name", deploy_to_stage.outputs["stage-app-name"], "--data-path", download.outputs["data_path"], "--mount-path", mount_path, "--hydrosphere-address", hydrosphere_address, "--acceptable-accuracy", acceptable_accuracy, "--model-name", model_name, ], ) test.add_volume(storage_volume) test.add_volume_mount(storage_volume_mount) test.after(deploy_to_stage) test.set_retry(3) # 6. Deploy to production application deploy_to_prod = dsl.ContainerOp( name="deploy_to_prod", image= "tidylobster/mnist-pipeline-deploy-to-prod:latest", # <-- Replace with correct docker image arguments=[ "--model-version", release.outputs["model-version"], "--model-name", model_name, "--hydrosphere-address", hydrosphere_address ], ) deploy_to_prod.after(test)
'retries': 0, 'retry_delay': timedelta(minutes=5), } spark_application_name = "spark-wordcount-py-{{ ds }}-{{ task_instance.try_number }}" compute_resources = { 'request_cpu': '200m', 'request_memory': '512Mi', 'limit_cpu': '500m', 'limit_memory': '1Gi' } volume = k8s.V1Volume( name="spark-data", persistent_volume_claim=k8s.V1PersistentVolumeClaimVolumeSource( claim_name='spark-pvc')) # this describes where to mount the volume in the pod volume_mount = k8s.V1VolumeMount(name="spark-data", mount_path="/mnt1", sub_path=None, read_only=False) dag = DAG( 'example_spark_wordcount_dag', default_args=default_args, description= 'Simple wordcount spark job which uses the spark-k8s-operator in a Kubernetes cluster', schedule_interval=timedelta(days=1), )