def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( task_id='job_end', dag=dag, pipeline_config={'pipeline': 'my_end_pipeline'}, task_config={}, parent=None, trigger_rule='all_done', liminal_config={ 'metrics': { 'namespace': 'EndJobNameSpace', 'backends': ['cloudwatch'] } }) task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'EndJobNameSpace') self.assertEqual(dag_task0.backends, ['cloudwatch']) self.assertEqual(dag_task0.task_id, 'end')
def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask( task_id="start_task", dag=dag, liminal_config={ 'metrics': { 'namespace': 'StartJobNameSpace', 'backends': ['cloudwatch'] } }, pipeline_config={'pipeline': 'my_start_pipeline'}, task_config={}, parent=None, trigger_rule='all_success') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'StartJobNameSpace') self.assertEqual(dag_task0.backends, ['cloudwatch']) self.assertEqual(dag_task0.task_id, 'start')
def setUp(self) -> None: self.dag = dag_test_utils.create_dag() self.dag.context = { 'ti': self.dag, 'ts': datetime.now().timestamp(), 'execution_date': datetime.now().timestamp() } self.dag.get_dagrun = MagicMock() self.cluster_name = "liminal-cluster-for-tests" self.delete_cloudformation_task = \ DeleteCloudFormationStackTask( 'delete-emr', self.dag, [], trigger_rule='all_success', liminal_config={}, pipeline_config={}, task_config={ 'stack_name': self.cluster_name } ) self.delete_cloudformation_task.apply_task_to_dag()
def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = self.__create_python_task(dag, 'my_input_task', None, 'my_python_task_img', 'python -u write_inputs.py', env_vars={ 'NUM_FILES': 10, 'NUM_SPLITS': 3 }) task0.apply_task_to_dag() task1 = self.__create_python_task(dag, 'my_output_task', dag.tasks[0], 'my_parallelized_python_task_img', 'python -u write_outputs.py', executors=3) task1.apply_task_to_dag() for task in dag.tasks: print(f'Executing task {task.task_id}') task.execute(DummyDag('my_dag', task.task_id).context) inputs_dir = os.path.join(self.temp_dir, 'inputs') outputs_dir = os.path.join(self.temp_dir, 'outputs') self.assertListEqual(sorted(os.listdir(self.temp_dir)), sorted(['outputs', 'inputs'])) inputs_dir_contents = sorted(os.listdir(inputs_dir)) self.assertListEqual(inputs_dir_contents, ['0', '1', '2']) self.assertListEqual( sorted(os.listdir(os.path.join(inputs_dir, '0'))), ['input0.json', 'input3.json', 'input6.json', 'input9.json']) self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '1'))), ['input1.json', 'input4.json', 'input7.json']) self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '2'))), ['input2.json', 'input5.json', 'input8.json']) self.assertListEqual(sorted(os.listdir(outputs_dir)), [ 'output0.txt', 'output1.txt', 'output2.txt', 'output3.txt', 'output4.txt', 'output5.txt', 'output6.txt', 'output7.txt', 'output8.txt', 'output9.txt' ]) for filename in os.listdir(outputs_dir): with open(os.path.join(outputs_dir, filename)) as f: expected_file_content = filename.replace('output', 'myval').replace( '.txt', '') self.assertEqual(f.read(), expected_file_content)
def test_apply_task_to_dag_with_partial_configuration(self): conf = {'pipeline': 'my_pipeline', 'metrics': {'namespace': 'StartJobNameSpace'}} dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask(dag, 'my_start_pipeline', None, conf, 'all_success') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'StartJobNameSpace') self.assertEqual(dag_task0.backends, [])
def test_apply_task_to_dag_missing_metrics(self): conf = {'pipeline': 'my_pipeline'} dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask(dag, {}, {'pipeline': 'my_end_pipeline'}, conf, None, 'all_done') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, '') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_done')
def test_apply_task_to_dag_with_partial_configuration(self): dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( dag, {'metrics': { 'namespace': 'EndJobNameSpace' }}, {'pipeline': 'my_end_pipeline'}, {}, None, 'all_done') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'EndJobNameSpace') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_done')
def test_apply_task_to_dag(self): # TODO: elaborate tests dag = dag_test_utils.create_dag() task_id = 'my_task' config = self.__create_conf(task_id) task0 = python.PythonTask(dag, 'my_pipeline', None, config, 'all_success') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertIsInstance(dag_task0, KubernetesPodOperatorWithInputAndOutput) self.assertEqual(dag_task0.task_id, task_id)
def setUp(self) -> None: self.dag = dag_test_utils.create_dag() self.dag.context = { 'ti': self.dag, 'ts': datetime.now().timestamp(), 'execution_date': datetime.now().timestamp() } self.dag.get_dagrun = MagicMock() self.cluster_name = "liminal-cluster-for-tests" self.config = { 'task': 'create_emr', 'type': 'create_cloudformation_stack', 'description': 'create emr', 'stack_name': self.cluster_name, 'properties': { 'OnFailure': 'DO_NOTHING', 'TimeoutInMinutes': 25, 'Capabilities': ['CAPABILITY_NAMED_IAM'], 'TemplateURL': 'https://s3.amazonaws.com/liminal-tests/emr_cluster_creation.yml', 'Parameters': { 'Environment': 'Staging', 'OwnerTeam': 'liminal-team', 'Tenancy': 'Ephemeral', 'MasterServerCount': '1', 'CoreServerCount': '1', 'EmrApplicationRelease': '5.28', 'InstanceTypeMaster': 'm5.xlarge', 'InstanceTypeCore': 'm5.xlarge' } } } self.create_cloudformation_task = \ CreateCloudFormationStackTask( self.config['task'], self.dag, [], trigger_rule='all_success', liminal_config={}, pipeline_config={}, task_config=self.config ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag( task=self.create_cloudformation_task)
def setUp(self) -> None: self.run_job_flow_args = dict(Instances={ "InstanceCount": 1, "KeepJobFlowAliveWhenNoSteps": True, "MasterInstanceType": "c3.medium", "Placement": { "AvailabilityZone": "us-east-1" }, "SlaveInstanceType": "c3.xlarge", }, JobFlowRole="EMR_EC2_DefaultRole", LogUri="s3://liminal/log", Name="test-emr-cluster", ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True) self.client = boto3.client("emr", region_name="us-east-1") args = deepcopy(self.run_job_flow_args) self.cluster_id = self.client.run_job_flow(**args)["JobFlowId"] self.dag = dag_test_utils.create_dag() self.dag.context = DummyDag(dag_id=self.dag.dag_id, task_id="").context self.executor_name = 'test-emr-cluster' executor_config = { 'executor': self.executor_name, 'cluster_name': self.executor_name, 'aws_conn_id': 'us-east-1', 'type': 'emr', 'properties': { 'ActionOnFailure': 'CONTINUE' } } self.hadoop_task = MagicMock(spec=hadoop.HadoopTask) self.hadoop_task.get_runnable_command.return_value = [ 'spark-submit', 'test', 'params', '--param' ] self.hadoop_task.task_id = 'spark-task' self.hadoop_task.dag = self.dag self.hadoop_task.trigger_rule = 'all_done' self.hadoop_task.parent = None self.emr = EMRExecutor(self.executor_name, liminal_config={}, executor_config=executor_config)
def test_apply_task_to_dag(self): conf = { 'pipeline': 'my_pipeline', 'metrics': {'namespace': 'StartJobNameSpace', 'backends': ['cloudwatch']}, } dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask(dag, 'my_start_pipeline', None, conf, 'all_success') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'StartJobNameSpace') self.assertEqual(dag_task0.backends, ['cloudwatch']) self.assertEqual(dag_task0.task_id, 'start')
def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( dag, { 'metrics': { 'namespace': 'EndJobNameSpace', 'backends': ['cloudwatch'] } }, {'pipeline': 'my_end_pipeline'}, {}, None, 'all_done') task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'EndJobNameSpace') self.assertEqual(dag_task0.backends, ['cloudwatch']) self.assertEqual(dag_task0.task_id, 'end')
def test_apply_task_to_dag_missing_metrics(self): conf = {'pipeline': 'my_pipeline'} dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( task_id="job_end", dag=dag, pipeline_config={'pipeline': 'my_end_pipeline'}, liminal_config=conf, parent=None, trigger_rule='all_done', task_config={}) task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, '') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_done')
def test_apply_task_to_dag_with_partial_configuration(self): dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask( task_id="start_task", dag=dag, liminal_config={'metrics': { 'namespace': 'StartJobNameSpace' }}, pipeline_config={'pipeline': 'my_start_pipeline'}, task_config={}, parent=None, trigger_rule='all_success', ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=task0) self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'StartJobNameSpace') self.assertEqual(dag_task0.backends, [])
def test_apply_task_to_dag_missing_metrics(self): conf = {'pipeline': 'my_pipeline'} dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask( task_id="start_task", dag=dag, liminal_config=conf, task_config={}, pipeline_config={'pipeline': 'my_end_pipeline'}, parent=None, trigger_rule='all_success') airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=task0) self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, '') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_success')
def test_spark_on_k8s(self): volume_util.delete_local_volume(self._VOLUME_NAME) os.environ['TMPDIR'] = '/tmp' self.temp_dir = tempfile.mkdtemp() self.liminal_config = { 'volumes': [ { 'volume': self._VOLUME_NAME, 'local': { 'path': self.temp_dir.replace( "/var/folders", "/private/var/folders" ) } } ] } volume_util.create_local_volumes(self.liminal_config, None) # build spark image liminal_apps_builder.build_liminal_apps( os.path.join(os.path.dirname(__file__), '../../apps/test_spark_app')) outputs_dir = os.path.join(self.temp_dir, 'outputs') task_config = { 'task': "my_spark_task", 'image': "my_spark_image", 'application_source': 'wordcount.py', 'application_arguments': ['words.txt', '/mnt/vol1/outputs/'], 'env_vars': {}, 'mounts': [ { 'mount': 'mymount', 'volume': self._VOLUME_NAME, 'path': '/mnt/vol1' } ] } dag = dag_test_utils.create_dag() task1 = SparkTask( task_id="my_spark_task", dag=dag, liminal_config=self.liminal_config, pipeline_config={ 'pipeline': 'my_pipeline' }, task_config=task_config, parent=None, trigger_rule='all_success') executor = KubernetesPodExecutor( task_id='k8s', liminal_config=self.liminal_config, executor_config={ 'executor': 'k8s', 'name': 'mypod' } ) executor.apply_task_to_dag(task=task1) for task in dag.tasks: print(f'Executing task {task.task_id}') task.execute(DummyDag('my_dag', task.task_id).context) expected_output = '{"word":"my","count":1}\n' \ '{"word":"first","count":1}\n' \ '{"word":"liminal","count":1}\n' \ '{"word":"spark","count":1}\n' \ '{"word":"task","count":1}\n'.split("\n") actual = '' for filename in os.listdir(outputs_dir): if filename.endswith(".json"): with open(os.path.join(outputs_dir, filename)) as f: actual = f.read() self.assertEqual(actual.split("\n"), expected_output)