Exemplo n.º 1
0
    def test_apply_task_to_dag(self):
        dag = dag_test_utils.create_dag()

        task0 = job_end.JobEndTask(
            task_id='job_end',
            dag=dag,
            pipeline_config={'pipeline': 'my_end_pipeline'},
            task_config={},
            parent=None,
            trigger_rule='all_done',
            liminal_config={
                'metrics': {
                    'namespace': 'EndJobNameSpace',
                    'backends': ['cloudwatch']
                }
            })
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'EndJobNameSpace')
        self.assertEqual(dag_task0.backends, ['cloudwatch'])

        self.assertEqual(dag_task0.task_id, 'end')
    def test_apply_task_to_dag(self):
        dag = dag_test_utils.create_dag()

        task0 = job_start.JobStartTask(
            task_id="start_task",
            dag=dag,
            liminal_config={
                'metrics': {
                    'namespace': 'StartJobNameSpace',
                    'backends': ['cloudwatch']
                }
            },
            pipeline_config={'pipeline': 'my_start_pipeline'},
            task_config={},
            parent=None,
            trigger_rule='all_success')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'StartJobNameSpace')
        self.assertEqual(dag_task0.backends, ['cloudwatch'])

        self.assertEqual(dag_task0.task_id, 'start')
    def setUp(self) -> None:
        self.dag = dag_test_utils.create_dag()
        self.dag.context = {
            'ti': self.dag,
            'ts': datetime.now().timestamp(),
            'execution_date': datetime.now().timestamp()
        }
        self.dag.get_dagrun = MagicMock()

        self.cluster_name = "liminal-cluster-for-tests"

        self.delete_cloudformation_task = \
            DeleteCloudFormationStackTask(
                'delete-emr',
                self.dag,
                [],
                trigger_rule='all_success',
                liminal_config={},
                pipeline_config={},
                task_config={
                    'stack_name': self.cluster_name
                }
            )

        self.delete_cloudformation_task.apply_task_to_dag()
Exemplo n.º 4
0
    def test_apply_task_to_dag(self):
        dag = dag_test_utils.create_dag()

        task0 = self.__create_python_task(dag,
                                          'my_input_task',
                                          None,
                                          'my_python_task_img',
                                          'python -u write_inputs.py',
                                          env_vars={
                                              'NUM_FILES': 10,
                                              'NUM_SPLITS': 3
                                          })
        task0.apply_task_to_dag()

        task1 = self.__create_python_task(dag,
                                          'my_output_task',
                                          dag.tasks[0],
                                          'my_parallelized_python_task_img',
                                          'python -u write_outputs.py',
                                          executors=3)
        task1.apply_task_to_dag()

        for task in dag.tasks:
            print(f'Executing task {task.task_id}')
            task.execute(DummyDag('my_dag', task.task_id).context)

        inputs_dir = os.path.join(self.temp_dir, 'inputs')
        outputs_dir = os.path.join(self.temp_dir, 'outputs')

        self.assertListEqual(sorted(os.listdir(self.temp_dir)),
                             sorted(['outputs', 'inputs']))

        inputs_dir_contents = sorted(os.listdir(inputs_dir))

        self.assertListEqual(inputs_dir_contents, ['0', '1', '2'])

        self.assertListEqual(
            sorted(os.listdir(os.path.join(inputs_dir, '0'))),
            ['input0.json', 'input3.json', 'input6.json', 'input9.json'])

        self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '1'))),
                             ['input1.json', 'input4.json', 'input7.json'])

        self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '2'))),
                             ['input2.json', 'input5.json', 'input8.json'])

        self.assertListEqual(sorted(os.listdir(outputs_dir)), [
            'output0.txt', 'output1.txt', 'output2.txt', 'output3.txt',
            'output4.txt', 'output5.txt', 'output6.txt', 'output7.txt',
            'output8.txt', 'output9.txt'
        ])

        for filename in os.listdir(outputs_dir):
            with open(os.path.join(outputs_dir, filename)) as f:
                expected_file_content = filename.replace('output',
                                                         'myval').replace(
                                                             '.txt', '')
                self.assertEqual(f.read(), expected_file_content)
Exemplo n.º 5
0
    def test_apply_task_to_dag_with_partial_configuration(self):
        conf = {'pipeline': 'my_pipeline', 'metrics': {'namespace': 'StartJobNameSpace'}}
        dag = dag_test_utils.create_dag()

        task0 = job_start.JobStartTask(dag, 'my_start_pipeline', None, conf, 'all_success')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'StartJobNameSpace')
        self.assertEqual(dag_task0.backends, [])
Exemplo n.º 6
0
    def test_apply_task_to_dag_missing_metrics(self):
        conf = {'pipeline': 'my_pipeline'}
        dag = dag_test_utils.create_dag()

        task0 = job_end.JobEndTask(dag, {}, {'pipeline': 'my_end_pipeline'}, conf, None, 'all_done')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, '')
        self.assertEqual(dag_task0.backends, [])
        self.assertEqual(dag_task0.trigger_rule, 'all_done')
    def test_apply_task_to_dag_with_partial_configuration(self):
        dag = dag_test_utils.create_dag()

        task0 = job_end.JobEndTask(
            dag, {'metrics': {
                'namespace': 'EndJobNameSpace'
            }}, {'pipeline': 'my_end_pipeline'}, {}, None, 'all_done')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'EndJobNameSpace')
        self.assertEqual(dag_task0.backends, [])
        self.assertEqual(dag_task0.trigger_rule, 'all_done')
Exemplo n.º 8
0
    def test_apply_task_to_dag(self):
        # TODO: elaborate tests
        dag = dag_test_utils.create_dag()

        task_id = 'my_task'

        config = self.__create_conf(task_id)

        task0 = python.PythonTask(dag, 'my_pipeline', None, config, 'all_success')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertIsInstance(dag_task0, KubernetesPodOperatorWithInputAndOutput)
        self.assertEqual(dag_task0.task_id, task_id)
Exemplo n.º 9
0
    def setUp(self) -> None:
        self.dag = dag_test_utils.create_dag()
        self.dag.context = {
            'ti': self.dag,
            'ts': datetime.now().timestamp(),
            'execution_date': datetime.now().timestamp()
        }
        self.dag.get_dagrun = MagicMock()

        self.cluster_name = "liminal-cluster-for-tests"
        self.config = {
            'task': 'create_emr',
            'type': 'create_cloudformation_stack',
            'description': 'create emr',
            'stack_name': self.cluster_name,
            'properties': {
                'OnFailure': 'DO_NOTHING',
                'TimeoutInMinutes': 25,
                'Capabilities': ['CAPABILITY_NAMED_IAM'],
                'TemplateURL': 'https://s3.amazonaws.com/liminal-tests/emr_cluster_creation.yml',
                'Parameters': {
                    'Environment': 'Staging',
                    'OwnerTeam': 'liminal-team',
                    'Tenancy': 'Ephemeral',
                    'MasterServerCount': '1',
                    'CoreServerCount': '1',
                    'EmrApplicationRelease': '5.28',
                    'InstanceTypeMaster': 'm5.xlarge',
                    'InstanceTypeCore': 'm5.xlarge'
                }
            }
        }

        self.create_cloudformation_task = \
            CreateCloudFormationStackTask(
                self.config['task'],
                self.dag,
                [],
                trigger_rule='all_success',
                liminal_config={},
                pipeline_config={},
                task_config=self.config
            )

        airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(
            task=self.create_cloudformation_task)
Exemplo n.º 10
0
    def setUp(self) -> None:
        self.run_job_flow_args = dict(Instances={
            "InstanceCount": 1,
            "KeepJobFlowAliveWhenNoSteps": True,
            "MasterInstanceType": "c3.medium",
            "Placement": {
                "AvailabilityZone": "us-east-1"
            },
            "SlaveInstanceType": "c3.xlarge",
        },
                                      JobFlowRole="EMR_EC2_DefaultRole",
                                      LogUri="s3://liminal/log",
                                      Name="test-emr-cluster",
                                      ServiceRole="EMR_DefaultRole",
                                      VisibleToAllUsers=True)

        self.client = boto3.client("emr", region_name="us-east-1")

        args = deepcopy(self.run_job_flow_args)

        self.cluster_id = self.client.run_job_flow(**args)["JobFlowId"]

        self.dag = dag_test_utils.create_dag()
        self.dag.context = DummyDag(dag_id=self.dag.dag_id, task_id="").context
        self.executor_name = 'test-emr-cluster'
        executor_config = {
            'executor': self.executor_name,
            'cluster_name': self.executor_name,
            'aws_conn_id': 'us-east-1',
            'type': 'emr',
            'properties': {
                'ActionOnFailure': 'CONTINUE'
            }
        }
        self.hadoop_task = MagicMock(spec=hadoop.HadoopTask)
        self.hadoop_task.get_runnable_command.return_value = [
            'spark-submit', 'test', 'params', '--param'
        ]
        self.hadoop_task.task_id = 'spark-task'
        self.hadoop_task.dag = self.dag
        self.hadoop_task.trigger_rule = 'all_done'
        self.hadoop_task.parent = None

        self.emr = EMRExecutor(self.executor_name,
                               liminal_config={},
                               executor_config=executor_config)
Exemplo n.º 11
0
    def test_apply_task_to_dag(self):
        conf = {
            'pipeline': 'my_pipeline',
            'metrics': {'namespace': 'StartJobNameSpace', 'backends': ['cloudwatch']},
        }

        dag = dag_test_utils.create_dag()

        task0 = job_start.JobStartTask(dag, 'my_start_pipeline', None, conf, 'all_success')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'StartJobNameSpace')
        self.assertEqual(dag_task0.backends, ['cloudwatch'])

        self.assertEqual(dag_task0.task_id, 'start')
    def test_apply_task_to_dag(self):

        dag = dag_test_utils.create_dag()

        task0 = job_end.JobEndTask(
            dag, {
                'metrics': {
                    'namespace': 'EndJobNameSpace',
                    'backends': ['cloudwatch']
                }
            }, {'pipeline': 'my_end_pipeline'}, {}, None, 'all_done')
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'EndJobNameSpace')
        self.assertEqual(dag_task0.backends, ['cloudwatch'])

        self.assertEqual(dag_task0.task_id, 'end')
Exemplo n.º 13
0
    def test_apply_task_to_dag_missing_metrics(self):
        conf = {'pipeline': 'my_pipeline'}
        dag = dag_test_utils.create_dag()

        task0 = job_end.JobEndTask(
            task_id="job_end",
            dag=dag,
            pipeline_config={'pipeline': 'my_end_pipeline'},
            liminal_config=conf,
            parent=None,
            trigger_rule='all_done',
            task_config={})
        task0.apply_task_to_dag()

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, '')
        self.assertEqual(dag_task0.backends, [])
        self.assertEqual(dag_task0.trigger_rule, 'all_done')
Exemplo n.º 14
0
    def test_apply_task_to_dag_with_partial_configuration(self):
        dag = dag_test_utils.create_dag()

        task0 = job_start.JobStartTask(
            task_id="start_task",
            dag=dag,
            liminal_config={'metrics': {
                'namespace': 'StartJobNameSpace'
            }},
            pipeline_config={'pipeline': 'my_start_pipeline'},
            task_config={},
            parent=None,
            trigger_rule='all_success',
        )
        airflow.AirflowExecutor("airflow-executor", {},
                                {}).apply_task_to_dag(task=task0)

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, 'StartJobNameSpace')
        self.assertEqual(dag_task0.backends, [])
Exemplo n.º 15
0
    def test_apply_task_to_dag_missing_metrics(self):
        conf = {'pipeline': 'my_pipeline'}

        dag = dag_test_utils.create_dag()

        task0 = job_start.JobStartTask(
            task_id="start_task",
            dag=dag,
            liminal_config=conf,
            task_config={},
            pipeline_config={'pipeline': 'my_end_pipeline'},
            parent=None,
            trigger_rule='all_success')
        airflow.AirflowExecutor("airflow-executor", {},
                                {}).apply_task_to_dag(task=task0)

        self.assertEqual(len(dag.tasks), 1)
        dag_task0 = dag.tasks[0]

        self.assertEqual(dag_task0.namespace, '')
        self.assertEqual(dag_task0.backends, [])
        self.assertEqual(dag_task0.trigger_rule, 'all_success')
Exemplo n.º 16
0
    def test_spark_on_k8s(self):
        volume_util.delete_local_volume(self._VOLUME_NAME)
        os.environ['TMPDIR'] = '/tmp'
        self.temp_dir = tempfile.mkdtemp()
        self.liminal_config = {
            'volumes': [
                {
                    'volume': self._VOLUME_NAME,
                    'local': {
                        'path': self.temp_dir.replace(
                            "/var/folders",
                            "/private/var/folders"
                        )
                    }
                }
            ]
        }
        volume_util.create_local_volumes(self.liminal_config, None)

        # build spark image
        liminal_apps_builder.build_liminal_apps(
            os.path.join(os.path.dirname(__file__), '../../apps/test_spark_app'))

        outputs_dir = os.path.join(self.temp_dir, 'outputs')

        task_config = {
            'task': "my_spark_task",
            'image': "my_spark_image",
            'application_source': 'wordcount.py',
            'application_arguments': ['words.txt', '/mnt/vol1/outputs/'],
            'env_vars': {},
            'mounts': [
                {
                    'mount': 'mymount',
                    'volume': self._VOLUME_NAME,
                    'path': '/mnt/vol1'
                }
            ]
        }

        dag = dag_test_utils.create_dag()

        task1 = SparkTask(
            task_id="my_spark_task",
            dag=dag,
            liminal_config=self.liminal_config,
            pipeline_config={
                'pipeline': 'my_pipeline'
            },
            task_config=task_config,
            parent=None,
            trigger_rule='all_success')

        executor = KubernetesPodExecutor(
            task_id='k8s',
            liminal_config=self.liminal_config,
            executor_config={
                'executor': 'k8s',
                'name': 'mypod'
            }
        )
        executor.apply_task_to_dag(task=task1)

        for task in dag.tasks:
            print(f'Executing task {task.task_id}')
            task.execute(DummyDag('my_dag', task.task_id).context)

        expected_output = '{"word":"my","count":1}\n' \
                          '{"word":"first","count":1}\n' \
                          '{"word":"liminal","count":1}\n' \
                          '{"word":"spark","count":1}\n' \
                          '{"word":"task","count":1}\n'.split("\n")

        actual = ''
        for filename in os.listdir(outputs_dir):
            if filename.endswith(".json"):
                with open(os.path.join(outputs_dir, filename)) as f:
                    actual = f.read()

        self.assertEqual(actual.split("\n"), expected_output)