示例#1
0
    def test_bigquery_operator_extra_serialized_field_when_multiple_queries(self):
        with self.dag:
            BigQueryExecuteQueryOperator(
                task_id=TASK_ID,
                sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'],
            )
        serialized_dag = SerializedDAG.to_dict(self.dag)
        self.assertIn("sql", serialized_dag["dag"]["tasks"][0])

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict[TASK_ID]
        self.assertEqual(getattr(simple_task, "sql"),
                         ['SELECT * FROM test_table', 'SELECT * FROM test_table2'])

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################

        # Check Serialized version of operator link
        self.assertEqual(
            serialized_dag["dag"]["tasks"][0]["_operator_extra_links"],
            [
                {'airflow.gcp.operators.bigquery.BigQueryConsoleIndexableLink': {'index': 0}},
                {'airflow.gcp.operators.bigquery.BigQueryConsoleIndexableLink': {'index': 1}}
            ]
        )

        # Check DeSerialized version of operator link
        self.assertIsInstance(list(simple_task.operator_extra_links)[0], BigQueryConsoleIndexableLink)

        ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE)
        job_id = ['123', '45']
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEqual(
            {'BigQuery Console #1', 'BigQuery Console #2'},
            simple_task.operator_extra_link_dict.keys()
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=123',
            simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'),
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=45',
            simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'),
        )
    def test_extra_serialized_field_and_operator_links(self):
        """
        Assert extra field exists & OperatorLinks defined in Plugins and inbuilt Operator Links.

        This tests also depends on GoogleLink() registered as a plugin
        in tests/plugins/test_plugin.py

        The function tests that if extra operator links are registered in plugin
        in ``operator_extra_links`` and the same is also defined in
        the Operator in ``BaseOperator.operator_extra_links``, it has the correct
        extra link.
        """
        test_date = datetime(2019, 8, 1)
        dag = DAG(dag_id='simple_dag', start_date=test_date)
        CustomOperator(task_id='simple_task', dag=dag, bash_command="true")

        serialized_dag = SerializedDAG.to_dict(dag)
        assert "bash_command" in serialized_dag["dag"]["tasks"][0]

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict["simple_task"]
        assert getattr(simple_task, "bash_command") == "true"

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################
        # Check Serialized version of operator link only contains the inbuilt Op Link
        assert serialized_dag["dag"]["tasks"][0]["_operator_extra_links"] == [{
            'tests.test_utils.mock_operators.CustomOpLink': {}
        }]

        # Test all the extra_links are set
        assert set(simple_task.extra_links) == {
            'Google Custom', 'airflow', 'github', 'google'
        }

        ti = TaskInstance(task=simple_task, execution_date=test_date)
        ti.xcom_push('search_query', "dummy_value_1")

        # Test Deserialized inbuilt link
        custom_inbuilt_link = simple_task.get_extra_links(
            test_date, CustomOpLink.name)
        assert 'http://google.com/custom_base_link?search=dummy_value_1' == custom_inbuilt_link

        # Test Deserialized link registered via Airflow Plugin
        google_link_from_plugin = simple_task.get_extra_links(
            test_date, GoogleLink.name)
        assert "https://www.google.com" == google_link_from_plugin
示例#3
0
    def test_templated_fields_exist_in_serialized_dag(self, templated_field, expected_field):
        """
        Test that templated_fields exists for all Operators in Serialized DAG

        Since we don't want to inflate arbitrary python objects (it poses a RCE/security risk etc.)
        we want check that non-"basic" objects are turned in to strings after deserializing.
        """

        dag = DAG("test_serialized_template_fields", start_date=datetime(2019, 8, 1))
        with dag:
            BashOperator(task_id="test", bash_command=templated_field)

        serialized_dag = SerializedDAG.to_dict(dag)
        deserialized_dag = SerializedDAG.from_dict(serialized_dag)
        deserialized_test_task = deserialized_dag.task_dict["test"]
        assert expected_field == getattr(deserialized_test_task, "bash_command")
示例#4
0
    def test_task_params_roundtrip(self, val, expected_val):
        """
        Test that params work both on Serialized DAGs & Tasks
        """
        dag = DAG(dag_id='simple_dag')
        BaseOperator(task_id='simple_task', dag=dag, params=val, start_date=datetime(2019, 8, 1))

        serialized_dag = SerializedDAG.to_dict(dag)
        if val:
            assert "params" in serialized_dag["dag"]["tasks"][0]
        else:
            assert "params" not in serialized_dag["dag"]["tasks"][0]

        deserialized_dag = SerializedDAG.from_dict(serialized_dag)
        deserialized_simple_task = deserialized_dag.task_dict["simple_task"]
        assert expected_val == deserialized_simple_task.params
    def test_dag_params_roundtrip(self, val, expected_val):
        """
        Test that params work both on Serialized DAGs & Tasks
        """
        dag = DAG(dag_id='simple_dag', params=val)
        BaseOperator(task_id='simple_task', dag=dag, start_date=datetime(2019, 8, 1))

        serialized_dag = SerializedDAG.to_dict(dag)
        if val:
            self.assertIn("params", serialized_dag["dag"])
        else:
            self.assertNotIn("params", serialized_dag["dag"])

        deserialized_dag = SerializedDAG.from_dict(serialized_dag)
        deserialized_simple_task = deserialized_dag.task_dict["simple_task"]
        self.assertEqual(expected_val, deserialized_dag.params)
        self.assertEqual(expected_val, deserialized_simple_task.params)
示例#6
0
    def test_console_extra_link_serialized_field(self):
        with self.dag:
            training_op = MLEngineStartTrainingJobOperator(
                **self.TRAINING_DEFAULT_ARGS)
        serialized_dag = SerializedDAG.to_dict(self.dag)
        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict[self.TRAINING_DEFAULT_ARGS['task_id']]

        # Check Serialized version of operator link
        self.assertEqual(
            serialized_dag["dag"]["tasks"][0]["_operator_extra_links"],
            [{
                "airflow.providers.google.cloud.operators.mlengine.AIPlatformConsoleLink":
                {}
            }],
        )

        # Check DeSerialized version of operator link
        self.assertIsInstance(
            list(simple_task.operator_extra_links)[0], AIPlatformConsoleLink)

        job_id = self.TRAINING_DEFAULT_ARGS['job_id']
        project_id = self.TRAINING_DEFAULT_ARGS['project_id']
        gcp_metadata = {
            "job_id": job_id,
            "project_id": project_id,
        }

        ti = TaskInstance(
            task=training_op,
            execution_date=DEFAULT_DATE,
        )
        ti.xcom_push(key='gcp_metadata', value=gcp_metadata)

        self.assertEqual(
            f"https://console.cloud.google.com/ai-platform/jobs/{job_id}?project={project_id}",
            simple_task.get_extra_links(DEFAULT_DATE,
                                        AIPlatformConsoleLink.name),
        )

        self.assertEqual(
            '',
            simple_task.get_extra_links(datetime.datetime(2019, 1, 1),
                                        AIPlatformConsoleLink.name),
        )
    def test_deserialization_end_date(self, dag_end_date, task_end_date,
                                      expected_task_end_date):
        dag = DAG(dag_id='simple_dag',
                  start_date=datetime(2019, 8, 1),
                  end_date=dag_end_date)
        BaseOperator(task_id='simple_task', dag=dag, end_date=task_end_date)

        serialized_dag = SerializedDAG.to_dict(dag)
        if not task_end_date or dag_end_date <= task_end_date:
            # If dag.end_date < task.end_date -> task.end_date=dag.end_date
            # because of the logic in dag.add_task()
            self.assertNotIn("end_date", serialized_dag["dag"]["tasks"][0])
        else:
            self.assertIn("end_date", serialized_dag["dag"]["tasks"][0])

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict["simple_task"]
        self.assertEqual(simple_task.end_date, expected_task_end_date)
    def test_event_op_dag_read_write(self):
        class TestHandler(EventMetHandler):
            def met(self, ti: TaskInstance, ts: TaskState) -> TaskAction:
                return TaskAction.START

        now = timezone.utcnow()
        dag_id = 'test_add_taskstate_0'
        dag = DAG(dag_id=dag_id, start_date=now)
        task0 = DummyOperator(task_id='backfill_task_0', owner='test', dag=dag)
        task0.add_event_dependency('key', "EVENT")
        task0.set_event_met_handler(TestHandler())
        SDM.write_dag(dag)
        with db.create_session() as session:
            sdag = session.query(SDM).first()
            dag = SerializedDAG.from_dict(sdag.data)
        self.assertEqual(dag_id, dag.dag_id)
        self.assertEqual(
            1, len(dag.task_dict["backfill_task_0"].event_dependencies()))
    def test_deserialization_start_date(self, dag_start_date, task_start_date,
                                        expected_task_start_date):
        dag = DAG(dag_id='simple_dag', start_date=dag_start_date)
        BaseOperator(task_id='simple_task',
                     dag=dag,
                     start_date=task_start_date)

        serialized_dag = SerializedDAG.to_dict(dag)
        if not task_start_date or dag_start_date >= task_start_date:
            # If dag.start_date > task.start_date -> task.start_date=dag.start_date
            # because of the logic in dag.add_task()
            assert "start_date" not in serialized_dag["dag"]["tasks"][0]
        else:
            assert "start_date" in serialized_dag["dag"]["tasks"][0]

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict["simple_task"]
        assert simple_task.start_date == expected_task_start_date
示例#10
0
    def test_deserialization_schedule_interval(self, serialized_schedule_interval, expected):
        serialized = {
            "__version": 1,
            "dag": {
                "default_args": {"__type": "dict", "__var": {}},
                "_dag_id": "simple_dag",
                "fileloc": __file__,
                "tasks": [],
                "timezone": "UTC",
                "schedule_interval": serialized_schedule_interval,
            },
        }

        SerializedDAG.validate_schema(serialized)

        dag = SerializedDAG.from_dict(serialized)

        self.assertEqual(dag.schedule_interval, expected)
示例#11
0
    def test_bigquery_operator_extra_serialized_field_when_single_query(self):
        with self.dag:
            BigQueryExecuteQueryOperator(
                task_id=TASK_ID,
                sql='SELECT * FROM test_table',
            )
        serialized_dag = SerializedDAG.to_dict(self.dag)
        self.assertIn("sql", serialized_dag["dag"]["tasks"][0])

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict[TASK_ID]
        self.assertEqual(getattr(simple_task, "sql"),
                         'SELECT * FROM test_table')

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################

        # Check Serialized version of operator link
        self.assertEqual(
            serialized_dag["dag"]["tasks"][0]["_operator_extra_links"],
            [{
                'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleLink':
                {}
            }],
        )

        # Check DeSerialized version of operator link
        self.assertIsInstance(
            list(simple_task.operator_extra_links)[0], BigQueryConsoleLink)

        ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE)
        ti.xcom_push('job_id', 12345)

        # check for positive case
        url = simple_task.get_extra_links(DEFAULT_DATE,
                                          BigQueryConsoleLink.name)
        self.assertEqual(url,
                         'https://console.cloud.google.com/bigquery?j=12345')

        # check for negative case
        url2 = simple_task.get_extra_links(datetime(2017, 1, 2),
                                           BigQueryConsoleLink.name)
        self.assertEqual(url2, '')
示例#12
0
    def test_dag_on_success_callback_roundtrip(self, passed_success_callback, expected_value):
        """
        Test that when on_success_callback is passed to the DAG, has_on_success_callback is stored
        in Serialized JSON blob. And when it is de-serialized dag.has_on_success_callback is set to True.

        When the callback is not set, has_on_success_callback should not be stored in Serialized blob
        and so default to False on de-serialization
        """
        dag = DAG(dag_id='test_dag_on_success_callback_roundtrip', **passed_success_callback)
        BaseOperator(task_id='simple_task', dag=dag, start_date=datetime(2019, 8, 1))

        serialized_dag = SerializedDAG.to_dict(dag)
        if expected_value:
            assert "has_on_success_callback" in serialized_dag["dag"]
        else:
            assert "has_on_success_callback" not in serialized_dag["dag"]

        deserialized_dag = SerializedDAG.from_dict(serialized_dag)

        assert deserialized_dag.has_on_success_callback is expected_value
示例#13
0
    def test_dagrun_update_state_with_handle_callback_failure(self):
        def on_failure_callable(context):
            self.assertEqual(
                context['dag_run'].dag_id,
                'test_dagrun_update_state_with_handle_callback_failure')

        dag = DAG(
            dag_id='test_dagrun_update_state_with_handle_callback_failure',
            start_date=datetime.datetime(2017, 1, 1),
            on_failure_callback=on_failure_callable,
        )
        dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag)
        dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag)
        dag_task1.set_downstream(dag_task2)

        initial_task_states = {
            'test_state_succeeded1': State.SUCCESS,
            'test_state_failed2': State.FAILED,
        }

        # Scheduler uses Serialized DAG -- so use that instead of the Actual DAG
        dag = SerializedDAG.from_dict(SerializedDAG.to_dict(dag))

        dag_run = self.create_dag_run(dag=dag,
                                      state=State.RUNNING,
                                      task_states=initial_task_states)

        _, callback = dag_run.update_state(execute_callbacks=False)
        self.assertEqual(State.FAILED, dag_run.state)
        # Callbacks are not added until handle_callback = False is passed to dag_run.update_state()

        assert callback == DagCallbackRequest(
            full_filepath=dag_run.dag.fileloc,
            dag_id="test_dagrun_update_state_with_handle_callback_failure",
            execution_date=dag_run.execution_date,
            is_failure_callback=True,
            msg="task_failure",
        )
示例#14
0
    def test_extra_serialized_field_and_multiple_operator_links(self):
        """
        Assert extra field exists & OperatorLinks defined in Plugins and inbuilt Operator Links.

        This tests also depends on GoogleLink() registered as a plugin
        in tests/plugins/test_plugin.py

        The function tests that if extra operator links are registered in plugin
        in ``operator_extra_links`` and the same is also defined in
        the Operator in ``BaseOperator.operator_extra_links``, it has the correct
        extra link.
        """
        test_date = datetime(2019, 8, 1)
        dag = DAG(dag_id='simple_dag', start_date=test_date)
        CustomOperator(task_id='simple_task',
                       dag=dag,
                       bash_command=["echo", "true"])

        serialized_dag = SerializedDAG.to_dict(dag)
        self.assertIn("bash_command", serialized_dag["dag"]["tasks"][0])

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict["simple_task"]
        self.assertEqual(getattr(simple_task, "bash_command"),
                         ["echo", "true"])

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################
        # Check Serialized version of operator link only contains the inbuilt Op Link
        self.assertEqual(
            serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [
                {
                    'tests.test_utils.mock_operators.CustomBaseIndexOpLink': {
                        'index': 0
                    }
                },
                {
                    'tests.test_utils.mock_operators.CustomBaseIndexOpLink': {
                        'index': 1
                    }
                },
            ])

        # Test all the extra_links are set
        self.assertCountEqual(simple_task.extra_links, [
            'BigQuery Console #1', 'BigQuery Console #2', 'airflow', 'github',
            'google'
        ])

        ti = TaskInstance(task=simple_task, execution_date=test_date)
        ti.xcom_push('search_query', ["dummy_value_1", "dummy_value_2"])

        # Test Deserialized inbuilt link #1
        custom_inbuilt_link = simple_task.get_extra_links(
            test_date, "BigQuery Console #1")
        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=dummy_value_1',
            custom_inbuilt_link)

        # Test Deserialized inbuilt link #2
        custom_inbuilt_link = simple_task.get_extra_links(
            test_date, "BigQuery Console #2")
        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=dummy_value_2',
            custom_inbuilt_link)

        # Test Deserialized link registered via Airflow Plugin
        google_link_from_plugin = simple_task.get_extra_links(
            test_date, GoogleLink.name)
        self.assertEqual("https://www.google.com", google_link_from_plugin)
示例#15
0
 def _process_message(self, message):
     self.log.debug("Received message of type %s", type(message).__name__)
     if isinstance(message, DagParsingStat):
         self._sync_metadata(message)
     else:
         self._collected_dag_buffer.append(SerializedDAG.from_dict(message))