def test_bigquery_operator_extra_serialized_field_when_multiple_queries(self): with self.dag: BigQueryExecuteQueryOperator( task_id=TASK_ID, sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'], ) serialized_dag = SerializedDAG.to_dict(self.dag) self.assertIn("sql", serialized_dag["dag"]["tasks"][0]) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict[TASK_ID] self.assertEqual(getattr(simple_task, "sql"), ['SELECT * FROM test_table', 'SELECT * FROM test_table2']) ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link self.assertEqual( serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [ {'airflow.gcp.operators.bigquery.BigQueryConsoleIndexableLink': {'index': 0}}, {'airflow.gcp.operators.bigquery.BigQueryConsoleIndexableLink': {'index': 1}} ] ) # Check DeSerialized version of operator link self.assertIsInstance(list(simple_task.operator_extra_links)[0], BigQueryConsoleIndexableLink) ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE) job_id = ['123', '45'] ti.xcom_push(key='job_id', value=job_id) self.assertEqual( {'BigQuery Console #1', 'BigQuery Console #2'}, simple_task.operator_extra_link_dict.keys() ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=123', simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'), ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=45', simple_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'), )
def test_extra_serialized_field_and_operator_links(self): """ Assert extra field exists & OperatorLinks defined in Plugins and inbuilt Operator Links. This tests also depends on GoogleLink() registered as a plugin in tests/plugins/test_plugin.py The function tests that if extra operator links are registered in plugin in ``operator_extra_links`` and the same is also defined in the Operator in ``BaseOperator.operator_extra_links``, it has the correct extra link. """ test_date = datetime(2019, 8, 1) dag = DAG(dag_id='simple_dag', start_date=test_date) CustomOperator(task_id='simple_task', dag=dag, bash_command="true") serialized_dag = SerializedDAG.to_dict(dag) assert "bash_command" in serialized_dag["dag"]["tasks"][0] dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict["simple_task"] assert getattr(simple_task, "bash_command") == "true" ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link only contains the inbuilt Op Link assert serialized_dag["dag"]["tasks"][0]["_operator_extra_links"] == [{ 'tests.test_utils.mock_operators.CustomOpLink': {} }] # Test all the extra_links are set assert set(simple_task.extra_links) == { 'Google Custom', 'airflow', 'github', 'google' } ti = TaskInstance(task=simple_task, execution_date=test_date) ti.xcom_push('search_query', "dummy_value_1") # Test Deserialized inbuilt link custom_inbuilt_link = simple_task.get_extra_links( test_date, CustomOpLink.name) assert 'http://google.com/custom_base_link?search=dummy_value_1' == custom_inbuilt_link # Test Deserialized link registered via Airflow Plugin google_link_from_plugin = simple_task.get_extra_links( test_date, GoogleLink.name) assert "https://www.google.com" == google_link_from_plugin
def test_templated_fields_exist_in_serialized_dag(self, templated_field, expected_field): """ Test that templated_fields exists for all Operators in Serialized DAG Since we don't want to inflate arbitrary python objects (it poses a RCE/security risk etc.) we want check that non-"basic" objects are turned in to strings after deserializing. """ dag = DAG("test_serialized_template_fields", start_date=datetime(2019, 8, 1)) with dag: BashOperator(task_id="test", bash_command=templated_field) serialized_dag = SerializedDAG.to_dict(dag) deserialized_dag = SerializedDAG.from_dict(serialized_dag) deserialized_test_task = deserialized_dag.task_dict["test"] assert expected_field == getattr(deserialized_test_task, "bash_command")
def test_task_params_roundtrip(self, val, expected_val): """ Test that params work both on Serialized DAGs & Tasks """ dag = DAG(dag_id='simple_dag') BaseOperator(task_id='simple_task', dag=dag, params=val, start_date=datetime(2019, 8, 1)) serialized_dag = SerializedDAG.to_dict(dag) if val: assert "params" in serialized_dag["dag"]["tasks"][0] else: assert "params" not in serialized_dag["dag"]["tasks"][0] deserialized_dag = SerializedDAG.from_dict(serialized_dag) deserialized_simple_task = deserialized_dag.task_dict["simple_task"] assert expected_val == deserialized_simple_task.params
def test_dag_params_roundtrip(self, val, expected_val): """ Test that params work both on Serialized DAGs & Tasks """ dag = DAG(dag_id='simple_dag', params=val) BaseOperator(task_id='simple_task', dag=dag, start_date=datetime(2019, 8, 1)) serialized_dag = SerializedDAG.to_dict(dag) if val: self.assertIn("params", serialized_dag["dag"]) else: self.assertNotIn("params", serialized_dag["dag"]) deserialized_dag = SerializedDAG.from_dict(serialized_dag) deserialized_simple_task = deserialized_dag.task_dict["simple_task"] self.assertEqual(expected_val, deserialized_dag.params) self.assertEqual(expected_val, deserialized_simple_task.params)
def test_console_extra_link_serialized_field(self): with self.dag: training_op = MLEngineStartTrainingJobOperator( **self.TRAINING_DEFAULT_ARGS) serialized_dag = SerializedDAG.to_dict(self.dag) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict[self.TRAINING_DEFAULT_ARGS['task_id']] # Check Serialized version of operator link self.assertEqual( serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [{ "airflow.providers.google.cloud.operators.mlengine.AIPlatformConsoleLink": {} }], ) # Check DeSerialized version of operator link self.assertIsInstance( list(simple_task.operator_extra_links)[0], AIPlatformConsoleLink) job_id = self.TRAINING_DEFAULT_ARGS['job_id'] project_id = self.TRAINING_DEFAULT_ARGS['project_id'] gcp_metadata = { "job_id": job_id, "project_id": project_id, } ti = TaskInstance( task=training_op, execution_date=DEFAULT_DATE, ) ti.xcom_push(key='gcp_metadata', value=gcp_metadata) self.assertEqual( f"https://console.cloud.google.com/ai-platform/jobs/{job_id}?project={project_id}", simple_task.get_extra_links(DEFAULT_DATE, AIPlatformConsoleLink.name), ) self.assertEqual( '', simple_task.get_extra_links(datetime.datetime(2019, 1, 1), AIPlatformConsoleLink.name), )
def test_deserialization_end_date(self, dag_end_date, task_end_date, expected_task_end_date): dag = DAG(dag_id='simple_dag', start_date=datetime(2019, 8, 1), end_date=dag_end_date) BaseOperator(task_id='simple_task', dag=dag, end_date=task_end_date) serialized_dag = SerializedDAG.to_dict(dag) if not task_end_date or dag_end_date <= task_end_date: # If dag.end_date < task.end_date -> task.end_date=dag.end_date # because of the logic in dag.add_task() self.assertNotIn("end_date", serialized_dag["dag"]["tasks"][0]) else: self.assertIn("end_date", serialized_dag["dag"]["tasks"][0]) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict["simple_task"] self.assertEqual(simple_task.end_date, expected_task_end_date)
def test_event_op_dag_read_write(self): class TestHandler(EventMetHandler): def met(self, ti: TaskInstance, ts: TaskState) -> TaskAction: return TaskAction.START now = timezone.utcnow() dag_id = 'test_add_taskstate_0' dag = DAG(dag_id=dag_id, start_date=now) task0 = DummyOperator(task_id='backfill_task_0', owner='test', dag=dag) task0.add_event_dependency('key', "EVENT") task0.set_event_met_handler(TestHandler()) SDM.write_dag(dag) with db.create_session() as session: sdag = session.query(SDM).first() dag = SerializedDAG.from_dict(sdag.data) self.assertEqual(dag_id, dag.dag_id) self.assertEqual( 1, len(dag.task_dict["backfill_task_0"].event_dependencies()))
def test_deserialization_start_date(self, dag_start_date, task_start_date, expected_task_start_date): dag = DAG(dag_id='simple_dag', start_date=dag_start_date) BaseOperator(task_id='simple_task', dag=dag, start_date=task_start_date) serialized_dag = SerializedDAG.to_dict(dag) if not task_start_date or dag_start_date >= task_start_date: # If dag.start_date > task.start_date -> task.start_date=dag.start_date # because of the logic in dag.add_task() assert "start_date" not in serialized_dag["dag"]["tasks"][0] else: assert "start_date" in serialized_dag["dag"]["tasks"][0] dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict["simple_task"] assert simple_task.start_date == expected_task_start_date
def test_deserialization_schedule_interval(self, serialized_schedule_interval, expected): serialized = { "__version": 1, "dag": { "default_args": {"__type": "dict", "__var": {}}, "_dag_id": "simple_dag", "fileloc": __file__, "tasks": [], "timezone": "UTC", "schedule_interval": serialized_schedule_interval, }, } SerializedDAG.validate_schema(serialized) dag = SerializedDAG.from_dict(serialized) self.assertEqual(dag.schedule_interval, expected)
def test_bigquery_operator_extra_serialized_field_when_single_query(self): with self.dag: BigQueryExecuteQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', ) serialized_dag = SerializedDAG.to_dict(self.dag) self.assertIn("sql", serialized_dag["dag"]["tasks"][0]) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict[TASK_ID] self.assertEqual(getattr(simple_task, "sql"), 'SELECT * FROM test_table') ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link self.assertEqual( serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [{ 'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleLink': {} }], ) # Check DeSerialized version of operator link self.assertIsInstance( list(simple_task.operator_extra_links)[0], BigQueryConsoleLink) ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE) ti.xcom_push('job_id', 12345) # check for positive case url = simple_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name) self.assertEqual(url, 'https://console.cloud.google.com/bigquery?j=12345') # check for negative case url2 = simple_task.get_extra_links(datetime(2017, 1, 2), BigQueryConsoleLink.name) self.assertEqual(url2, '')
def test_dag_on_success_callback_roundtrip(self, passed_success_callback, expected_value): """ Test that when on_success_callback is passed to the DAG, has_on_success_callback is stored in Serialized JSON blob. And when it is de-serialized dag.has_on_success_callback is set to True. When the callback is not set, has_on_success_callback should not be stored in Serialized blob and so default to False on de-serialization """ dag = DAG(dag_id='test_dag_on_success_callback_roundtrip', **passed_success_callback) BaseOperator(task_id='simple_task', dag=dag, start_date=datetime(2019, 8, 1)) serialized_dag = SerializedDAG.to_dict(dag) if expected_value: assert "has_on_success_callback" in serialized_dag["dag"] else: assert "has_on_success_callback" not in serialized_dag["dag"] deserialized_dag = SerializedDAG.from_dict(serialized_dag) assert deserialized_dag.has_on_success_callback is expected_value
def test_dagrun_update_state_with_handle_callback_failure(self): def on_failure_callable(context): self.assertEqual( context['dag_run'].dag_id, 'test_dagrun_update_state_with_handle_callback_failure') dag = DAG( dag_id='test_dagrun_update_state_with_handle_callback_failure', start_date=datetime.datetime(2017, 1, 1), on_failure_callback=on_failure_callable, ) dag_task1 = DummyOperator(task_id='test_state_succeeded1', dag=dag) dag_task2 = DummyOperator(task_id='test_state_failed2', dag=dag) dag_task1.set_downstream(dag_task2) initial_task_states = { 'test_state_succeeded1': State.SUCCESS, 'test_state_failed2': State.FAILED, } # Scheduler uses Serialized DAG -- so use that instead of the Actual DAG dag = SerializedDAG.from_dict(SerializedDAG.to_dict(dag)) dag_run = self.create_dag_run(dag=dag, state=State.RUNNING, task_states=initial_task_states) _, callback = dag_run.update_state(execute_callbacks=False) self.assertEqual(State.FAILED, dag_run.state) # Callbacks are not added until handle_callback = False is passed to dag_run.update_state() assert callback == DagCallbackRequest( full_filepath=dag_run.dag.fileloc, dag_id="test_dagrun_update_state_with_handle_callback_failure", execution_date=dag_run.execution_date, is_failure_callback=True, msg="task_failure", )
def test_extra_serialized_field_and_multiple_operator_links(self): """ Assert extra field exists & OperatorLinks defined in Plugins and inbuilt Operator Links. This tests also depends on GoogleLink() registered as a plugin in tests/plugins/test_plugin.py The function tests that if extra operator links are registered in plugin in ``operator_extra_links`` and the same is also defined in the Operator in ``BaseOperator.operator_extra_links``, it has the correct extra link. """ test_date = datetime(2019, 8, 1) dag = DAG(dag_id='simple_dag', start_date=test_date) CustomOperator(task_id='simple_task', dag=dag, bash_command=["echo", "true"]) serialized_dag = SerializedDAG.to_dict(dag) self.assertIn("bash_command", serialized_dag["dag"]["tasks"][0]) dag = SerializedDAG.from_dict(serialized_dag) simple_task = dag.task_dict["simple_task"] self.assertEqual(getattr(simple_task, "bash_command"), ["echo", "true"]) ######################################################### # Verify Operator Links work with Serialized Operator ######################################################### # Check Serialized version of operator link only contains the inbuilt Op Link self.assertEqual( serialized_dag["dag"]["tasks"][0]["_operator_extra_links"], [ { 'tests.test_utils.mock_operators.CustomBaseIndexOpLink': { 'index': 0 } }, { 'tests.test_utils.mock_operators.CustomBaseIndexOpLink': { 'index': 1 } }, ]) # Test all the extra_links are set self.assertCountEqual(simple_task.extra_links, [ 'BigQuery Console #1', 'BigQuery Console #2', 'airflow', 'github', 'google' ]) ti = TaskInstance(task=simple_task, execution_date=test_date) ti.xcom_push('search_query', ["dummy_value_1", "dummy_value_2"]) # Test Deserialized inbuilt link #1 custom_inbuilt_link = simple_task.get_extra_links( test_date, "BigQuery Console #1") self.assertEqual( 'https://console.cloud.google.com/bigquery?j=dummy_value_1', custom_inbuilt_link) # Test Deserialized inbuilt link #2 custom_inbuilt_link = simple_task.get_extra_links( test_date, "BigQuery Console #2") self.assertEqual( 'https://console.cloud.google.com/bigquery?j=dummy_value_2', custom_inbuilt_link) # Test Deserialized link registered via Airflow Plugin google_link_from_plugin = simple_task.get_extra_links( test_date, GoogleLink.name) self.assertEqual("https://www.google.com", google_link_from_plugin)
def _process_message(self, message): self.log.debug("Received message of type %s", type(message).__name__) if isinstance(message, DagParsingStat): self._sync_metadata(message) else: self._collected_dag_buffer.append(SerializedDAG.from_dict(message))