def __init__(self, *args, monitoring_bq_conn_id: str, dag_name: str, days_to_live: int, monitoring_dataset: str, monitoring_table: str, **kwargs) -> None: """Initializes the MonitoringCleanupOperator. Args: *args: arguments for the operator. monitoring_bq_conn_id: Optional; BigQuery connection ID for the monitoring table. Default is 'bigquery_default' dag_name: The name of the DAG running the cleanup operator. days_to_live: Optional; The number of days data can live before being removed. Default is 50 days. monitoring_dataset: Dataset id of the monitoring table. monitoring_table: Table name of the monitoring table. **kwargs: Other arguments to pass through to the operator or hooks. """ super().__init__(*args, **kwargs) self.days_to_live = days_to_live self.monitoring_hook = monitoring_hook_lib.MonitoringHook( bq_conn_id=monitoring_bq_conn_id, dag_name=dag_name, monitoring_dataset=monitoring_dataset, monitoring_table=monitoring_table)
def test_init_handles_bigquery_create_empty_table_errors(self): self.mock_cursor_obj.create_empty_table.side_effect = ( exceptions.AirflowException()) monitoring_hook.MonitoringHook.table_exists = mock.MagicMock( return_value=False) with self.assertRaises(errors.MonitoringDatabaseError): monitoring_hook.MonitoringHook(bq_conn_id='test_conn', monitoring_dataset=self.dataset_id, monitoring_table=self.table_id)
def test_init_handles_bigquery_create_empty_dataset_errors(self): self.mock_cursor_obj.get_dataset.side_effect = exceptions.AirflowException( ) self.mock_cursor_obj.create_empty_dataset.side_effect = ( exceptions.AirflowException()) with self.assertRaises(errors.MonitoringDatabaseError): monitoring_hook.MonitoringHook(bq_conn_id='test_conn', monitoring_dataset=self.dataset_id, monitoring_table=self.table_id)
def __init__(self, *args, input_hook: hook_factory.InputHookType, output_hook: hook_factory.OutputHookType, dag_name: str, monitoring_dataset: str = '', monitoring_table: str = '', monitoring_bq_conn_id: str = '', return_report: bool = False, enable_monitoring: bool = True, is_retry: bool = False, **kwargs) -> None: """Initiates the DataConnectorOperator. Args: *args: arguments for the operator. input_hook: The type of the input hook. output_hook: The type of the output hook. dag_name: The ID of the current running dag. monitoring_dataset: Dataset id of the monitoring table. monitoring_table: Table name of the monitoring table. monitoring_bq_conn_id: BigQuery connection ID for the monitoring table. return_report: Indicates whether to return a run report or not. enable_monitoring: If enabled, data transfer monitoring log will be stored in Storage to allow for retry of failed events. is_retry: If true, the operator will draw failed events from monitoring log and will send them to the output hook. **kwargs: Other arguments to pass through to the operator or hooks. """ super().__init__(*args, **kwargs) self.dag_name = dag_name self.input_hook = hook_factory.get_input_hook(input_hook, **kwargs) self.output_hook = hook_factory.get_output_hook(output_hook, **kwargs) self.return_report = return_report self.enable_monitoring = enable_monitoring self.is_retry = is_retry if enable_monitoring and not all( [monitoring_dataset, monitoring_table, monitoring_bq_conn_id]): raise errors.MonitoringValueError(msg=( 'Missing or empty monitoring parameters although monitoring is ' 'enabled.'), error_num=errors.ErrorNameIDMap. MONITORING_HOOK_INVALID_VARIABLES ) self.monitor = monitoring.MonitoringHook( bq_conn_id=monitoring_bq_conn_id, enable_monitoring=enable_monitoring, dag_name=dag_name, monitoring_dataset=monitoring_dataset, monitoring_table=monitoring_table, location=self.input_hook.get_location())
def test_init(self): self.mock_cursor_obj.get_dataset.side_effect = exceptions.AirflowException( ) monitoring_hook.MonitoringHook.table_exists = mock.MagicMock( return_value=True) monitoring_hook.MonitoringHook(bq_conn_id='test_conn', monitoring_dataset=self.dataset_id, monitoring_table=self.table_id) self.mock_cursor_obj.get_dataset.assert_called_with( project_id=self.project_id, dataset_id=self.dataset_id) monitoring_hook.MonitoringHook.table_exists.assert_called_with( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id)
def test_init_create_monitoring_dataset_and_table(self): self.mock_cursor_obj.get_dataset.side_effect = exceptions.AirflowException( ) monitoring_hook.MonitoringHook.table_exists = mock.MagicMock( return_value=False) monitoring_hook.MonitoringHook(bq_conn_id='test_conn', monitoring_dataset=self.dataset_id, monitoring_table=self.table_id) self.mock_cursor_obj.create_empty_table.assert_called_with( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=monitoring_hook._LOG_SCHEMA_FIELDS) self.mock_cursor_obj.create_empty_dataset.assert_called_with( project_id=self.project_id, dataset_id=self.dataset_id) self.mock_cursor_obj.get_dataset.assert_called_with( project_id=self.project_id, dataset_id=self.dataset_id) monitoring_hook.MonitoringHook.table_exists.assert_called_with( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id)
def setUp(self): super().setUp() self.dag_name = 'dag' self.project_id = 'test_project' self.dataset_id = 'test_dataset' self.table_id = 'test_table' self.conn_id = 'test_conn' self.expected_run_row = { 'dag_name': self.dag_name, 'timestamp': '20201103180000', 'type_id': monitoring_hook.MonitoringEntityMap.RUN.value, 'location': 'https://input/resource', 'position': '', 'info': '' } self.expected_blob_row = { 'dag_name': self.dag_name, 'timestamp': '20201103180000', 'type_id': monitoring_hook.MonitoringEntityMap.BLOB.value, 'location': 'https://input/resource', 'position': '3000', 'info': '1500' } self.expected_event_row = { 'dag_name': self.dag_name, 'timestamp': '20201103180000', 'type_id': 50, 'location': 'https://input/resource', 'position': '60', 'info': json.dumps({'a': 1}) } self.expected_retry_row = { 'dag_name': self.dag_name, 'timestamp': '20201103180000', 'type_id': monitoring_hook.MonitoringEntityMap.RETRY.value, 'location': 'https://input/resource', 'position': '', 'info': '' } self.mock_conn_obj = mock.MagicMock() self.mock_cursor_obj = mock.MagicMock() self.mock_cursor_obj.project_id = self.project_id self.mock_conn_obj.cursor = mock.MagicMock( return_value=self.mock_cursor_obj) self.mock_cursor_obj.create_empty_table = mock.MagicMock() self.mock_cursor_obj.create_empty_dataset = mock.MagicMock() self.mock_cursor_obj.insert_all = mock.MagicMock() self.original_get_conn = monitoring_hook.MonitoringHook.get_conn monitoring_hook.MonitoringHook.get_conn = mock.MagicMock( return_value=self.mock_conn_obj) self.original_bigquery_hook_init = bigquery_hook.BigQueryHook.__init__ bigquery_hook.BigQueryHook.__init__ = mock.MagicMock() with mock.patch( 'google3.third_party.gps_building_blocks.tcrm.hooks.monitoring_hook.' 'MonitoringHook._create_monitoring_dataset_and_table_if_not_exist' ): self.hook = monitoring_hook.MonitoringHook( bq_conn_id=self.conn_id, monitoring_dataset=self.dataset_id, monitoring_table=self.table_id) self.hook.get_conn = mock.MagicMock( return_value=self.mock_conn_obj)