def create_all_jails_bq_refresh_tasks() -> Tuple[str, int]: """Creates an export task for each table to be exported. A task is created for each table defined in the JailsBase schema. Re-creates all tasks if any task fails to be created. """ logging.info("Beginning BQ export for jails schema tables.") task_manager = BQRefreshCloudTaskManager() cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type( SchemaType.JAILS) for table in cloud_sql_to_bq_config.get_tables_to_export(): task_manager.create_refresh_bq_table_task(table.name, SchemaType.JAILS) return ('', HTTPStatus.OK)
def wait_for_ingest_to_create_tasks(schema_arg: str) -> Tuple[str, HTTPStatus]: """Worker function to wait until ingest is not running to create_all_bq_refresh_tasks_for_schema. When ingest is not running/locked, creates task to create_all_bq_refresh_tasks_for_schema. When ingest is running/locked, re-enqueues this task to run again in 60 seconds. """ task_manager = BQRefreshCloudTaskManager() lock_manager = GCSPseudoLockManager() json_data_text = request.get_data(as_text=True) try: json_data = json.loads(json_data_text) except (TypeError, json.decoder.JSONDecodeError): json_data = {} if "lock_id" not in json_data: lock_id = str(uuid.uuid4()) else: lock_id = json_data["lock_id"] logging.info("Request lock id: %s", lock_id) if not lock_manager.is_locked( postgres_to_bq_lock_name_with_suffix(schema_arg)): time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") contents_as_json = {"time": time, "lock_id": lock_id} contents = json.dumps(contents_as_json) lock_manager.lock(postgres_to_bq_lock_name_with_suffix(schema_arg), contents) else: contents = lock_manager.get_lock_contents( postgres_to_bq_lock_name_with_suffix(schema_arg)) try: contents_json = json.loads(contents) except (TypeError, json.decoder.JSONDecodeError): contents_json = {} logging.info("Lock contents: %s", contents_json) if lock_id != contents_json.get("lock_id"): raise GCSPseudoLockAlreadyExists( f"UUID {lock_id} does not match existing lock's UUID") no_regions_running = lock_manager.no_active_locks_with_prefix( GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_NAME) if not no_regions_running: logging.info("Regions running, renqueuing this task.") task_id = "{}-{}-{}".format("renqueue_wait_task", str(datetime.utcnow().date()), uuid.uuid4()) body = {"schema_type": schema_arg, "lock_id": lock_id} task_manager.job_monitor_cloud_task_queue_manager.create_task( task_id=task_id, body=body, relative_uri= f"/cloud_sql_to_bq/create_refresh_bq_tasks/{schema_arg}", schedule_delay_seconds=60, ) return "", HTTPStatus.OK logging.info("No regions running, calling create_refresh_bq_tasks") create_all_bq_refresh_tasks_for_schema(schema_arg) return "", HTTPStatus.OK
def monitor_refresh_bq_tasks() -> Tuple[str, int]: """Worker function to publish a message to a Pub/Sub topic once all tasks in the BIGQUERY_QUEUE queue have completed. """ json_data = request.get_data(as_text=True) data = json.loads(json_data) schema = data["schema"] topic = data["topic"] message = data["message"] task_manager = BQRefreshCloudTaskManager() # If any of the tasks in the queue have task_name containing schema, consider BQ tasks in queue bq_tasks_in_queue = False bq_task_list = task_manager.get_bq_queue_info().task_names for task_name in bq_task_list: task_id = task_name[task_name.find("/tasks/"):] if schema in task_id: bq_tasks_in_queue = True # If there are BQ tasks in the queue, then re-queue this task in a minute if bq_tasks_in_queue: logging.info("Tasks still in bigquery queue. Re-queuing bq monitor" " task.") task_manager.create_bq_refresh_monitor_task(schema, topic, message) return "", HTTPStatus.OK # Publish a message to the Pub/Sub topic once state BQ export is complete if topic: pubsub_helper.publish_message_to_topic(message=message, topic=topic) # Unlock export lock when all BQ exports complete lock_manager = GCSPseudoLockManager() lock_manager.unlock(postgres_to_bq_lock_name_with_suffix(schema)) logging.info( "Done running export for %s, unlocking Postgres to BigQuery export", schema) # Kick scheduler to restart ingest kick_all_schedulers() return ("", HTTPStatus.OK)
def create_all_bq_refresh_tasks_for_schema(schema_arg: str) -> None: """Creates an export task for each table to be exported. A task is created for each table defined in the schema. Re-creates all tasks if any task fails to be created. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return logging.info("Beginning BQ export for %s schema tables.", schema_type.value) task_manager = BQRefreshCloudTaskManager() cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type) if cloud_sql_to_bq_config is None: logging.info("Cloud SQL to BQ is disabled for: %s", schema_type) return for table in cloud_sql_to_bq_config.get_tables_to_export(): task_manager.create_refresh_bq_table_task(table.name, schema_type) if schema_type is SchemaType.STATE: pub_sub_topic = "v1.calculator.trigger_daily_pipelines" pub_sub_message = "State export to BQ complete" else: pub_sub_topic = "" pub_sub_message = "" task_manager.create_bq_refresh_monitor_task(schema_type.value, pub_sub_topic, pub_sub_message)
def test_reattempt_create_refresh_tasks_task( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock ) -> None: # Arrange delay_sec = 60 now_utc_timestamp = int(datetime.datetime.now().timestamp()) uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid schema = "fake_schema" lock_id = "fake_lock_id" queue_path = f"queue_path/{self.mock_project_id}/{QUEUES_REGION}" task_id = "reenqueue_wait_task-2019-04-13-random-uuid" task_path = f"{queue_path}/{task_id}" body = { "lock_id": lock_id, } mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act BQRefreshCloudTaskManager().create_reattempt_create_refresh_tasks_task( schema=schema, lock_id=lock_id ) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, JOB_MONITOR_QUEUE_V2 ) mock_client.return_value.task_path.assert_called_with( self.mock_project_id, QUEUES_REGION, JOB_MONITOR_QUEUE_V2, task_id ) expected_task = tasks_v2.types.task_pb2.Task( name=task_path, schedule_time=timestamp_pb2.Timestamp( seconds=(now_utc_timestamp + delay_sec) ), app_engine_http_request={ "http_method": "POST", "relative_uri": "/cloud_sql_to_bq/create_refresh_bq_schema_task/fake_schema", "body": json.dumps(body).encode(), }, ) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=expected_task )
def monitor_refresh_bq_tasks() -> Tuple[str, int]: """Worker function to publish a message to a Pub/Sub topic once all tasks in the BIGQUERY_QUEUE queue have completed. """ json_data = request.get_data(as_text=True) data = json.loads(json_data) topic = data['topic'] message = data['message'] task_manager = BQRefreshCloudTaskManager() bq_tasks_in_queue = task_manager.get_bq_queue_info().size() > 0 # If there are BQ tasks in the queue, then re-queue this task in a minute if bq_tasks_in_queue: logging.info("Tasks still in bigquery queue. Re-queuing bq monitor" " task.") task_manager.create_bq_refresh_monitor_task(topic, message) return ('', HTTPStatus.OK) # Publish a message to the Pub/Sub topic once all BQ exports are complete pubsub_helper.publish_message_to_topic(message=message, topic=topic) return ('', HTTPStatus.OK)
def test_create_bq_refresh_monitor_task(self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange delay_sec = 60 now_utc_timestamp = int(datetime.datetime.now().timestamp()) uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid schema = "schema" topic = "fake.topic" message = "A fake message" queue_path = f"queue_path/{self.mock_project_id}/{QUEUES_REGION}" task_id = "fake-topic-2019-04-13-random-uuid" task_path = f"{queue_path}/{task_id}" body = { "schema": schema, "topic": topic, "message": message, } task = tasks_v2.types.task_pb2.Task( name=task_path, schedule_time=timestamp_pb2.Timestamp(seconds=(now_utc_timestamp + delay_sec)), app_engine_http_request={ "http_method": "POST", "relative_uri": "/cloud_sql_to_bq/monitor_refresh_bq_tasks", "body": json.dumps(body).encode(), }, ) mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act BQRefreshCloudTaskManager().create_bq_refresh_monitor_task( schema="schema", topic=topic, message=message) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, JOB_MONITOR_QUEUE_V2) mock_client.return_value.task_path.assert_called_with( self.mock_project_id, QUEUES_REGION, JOB_MONITOR_QUEUE_V2, task_id) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_create_bq_refresh_monitor_task(self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange delay_sec = 60 now_utc_timestamp = int(datetime.datetime.now().timestamp()) uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid project_id = 'recidiviz-456' topic = 'fake.topic' message = 'A fake message' queue_path = f'queue_path/{project_id}/{QUEUES_REGION}' task_id = 'fake-topic-2019-04-13-random-uuid' task_path = f'{queue_path}/{task_id}' body = { 'topic': topic, 'message': message, } task = tasks_v2.types.task_pb2.Task( name=task_path, schedule_time=timestamp_pb2.Timestamp(seconds=(now_utc_timestamp + delay_sec)), app_engine_http_request={ 'http_method': 'POST', 'relative_uri': '/cloud_sql_to_bq/monitor_refresh_bq_tasks', 'body': json.dumps(body).encode() }) mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act BQRefreshCloudTaskManager(project_id=project_id). \ create_bq_refresh_monitor_task(topic=topic, message=message) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, JOB_MONITOR_QUEUE_V2) mock_client.return_value.task_path.assert_called_with( project_id, QUEUES_REGION, JOB_MONITOR_QUEUE_V2, task_id) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_create_refresh_bq_schema_task( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock ) -> None: # Arrange uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid schema_type = SchemaType.JAILS.value queue_path = f"queue_path/{self.mock_project_id}/{QUEUES_REGION}" task_id = f"{schema_type}-2019-04-12-random-uuid" task_path = f"{queue_path}/{task_id}" task = tasks_v2.types.task_pb2.Task( name=task_path, app_engine_http_request={ "http_method": "POST", "relative_uri": "/cloud_sql_to_bq/refresh_bq_schema/JAILS", "body": json.dumps({}).encode(), }, ) mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act BQRefreshCloudTaskManager().create_refresh_bq_schema_task( schema_type=SchemaType.JAILS ) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, BIGQUERY_QUEUE_V2 ) mock_client.return_value.task_path.assert_called_with( self.mock_project_id, QUEUES_REGION, BIGQUERY_QUEUE_V2, task_id ) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task )
def test_create_refresh_bq_table_task(self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid project_id = 'recidiviz-456' table_name = 'test_table' schema_type = SchemaType.JAILS.value queue_path = f'queue_path/{project_id}/{QUEUES_REGION}' task_id = f'test_table-{schema_type}-2019-04-12-random-uuid' task_path = f'{queue_path}/{task_id}' body = {'table_name': table_name, 'schema_type': schema_type} task = tasks_v2.types.task_pb2.Task( name=task_path, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': '/cloud_sql_to_bq/refresh_bq_table', 'body': json.dumps(body).encode() }) mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act BQRefreshCloudTaskManager(project_id=project_id). \ create_refresh_bq_table_task(table_name=table_name, schema_type=SchemaType.JAILS) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, BIGQUERY_QUEUE_V2) mock_client.return_value.task_path.assert_called_with( project_id, QUEUES_REGION, BIGQUERY_QUEUE_V2, task_id) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def create_all_state_bq_refresh_tasks() -> Tuple[str, int]: """Creates an export task for each table to be exported. A task is created for each table defined in the StateBase schema. Re-creates all tasks if any task fails to be created. """ logging.info("Beginning BQ export for state schema tables.") task_manager = BQRefreshCloudTaskManager() cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type( SchemaType.STATE) for table in cloud_sql_to_bq_config.get_tables_to_export(): task_manager.create_refresh_bq_table_task(table.name, SchemaType.STATE) pub_sub_topic = 'v1.calculator.recidivism' pub_sub_message = 'State export to BQ complete' task_manager.create_bq_refresh_monitor_task(pub_sub_topic, pub_sub_message) return ('', HTTPStatus.OK)
def wait_for_ingest_to_create_tasks(schema_arg: str) -> Tuple[str, HTTPStatus]: """Worker function to wait until ingest is not running to queue a task to run /refresh_bq_schema. Before doing anything, grabs the refresh lock to indicate that a refresh wants to start and ingest should yield ASAP. Then: * When ingest is not running/locked, creates task to run /refresh_bq_schema. * When ingest is running/locked, re-enqueues this task to run again in 60 seconds. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return ( f"Unexpected value for schema_arg: [{schema_arg}]", HTTPStatus.BAD_REQUEST, ) if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): return ( f"Unsuppported schema type: [{schema_type}]", HTTPStatus.BAD_REQUEST, ) lock_id = get_or_create_lock_id() logging.info("Request lock id: %s", lock_id) lock_manager = CloudSqlToBQLockManager() lock_manager.acquire_lock(schema_type=schema_type, lock_id=lock_id) task_manager = BQRefreshCloudTaskManager() if not lock_manager.can_proceed(schema_type): logging.info("Regions running, renqueuing this task.") task_manager.create_reattempt_create_refresh_tasks_task( lock_id=lock_id, schema=schema_arg) return "", HTTPStatus.OK logging.info("No regions running, triggering BQ refresh.") task_manager.create_refresh_bq_schema_task(schema_type=schema_type) return "", HTTPStatus.OK