def refresh_bq_schema(schema_arg: str) -> Tuple[str, HTTPStatus]: """Performs a full refresh of BigQuery data for a given schema, pulling data from the appropriate CloudSQL Postgres instance. On completion, triggers Dataflow pipelines (when necessary), releases the refresh lock and restarts any paused ingest work. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return ( f"Unexpected value for schema_arg: [{schema_arg}]", HTTPStatus.BAD_REQUEST, ) if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): return ( f"Unsupported schema type: [{schema_type}]", HTTPStatus.BAD_REQUEST, ) lock_manager = CloudSqlToBQLockManager() try: can_proceed = lock_manager.can_proceed(schema_type) except GCSPseudoLockDoesNotExist as e: logging.exception(e) return ( f"Expected lock for [{schema_arg}] BQ refresh to already exist.", HTTPStatus.EXPECTATION_FAILED, ) if not can_proceed: return ( f"Expected to be able to proceed with refresh before this endpoint was " f"called for [{schema_arg}].", HTTPStatus.EXPECTATION_FAILED, ) federated_bq_schema_refresh(schema_type=schema_type) # Publish a message to the Pub/Sub topic once state BQ export is complete if schema_type is SchemaType.STATE: pubsub_helper.publish_message_to_topic( message="State export to BQ complete", topic="v1.calculator.trigger_daily_pipelines", ) # Unlock export lock when all BQ exports complete lock_manager = CloudSqlToBQLockManager() lock_manager.release_lock(schema_type) logging.info( "Done running refresh for [%s], unlocking Postgres to BigQuery export", schema_type.value, ) # Kick scheduler to restart ingest kick_all_schedulers() return "", HTTPStatus.OK
def handle_dataflow_monitor_task() -> Tuple[str, HTTPStatus]: """Worker function to publish a message to a Pub/Sub topic once a Dataflow job with the given `job_id` has successfully completed. If the job is running, or has another current state that could eventually progress to `JOB_STATE_DONE` in the future, a new task is queued to continue to monitor the job progress. """ json_data = request.get_data(as_text=True) data = json.loads(json_data) project_id = metadata.project_id() job_id = data["job_id"] location = data["location"] topic_dashed = data["topic"] topic = topic_dashed.replace("-", ".") job = get_dataflow_job_with_id(project_id, job_id, location) if job: state = job["currentState"] if state == "JOB_STATE_DONE": # Job was successful. Publish success message. logging.info( "Job %s successfully completed. Triggering " "dashboard export.", job_id ) message = "Dataflow job {} complete".format(job_id) pubsub_helper.publish_message_to_topic(message, topic) elif state in [ "JOB_STATE_STOPPED", "JOB_STATE_RUNNING", "JOB_STATE_PENDING", "JOB_STATE_QUEUED", ]: logging.info( "Job %s has state: %s. Continuing " "to monitor progress.", job_id, state, ) # Job has not completed yet. Re-queue monitor task. CalculateCloudTaskManager().create_dataflow_monitor_task( job_id, location, topic_dashed ) else: logging.warning( "Dataflow job %s has state: %s. Killing the" "monitor tasks.", job_id, state, ) else: logging.warning("Dataflow job %s not found.", job_id) return "", HTTPStatus.OK
def monitor_refresh_bq_tasks() -> Tuple[str, int]: """Worker function to publish a message to a Pub/Sub topic once all tasks in the BIGQUERY_QUEUE queue have completed. """ json_data = request.get_data(as_text=True) data = json.loads(json_data) schema = data["schema"] topic = data["topic"] message = data["message"] task_manager = BQRefreshCloudTaskManager() # If any of the tasks in the queue have task_name containing schema, consider BQ tasks in queue bq_tasks_in_queue = False bq_task_list = task_manager.get_bq_queue_info().task_names for task_name in bq_task_list: task_id = task_name[task_name.find("/tasks/"):] if schema in task_id: bq_tasks_in_queue = True # If there are BQ tasks in the queue, then re-queue this task in a minute if bq_tasks_in_queue: logging.info("Tasks still in bigquery queue. Re-queuing bq monitor" " task.") task_manager.create_bq_refresh_monitor_task(schema, topic, message) return "", HTTPStatus.OK # Publish a message to the Pub/Sub topic once state BQ export is complete if topic: pubsub_helper.publish_message_to_topic(message=message, topic=topic) # Unlock export lock when all BQ exports complete lock_manager = GCSPseudoLockManager() lock_manager.unlock(postgres_to_bq_lock_name_with_suffix(schema)) logging.info( "Done running export for %s, unlocking Postgres to BigQuery export", schema) # Kick scheduler to restart ingest kick_all_schedulers() return ("", HTTPStatus.OK)
def handle_bq_monitor_task(): """Worker function to publish a message to a Pub/Sub topic once all tasks in the BIGQUERY_QUEUE queue have completed. """ json_data = request.get_data(as_text=True) data = json.loads(json_data) topic = data['topic'] message = data['message'] task_manager = BQExportCloudTaskManager() bq_tasks_in_queue = task_manager.get_bq_queue_info().size() > 0 # If there are BQ tasks in the queue, then re-queue this task in a minute if bq_tasks_in_queue: logging.info("Tasks still in bigquery queue. Re-queuing bq monitor" " task.") task_manager.create_bq_monitor_task(topic, message) return ('', HTTPStatus.OK) # Publish a message to the Pub/Sub topic once all BQ exports are complete pubsub_helper.publish_message_to_topic(message=message, topic=topic) return ('', HTTPStatus.OK)