예제 #1
0
def refresh_bq_schema(schema_arg: str) -> Tuple[str, HTTPStatus]:
    """Performs a full refresh of BigQuery data for a given schema, pulling data from
    the appropriate CloudSQL Postgres instance.

    On completion, triggers Dataflow pipelines (when necessary), releases the refresh
    lock and restarts any paused ingest work.
    """
    try:
        schema_type = SchemaType(schema_arg.upper())
    except ValueError:
        return (
            f"Unexpected value for schema_arg: [{schema_arg}]",
            HTTPStatus.BAD_REQUEST,
        )
    if not CloudSqlToBQConfig.is_valid_schema_type(schema_type):
        return (
            f"Unsupported schema type: [{schema_type}]",
            HTTPStatus.BAD_REQUEST,
        )

    lock_manager = CloudSqlToBQLockManager()

    try:
        can_proceed = lock_manager.can_proceed(schema_type)
    except GCSPseudoLockDoesNotExist as e:
        logging.exception(e)
        return (
            f"Expected lock for [{schema_arg}] BQ refresh to already exist.",
            HTTPStatus.EXPECTATION_FAILED,
        )

    if not can_proceed:
        return (
            f"Expected to be able to proceed with refresh before this endpoint was "
            f"called for [{schema_arg}].",
            HTTPStatus.EXPECTATION_FAILED,
        )

    federated_bq_schema_refresh(schema_type=schema_type)

    # Publish a message to the Pub/Sub topic once state BQ export is complete
    if schema_type is SchemaType.STATE:
        pubsub_helper.publish_message_to_topic(
            message="State export to BQ complete",
            topic="v1.calculator.trigger_daily_pipelines",
        )

    # Unlock export lock when all BQ exports complete
    lock_manager = CloudSqlToBQLockManager()
    lock_manager.release_lock(schema_type)
    logging.info(
        "Done running refresh for [%s], unlocking Postgres to BigQuery export",
        schema_type.value,
    )

    # Kick scheduler to restart ingest
    kick_all_schedulers()

    return "", HTTPStatus.OK
def handle_dataflow_monitor_task() -> Tuple[str, HTTPStatus]:
    """Worker function to publish a message to a Pub/Sub topic once a Dataflow
    job with the given `job_id` has successfully completed.

    If the job is running, or has another current state that could eventually
    progress to `JOB_STATE_DONE` in the future, a new task is queued to
    continue to monitor the job progress.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    project_id = metadata.project_id()
    job_id = data["job_id"]
    location = data["location"]
    topic_dashed = data["topic"]
    topic = topic_dashed.replace("-", ".")

    job = get_dataflow_job_with_id(project_id, job_id, location)

    if job:
        state = job["currentState"]

        if state == "JOB_STATE_DONE":
            # Job was successful. Publish success message.
            logging.info(
                "Job %s successfully completed. Triggering " "dashboard export.", job_id
            )
            message = "Dataflow job {} complete".format(job_id)
            pubsub_helper.publish_message_to_topic(message, topic)

        elif state in [
            "JOB_STATE_STOPPED",
            "JOB_STATE_RUNNING",
            "JOB_STATE_PENDING",
            "JOB_STATE_QUEUED",
        ]:
            logging.info(
                "Job %s has state: %s. Continuing " "to monitor progress.",
                job_id,
                state,
            )
            # Job has not completed yet. Re-queue monitor task.
            CalculateCloudTaskManager().create_dataflow_monitor_task(
                job_id, location, topic_dashed
            )
        else:
            logging.warning(
                "Dataflow job %s has state: %s. Killing the" "monitor tasks.",
                job_id,
                state,
            )
    else:
        logging.warning("Dataflow job %s not found.", job_id)

    return "", HTTPStatus.OK
def monitor_refresh_bq_tasks() -> Tuple[str, int]:
    """Worker function to publish a message to a Pub/Sub topic once all tasks in
    the BIGQUERY_QUEUE queue have completed.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    schema = data["schema"]
    topic = data["topic"]
    message = data["message"]

    task_manager = BQRefreshCloudTaskManager()

    # If any of the tasks in the queue have task_name containing schema, consider BQ tasks in queue
    bq_tasks_in_queue = False
    bq_task_list = task_manager.get_bq_queue_info().task_names
    for task_name in bq_task_list:
        task_id = task_name[task_name.find("/tasks/"):]
        if schema in task_id:
            bq_tasks_in_queue = True

    # If there are BQ tasks in the queue, then re-queue this task in a minute
    if bq_tasks_in_queue:
        logging.info("Tasks still in bigquery queue. Re-queuing bq monitor"
                     " task.")
        task_manager.create_bq_refresh_monitor_task(schema, topic, message)
        return "", HTTPStatus.OK

    # Publish a message to the Pub/Sub topic once state BQ export is complete
    if topic:
        pubsub_helper.publish_message_to_topic(message=message, topic=topic)

    # Unlock export lock when all BQ exports complete
    lock_manager = GCSPseudoLockManager()
    lock_manager.unlock(postgres_to_bq_lock_name_with_suffix(schema))
    logging.info(
        "Done running export for %s, unlocking Postgres to BigQuery export",
        schema)

    # Kick scheduler to restart ingest
    kick_all_schedulers()

    return ("", HTTPStatus.OK)
예제 #4
0
def handle_bq_monitor_task():
    """Worker function to publish a message to a Pub/Sub topic once all tasks in
    the BIGQUERY_QUEUE queue have completed.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    topic = data['topic']
    message = data['message']

    task_manager = BQExportCloudTaskManager()

    bq_tasks_in_queue = task_manager.get_bq_queue_info().size() > 0

    # If there are BQ tasks in the queue, then re-queue this task in a minute
    if bq_tasks_in_queue:
        logging.info("Tasks still in bigquery queue. Re-queuing bq monitor"
                     " task.")
        task_manager.create_bq_monitor_task(topic, message)
        return ('', HTTPStatus.OK)

    # Publish a message to the Pub/Sub topic once all BQ exports are complete
    pubsub_helper.publish_message_to_topic(message=message, topic=topic)

    return ('', HTTPStatus.OK)