def _compose_catch_up_union() -> str:
    """
    Compose a UNION ALL statement and secondary query to extend the
    access log with items that predate access logging, if the
    configuration contains BIGQUERY.CATCHUP_TABLE. Otherwise, return an
    empty string, having no effect on any composed queries.

    The REGEXP_REPLACE function serves to format the object URL the same
    way the audit log resourceName is formatted.

    Returns:
        str -- The UNION ALL statement, or empty string.
    """
    config = get_config()
    catchup_table_name = config.get("BIGQUERY", "CATCHUP_TABLE", fallback=None)
    if catchup_table_name:
        catchup_table = Table(catchup_table_name)
        return """
            UNION ALL
            SELECT
                REGEXP_REPLACE(url,"gs://(.*)/(.*)","projects/_/buckets/{0}1/objects/{0}2") AS resourceName,
                created AS timestamp
            FROM `{1}`
        """.format("\\\\", catchup_table.get_fully_qualified_name())
    return ""
示例#2
0
def warmup_command() -> None:
    """
    Evaluate objects in the audit log to see if they should be moved to a
    warmer storage class.
    """
    config = get_config()
    moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED))
    excluded_output = BigQueryOutput(
        get_table(TableDefinitions.OBJECTS_EXCLUDED))
    rows_read = 0

    # Create temp table object. Doesn't need to be initialized, as the
    # query job will do that.
    temp_table = Table(
        config.get('BIGQUERY',
                   'TEMP_TABLE',
                   fallback='smart_archiver_temp_warmup'))

    # Register cleanup as shutdown hook
    def cleanup():
        # Flush any remaining output
        moved_output.flush()
        excluded_output.flush()
        # Delete temp table
        temp_table.drop()
        # Print statistics
        LOG.info("%s rows read.", rows_read)
        LOG.info(moved_output.stats())
        LOG.info(excluded_output.stats())

    register(cleanup)

    # Run query job
    job = run_query_job(compose_warmup_query(),
                        temp_table.get_fully_qualified_name())

    # evaluate, archive and record
    def archive_worker(row: Row) -> None:
        if should_warm_up(row):
            rewrite_object(row, 'STANDARD', moved_output, excluded_output)

    workers = config.getint('RUNTIME', 'WORKERS')
    size = int(config.getint('RUNTIME', 'WORK_QUEUE_SIZE') / 2)
    with BoundedThreadPoolExecutor(max_workers=workers,
                                   queue_size=size) as executor:
        # get total rows in result, report it
        result = job.result()
        total_rows = result.total_rows
        percentage_reported = 0
        LOG.info("Total rows: %s", total_rows)
        # Start all worker threads
        for row in result:
            rows_read += 1
            executor.submit(archive_worker, row)
            # calculate the percentage and show it if it's a new 10%ile
            percentage = int(rows_read / total_rows * 100)
            if percentage > percentage_reported and not percentage % 10:
                LOG.info("%s percent complete.", percentage)
                percentage_reported = percentage
示例#3
0
 def __init__(self, table: Table):
     self.config = get_config()
     self.lock = Lock()
     self.client = get_bq_client()
     self.rows = list()
     self.tablename = table.get_fully_qualified_name()
     self.batch_size = int(
         self.config.get('BIGQUERY', 'BATCH_WRITE_SIZE', fallback=100))
     self.insert_count = 0
     table.initialize()
def evaluate_objects() -> None:
    """
    Evaluate objects in the audit log to see if they should be moved to a
    new storage class.
    """
    config = get_config()
    cold_storage_class = config.get('RULES', 'COLD_STORAGE_CLASS')
    moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED))
    excluded_output = BigQueryOutput(
        get_table(TableDefinitions.OBJECTS_EXCLUDED))
    work_queue = Queue(maxsize=3000)

    # evaluate, archive and record
    def archive_worker():
        while True:
            row = work_queue.get()
            if not row:
                break
            if should_warm_up(row):
                rewrite_object(row, 'STANDARD', moved_output, excluded_output)
            elif should_cool_down(row):
                rewrite_object(row, cold_storage_class, moved_output,
                               excluded_output)
            work_queue.task_done()

    # Start all worker threads
    worker_threads = []
    for _ in range(32):
        thread = Thread(target=archive_worker)
        thread.start()
        worker_threads.append(thread)

    # Create temp table object. Doesn't need to be initialized.
    temp_table = Table("smart_archiver_temp")

    # Register cleanup as shutdown hook
    def cleanup():
        # Flush any remaining output
        moved_output.flush()
        excluded_output.flush()
        # Delete temp table
        temp_table.drop()
        # Print statistics
        LOG.info("%s rows read.", rows_read)
        LOG.info(moved_output.stats())
        LOG.info(excluded_output.stats())

    register(cleanup)

    rows_read = 0
    # Run query job
    job = run_query_job(compose_access_query(),
                        temp_table.get_fully_qualified_name())
    # Enqueue all work
    for row in job.result():
        rows_read += 1
        work_queue.put(row)

    # wait for all of the row jobs to complete
    LOG.info("All work enqueued. Waiting for last jobs to complete.")
    work_queue.join()

    # shutdown workers
    for _ in range(32):
        work_queue.put(None)
    for thread in worker_threads:
        thread.join()