def _compose_catch_up_union() -> str: """ Compose a UNION ALL statement and secondary query to extend the access log with items that predate access logging, if the configuration contains BIGQUERY.CATCHUP_TABLE. Otherwise, return an empty string, having no effect on any composed queries. The REGEXP_REPLACE function serves to format the object URL the same way the audit log resourceName is formatted. Returns: str -- The UNION ALL statement, or empty string. """ config = get_config() catchup_table_name = config.get("BIGQUERY", "CATCHUP_TABLE", fallback=None) if catchup_table_name: catchup_table = Table(catchup_table_name) return """ UNION ALL SELECT REGEXP_REPLACE(url,"gs://(.*)/(.*)","projects/_/buckets/{0}1/objects/{0}2") AS resourceName, created AS timestamp FROM `{1}` """.format("\\\\", catchup_table.get_fully_qualified_name()) return ""
def warmup_command() -> None: """ Evaluate objects in the audit log to see if they should be moved to a warmer storage class. """ config = get_config() moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED)) excluded_output = BigQueryOutput( get_table(TableDefinitions.OBJECTS_EXCLUDED)) rows_read = 0 # Create temp table object. Doesn't need to be initialized, as the # query job will do that. temp_table = Table( config.get('BIGQUERY', 'TEMP_TABLE', fallback='smart_archiver_temp_warmup')) # Register cleanup as shutdown hook def cleanup(): # Flush any remaining output moved_output.flush() excluded_output.flush() # Delete temp table temp_table.drop() # Print statistics LOG.info("%s rows read.", rows_read) LOG.info(moved_output.stats()) LOG.info(excluded_output.stats()) register(cleanup) # Run query job job = run_query_job(compose_warmup_query(), temp_table.get_fully_qualified_name()) # evaluate, archive and record def archive_worker(row: Row) -> None: if should_warm_up(row): rewrite_object(row, 'STANDARD', moved_output, excluded_output) workers = config.getint('RUNTIME', 'WORKERS') size = int(config.getint('RUNTIME', 'WORK_QUEUE_SIZE') / 2) with BoundedThreadPoolExecutor(max_workers=workers, queue_size=size) as executor: # get total rows in result, report it result = job.result() total_rows = result.total_rows percentage_reported = 0 LOG.info("Total rows: %s", total_rows) # Start all worker threads for row in result: rows_read += 1 executor.submit(archive_worker, row) # calculate the percentage and show it if it's a new 10%ile percentage = int(rows_read / total_rows * 100) if percentage > percentage_reported and not percentage % 10: LOG.info("%s percent complete.", percentage) percentage_reported = percentage
def __init__(self, table: Table): self.config = get_config() self.lock = Lock() self.client = get_bq_client() self.rows = list() self.tablename = table.get_fully_qualified_name() self.batch_size = int( self.config.get('BIGQUERY', 'BATCH_WRITE_SIZE', fallback=100)) self.insert_count = 0 table.initialize()
def evaluate_objects() -> None: """ Evaluate objects in the audit log to see if they should be moved to a new storage class. """ config = get_config() cold_storage_class = config.get('RULES', 'COLD_STORAGE_CLASS') moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED)) excluded_output = BigQueryOutput( get_table(TableDefinitions.OBJECTS_EXCLUDED)) work_queue = Queue(maxsize=3000) # evaluate, archive and record def archive_worker(): while True: row = work_queue.get() if not row: break if should_warm_up(row): rewrite_object(row, 'STANDARD', moved_output, excluded_output) elif should_cool_down(row): rewrite_object(row, cold_storage_class, moved_output, excluded_output) work_queue.task_done() # Start all worker threads worker_threads = [] for _ in range(32): thread = Thread(target=archive_worker) thread.start() worker_threads.append(thread) # Create temp table object. Doesn't need to be initialized. temp_table = Table("smart_archiver_temp") # Register cleanup as shutdown hook def cleanup(): # Flush any remaining output moved_output.flush() excluded_output.flush() # Delete temp table temp_table.drop() # Print statistics LOG.info("%s rows read.", rows_read) LOG.info(moved_output.stats()) LOG.info(excluded_output.stats()) register(cleanup) rows_read = 0 # Run query job job = run_query_job(compose_access_query(), temp_table.get_fully_qualified_name()) # Enqueue all work for row in job.result(): rows_read += 1 work_queue.put(row) # wait for all of the row jobs to complete LOG.info("All work enqueued. Waiting for last jobs to complete.") work_queue.join() # shutdown workers for _ in range(32): work_queue.put(None) for thread in worker_threads: thread.join()