def listen_command() -> None: """ Implementation of the listen command. """ config = get_config() # Call this once to initialize the table. _ = BigQueryOutput( get_table(TableDefinitions.INVENTORY, config.get("BIGQUERY", "INVENTORY_TABLE"))) subscriber = pubsub.SubscriberClient() topic_name = 'projects/{}/topics/{}'.format( config.get("GCP", "PROJECT"), config.get("PUBSUB", "TOPIC_SHORT_NAME")) subscription_name = 'projects/{}/subscriptions/{}'.format( config.get("GCP", "PROJECT"), config.get("PUBSUB", "SUBSCRIPTION_SHORT_NAME")) LOG.info("Creating or adopting subscription {}.".format(subscription_name)) try: subscriber.create_subscription(name=subscription_name, topic=topic_name, ack_deadline_seconds=60) except AlreadyExists: pass output = BigQueryOutput( get_table(TableDefinitions.INVENTORY, config.get("BIGQUERY", "INVENTORY_TABLE")), False) def handle(message): """Callback for handling new PubSub messages. Effectively, this just "partially applies" the output stream above to unpack_and_insert. """ unpack_and_insert(output, message) def shutdown(sub_future: StreamingPullFuture) -> None: """Close subscriptions and flush rows to BQ. """ LOG.info("Cancelling subscription pull.") sub_future.cancel() LOG.info("Flushing rows to BigQuery.") output.flush() LOG.info("Subscribing...") subscription_future = subscriber.subscribe(subscription_name, handle) atexit.register(shutdown, subscription_future) timeout = config.getint("PUBSUB", "TIMEOUT", fallback=10) with subscriber: while True: try: subscription_future.result(timeout=timeout) except TimeoutError: LOG.debug("No messages in {} seconds, flushing rows (if any).". format(timeout)) output.flush() except: LOG.info("Quitting...") break
def _compose_catch_up_union() -> str: """ Compose a UNION ALL statement and secondary query to extend the access log with items that predate access logging, if the configuration contains BIGQUERY.CATCHUP_TABLE. Otherwise, return an empty string, having no effect on any composed queries. The REGEXP_REPLACE function serves to format the object URL the same way the audit log resourceName is formatted. Returns: str -- The UNION ALL statement, or empty string. """ config = get_config() catchup_table_name = config.get("BIGQUERY", "CATCHUP_TABLE", fallback=None) if catchup_table_name: catchup_table = Table(catchup_table_name) return """ UNION ALL SELECT REGEXP_REPLACE(url,"gs://(.*)/(.*)","projects/_/buckets/{0}1/objects/{0}2") AS resourceName, created AS timestamp FROM `{1}` """.format("\\\\", catchup_table.get_fully_qualified_name()) return ""
def _get_cold_threshold_days() -> int: """Retrieve the warm threshold days from the configuration. Returns: int -- Warm threshold days. """ config = get_config() return config.getint('RULES', 'COLD_THRESHOLD_DAYS')
def _get_warm_threshold_accesses() -> int: """Retrieve the warm threshold accesses from the configuration. Returns: int -- Warm threshold accesses. """ config = get_config() return config.getint('RULES', 'WARM_THRESHOLD_ACCESSES')
def __init__(self, table: Table, create_table: bool = True): self.config = get_config() self.lock = Lock() self.rows = list() self.tablename = table.get_fully_qualified_name() self.batch_size = int( self.config.get('BIGQUERY', 'BATCH_WRITE_SIZE', fallback=100)) self.insert_count = 0 self.insert_bytes = 0 if create_table: table.initialize()
def _calculate_day_partitions() -> int: """Calculate the daily partitions to query. This is the sum of how far you need to look back (COLD_THRESHOLD_DAYS) and how often you look (DAYS_BETWEEN_RUNS). Returns: int -- The sum of cold threshold days and days between runs. """ config = get_config() return config.getint('RULES', 'COLD_THRESHOLD_DAYS') + \ config.getint('RULES', 'DAYS_BETWEEN_RUNS')
def get_client(self) -> bigquery.client: """Get a client. Returns: storage.client -- A configured BQ client. """ if not self.client: LOG.debug("Making new BQ client.") config = get_config() self.client = bigquery.Client( project=config.get('BIGQUERY', 'JOB_PROJECT', fallback=config.get('GCP', 'PROJECT'))) return self.client
def get_fully_qualified_name(self) -> str: """Return a table name with project and dataset names prefixed. Arguments: name {str} -- Short name of the table. Returns: str -- Fully qualified name of the table. """ config = get_config() return "{}.{}.{}".format( config.get("BIGQUERY", "JOB_PROJECT", fallback=config.get("GCP", "PROJECT")), config.get("BIGQUERY", "DATASET_NAME"), self.short_name)
def load_command(buckets: [str] = None, prefix: str = None) -> None: """Implementation of the load command. This function dispatches each bucket listed into an executor thread for parallel processing of the bucket list. Keyword Arguments: buckets {[str]} -- A list of buckets to use instead of the project-wide bucket listing. (default: {None}) prefix {str} -- A prefix to use when listing. (default: {None}) """ config = get_config() gcs = get_gcs_client() # Call this once to initialize. _ = BigQueryOutput( get_table(TableDefinitions.INVENTORY, config.get("BIGQUERY", "INVENTORY_TABLE"))) # if buckets is given, get each bucket object; otherwise, list all bucket # objects if buckets: buckets = [gcs.get_bucket(x) for x in buckets] else: buckets = [x for x in gcs.list_buckets()] total_buckets = len(buckets) buckets_listed = 0 bucket_blob_counts = dict() # Use at most 2 workers for this part, as it won't be many workers = min(config.getint('RUNTIME', 'WORKERS'), 2) size = int(config.getint('RUNTIME', 'WORK_QUEUE_SIZE') * .25) with BoundedThreadPoolExecutor(max_workers=workers, queue_size=size) as executor: for bucket in buckets: buckets_listed += 1 executor.submit(bucket_lister, config, gcs, bucket, prefix, buckets_listed, total_buckets, bucket_blob_counts) LOG.info("Stats: \n\t%s", bucket_blob_counts) LOG.info("Total rows: \n\t%s", sum([v for _, v in bucket_blob_counts.items()]))
def get_client(self) -> storage.client: """Get a client from the pool. Automatically makes new ones until the pool is full. Threadsafe. Returns: storage.client -- A configured GCS client. """ config = get_config() self.lock.acquire() if len(self.clients) < self.pool_size: LOG.debug("Making new GCS client.") self.clients.append( storage.Client( config.get('GCP', 'GCS_PROJECT', fallback=config.get('GCP', 'PROJECT')))) client = self.clients[self.next_up] self.next_up += 1 if self.next_up >= self.pool_size - 1: self.next_up = 0 self.lock.release() return client
def unpack_and_insert(output: BigQueryOutput, message: Message) -> None: """Unpack a PubSub message regarding a GCS object change, and insert it into a BigQueryOutput. Args: output (BigQueryOutput): The output to use. In most cases, you will want to use a single output object per program. message (Message): The PubSub message. """ bq_client = get_bq_client() config = get_config() table = get_table(TableDefinitions.INVENTORY, config.get("BIGQUERY", "INVENTORY_TABLE")) table_name = table.get_fully_qualified_name() try: LOG.debug("Message data: \n---DATA---\n{}\n---DATA---".format( message.data)) # Decode and deserialize message_string = bytes.decode(message.data, "UTF-8") object_info = json.loads(message_string) LOG.debug(message) LOG.debug(object_info) # Get important attributes event_type = message.attributes['eventType'] publish_time = message.publish_time.isoformat() LOG.info("Got a message: {} {} {}".format( publish_time, event_type, object_info['bucket'] + "/" + object_info['name'])) # For deletes, use the publish time to approximate deleted time if event_type == "OBJECT_DELETE": object_info["timeDeleted"] = publish_time if object_info.get("metadata"): object_info["metadata"] = [{ "key": k, "value": v } for k, v in object_info["metadata"].items()] if event_type == "OBJECT_METADATA_UPDATE": def generate_structs(arr): res = '[' for s in arr: res += "STRUCT(\"{key}\" as key, \"{value}\" as value),".format( key=s['key'], value=s['value']) res = res[:-1] res += ']' return res querytext = "UPDATE `{table_name}`\ SET metadata = {new_metadata}\ WHERE id = '{id}'".format( table_name=table_name, new_metadata=generate_structs([{ "key": k, "value": v } for k, v in object_info["metadata"].items()]), id=object_info["id"]) LOG.info("Running query: \n%s", querytext) query_job = bq_client.query(querytext) LOG.info(query_job.result()) else: # Enqueue for writing output.put(object_info) message.ack() except: LOG.exception( "Error processing message! ---DATA---\n{}\n---DATA---".format( message.data)) # TODO: A retry / DLQ policy would be useful, if not already present by default. message.nack()