def handle_duplicate_notification(bkt: storage.Bucket, success_blob: storage.Blob, gsurl: str): """ Need to handle potential duplicate Pub/Sub notifications. To achieve this we will drop an empty "claimed" file that indicates an invocation of this cloud function has picked up the success file with a certain creation timestamp. This will support republishing the success file as a mechanism of re-running the ingestion while avoiding duplicate ingestion due to multiple Pub/Sub messages for a success file with the same creation time. """ success_blob.reload() success_created_unix_timestamp = success_blob.time_created.timestamp() claim_blob: storage.Blob = bkt.blob( success_blob.name.replace( SUCCESS_FILENAME, f"_claimed_{success_created_unix_timestamp}")) try: claim_blob.upload_from_string("", if_generation_match=0) except google.api_core.exceptions.PreconditionFailed as err: raise RuntimeError( f"The prefix {gsurl} appears to already have been claimed for " f"{gsurl}{SUCCESS_FILENAME} with created timestamp" f"{success_created_unix_timestamp}." "This means that another invocation of this cloud function has" "claimed the ingestion of this batch." "This may be due to a rare duplicate delivery of the Pub/Sub " "storage notification.") from err
def handle_duplicate_notification( gcs_client: storage.Client, blob_to_claim: storage.Blob, ): """ Need to handle potential duplicate Pub/Sub notifications. To achieve this we will drop an empty "claimed" file that indicates an invocation of this cloud function has picked up the success file with a certain creation timestamp. This will support republishing the success file as a mechanism of re-running the ingestion while avoiding duplicate ingestion due to multiple Pub/Sub messages for a success file with the same creation time. """ blob_to_claim.reload(client=gcs_client) created_unix_timestamp = blob_to_claim.time_created.timestamp() basename = os.path.basename(blob_to_claim.name) claim_blob: storage.Blob = blob_to_claim.bucket.blob( blob_to_claim.name.replace( basename, f"_claimed_{basename}_created_at_" f"{created_unix_timestamp}")) try: claim_blob.upload_from_string("", if_generation_match=0, client=gcs_client) except google.api_core.exceptions.PreconditionFailed as err: blob_to_claim.reload(client=gcs_client) raise exceptions.DuplicateNotificationException( f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears " "to already have been claimed for created timestamp: " f"{created_unix_timestamp}." "This means that another invocation of this cloud function has " "claimed the work to be one for this file. " "This may be due to a rare duplicate delivery of the Pub/Sub " "storage notification.") from err
def copy_index(index_folder_path: str, build_index_blob: Blob, build_index_generation: str, production_bucket: Bucket, build_bucket: Bucket, storage_base_path: str, build_bucket_base_path: str): """ Copies the build bucket index to the production bucket index path. Args: index_folder_path (str): index folder full path. build_index_blob (Blob): google cloud storage object that represents build index.zip blob. build_index_generation (str): downloaded build index generation. production_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where index is copied to. build_bucket (google.cloud.storage.bucket.Bucket): gcs bucket where index is copied from. storage_base_path (str): the path to upload the index to. build_bucket_base_path (str): the path in the build bucket of the index. """ try: build_index_blob.reload() build_current_index_generation = build_index_blob.generation # disabling caching for prod index blob prod_index_storage_path = os.path.join(storage_base_path, f"{GCPConfig.INDEX_NAME}.zip") prod_index_blob = production_bucket.blob(prod_index_storage_path) prod_index_blob.cache_control = "no-cache,max-age=0" prod_index_json_storage_path = os.path.join(storage_base_path, f"{GCPConfig.INDEX_NAME}.json") prod_index_json_blob = production_bucket.blob(prod_index_json_storage_path) prod_index_json_blob.cache_control = "no-cache,max-age=0" if build_current_index_generation == build_index_generation: copied_index = build_bucket.copy_blob( blob=build_index_blob, destination_bucket=production_bucket, new_name=prod_index_storage_path ) if copied_index.exists(): logging.success(f"Finished uploading {GCPConfig.INDEX_NAME}.zip to storage.") else: logging.error("Failed copying index.zip from build index - blob does not exist.") sys.exit(1) copied_index_json_blob = build_bucket.blob( os.path.join(build_bucket_base_path, f"{GCPConfig.INDEX_NAME}.json") ) copied_index_json = build_bucket.copy_blob( blob=copied_index_json_blob, destination_bucket=production_bucket, new_name=prod_index_json_storage_path ) if copied_index_json.exists(): logging.success(f"Finished uploading {GCPConfig.INDEX_NAME}.json to storage.") else: logging.error("Failed copying index.json from build index - blob does not exist.") sys.exit(1) else: logging.error(f"Failed in uploading {GCPConfig.INDEX_NAME}, mismatch in index file generation") logging.error(f"Downloaded build index generation: {build_index_generation}") logging.error(f"Current build index generation: {build_current_index_generation}") sys.exit(1) except Exception as e: logging.exception(f"Failed copying {GCPConfig.INDEX_NAME}. Additional Info: {str(e)}") sys.exit(1) finally: shutil.rmtree(index_folder_path)
def up_to_date(input_blob: storage.Blob, output_blob: storage.Blob): """ Checks if the blob is up-to-date. :param input_blob: :param output_blob: :return: true if the output blob is up-to-date. If the blob doesn't exist or is outdated, returns false. """ if not output_blob.exists(): return False input_blob.reload() output_blob.reload() assert input_blob.updated is not None, 'input blob should exist' if input_blob.updated > output_blob.updated: return False return True
def _assert_file_uploaded(self, blob: storage.Blob, sleep_time: float, max_sleep_time: float): if sleep_time > max_sleep_time: raise UploadPollingException( f'Could not verify completed upload for blob {blob.name} within maximum ' f'wait time of {str(max_sleep_time)} seconds') else: sleep(sleep_time) blob.reload() export_completed = blob.metadata is not None and blob.metadata.get( "export_completed") if export_completed: return else: new_sleep_time = sleep_time * 2 self.logger.info( f'Verifying upload of blob {blob.name}. Waiting for {str(new_sleep_time)} seconds...' ) return self._assert_file_uploaded(blob, new_sleep_time, max_sleep_time)