CloudDataCatalogGetTagTemplateOperator, CloudDataCatalogListTagsOperator, CloudDataCatalogLookupEntryOperator, CloudDataCatalogRenameTagTemplateFieldOperator, CloudDataCatalogSearchCatalogOperator, CloudDataCatalogUpdateEntryOperator, CloudDataCatalogUpdateTagOperator, CloudDataCatalogUpdateTagTemplateFieldOperator, CloudDataCatalogUpdateTagTemplateOperator, ) TEST_PROJECT_ID: str = "example_id" TEST_LOCATION: str = "en-west-3" TEST_ENTRY_ID: str = "test-entry-id" TEST_TAG_ID: str = "test-tag-id" TEST_RETRY: Retry = Retry() TEST_TIMEOUT: float = 0.5 TEST_METADATA: Sequence[Tuple[str, str]] = [] TEST_GCP_CONN_ID: str = "test-gcp-conn-id" TEST_IMPERSONATION_CHAIN: Sequence[str] = ["ACCOUNT_1", "ACCOUNT_2", "ACCOUNT_3"] TEST_ENTRY_GROUP_ID: str = "test-entry-group-id" TEST_TAG_TEMPLATE_ID: str = "test-tag-template-id" TEST_TAG_TEMPLATE_FIELD_ID: str = "test-tag-template-field-id" TEST_TAG_TEMPLATE_NAME: str = "test-tag-template-field-name" TEST_FORCE: bool = False TEST_READ_MASK: Dict = {"fields": ["name"]} TEST_RESOURCE: str = "test-resource" TEST_OPTIONS_: Dict = {} TEST_PAGE_SIZE: int = 50 TEST_LINKED_RESOURCE: str = "test-linked-resource" TEST_SQL_RESOURCE: str = "test-sql-resource"
def upload_tiff_and_json_files(logger, filepaths_to_upload, bucket, stats, uncompressed_blob_prefix, extraction_path): google_retry = Retry(deadline=480, maximum=240) def on_google_retry_error(ex: Exception): logger.error("Exception when uploading blob to google cloud.") logger.exception(ex) def google_cloud_uploader(): start = time.time() blob_name, filepath, content_type = filepaths_to_upload.get(timeout=30) while True: blob = bucket.blob(blob_name) try: google_retry(blob.upload_from_filename(filepath, content_type=content_type), on_error=on_google_retry_error) except Exception as ex: logger.error(f"Uncaught exception when uploading blob to google cloud.") logger.exception(ex) filepaths_to_upload.put((blob.name, filepath, content_type)) raise ex stats['num_files_uploaded'] += 1 if stats['num_files_uploaded'] > stats['checkpoint']: elapsed = (time.time() - start) / 60 logger.info( f"Uploaded {stats['num_files_uploaded']} files in {elapsed} minutes, {stats['num_files_uploaded'] / elapsed} files per minute.") stats['checkpoint'] += 1000 blob_name, filepath, content_type = filepaths_to_upload.get(timeout=5) def traverse_directory(): for subdir_name in os.listdir(extraction_path): subdir_path = extraction_path + "/" + subdir_name if os.path.isdir(subdir_path): for filename in os.listdir(subdir_path): if not filename.startswith("._"): split = filename.rsplit(".") if split[-1] == "tif" and split[-2].endswith(("B02", "B03", "B04")): content_type = "image/tiff" # multiple tiff files per subdirectory blob_name: str = os.path.join(uncompressed_blob_prefix, "tiff", subdir_name, filename) filepath = subdir_path + "/" + filename filepaths_to_upload.put((blob_name, filepath, content_type)) elif split[-1] == "json": # one json file per subdirectory blob_name: str = os.path.join(uncompressed_blob_prefix, "json_metadata", filename) filepath = subdir_path + "/" + filename filepaths_to_upload.put((blob_name, filepath, content_type)) num_workers = int(os.environ.get("NUM_WORKERS", 3)) with ThreadPoolExecutor(max_workers=num_workers + 1) as executor: tasks: List[Future] = [] for x in range(num_workers): tasks.append(executor.submit(google_cloud_uploader)) tasks.append(executor.submit(traverse_directory)) logger.info(f"Started {len(tasks)} worker tasks.") logger.info("Starting traverse_directory") for task in as_completed(tasks): if task.exception() is not None: if type(task.exception()) == Empty: logger.info("Child thread completed") else: logger.error("Child thread failed") logger.exception(task.exception()) logger.info("Ending job")
gcs_client = storage.Client() bucket_name: str = os.environ.get("GCS_BUCKET_NAME") disk_path: str = os.environ.get("DISK_PATH") logger = logging.Logger(name='logger', level=logging.INFO) handler = logging.StreamHandler(sys.stdout) logger.addHandler(handler) def on_google_retry_error(ex: Exception): logger.error("Exception when uploading blob to google cloud.") logger.exception(ex) fs = gcsfs.GCSFileSystem(project='big_earth') google_retry = Retry(deadline=480, maximum=240) image_paths = queue.Queue() stats = { "pixel_sum": 0, "num_images": 0, } def get_image_sum_from_gcs(): image_path = image_paths.get(timeout=30) r = google_retry(fs.cat(image_path), on_error=on_google_retry_error) img = imageio.core.asarray(imageio.imread(r, 'TIFF')) stats['pixel_sum'] += img.sum() stats['num_images'] += 1
def test_list_documents_w_retry_timeout(): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 _list_documents_helper(retry=retry, timeout=timeout)
async def test_get_all_w_retry_timeout(self): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 await self._get_all_helper(retry=retry, timeout=timeout)
def test_collections_w_retry_timeout(self): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 self._collections_helper(retry=retry, timeout=timeout)
def test_execute_update_w_timeout_and_retry_params(self): self._execute_update_helper(retry=Retry(deadline=60), timeout=2.0)
def _transient_string_in_exception_message(exc): # type: (Exception) -> bool """Determines whether an exception's message contains a common message for transient errors. The exception's message containing one of these substrings is sufficient to determine that it is transient, but there can be transient exceptions whose messages do not contain these substrings. """ return ('The job encountered an internal error during execution' in str(exc) or 'Retrying the job may solve the problem' in str(exc)) # Retry object for errors encountered in making API calls (executing jobs, etc.) DEFAULT_RETRY_FOR_API_CALLS = Retry( # The predicate takes an exception and returns whether it is transient. predicate=lambda exc: (bq_retry.DEFAULT_RETRY._predicate(exc) or _transient_string_in_exception_message(exc)), deadline=DEFAULT_TIMEOUT_SEC) # Retry object for errors encountered while polling jobs in progress. # See https://github.com/googleapis/google-cloud-python/issues/6301 DEFAULT_RETRY_FOR_ASYNC_JOBS = Retry( # The predicate takes an exception and returns whether it is transient. predicate=lambda exc: (polling.DEFAULT_RETRY._predicate(exc) or _transient_string_in_exception_message(exc)), deadline=DEFAULT_TIMEOUT_SEC) class BigqueryBaseClient(object): """Stores credentials and pointers to a BigQuery project.
def test_get_w_document_ref_w_retry_timeout(self): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 self._get_w_document_ref_helper(retry=retry, timeout=timeout)
def test_execute_update_w_retry_param(self): self._execute_update_helper(retry=Retry(deadline=60))
from google.cloud.bigtable.row import ConditionalRow from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.row_data import PartialRowsData from grpc import StatusCode # Maximum number of mutations in bulk (MutateRowsRequest message): # (https://cloud.google.com/bigtable/docs/reference/data/rpc/ # google.bigtable.v2#google.bigtable.v2.MutateRowRequest) _MAX_BULK_MUTATIONS = 100000 DEFAULT_RETRY = Retry( predicate=if_exception_type(( Aborted, DeadlineExceeded, ServiceUnavailable, ), ), initial=1.0, maximum=15.0, multiplier=2.0, deadline=120.0, # 2 minutes ) """The default retry stategy to be used on retry-able errors. Used by :meth:`~google.cloud.bigtable.table.Table.mutate_rows`. """ class TableMismatchError(ValueError): """Row from another table."""
async def test_asynctransaction_get_w_document_ref_w_retry_timeout(): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 await _get_w_document_ref_helper(retry=retry, timeout=timeout)
from mediawords.util.log import create_logger from mediawords.workflow.exceptions import McProgrammingError from .config import GCAuthConfig from .transcript import Transcript, UtteranceAlternative, Utterance from .media_info import MediaFileInfoAudioStream log = create_logger(__name__) # Speech API sometimes throws: # # google.api_core.exceptions.ServiceUnavailable: 503 failed to connect to all addresses # # so let it retry for 10 minutes or so. _GOOGLE_API_RETRIES = Retry(initial=5, maximum=60, multiplier=2, deadline=60 * 10) """Google Cloud API's own retry policy.""" def submit_transcribe_operation(gs_uri: str, episode_metadata: MediaFileInfoAudioStream, bcp47_language_code: str, gc_auth_config: Optional[GCAuthConfig] = None) -> str: """ Submit a Speech API long running operation to transcribe a podcast episode. :param gs_uri: Google Cloud Storage URI to a transcoded episode. :param episode_metadata: Metadata derived from the episode while transcoding it. :param bcp47_language_code: Episode's BCP 47 language code guessed from story's title + description. :param gc_auth_config: Google Cloud authentication configuration instance. :return Google Speech API operation ID by which the transcription operation can be referred to.
# [START howto_operator_vision_detect_image_param] DETECT_IMAGE = {"source": {"image_uri": GCP_VISION_ANNOTATE_IMAGE_URL}} # [END howto_operator_vision_detect_image_param] with models.DAG('example_gcp_vision_autogenerated_id', start_date=days_ago(1), schedule_interval=None) as dag_autogenerated_id: # ################################## # # ### Autogenerated IDs examples ### # # ################################## # # [START howto_operator_vision_product_set_create] product_set_create = CloudVisionCreateProductSetOperator( location=GCP_VISION_LOCATION, product_set=product_set, retry=Retry(maximum=10.0), timeout=5, task_id='product_set_create', ) # [END howto_operator_vision_product_set_create] # [START howto_operator_vision_product_set_get] product_set_get = CloudVisionGetProductSetOperator( location=GCP_VISION_LOCATION, product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", task_id='product_set_get', ) # [END howto_operator_vision_product_set_get] # [START howto_operator_vision_product_set_update] product_set_update = CloudVisionUpdateProductSetOperator(
def delete_table(self, mode="staging", bucket_name=None, not_found_ok=False): """Deletes a table from storage, sends request in batches. Args: mode (str): Folder of which dataset to update [raw|staging|header|auxiliary_files|architecture] Folder of which dataset to update. Defaults to "staging". bucket_name (str): The bucket name from which to delete the table. If None, defaults to the bucket initialized when instantiating the Storage object. (You can check it with the Storage().bucket property) not_found_ok (bool): Optional. What to do if table not found """ prefix = f"{mode}/{self.dataset_id}/{self.table_id}/" if bucket_name is not None: table_blobs = list(self.client["storage_staging"].bucket( f"{bucket_name}").list_blobs(prefix=prefix)) else: table_blobs = list(self.bucket.list_blobs(prefix=prefix)) if table_blobs == []: if not_found_ok: return else: raise FileNotFoundError( f"Could not find the requested table {self.dataset_id}.{self.table_id}" ) else: # Divides table_blobs list for maximum batch request size table_blobs_chunks = [ table_blobs[i:i + 999] for i in range(0, len(table_blobs), 999) ] for i, source_table in enumerate( tqdm(table_blobs_chunks, desc="Delete Table Chunk")): counter = 0 while counter < 100: try: with self.client["storage_staging"].batch(): for blob in source_table: blob.delete(retry=Retry( predicate=_is_retryable)) break except Exception as e: print( f"Delete Table Chunk {i} | Attempt {counter}: delete operation starts again in 5 seconds...", ) time.sleep(5) counter += 1 traceback.print_exc(file=sys.stderr)
# pylint: enable=ungrouped-imports SPANNER_DATA_SCOPE = "https://www.googleapis.com/auth/spanner.data" _DATABASE_NAME_RE = re.compile( r"^projects/(?P<project>[^/]+)/" r"instances/(?P<instance_id>[a-z][-a-z0-9]*)/" r"databases/(?P<database_id>[a-z][a-z0-9_\-]*[a-z0-9])$" ) _DATABASE_METADATA_FILTER = "name:{0}/operations/" DEFAULT_RETRY_BACKOFF = Retry(initial=0.02, maximum=32, multiplier=1.3) class Database(object): """Representation of a Cloud Spanner Database. We can use a :class:`Database` to: * :meth:`create` the database * :meth:`reload` the database * :meth:`update` the database * :meth:`drop` the database :type database_id: str :param database_id: The ID of the database.
def copy_table( self, source_bucket_name="basedosdados", destination_bucket_name=None, mode="staging", ): """Copies table from a source bucket to your bucket, sends request in batches. Args: source_bucket_name (str): The bucket name from which to copy data. You can change it to copy from other external bucket. destination_bucket_name (str): Optional The bucket name where data will be copied to. If None, defaults to the bucket initialized when instantiating the Storage object (You can check it with the Storage().bucket property) mode (str): Folder of which dataset to update [raw|staging|header|auxiliary_files|architecture] Folder of which dataset to update. Defaults to "staging". """ source_table_ref = list(self.client["storage_staging"].bucket( source_bucket_name).list_blobs( prefix=f"{mode}/{self.dataset_id}/{self.table_id}/")) if source_table_ref == []: raise FileNotFoundError( f"Could not find the requested table {self.dataset_id}.{self.table_id}" ) if destination_bucket_name is None: destination_bucket = self.bucket else: destination_bucket = self.client["storage_staging"].bucket( destination_bucket_name) # Divides source_table_ref list for maximum batch request size source_table_ref_chunks = [ source_table_ref[i:i + 999] for i in range(0, len(source_table_ref), 999) ] for i, source_table in enumerate( tqdm(source_table_ref_chunks, desc="Copy Table Chunk")): counter = 0 while counter < 100: try: with self.client["storage_staging"].batch(): for blob in source_table: self.bucket.copy_blob( blob, destination_bucket=destination_bucket, retry=Retry(predicate=_is_retryable), ) break except Exception as e: print( f"Copy Table Chunk {i} | Attempt {counter}: copy operation starts again in 5 seconds...", ) counter += 1 time.sleep(5) traceback.print_exc(file=sys.stderr)
async def test_asynccollectionreference_list_documents_w_retry_timeout(): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 await _list_documents_helper(retry=retry, timeout=timeout)
# [START howto_operator_vision_detect_image_param] DETECT_IMAGE = {"source": {"image_uri": GCP_VISION_ANNOTATE_IMAGE_URL}} # [END howto_operator_vision_detect_image_param] with models.DAG( 'example_gcp_vision_autogenerated_id', default_args=default_args, schedule_interval=None ) as dag_autogenerated_id: # ################################## # # ### Autogenerated IDs examples ### # # ################################## # # [START howto_operator_vision_product_set_create] product_set_create = CloudVisionCreateProductSetOperator( location=GCP_VISION_LOCATION, product_set=product_set, retry=Retry(maximum=10.0), timeout=5, task_id='product_set_create', ) # [END howto_operator_vision_product_set_create] # [START howto_operator_vision_product_set_get] product_set_get = CloudVisionGetProductSetOperator( location=GCP_VISION_LOCATION, product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", task_id='product_set_get', ) # [END howto_operator_vision_product_set_get] # [START howto_operator_vision_product_set_update] product_set_update = CloudVisionUpdateProductSetOperator(
from airflow import version from airflow.exceptions import AirflowException from airflow.providers.google.cloud.hooks.cloud_memorystore import CloudMemorystoreHook from tests.providers.google.cloud.utils.base_gcp_mock import ( GCP_PROJECT_ID_HOOK_UNIT_TEST, mock_base_gcp_hook_default_project_id, mock_base_gcp_hook_no_default_project_id, ) TEST_GCP_CONN_ID = "test-gcp-conn-id" # type: str TEST_DELEGATE_TO = "test-delegate-to" # type: str TEST_LOCATION = "test-location" # type: str TEST_INSTANCE_ID = "test-instance-id" # type: str TEST_PROJECT_ID = "test-project-id" # type: str TEST_RETRY = Retry() # type: Retry TEST_TIMEOUT = 10 # type: float TEST_METADATA = [("KEY", "VALUE")] # type: Sequence[Tuple[str, str]] TEST_PAGE_SIZE = 100 # type: int TEST_UPDATE_MASK = {"paths": ["memory_size_gb"]} # type: Dict TEST_PARENT = "projects/test-project-id/locations/test-location" # type: str TEST_NAME = "projects/test-project-id/locations/test-location/instances/test-instance-id" # type: str TEST_PARENT_DEFAULT_PROJECT_ID = "projects/{}/locations/test-location".format( GCP_PROJECT_ID_HOOK_UNIT_TEST) # type: str TEST_NAME_DEFAULT_PROJECT_ID = "projects/{}/locations/test-location/instances/test-instance-id".format( GCP_PROJECT_ID_HOOK_UNIT_TEST) # type: str class TestCloudMemorystoreWithDefaultProjectIdHook(TestCase): def setUp(self, ): with mock.patch(
def test_documentreference_delete_w_retry_timeout(): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 _delete_helper(retry=retry, timeout=timeout)
# Maximum number of mutations in bulk (MutateRowsRequest message): # (https://cloud.google.com/bigtable/docs/reference/data/rpc/ # google.bigtable.v2#google.bigtable.v2.MutateRowRequest) _MAX_BULK_MUTATIONS = 100000 VIEW_NAME_ONLY = enums.Table.View.NAME_ONLY class _BigtableRetryableError(Exception): """Retry-able error expected by the default retry strategy.""" DEFAULT_RETRY = Retry( predicate=if_exception_type(_BigtableRetryableError), initial=1.0, maximum=15.0, multiplier=2.0, deadline=120.0, # 2 minutes ) """The default retry stategy to be used on retry-able errors. Used by :meth:`~google.cloud.bigtable.table.Table.mutate_rows`. """ class TableMismatchError(ValueError): """Row from another table.""" class TooManyMutationsError(ValueError): """The number of mutations for bulk request is too big."""
def test_client_get_all_w_retry_timeout(): from google.api_core.retry import Retry retry = Retry(predicate=object()) timeout = 123.0 _get_all_helper(retry=retry, timeout=timeout)