def get_gcp_service_account_credentials(gcp_project_id): # Retrieve service account information corresponding to the GCP Project ID provided # bucket, blob_name = get_gcp_service_account_infos(gcp_project_id) if (bucket is None) or (blob_name is None): return None try: # Read the credentials from GCS # gcs_client = Client() bucket = gcs_client.get_bucket(bucket) blob = Blob(blob_name, bucket) json_credentials = json.loads(blob.download_as_string()) # Build and return GCP Credentials # return service_account.Credentials.from_service_account_info( json_credentials) except Exception as ex: print("Cannot retrieve Service Account credentials.") print(ex) return None
def lock(self): """ This is the best we can do. It is impossible to acquire the lock reliably without using any additional services. test-and-set is impossible to implement. :return: """ log = self._log log.info("Locking the bucket...") # Client should be imported here because grpc starts threads during import # and if you call fork after that, a child process will be hang during exit from google.cloud.storage import Client if self.credentials: client = Client.from_service_account_json(self.credentials) else: client = Client() bucket = client.get_bucket(self.bucket_name) self._bucket = bucket sentinel = bucket.blob("index.lock") try: while sentinel.exists(): log.warning("Failed to acquire the lock, waiting...") time.sleep(1) sentinel.upload_from_string(b"") # Several agents can get here. No test-and-set, sorry! yield None finally: self._bucket = None if sentinel is not None: try: sentinel.delete() except: pass
def _write_async(self): if len(self.pending_records) == 0: return try: client = Client(project=DB_LOGGER_WRITE_TO_GCS_PROJECT_ID) bucket_path = f"{self.bucket_inner_path}/{self.filename}" if DB_LOGGER_WRITE_TO_GCS_MULTI_FILE_LOG: bucket_path = self._compose_progressing_log_file_name( bucket_path) bucket = client.bucket(bucket_name=self.bucket_name) blob = bucket.blob(blob_name=bucket_path) records = self.pending_records self.pending_records = [] if not DB_LOGGER_WRITE_TO_GCS_MULTI_FILE_LOG and blob.exists(): current_log = blob.download_as_string().decode( encoding="utf-8").strip() if current_log: records.insert(0, current_log) # Reset the blob blob = bucket.blob(blob_name=bucket_path) blob.upload_from_string("\n".join(records)) except Exception as err: airflow_db_logger_log.error( f"Failed to flash to bucket @ {self.bucket_name}/{self.bucket_inner_path}/{self.filename}" ) airflow_db_logger_log.error(err)
def _store_in_thread(self, file): file.seek(0) from google.cloud.storage import Client client = Client(project=self.project_id) bucket = client.get_bucket(self.bucket_name) blob = bucket.blob(self.blob_name) blob.upload_from_file(file, predefined_acl=self.acl)
def open_gcs_url(config, logger, storage, url): reader_impl = SourceFile.extract_reader_impl(config) use_gcs_service_account = "service_account_json" in config["provider"] and storage == "gs://" file_to_close = None if reader_impl == "gcsfs": if use_gcs_service_account: try: token_dict = json.loads(config["provider"]["service_account_json"]) except json.decoder.JSONDecodeError as err: logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}") raise err else: token_dict = "anon" fs = gcsfs.GCSFileSystem(token=token_dict) file_to_close = fs.open(f"gs://{url}") result = file_to_close else: if use_gcs_service_account: try: credentials = json.dumps(json.loads(config["provider"]["service_account_json"])) tmp_service_account = tempfile.NamedTemporaryFile(delete=False) with open(tmp_service_account, "w") as f: f.write(credentials) tmp_service_account.close() client = Client.from_service_account_json(tmp_service_account.name) result = open(f"gs://{url}", transport_params=dict(client=client)) os.remove(tmp_service_account.name) except json.decoder.JSONDecodeError as err: logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}") raise err else: client = Client.create_anonymous_client() result = open(f"{storage}{url}", transport_params=dict(client=client)) return result, file_to_close
async def upload_picture(file: bytes = File(None, media_type="image/jpeg")): """ Uploads image from phone to server and saves it to bucket Args: file (bytes): the image taken from camera app encoded in bytes Returns: response (str): JSON Response with uuid of the file uploaded """ try: if file is None: raise HTTPException(status_code=422, detail="Empty image sent") else: # Initializes the Storage client storage_client = Client(project=creds.project_id) bucket = storage_client.get_bucket(creds.bucket_id) # Generates a unique identifier for storage img_uuid = str(uuid.uuid4())[0:6] blob = bucket.blob(img_uuid) # Decodes the base64 encoded bytearry of incoming image content = base64.b64decode(file) # Takes base64 decoded image and converts to image/jpeg blob.upload_from_string(data=content, content_type="image/jpeg") return {"detail": img_uuid} except GoogleCloudError as e: raise HTTPException(detail=str(e), status_code=500)
def _get_native_gcp_handle() -> typing.Any: if Config.BLOBSTORE_GS_MAX_CUMULATIVE_RETRY is not None: google.resumable_media.common.MAX_CUMULATIVE_RETRY = Config.BLOBSTORE_GS_MAX_CUMULATIVE_RETRY if Config.BLOBSTORE_CONNECT_TIMEOUT is None and Config.BLOBSTORE_READ_TIMEOUT is None: return Client.from_service_account_json( os.environ['GOOGLE_APPLICATION_CREDENTIALS'], ) else: # GCP has no direct interface to configure retries and timeouts. However, it makes use of Python's # stdlib `requests` package, which has straightforward timeout usage. class SessionWithTimeouts( google.auth.transport.requests.AuthorizedSession): def request(self, *args, **kwargs): kwargs['timeout'] = (Config.BLOBSTORE_CONNECT_TIMEOUT, Config.BLOBSTORE_READ_TIMEOUT) return super().request(*args, **kwargs) credentials = service_account.Credentials.from_service_account_file( os.environ['GOOGLE_APPLICATION_CREDENTIALS'], scopes=Client.SCOPE) # _http is a "private" parameter, and we may need to re-visit GCP timeout retry # strategies in the future. return Client(_http=SessionWithTimeouts(credentials), credentials=credentials)
def _get_native_gcp_handle() -> typing.Any: if Config.BLOBSTORE_CONNECT_TIMEOUT is None and Config.BLOBSTORE_READ_TIMEOUT is None: client = Client.from_service_account_json( os.environ['GOOGLE_APPLICATION_CREDENTIALS'], ) else: # GCP has no direct interface to configure retries and timeouts. However, it makes use of Python's # stdlib `requests` package, which has straightforward timeout usage. class SessionWithTimeouts(AuthorizedSession): def request(self, *args, **kwargs): kwargs['timeout'] = (Config.BLOBSTORE_CONNECT_TIMEOUT, Config.BLOBSTORE_READ_TIMEOUT) return super().request(*args, **kwargs) credentials = service_account.Credentials.from_service_account_file( os.environ['GOOGLE_APPLICATION_CREDENTIALS'], scopes=Client.SCOPE) # _http is a "private" parameter, and we may need to re-visit GCP timeout retry # strategies in the future. client = Client(_http=SessionWithTimeouts(credentials), credentials=credentials) adapter_kwargs = dict(pool_maxsize=max(DEFAULT_POOLSIZE, 20)) if Config.BLOBSTORE_RETRIES is not None: adapter_kwargs['max_retries'] = Retry( total=Config.BLOBSTORE_RETRIES, backoff_factor=0.3, status_forcelist=(500, 502, 504)) adapter = HTTPAdapter(**adapter_kwargs) # _http is a "private" parameter, and we may need to re-visit GCP timeout retry # strategies in the future. client._http.mount('https://', adapter) client._http.mount('http://', adapter) return client
def open_fs(self, fs_url, parse_result, writeable, create, cwd): # pylint: disable=no-self-use path_parts = iteratepath(parse_result.resource) bucket_name = path_parts[0] root_path = join(*path_parts[1:]) if not bucket_name: raise OpenerError("invalid bucket name in '{}'".format(fs_url)) if parse_result.params.get("strict") == "False": strict = False else: strict = True client = Client() project = parse_result.params.get("project") if project: client.project = project api_endpoint = parse_result.params.get("api_endpoint") if api_endpoint: client.client_options = {"api_endpoint": api_endpoint} return GCSFS(bucket_name, root_path=root_path, create=create, client=client, strict=strict)
def __init__( self, bucketname: str, #Should be bucket and/or blob name filename: str, store_user_data: bool = True, store_chat_data: bool = True, store_bot_data: bool = True, single_file: bool = True, #If false, stores in chatID_user_data.json, chatID_chat_data.json, chatID_bot_data.json on_flush: bool = False, storage_client: storage.Client = storage.Client()): super().__init__( store_user_data=store_user_data, store_chat_data=store_chat_data, store_bot_data=store_bot_data, ) self.bucketname = bucketname self.filename = filename try: self.bucket = storage_client.get_bucket(bucketname) except: self.bucket = storage_client.create_bucket(bucketname) blob = self.bucket.blob(filename) blob.upload_from_string(json.dumps({})) self.filename = filename self.storage_client = storage_client self.single_file = single_file self.on_flush = on_flush self.user_data: Optional[DefaultDict[int, Dict]] = None self.chat_data: Optional[DefaultDict[int, Dict]] = None self.bot_data: Optional[Dict] = None self.conversations: Optional[Dict[str, Dict[Tuple, object]]] = None
def _download_from_cloudstorage(self, blob_path: str, local_path: str) -> str: client = Client() bucket = client.get_bucket(self.BUCKET) blob = bucket.blob(blob_path) blob.download_to_filename(local_path) return local_path
def _remove_from_cloudstorage(self, blob_path: str): client = Client() bucket = client.bucket(self.BUCKET) try: # don't fail entire task if this fails bucket.delete_blob(blob_path) except NotFound: print(f"{blob_path} not found")
def hello(**kwargs): gcs = Client() bucket = gcs.bucket("data.visitdata.org") blob = bucket.blob("processed/hello/lastrun") timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') blob.upload_from_string(f"{timestamp}\n") print("Successfully wrote timestamp to bucket: {}".format(timestamp))
def _upload(self, payload: bytes, filename: str, bucket: str) -> None: """ Upload a payload to GCS """ client = Client(project=self.project_id) count = 0 while count < self.max_retries: try: bucket_obj = client.get_bucket(bucket) if self.use_encryption: payload = self._encrypt(payload) content = BytesIO(payload) blob = Blob(filename, bucket_obj) blob.upload_from_file(content) break except ( InvalidResponse, GoogleAPICallError, InternalServerError, SSLError, ) as e: if count >= self.max_retries: raise StoqPluginException( f'Failed to upload {bucket}/{filename} to GCS: {str(e)}' ) count += 1 sleep(randrange(0, 4))
def __init__(self, bucket_name, project=None, credentials=None): """ Constructor :param bucket_name: Name of the bucket that the files are on. :param project: the project which the client acts on behalf of. Will be passed when creating a topic. If not passed, falls back to the default inferred from the environment. :param credentials: (Optional) The OAuth2 Credentials to use for this client. If not passed (and if no ``_http`` object is passed), falls back to the default inferred from the environment. Make sure the credentials have the correct permissions set up on Google Cloud or else GoogleStorage will return a 403 FORBIDDEN error. """ if not Client: raise ValueError( 'Could not import google.cloud.storage. You can install ' 'google.cloud.storage by using pip install google-cloud-storage' ) connection = Client(project=project, credentials=credentials) self.bucket = connection.bucket(bucket_name)
def test_extract_table(client, to_delete): DATASET_ID = 'export_data_dataset_{}'.format(_millis()) dataset = bigquery.Dataset(client.dataset(DATASET_ID)) client.create_dataset(dataset) to_delete.append(dataset) table_ref = dataset.table('person_ages') table = client.create_table(bigquery.Table(table_ref, schema=SCHEMA)) to_delete.insert(0, table) client.create_rows(table, ROWS) bucket_name = 'extract_person_ages_job_{}'.format(_millis()) # [START extract_table] from google.cloud.storage import Client as StorageClient storage_client = StorageClient() bucket = storage_client.create_bucket(bucket_name) # API request destination_blob_name = 'person_ages_out.csv' destination = bucket.blob(destination_blob_name) destination_uri = 'gs://{}/{}'.format(bucket_name, destination_blob_name) extract_job = client.extract_table(table_ref, destination_uri) # API request extract_job.result(timeout=100) # Waits for job to complete. got = destination.download_as_string().decode('utf-8') # API request assert 'Bharney Rhubble' in got # [END extract_table] to_delete.append(bucket) to_delete.insert(0, destination)
def bucket_object(storage_client: storage.Client) -> storage.Bucket: """ GCS Bucket from .env config """ if not storage_client.lookup_bucket(TEST_BUCKET): bucket = storage_client.create_bucket(TEST_BUCKET) else: bucket = storage_client.get_bucket(TEST_BUCKET) yield bucket
def _create_connection(self): client = Client( credentials=self.credentials, project=self.project, ) return client.bucket(self.bucket)
class GS(Base): _creds: service_account.Credentials = None _project: str = None _bucket: Bucket = None def __init__(self, bucket: str, creds_path: Optional[str] = None): super().__init__() if creds_path is not None: self._creds = service_account.Credentials.from_service_account_file( creds_path) with open(creds_path, 'rt') as f: self._project = json.loads(f.read())['project_id'] self._bucket = Client(self._project, self._creds).bucket(bucket) else: self._bucket = Client().bucket(bucket) def get(self, path: str) -> bytes: return self._bucket.get_blob(path).download_as_string() def put(self, path: str, content: bytes): self._bucket.blob(path).upload_from_string(content) def exists(self, path: str) -> bool: return self._bucket.get_blob(path) is not None def delete(self, path: str): blobs = self._bucket.list_blobs(prefix=path) for blob in blobs: blob.delete()
def gs(self): from google.cloud.storage import Client return ( Client.from_service_account_json(self.credentialpath) if self.credentialpath else Client(self.projectname) )
def _client(self): from google.cloud.storage import Client if type(self._credentials) == str: return Client.from_service_account_json(self._credentials) else: return Client(credentials=self._credentials, project=self.project_name)
def _create_default_client(self, service_account_credentials_path=settings. GCS_STORAGE_SERVICE_ACCOUNT_KEY_PATH): if service_account_credentials_path: return Client.from_service_account_json( service_account_credentials_path) else: return Client()
def create_client(self): # Client should be imported here because grpc starts threads during import # and if you call fork after that, a child process will be hang during exit from google.cloud.storage import Client if self.credentials: client = Client.from_service_account_json(self.credentials) else: client = Client() return client
def create_auth(self): if self.auth_params['service_account']: self.client = Client.from_service_account_json( self.auth_params['service_account']) elif self.auth_params['token']: self.sess = Credentials(token=self.auth_params['token']) self.client = Client(credentials=self.sess) else: self.client = None
class RetimoDataset: cache_path = '.cache' def __init__(self, config_records: List[RetimoDatasetConfigRecord]): self.config_records = config_records self.storage_client = Client() def load(self, to_shuffle=True): self.config_records = [self._download(config_record) for config_record in self.config_records] self.config_records = [self._unzip(config_record) for config_record in self.config_records] arrays = dict(self._load_to_nparray(config_record) for config_record in self.config_records) if to_shuffle: values = arrays.values() y = {x for x in values} values_2 = values.values() shuffeled = sklearn.utils.shuffle() return arrays def _download(self, config_record: RetimoDatasetConfigRecord) -> RetimoDatasetConfigRecord: local_path = f"{self.cache_path}/{config_record.dataset_name}/{config_record.name}" os.makedirs(local_path, exist_ok=True) raw_path = f"{local_path}/raw" if not path.exists(raw_path): with open(raw_path, "w") as f: f.write("") print(f"Downloading raw data from {config_record.gcs_path} to {raw_path}") with open(raw_path, 'wb') as file_obj: self.storage_client.download_blob_to_file(config_record.gcs_path, file_obj) else: print(f"Downloading raw data for '{config_record.name}' not needed. Using cache '{raw_path}'") return config_record.add_local_path(local_path) def _unzip(self, config_record: RetimoDatasetConfigRecord) -> RetimoDatasetConfigRecord: unzipped_path = f"{config_record.local_path}/unziped" if not path.exists(unzipped_path): print(f"Unzipping file {config_record.raw_path} to {unzipped_path}") with zipfile.ZipFile(config_record.raw_path, 'r') as zip_ref: zip_ref.extractall(unzipped_path) else: print(f"Unzipping file for '{config_record.name}' not needed. Using cache '{unzipped_path}'") return config_record.unzipped() def _load_to_nparray(self, config_record: RetimoDatasetConfigRecord) -> Tuple[str, Dict[str, Union[ndarray, Any]]]: collector = {} for label in config_record.labels.keys(): directory = f"{os.getcwd()}/{config_record.unzipped_path}/{config_record.name}/{label}/" dataset = numpy.asarray( [asarray(Image.open(f"{directory}/{image}"), dtype=numpy.uint8) for image in os.listdir(directory) if image.endswith('.jpg')]) label_list = full((len(dataset)), fill_value=config_record.labels[label], dtype=numpy.uint8) collector['dataset'] = numpy.append(collector.get('dataset', numpy.empty(0)), dataset).reshape( dataset.shape[0] + collector.get('dataset', numpy.empty(0)).shape[0], *dataset.shape[1:]) collector['label'] = numpy.append(collector.get('label', numpy.empty(0)), label_list) return config_record.name, collector
def main(): args = parse_args() # Imports of thor modules are deferred until after argument parsing to avoid # numba JIT time if the arguments are invalid or the user asked for --help. import thor.utils.logging thor.utils.logging.setupLogger("thor") from thor.taskqueue.client import Client as TaskQueueClient from thor.taskqueue.queue import TaskQueueConnection from thor.orbits import Orbits from thor.config import Config if not isinstance(args.config, str): config = Config else: config = Config.fromYaml(args.config) # Read observations preprocessed_observations = pd.read_csv(args.preprocessed_observations, index_col=False, dtype={"obs_id": str}) # Read test orbits test_orbits = Orbits.from_csv(args.test_orbits) # Connect to Rabbit queue = TaskQueueConnection( pika.ConnectionParameters( host=args.rabbit_host, port=args.rabbit_port, credentials=pika.PlainCredentials( username=args.rabbit_username, password=args.rabbit_password, ), ), args.queue, ) queue.connect() # Connect to GCS bucket gcs = GCSClient() if args.create_bucket: try: gcs.create_bucket(args.bucket) except google.cloud.exceptions.Conflict: # Bucket already exists. pass bucket = gcs.bucket(args.bucket) taskqueue_client = TaskQueueClient(bucket, queue) manifest = taskqueue_client.launch_job(config, preprocessed_observations, test_orbits) taskqueue_client.monitor_job_status(manifest.job_id) taskqueue_client.download_results(manifest, args.out_dir)
def __init__( self, application_credentials: Optional[Union[str, os.PathLike]] = None, credentials: Optional["Credentials"] = None, project: Optional[str] = None, storage_client: Optional["StorageClient"] = None, local_cache_dir: Optional[Union[str, os.PathLike]] = None, ): """Class constructor. Sets up a [`Storage Client`](https://googleapis.dev/python/storage/latest/client.html). Supports the following authentication methods of `Storage Client`. - Environment variable `"GOOGLE_APPLICATION_CREDENTIALS"` containing a path to a JSON credentials file for a Google service account. See [Authenticating as a Service Account](https://cloud.google.com/docs/authentication/production). - File path to a JSON credentials file for a Google service account. - OAuth2 Credentials object and a project name. - Instantiated and already authenticated `Storage Client`. If multiple methods are used, priority order is reverse of list above (later in list takes priority). If no authentication methods are used, then the client will be instantiated as anonymous, which will only have access to public buckets. Args: application_credentials (Optional[Union[str, os.PathLike]]): Path to Google service account credentials file. credentials (Optional[Credentials]): The OAuth2 Credentials to use for this client. See documentation for [`StorageClient`]( https://googleapis.dev/python/storage/latest/client.html). project (Optional[str]): The project which the client acts on behalf of. See documentation for [`StorageClient`]( https://googleapis.dev/python/storage/latest/client.html). storage_client (Optional[StorageClient]): Instantiated [`StorageClient`]( https://googleapis.dev/python/storage/latest/client.html). local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache for downloaded files. If None, will use a temporary directory. """ if application_credentials is None: application_credentials = os.getenv( "GOOGLE_APPLICATION_CREDENTIALS") if storage_client is not None: self.client = storage_client elif credentials is not None: self.client = StorageClient(credentials=credentials, project=project) elif application_credentials is not None: self.client = StorageClient.from_service_account_json( application_credentials) else: self.client = StorageClient.create_anonymous_client() super().__init__(local_cache_dir=local_cache_dir)
def __init__(self, bucket: str, account_info: Union[Client, str, Dict], timeout: int = 15): self.account_info = account_info self.timeout = timeout self._client, self._project = ((account_info, None) if isinstance( account_info, Client) else self._generate_client(account_info)) self.client = Client(credentials=self._client, project=self._project) self.bucket = self.client.bucket(bucket)
def __init__(self, bucket: str, creds_path: Optional[str] = None): super().__init__() if creds_path is not None: self._creds = service_account.Credentials.from_service_account_file( creds_path) with open(creds_path, 'rt') as f: self._project = json.loads(f.read())['project_id'] self._bucket = Client(self._project, self._creds).bucket(bucket) else: self._bucket = Client().bucket(bucket)
def setUp(self): os.environ['FILESYSTEM_PUBLISH_ENABLED'] = '0' os.environ['FILESYSTEM_ENABLED'] = '0' os.environ['GOOGLE_PUBLISH_ENABLED'] = '1' os.environ['GOOGLE_ENABLED'] = '1' os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json' os.environ['CONFIG'] = './example/config.yml' self.store = Datastore() self.publish_config = self.store.config.publish['handlers']['gcloud'] self.storage_config = self.store.config.storage['gcloud'] self.dataset = self.store.datasets[0] self.client = Client()
def test_load_table_from_storage_then_dump_table(self): import csv import tempfile from google.cloud.storage import Client as StorageClient local_id = unique_resource_id() BUCKET_NAME = 'bq_load_test' + local_id BLOB_NAME = 'person_ages.csv' GS_URL = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME) ROWS = [ ('Phred Phlyntstone', 32), ('Bharney Rhubble', 33), ('Wylma Phlyntstone', 29), ('Bhettye Rhubble', 27), ] TABLE_NAME = 'test_table' s_client = StorageClient() # In the **very** rare case the bucket name is reserved, this # fails with a ConnectionError. bucket = s_client.create_bucket(BUCKET_NAME) self.to_delete.append(bucket) blob = bucket.blob(BLOB_NAME) with tempfile.TemporaryFile(mode='w+') as csv_file: writer = csv.writer(csv_file) writer.writerow(('Full Name', 'Age')) writer.writerows(ROWS) blob.upload_from_file( csv_file, rewind=True, content_type='text/csv') self.to_delete.insert(0, blob) dataset = Config.CLIENT.dataset( _make_dataset_name('load_gcs_then_dump')) retry_403(dataset.create)() self.to_delete.append(dataset) full_name = bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED') age = bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED') table = dataset.table(TABLE_NAME, schema=[full_name, age]) table.create() self.to_delete.insert(0, table) job = Config.CLIENT.load_table_from_storage( 'bq_load_storage_test_' + local_id, table, GS_URL) job.create_disposition = 'CREATE_NEVER' job.skip_leading_rows = 1 job.source_format = 'CSV' job.write_disposition = 'WRITE_EMPTY' job.begin() def _job_done(instance): return instance.state in ('DONE', 'done') # Allow for 90 seconds of "warm up" before rows visible. See: # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds retry = RetryInstanceState(_job_done, max_tries=8) retry(job.reload)() rows, _, _ = table.fetch_data() by_age = operator.itemgetter(1) self.assertEqual(sorted(rows, key=by_age), sorted(ROWS, key=by_age))