def get_data(): """ Retrieve the data file from GCP Storage, and return the file as a dictionary. Create the file, with dummy data, if it don't exist. """ # Introduce a delay here. do_delay() # Start of the actual function rtn = None storage_client = storage.Client() bucket_name = current_app.config.get('DATA_BUCKET_NAME') print('bucket_name=%s' % bucket_name) try: bucket = storage_client.get_bucket(bucket_name) except Exception as e: bucket = storage_client.create_bucket(bucket_name) # Test if the data file is found in the bucket, and # create it if it doesn't exist. blob = Blob(current_app.config.get('DATA_FILE_NAME'), bucket) if not blob.exists(): # Open the initial data file init_fname = current_app.config.get('INIT_DATA_FILE') with open(init_fname) as infile: init_data = json.load(infile) # Copy it to the storage bucket blob.upload_from_string(json.dumps(init_data, indent=4)) data_str = blob.download_as_string() rtn = json.loads(data_str) print('GOT BAG DATA:') print(json.dumps(rtn, indent=4)) return rtn, blob
def wait_on_gcs_blob(gcs_client: storage.Client, wait_blob: storage.Blob, polling_timeout: int, polling_interval: int = 1) -> bool: """" Wait for a GCS Object to exists. Args: gcs_client: storage.Client wait_blob: storage.Bllob the GCS to wait on. polling_timeout: int number of seconds to poll this job ID polling_interval: frequency to query the job state during polling Returns: bool: if the job ID has finished successfully. True if DONE without errors, False if RUNNING or PENDING Raises: exceptions.BigQueryJobFailure if the job failed. google.api_core.exceptions.NotFound if the job id cannot be found. """ start_poll = time.monotonic() while time.monotonic() - start_poll < (polling_timeout - polling_interval): if wait_blob.exists(client=gcs_client): return True print( f"waiting on GCS file gs://{wait_blob.bucket.name}/{wait_blob.name}" ) time.sleep(polling_interval) return False
def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, next_job_id: Optional[str]): """Reclaim the lock blob for the new job id (in-place) or delete the lock blob if next_job_id is None.""" try: if next_job_id: if lock_blob.exists(client=gcs_client): lock_blob.upload_from_string( next_job_id, if_generation_match=lock_blob.generation, client=gcs_client) else: # This happens when submitting the first job in the backlog lock_blob.upload_from_string(next_job_id, if_generation_match=0, client=gcs_client) else: print("releasing lock at: " f"gs://{lock_blob.bucket.name}/{lock_blob.name}") lock_blob.delete( if_generation_match=lock_blob.generation, client=gcs_client, ) except google.api_core.exceptions.PreconditionFailed as err: raise exceptions.BacklogException( f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name} " f"was changed by another process.") from err
def blob_exists(bucket_name, blob_name): """ check if blob/key exists or not! """ tries = 0 while tries < NUM_TRIES: try: if bucket_exists(bucket_name): client = storage.Client() bucket = client.bucket(bucket_name) blob = Blob(blob_name, bucket) return blob.exists() else: return False except BadRequest: return False except Exception: time.sleep(300) tries += 1 logger.error( "Can not check the status of the blob {} after multiple attempts".format( blob_name ) ) return False
def drop_db(flask_app): """ Remove the bucket and object that we used for testing """ storage_client = storage.Client() bucket_name = flask_app.config.get('DATA_BUCKET_NAME') bucket = storage_client.get_bucket(bucket_name) blob = Blob(flask_app.config.get('DATA_FILE_NAME'), bucket) if blob.exists(): blob.delete() bucket.delete()
def _save(self, name, content): name = os.path.basename(name) new_name = name count = 0 while True: blob = Blob(new_name, self.bucket, chunk_size=1024 * 256) if not blob.exists(): break count += 1 new_name = name + '.%d' % count blob.upload_from_file(content) blob.make_public() return new_name
def create_df_with_yesterday(bucket, interval_date, raw_csv): dfm = pd.read_csv(StringIO(raw_csv)) df_today = fix_dst_issue(dfm) yesterday = interval_date - timedelta(days=1) blob_name_yesterday = f"{LEMS_STORAGE_PATH_PREFIX}/{yesterday.year}/lems_data_{yesterday.strftime('%Y%m%d')}.csv" blob_yesterday = Blob(blob_name_yesterday, bucket) csv_yesterday = None if blob_yesterday.exists(): csv_yesterday = blob_yesterday.download_as_string().decode('utf-8') df_yesterday = fix_dst_issue(pd.read_csv(StringIO(csv_yesterday))) return df_today.append(df_yesterday) return df_today
def _locate_blob(self, content_hash): """Check if a file with the given hash exists on S3.""" if content_hash is None: return prefix = self._get_prefix(content_hash) if prefix is None: return # First, check the standard file name: blob = Blob(os.path.join(prefix, 'data'), self.bucket) if blob.exists(): return blob # Second, iterate over all file names: for blob in self.bucket.list_blobs(max_results=1, prefix=prefix): return blob
def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, next_job_id: Optional[str], table: bigquery.TableReference, retry_attempt_cnt: Optional[int] = None): """Reclaim the lock blob for the new job id (in-place) or delete the lock blob if next_job_id is None.""" try: if next_job_id: lock_blob_contents = json.dumps( dict(job_id=next_job_id, table=table.to_api_repr(), retry_attempt_cnt=retry_attempt_cnt)) logging.log_with_table( table, f"Writing the following content to lock_blob {lock_blob.name}:" f" {dict(job_id=next_job_id, table=table.to_api_repr(), retry_attempt_cnt=retry_attempt_cnt)}" ) if lock_blob.exists(client=gcs_client): lock_blob.upload_from_string( lock_blob_contents, if_generation_match=lock_blob.generation, client=gcs_client) else: # This happens when submitting the first job in the backlog lock_blob.upload_from_string( lock_blob_contents, if_generation_match=0, # noqa: E126 client=gcs_client) else: logging.log_with_table( table, "releasing lock at: " f"gs://{lock_blob.bucket.name}/{lock_blob.name}") lock_blob.delete( if_generation_match=lock_blob.generation, client=gcs_client, ) except (google.api_core.exceptions.PreconditionFailed, google.api_core.exceptions.NotFound) as err: if isinstance(err, google.api_core.exceptions.PreconditionFailed): raise exceptions.BacklogException( f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name} " f"was changed by another process.") from err logging.log_with_table( table, "Tried deleting a lock blob that was either already deleted " "or never existed.")
def up_to_date(input_blob: storage.Blob, output_blob: storage.Blob): """ Checks if the blob is up-to-date. :param input_blob: :param output_blob: :return: true if the output blob is up-to-date. If the blob doesn't exist or is outdated, returns false. """ if not output_blob.exists(): return False input_blob.reload() output_blob.reload() assert input_blob.updated is not None, 'input blob should exist' if input_blob.updated > output_blob.updated: return False return True
def exists(self, name): blob = Blob(name, self.bucket) return blob.exists()
def _create_folder_file(): folder_key = path.join(root, '{0}_$folder$'.format(folder_name)) blob = Blob(folder_key, gcp_bucket) if not blob.exists(): blob.upload_from_string(data='')
def _locate_key(self, key): if key is None: return blob = Blob(key, self.bucket) if blob.exists(): return blob
def _cache_blob(self, local_path:str, gc_blob:storage.Blob): if not gc_blob.exists(): return with open(local_path, 'wb') as file: gc_blob.download_to_file(file)