def setUp(self): self.cloud = GoogleCloudStorage() # Running into some issues creating and delete too many buckets, so # will check to see if it already exists if not self.cloud.bucket_exists(TEMP_BUCKET_NAME): self.cloud.create_bucket(TEMP_BUCKET_NAME) # Upload a file tmp_file_path = files.string_to_temp_file('A little string', suffix='.txt') self.cloud.put_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME, tmp_file_path)
def copy(self, table_obj, dataset_name, table_name, if_exists='fail', tmp_gcs_bucket=None, gcs_client=None, job_config=None, **load_kwargs): """ Copy a :ref:`parsons-table` into Google BigQuery via Google Cloud Storage. `Args:` table_obj: obj The Parsons Table to copy into BigQuery. dataset_name: str The dataset name to load the data into. table_name: str The table name to load the data into. if_exists: str If the table already exists, either ``fail``, ``append``, ``drop`` or ``truncate`` the table. temp_gcs_bucket: str The name of the Google Cloud Storage bucket to use to stage the data to load into BigQuery. Required if `GCS_TEMP_BUCKET` is not specified. gcs_client: object The GoogleCloudStorage Connector to use for loading data into Google Cloud Storage. job_config: object A LoadJobConfig object to provide to the underlying call to load_table_from_uri on the BigQuery client. The function will create its own if not provided. **load_kwargs: kwargs Arguments to pass to the underlying load_table_from_uri call on the BigQuery client. """ tmp_gcs_bucket = check_env.check('GCS_TEMP_BUCKET', tmp_gcs_bucket) if if_exists not in ['fail', 'truncate', 'append', 'drop']: raise ValueError(f'Unexpected value for if_exists: {if_exists}, must be one of ' '"append", "drop", "truncate", or "fail"') table_exists = self.table_exists(dataset_name, table_name) if not job_config: job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.skip_leading_rows = 1 job_config.source_format = bigquery.SourceFormat.CSV job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED dataset_ref = self.client.dataset(dataset_name) if table_exists: if if_exists == 'fail': raise ValueError('Table already exists.') elif if_exists == 'drop': self.delete_table(dataset_name, table_name) elif if_exists == 'append': job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND elif if_exists == 'truncate': job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE gcs_client = gcs_client or GoogleCloudStorage() temp_blob_name = f'{uuid.uuid4()}.csv' temp_blob_uri = gcs_client.upload_table(table_obj, tmp_gcs_bucket, temp_blob_name) # load CSV from Cloud Storage into BigQuery try: load_job = self.client.load_table_from_uri( temp_blob_uri, dataset_ref.table(table_name), job_config=job_config, **load_kwargs, ) load_job.result() finally: gcs_client.delete_blob(tmp_gcs_bucket, temp_blob_name)
class TestGoogleStorageBuckets(unittest.TestCase): def setUp(self): self.cloud = GoogleCloudStorage() # Running into some issues creating and delete too many buckets, so # will check to see if it already exists if not self.cloud.bucket_exists(TEMP_BUCKET_NAME): self.cloud.create_bucket(TEMP_BUCKET_NAME) # Upload a file tmp_file_path = files.string_to_temp_file('A little string', suffix='.txt') self.cloud.put_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME, tmp_file_path) def test_list_buckets(self): # Assert that it finds the correct buckets bucket_list = self.cloud.list_buckets() # Make sure that my bucket is in the list self.assertIn(TEMP_BUCKET_NAME, bucket_list) def test_bucket_exists(self): # Assert finds a bucket that exists self.assertTrue(self.cloud.bucket_exists(TEMP_BUCKET_NAME)) # Assert doesn't find a bucket that doesn't exist self.assertFalse(self.cloud.bucket_exists('NOT_A_REAL_BUCKET')) def test_get_bucket(self): # Assert that a bucket object is returned self.assertIsInstance(self.cloud.get_bucket(TEMP_BUCKET_NAME), storage.bucket.Bucket) def test_create_bucket(self): # Temporary bucket has already been created as part of set up, so just checking # that it really exists self.assertTrue(self.cloud.bucket_exists(TEMP_BUCKET_NAME)) def test_delete_bucket(self): # Create another bucket, delete it and make sure it doesn't exist self.cloud.create_bucket(TEMP_BUCKET_NAME + '_2') self.cloud.delete_bucket(TEMP_BUCKET_NAME + '_2') self.assertFalse(self.cloud.bucket_exists(TEMP_BUCKET_NAME + '_2')) def test_list_blobs(self): blob_list = self.cloud.list_blobs(TEMP_BUCKET_NAME) # Make sure that my file is in the list self.assertIn(TEMP_FILE_NAME, blob_list) # Make sure that there is only one file in the bucket self.assertEqual(len(blob_list), 1) def test_blob_exists(self): # Assert that it thinks that the blob exists self.assertTrue( self.cloud.blob_exists(TEMP_BUCKET_NAME, TEMP_FILE_NAME)) # Assert that it thinks that a non-existent blob doesn't exist self.assertFalse(self.cloud.blob_exists(TEMP_BUCKET_NAME, 'FAKE_BLOB')) def test_put_blob(self): # Already being tested as part of setUp pass def test_get_blob(self): # Assert that a blob object is returned self.assertIsInstance( self.cloud.get_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME), storage.blob.Blob) def test_download_blob(self): # Download blob and ensure that it is the expected file path = self.cloud.download_blob(TEMP_BUCKET_NAME, TEMP_FILE_NAME) with open(path, 'r') as f: self.assertEqual(f.read(), 'A little string') def test_delete_blob(self): file_name = 'delete_me.txt' # Upload a file tmp_file_path = files.string_to_temp_file('A little string', suffix='.txt') self.cloud.put_blob(TEMP_BUCKET_NAME, file_name, tmp_file_path) # Check that it was deleted. self.cloud.delete_blob(TEMP_BUCKET_NAME, file_name) self.assertFalse(self.cloud.blob_exists(TEMP_BUCKET_NAME, file_name))
def to_gcs_csv(self, bucket_name, blob_name, app_creds=None, project=None, compression=None, encoding=None, errors='strict', write_header=True, public_url=False, public_url_expires=60, **csvargs): """ Writes the table to a Google Cloud Storage blob as a CSV. `Args:` bucket_name: str The bucket to upload to blob_name: str The blob to name the file. If it ends in '.gz' or '.zip', the file will be compressed. app_creds: str A credentials json string or a path to a json file. Not required if ``GOOGLE_APPLICATION_CREDENTIALS`` env variable set. project: str The project which the client is acting on behalf of. If not passed then will use the default inferred environment. compression: str The compression type for the csv. Currently "None", "zip" and "gzip" are supported. If specified, will override the key suffix. encoding: str The CSV encoding type for `csv.writer() <https://docs.python.org/2/library/csv.html#csv.writer/>`_ errors: str Raise an Error if encountered write_header: boolean Include header in output public_url: boolean Create a public link to the file public_url_expire: 60 The time, in minutes, until the url expires if ``public_url`` set to ``True``. \**csvargs: kwargs ``csv_writer`` optional arguments `Returns:` Public url if specified. If not ``None``. """ # noqa: W605 compression = compression or files.compression_type_for_path(blob_name) csv_name = files.extract_file_name(blob_name, include_suffix=False) + '.csv' # Save the CSV as a temp file local_path = self.to_csv(temp_file_compression=compression, encoding=encoding, errors=errors, write_header=write_header, csv_name=csv_name, **csvargs) from parsons.google.google_cloud_storage import GoogleCloudStorage gcs = GoogleCloudStorage(app_creds=app_creds, project=project) gcs.put_blob(bucket_name, blob_name, local_path) if public_url: return gcs.get_url(bucket_name, blob_name, expires_in=public_url_expires) else: return None