def test_delete_table(storage): Storage("br_ibge_pib", "municipio").delete_table(bucket_name="basedosdados-dev") with pytest.raises(FileNotFoundError): Storage("br_ibge_pib", "municipio").delete_table()
def bq_upload(context, filepath, raw_filepath=None, partitions=None): table_id = context.resources.basedosdados_config['table_id'] dataset_id = context.resources.basedosdados_config['dataset_id'] context.log.info(f""" Received inputs: raw_filepath = {raw_filepath}, type = {type(raw_filepath)} treated_filepath = {filepath}, type = {type(filepath)} dataset_id = {dataset_id}, type = {type(dataset_id)} table_id = {table_id}, type = {type(table_id)} partitions = {partitions}, type = {type(partitions)} """) # Upload raw to staging if raw_filepath: st = Storage(table_id=table_id, dataset_id=dataset_id) context.log.info( f"Uploading raw file: {raw_filepath} to bucket {st.bucket_name} at {st.bucket_name}/{dataset_id}/{table_id}" ) st.upload(path=raw_filepath, partitions=partitions, mode='raw', if_exists='replace') # creates and publish table if it does not exist, append to it otherwise if partitions: # If table is partitioned, get parent directory wherein partitions are stored tb_dir = filepath.split(partitions)[0] create_or_append_table(context, dataset_id, table_id, tb_dir) else: create_or_append_table(context, dataset_id, table_id, filepath) # Delete local Files context.log.info(f"Deleting local files: {raw_filepath}, {filepath}") cleanup_local(filepath, raw_filepath)
def test_copy_table(storage): Storage("br_ibge_pib", "municipio").copy_table() with pytest.raises(FileNotFoundError): Storage("br_ibge_pib2", "municipio2").copy_table() Storage("br_ibge_pib", "municipio").copy_table( destination_bucket_name="basedosdados-dev", )
def upload(context, filename): dataset_id = context.resources.basedosdados_config["dataset_id"] table_id = context.resources.basedosdados_config["table_id"] st = Storage(dataset_id, table_id) context.log.info( f"Uploading {filename} to GCS at:{st.bucket_name}/staging/{dataset_id}/{table_id}", ) st.upload(path=filename, mode="staging", if_exists="replace") return filename
def test_create_storage_data_exist_table_config_exist(table, metadatadir, data_path, sample_data): shutil.rmtree(metadatadir / DATASET_ID / TABLE_ID, ignore_errors=True) Dataset(dataset_id=DATASET_ID, metadata_path=metadatadir).create(if_exists="pass") Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir).upload(data_path, mode="staging", if_exists="replace") table.init( data_sample_path=data_path, if_folder_exists="replace", if_table_config_exists="replace", ) for file in TABLE_FILES: shutil.copy(sample_data / file, table.table_folder / file) table.delete(mode="all") table.create( data_path, if_storage_data_exists="pass", if_table_config_exists="pass", ) assert table_exists(table, "staging")
def test_create(table, metadatadir): shutil.rmtree(Path(metadatadir) / DATASET_ID / TABLE_ID, ignore_errors=True) Dataset(dataset_id=DATASET_ID, metadata_path=metadatadir).create(if_exists="pass") Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir).upload( "tests/sample_data/municipios.csv", mode="staging", if_exists="replace") table.init(data_sample_path="tests/sample_data/municipios.csv", if_exists="replace") table.delete(mode="all") table.create() assert table_exists(table, mode="staging") table.create(if_exists="replace") assert table_exists(table, mode="staging") table.create("tests/sample_data/municipios.csv", if_exists="replace")
def test_create_with_upload(table, metadatadir, data_path): table.delete("all") Storage(DATASET_ID, TABLE_ID).delete_table(not_found_ok=True) table.create(data_path, if_table_config_exists="replace") assert table_exists(table, mode="staging")
def test_create_with_path(table, metadatadir, data_path, sample_data): table.delete("all") Storage(DATASET_ID, TABLE_ID).delete_table(not_found_ok=True) shutil.rmtree(metadatadir / DATASET_ID / TABLE_ID, ignore_errors=True) table.create(data_path, ) assert table_exists(table, mode="staging")
def test_create_if_storage_data_raise(table, metadatadir, data_path): Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir).upload( data_path, mode="staging", if_exists="replace" ) with pytest.raises(Exception): table.create( data_path, if_table_exists="replace", if_table_config_exists="replace", if_storage_data_exists="raise", )
def test_create_no_path(table, metadatadir, data_path, sample_data): Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir).upload( data_path, mode="staging", if_exists="replace" ) table.init(data_sample_path=data_path, if_folder_exists="replace") for file in TABLE_FILES: shutil.copy(sample_data / file, table.table_folder / file) table.create(if_storage_data_exists="pass", if_table_config_exists="pass") assert table_exists(table, "staging")
def sync_bucket( source_bucket_name, dataset_id, table_id, destination_bucket_name, backup_bucket_name, mode="staging", ): """Copies proprosed data between storage buckets. Creates a backup of old data, then delete it and copies new data into the destination bucket. Args: source_bucket_name (str): The bucket name from which to copy data. dataset_id (str): Dataset id available in basedosdados. It should always come with table_id. table_id (str): Table id available in basedosdados.dataset_id. It should always come with dataset_id. destination_bucket_name (str): The bucket name which data will be copied to. If None, defaults to the bucket initialized when instantianting Storage object (check it with the Storage.bucket proprerty) backup_bucket_name (str): The bucket name for where backup data will be stored. mode (str): Optional. Folder of which dataset to update. Raises: ValueError: If there are no files corresponding to the given dataset_id and table_id on the source bucket """ ref = Storage(dataset_id=dataset_id, table_id=table_id) prefix = f"{mode}/{dataset_id}/{table_id}/" source_ref = ( ref.client["storage_staging"].bucket(source_bucket_name).list_blobs( prefix=prefix)) destination_ref = ref.bucket.list_blobs(prefix=prefix) if len(list(source_ref)) == 0: raise ValueError("No objects found on the source bucket") # MAKE A BACKUP OF OLD DATA if len(list(destination_ref)): print( f"\n########################################### COPY BACKUP ###########################################\n" ) ref.copy_table( source_bucket_name=destination_bucket_name, destination_bucket_name=backup_bucket_name, ) print( f"\n########################################## DELETE OLD DATA ##########################################\n" ) # DELETE OLD DATA FROM PROD ref.delete_table(not_found_ok=True) print( f"\n########################################### COPY NEW DATA ###########################################\n" ) # COPIES DATA TO DESTINATION ref.copy_table(source_bucket_name=source_bucket_name)
def storage(metadatadir): return Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir)
def sync_bucket( source_bucket_name, dataset_id, table_id, destination_bucket_name, backup_bucket_name, mode="staging", ): """Copies proprosed data between storage buckets. Creates a backup of old data, then delete it and copies new data into the destination bucket. Args: source_bucket_name (str): The bucket name from which to copy data. dataset_id (str): Dataset id available in basedosdados. It should always come with table_id. table_id (str): Table id available in basedosdados.dataset_id. It should always come with dataset_id. destination_bucket_name (str): The bucket name which data will be copied to. If None, defaults to the bucket initialized when instantianting Storage object (check it with the Storage.bucket proprerty) backup_bucket_name (str): The bucket name for where backup data will be stored. mode (str): Optional Folder of which dataset to update.[raw|staging|header|auxiliary_files|architecture] Raises: ValueError: If there are no files corresponding to the given dataset_id and table_id on the source bucket """ ref = Storage(dataset_id=dataset_id, table_id=table_id) prefix = f"{mode}/{dataset_id}/{table_id}/" source_ref = ( ref.client["storage_staging"].bucket(source_bucket_name).list_blobs( prefix=prefix)) destination_ref = ref.bucket.list_blobs(prefix=prefix) if len(list(source_ref)) == 0: raise ValueError( f"No objects found on the source bucket {source_bucket_name}.{prefix}" ) if len(list(destination_ref)): backup_bucket_blobs = list(ref.client["storage_staging"].bucket( backup_bucket_name).list_blobs(prefix=prefix)) if len(backup_bucket_blobs): tprint(f"{mode.upper()}: DELETE BACKUP DATA") ref.delete_table(not_found_ok=True, mode=mode, bucket_name=backup_bucket_name) tprint(f"{mode.upper()}: BACKUP OLD DATA") ref.copy_table( source_bucket_name=destination_bucket_name, destination_bucket_name=backup_bucket_name, mode=mode, ) tprint(f"{mode.upper()}: DELETE OLD DATA") ref.delete_table(not_found_ok=True, mode=mode, bucket_name=destination_bucket_name) tprint(f"{mode.upper()}: TRANSFER NEW DATA") ref.copy_table( source_bucket_name=source_bucket_name, destination_bucket_name=destination_bucket_name, mode=mode, )