def remove_url(url, recursive=False): url = url_util.parse(url) local_path = url_util.local_file_path(url) if local_path: if recursive: shutil.rmtree(local_path) else: os.remove(local_path) return if url.scheme == 's3': # Try to find a mirror for potential connection information s3 = s3_util.create_s3_session( url, connection=s3_util.get_mirror_connection(url)) # noqa: E501 bucket = url.netloc if recursive: # Because list_objects_v2 can only return up to 1000 items # at a time, we have to paginate to make sure we get it all prefix = url.path.strip('/') paginator = s3.get_paginator('list_objects_v2') pages = paginator.paginate(Bucket=bucket, Prefix=prefix) delete_request = {'Objects': []} for item in pages.search('Contents'): if not item: continue delete_request['Objects'].append({'Key': item['Key']}) # Make sure we do not try to hit S3 with a list of more # than 1000 items if len(delete_request['Objects']) >= 1000: r = s3.delete_objects(Bucket=bucket, Delete=delete_request) _debug_print_delete_results(r) delete_request = {'Objects': []} # Delete any items that remain if len(delete_request['Objects']): r = s3.delete_objects(Bucket=bucket, Delete=delete_request) _debug_print_delete_results(r) else: s3.delete_object(Bucket=bucket, Key=url.path.lstrip('/')) return elif url.scheme == 'gs': if recursive: bucket = gcs_util.GCSBucket(url) bucket.destroy(recursive=recursive) else: blob = gcs_util.GCSBlob(url) blob.delete_blob() return
def list_url(url, recursive=False): url = url_util.parse(url) local_path = url_util.local_file_path(url) if local_path: if recursive: return list(_iter_local_prefix(local_path)) return [subpath for subpath in os.listdir(local_path) if os.path.isfile(os.path.join(local_path, subpath))] if url.scheme == 's3': s3 = s3_util.create_s3_session(url) if recursive: return list(_iter_s3_prefix(s3, url)) return list(set( key.split('/', 1)[0] for key in _iter_s3_prefix(s3, url))) elif url.scheme == 'gs': gcs = gcs_util.GCSBucket(url) return gcs.get_all_blobs(recursive=recursive)