def test_simple(token_restore): assert not GCSFileSystem.tokens gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) assert gcs.ls('') # token is now cached gcs = GCSFileSystem(TEST_PROJECT) assert gcs.ls('')
def test_raise_on_project_mismatch(mock_auth): mock_auth.default.return_value = (requests.Session(), "my_other_project") match = "'my_project' does not match the google default project 'my_other_project'" with pytest.raises(ValueError, match=match): GCSFileSystem(project="my_project", token="google_default") result = GCSFileSystem(token="google_default") assert result.project == "my_other_project"
def test_current(): from google.auth import credentials with gcs_maker() as gcs: assert GCSFileSystem.current() is gcs gcs2 = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) assert gcs2.session is gcs.session gcs2 = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, secure_serialize=False) assert isinstance(gcs2.token, credentials.Credentials)
def test_validate_response(): gcs = GCSFileSystem(token="anon") gcs.validate_response(200, None, None, "/path") # HttpError with no JSON body with pytest.raises(HttpError) as e: gcs.validate_response(503, b"", None, "/path") assert e.value.code == 503 assert e.value.message == ", 503" # HttpError with JSON body j = {"error": {"code": 503, "message": b"Service Unavailable"}} with pytest.raises(HttpError) as e: gcs.validate_response(503, None, j, "/path") assert e.value.code == 503 assert e.value.message == b"Service Unavailable, 503" # 403 j = {"error": {"message": "Not ok"}} with pytest.raises(IOError, match="Forbidden: /path\nNot ok"): gcs.validate_response(403, None, j, "/path") # 404 with pytest.raises(FileNotFoundError): gcs.validate_response(404, b"", None, "/path") # 502 with pytest.raises(ProxyError): gcs.validate_response(502, b"", None, "/path") # ChecksumError md5 = repr(base64.b64encode(hashlib.md5(b"foo").digest()))[2:-1] with pytest.raises(ChecksumError): gcs.validate_response(0, b"f", None, "/path", {"X-Goog-Hash": f"md5={md5}"})
def test_validate_response(): gcs = GCSFileSystem(token="anon") gcs.validate_response(200, None, None, "/path") # HttpError with no JSON body with pytest.raises(HttpError) as e: gcs.validate_response(503, b"", None, "/path") assert e.value.code == 503 assert e.value.message == "" # HttpError with JSON body j = {"error": {"code": 503, "message": b"Service Unavailable"}} with pytest.raises(HttpError) as e: gcs.validate_response(503, None, j, "/path") assert e.value.code == 503 assert e.value.message == b"Service Unavailable" # 403 j = {"error": {"message": "Not ok"}} with pytest.raises(IOError, match="Forbidden: /path\nNot ok"): gcs.validate_response(403, None, j, "/path") # 404 with pytest.raises(FileNotFoundError): gcs.validate_response(404, b"", None, "/path") # 502 with pytest.raises(ProxyError): gcs.validate_response(502, b"", None, "/path")
def gcs_maker(populate=False): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) gcs.invalidate_cache() try: try: gcs.mkdir(TEST_BUCKET, default_acl="authenticatedread", acl="publicReadWrite") except: pass for k in [a, b, c, d]: try: gcs.rm(k) except: pass if populate: for flist in [files, csv_files, text_files]: for fname, data in flist.items(): with gcs.open(TEST_BUCKET+'/'+fname, 'wb') as f: f.write(data) yield gcs finally: for f in gcs.find(TEST_BUCKET): try: gcs.rm(f) except: pass
def gcs_maker(populate=False, **kwargs): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, **kwargs) gcs.invalidate_cache() try: # ensure we're empty. try: gcs.rm(TEST_BUCKET, recursive=True) except FileNotFoundError: pass try: gcs.mkdir( TEST_BUCKET, default_acl="authenticatedread", acl="publicReadWrite" ) except Exception: pass if populate: gcs.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) gcs.invalidate_cache() yield gcs finally: try: gcs.rm(gcs.find(TEST_BUCKET)) except: # noqa: E722 pass
def gcs_maker(populate=False): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) gcs.invalidate_cache() try: try: gcs.mkdir(TEST_BUCKET, default_acl="authenticatedread", acl="publicReadWrite") except gcsfs.utils.HttpError: pass # ensure we're empty. gcs.rm(TEST_BUCKET, recursive=True) for k in [a, b, c, d]: try: gcs.rm(k) except FileNotFoundError: pass if populate: for flist in [files, csv_files, text_files]: for fname, data in flist.items(): with gcs.open(TEST_BUCKET + "/" + fname, "wb") as f: f.write(data) gcs.invalidate_cache() yield gcs finally: for f in gcs.find(TEST_BUCKET): try: gcs.rm(f) except: # noqa: E722 pass
def __init__(self, project_id: str, bucket_name: str, service_account_file: str): self._bucket_name = bucket_name self._client = storage.Client(project=project_id, credentials=service_account.Credentials.\ from_service_account_file(service_account_file)) self._fs = GCSFileSystem(token=service_account_file, check_connection=True)
def _load_fs_and_path(path, creds=None, session_creds=True, google_cloud_project=""): """Given url(path) and creds returns filesystem required for accessing that file + url's filepath in that filesystem""" if ( path.startswith("./") or path.startswith("/") or path.startswith("../") or path.startswith("~/") ): return fsspec.filesystem("file"), os.path.expanduser(path.replace("fs://", "")) if ( session_creds and creds is None and not path.startswith("s3://") and not path.startswith("gcs://") ): path, creds = _connect(path) if path.startswith("s3://"): path = path[5:] if creds is not None and session_creds: return ( fsspec.filesystem( "s3", key=creds["access_key"], secret=creds["secret_key"], token=creds["session_token"], client_kwargs={ "endpoint_url": creds["endpoint"], "region_name": creds["region"], }, ), path, ) elif creds is not None: return ( fsspec.filesystem( "s3", key=creds.get("access_key"), secret=creds.get("secret_key"), ), path, ) else: return fsspec.filesystem("s3"), path elif path.startswith("gcs://"): return ( GCSFileSystem(project=google_cloud_project, token=creds), path[6:], )
def test_many_connect(): from multiprocessing.pool import ThreadPool GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) def task(i): GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN).ls("") return True pool = ThreadPool(processes=20) out = pool.map(task, range(40)) assert all(out) pool.close() pool.join()
def test_request_header(): with gcs_maker(): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, requester_pays=True) # test directly against `_call` to inspect the result r = gcs.call( "GET", "b/{}/o/", TEST_REQUESTER_PAYS_BUCKET, delimiter="/", prefix="test", maxResults=100, info_out=True, ) assert r.headers["User-Agent"] == "python-gcsfs/" + version
def test_request_user_project(): with gcs_maker(): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, requester_pays=True) # test directly against `_call` to inspect the result r = gcs._call( "GET", "b/{}/o/", TEST_REQUESTER_PAYS_BUCKET, delimiter="/", prefix="test", maxResults=100, ) qs = urlparse(r.request.url).query result = parse_qs(qs) assert result["userProject"] == [TEST_PROJECT]
def gcs_maker(populate=False): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) try: if not gcs.exists(TEST_BUCKET): gcs.mkdir(TEST_BUCKET) for k in [a, b, c, d]: try: gcs.rm(k) except: pass if populate: for flist in [files, csv_files, text_files]: for fname, data in flist.items(): with gcs.open(TEST_BUCKET + '/' + fname, 'wb') as f: f.write(data) yield gcs finally: [gcs.rm(f) for f in gcs.walk(TEST_BUCKET)]
parser.add_argument('--firecloud', action='store_true', help="Use logic to generate primary keys for Terra upload via Firecloud") parser.add_argument('--debug', action='store_true', help="Write additional logs for debugging") args = parser.parse_args() log_level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig(level=log_level) log = logging.getLogger(__name__) TERRA_COLUMN_LIMIT = 1000 table_names = args.table or ['cslb', 'hles_cancer_condition', 'hles_dog', 'hles_health_condition', 'hles_owner', 'environment', 'sample', 'eols'] PRIMARY_KEY_PREFIX = 'entity' gcs = GCSFileSystem() # create a service object to handle all aspects of generating a primary key @dataclass class PrimaryKeyGenerator: table_name: str pk_name: str = field(init=False) firecloud: bool # this will calculate pk_name during init def __post_init__(self): # most tables should have "dog_id" as a key if self.table_name in {"hles_dog", "hles_cancer_condition", "hles_health_condition", "environment", "cslb", "eols"}: self.pk_name = 'dog_id' # owner table is linked to hles_dog via "owner_id" elif self.table_name == 'hles_owner':
def task(i): # first instance is made within thread - creating loop GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN).ls("") return True
def test_user_project_cat(): gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN, requester_pays=True) result = gcs.cat(TEST_REQUESTER_PAYS_BUCKET + "/foo.csv") assert len(result)
def test_user_project_fallback_google_default(mock_auth): mock_auth.default.return_value = (requests.Session(), "my_default_project") fs = GCSFileSystem(token="google_default") assert fs.project == "my_default_project"
def test_current(): with gcs_maker() as gcs: assert GCSFileSystem.current() is gcs gcs2 = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) assert gcs2.session is gcs.session
def task(i): GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN).ls("") return True
def test_simple(): assert not GCSFileSystem.tokens gcs = GCSFileSystem(TEST_PROJECT, token=GOOGLE_TOKEN) gcs.ls(TEST_BUCKET) # no error gcs.ls("/" + TEST_BUCKET) # OK to lead with '/'