def update(self, plugin: RepositoryPlugin) -> None: require(self.replica is None or self.replica == 'gcp') assert self.drs_path is not None drs_uri = plugin.drs_uri(self.drs_path) drs_client = plugin.drs_client() access = drs_client.get_object(drs_uri, access_method=AccessMethod.gs) assert access.headers is None url = furl(access.url) blob_name = '/'.join(url.path.segments) # https://github.com/databiosphere/azul/issues/2479#issuecomment-733410253 if url.fragmentstr: blob_name += '#' + unquote(url.fragmentstr) else: # furl does not differentiate between no fragment and empty # fragment if access.url.endswith('#'): blob_name += '#' blob = self._get_blob(bucket_name=url.netloc, blob_name=blob_name) expiration = int(time.time() + 3600) file_name = self.file_name.replace('"', r'\"') assert all(0x1f < ord(c) < 0x80 for c in file_name) disposition = f"attachment; filename={file_name}" signed_url = blob.generate_signed_url(expiration=expiration, response_disposition=disposition) self._location = signed_url
def update(self, plugin: RepositoryPlugin, authentication: Optional[Authentication]) -> None: require(self.replica is None or self.replica == 'gcp') assert self.drs_path is not None drs_uri = plugin.drs_uri(self.drs_path) drs_client = plugin.drs_client(authentication) access = drs_client.get_object(drs_uri, access_method=AccessMethod.gs) require(access.method is AccessMethod.https, access.method) require(access.headers is None, access.headers) signed_url = access.url args = furl(signed_url).args require('X-Goog-Signature' in args, args) self._location = signed_url
def update(self, plugin: RepositoryPlugin, authentication: Optional[Authentication] ) -> None: assert isinstance(plugin, Plugin) url = plugin.direct_file_url(file_uuid=self.file_uuid, file_version=self.file_version, replica=None) self._location = url
def _create_db(self) -> Tuple[JSONs, str]: """ Write hardcoded portal integrations DB to S3. :return: Newly created DB and accompanying version. """ catalog = config.default_catalog plugin = RepositoryPlugin.load(catalog).create(catalog) db = self.demultiplex(plugin.portal_db()) version = self._write_db(db, None) return db, version
def update(self, plugin: RepositoryPlugin, authentication: Optional[Authentication]) -> None: self.drs_path = None # to shorten the retry URLs if self.replica is None: self.replica = 'aws' assert isinstance(plugin, Plugin) dss_url = plugin.direct_file_url(file_uuid=self.file_uuid, file_version=self.file_version, replica=self.replica, token=self.token) dss_response = requests.get(dss_url, allow_redirects=False) if dss_response.status_code == 301: retry_after = int(dss_response.headers.get('Retry-After')) location = dss_response.headers['Location'] location = urllib.parse.urlparse(location) query = urllib.parse.parse_qs(location.query, strict_parsing=True) self.token = one(query['token']) self.replica = one(query['replica']) self.file_version = one(query['version']) self._retry_after = retry_after elif dss_response.status_code == 302: location = dss_response.headers['Location'] # Remove once https://github.com/HumanCellAtlas/data-store/issues/1837 is resolved if True: location = urllib.parse.urlparse(location) query = urllib.parse.parse_qs(location.query, strict_parsing=True) expires = int(one(query['Expires'])) bucket = location.netloc.partition('.')[0] dss_endpoint = one(plugin.sources).name assert bucket == aws.dss_checkout_bucket(dss_endpoint), bucket with aws.direct_access_credentials(dss_endpoint, lambda_name='service'): # FIXME: make region configurable (https://github.com/DataBiosphere/azul/issues/1560) s3 = aws.client('s3', region_name='us-east-1') params = { 'Bucket': bucket, 'Key': location.path[1:], 'ResponseContentDisposition': 'attachment;filename=' + self.file_name, } location = s3.generate_presigned_url( ClientMethod=s3.get_object.__name__, ExpiresIn=round(expires - time.time()), Params=params) self._location = location else: dss_response.raise_for_status() assert False
def verify_sources(): tdr_catalogs = { catalog.name for catalog in config.catalogs.values() if catalog.plugins[RepositoryPlugin.type_name()].name == 'tdr' } assert tdr_catalogs, tdr_catalogs futures = [] with ThreadPoolExecutor(max_workers=16) as tpe: for source in set(chain.from_iterable(map(config.sources, tdr_catalogs))): source = TDRSourceSpec.parse(source) for check in (tdr.check_api_access, tdr.check_bigquery_access, verify_source): futures.append(tpe.submit(check, source)) for completed_future in as_completed(futures): futures.remove(completed_future) e = completed_future.exception() if e is not None: for running_future in futures: running_future.cancel() raise e
def main(argv): configure_script_logging(logger) import argparse parser = argparse.ArgumentParser( description='Subscribe indexer lambda to bundle events from DSS') parser.add_argument('--unsubscribe', '-U', dest='subscribe', action='store_false', default=True) parser.add_argument( '--personal', '-p', dest='shared', action='store_false', default=True, help= "Do not use the shared credentials of the Google service account that represents the " "current deployment, but instead use personal credentials for authenticating to the DSS. " "When specifying this option you will need to a) run `hca dss login` prior to running " "this script or b) set GOOGLE_APPLICATION_CREDENTIALS to point to another service " "account's credentials. Note that this implies that the resulting DSS subscription will " "be owned by a) you or b) the other service account and that only a) you or b) someone " "in possession of those credentials can modify the subscription in the future. This is " "typically not what you'd want.") options = parser.parse_args(argv) dss_client = azul.dss.client() for catalog in config.catalogs: plugin = RepositoryPlugin.load(catalog) if isinstance(plugin, dss.Plugin): if options.shared: with aws.service_account_credentials( config.ServiceAccount.indexer): subscription.manage_subscriptions( plugin, dss_client, subscribe=options.subscribe) else: subscription.manage_subscriptions(plugin, dss_client, subscribe=options.subscribe)
def main(argv): parser = argparse.ArgumentParser(description=__doc__, formatter_class=AzulArgumentHelpFormatter) default_catalog = config.default_catalog plugin_cls = RepositoryPlugin.load(default_catalog) plugin = plugin_cls.create(default_catalog) if len(plugin.sources) == 1: source_arg = {'default': str(one(plugin.sources))} else: source_arg = {'required': True} parser.add_argument('--source', '-s', **source_arg, help='The repository source containing the bundle') parser.add_argument('--uuid', '-b', required=True, help='The UUID of the bundle to can.') parser.add_argument('--version', '-v', help='The version of the bundle to can (default: the latest version).') parser.add_argument('--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') args = parser.parse_args(argv) bundle = fetch_bundle(args.source, args.uuid, args.version) save_bundle(bundle, args.output_dir)
def repository_plugin(self, catalog: CatalogName) -> RepositoryPlugin: return RepositoryPlugin.load(catalog).create(catalog)
def plugin_for(catalog): return RepositoryPlugin.load(catalog).create(catalog)
def plugin_db(self) -> JSONs: # Must be lazy so the mock catalog's repository plugin is used catalog = config.default_catalog plugin = RepositoryPlugin.load(catalog).create(catalog) return plugin.portal_db()
def default_db(self) -> JSONs: # FIXME: Parameterize PortalService instances with current catalog # https://github.com/DataBiosphere/azul/issues/2716 catalog = config.default_catalog plugin = RepositoryPlugin.load(catalog).create(catalog) return self.demultiplex(plugin.portal_db())
def repository_plugin(self) -> RepositoryPlugin: catalog = self.catalog return RepositoryPlugin.load(catalog).create(catalog)