def __init__(self, config): super(S3Resolver, self).__init__(config) self.default_format = self.config.get("default_format", None) self._ident_regex_checker = IdentRegexChecker( ident_regex=self.config.get("ident_regex")) self._cache_namer = CacheNamer() if "cache_root" in self.config: self.cache_root = self.config["cache_root"] else: message = ("Server Side Error: Configuration incomplete and " "cannot resolve. Missing setting for cache_root.") logger.error(message) raise ResolverException(message) self.has_bucket_map = False if "bucket_map" in config: self.bucket_map = config["bucket_map"] self.has_bucket_map = True logger.debug("s3 bucket_map: {}".format(self.bucket_map)) # boto3: if not in us-east-1, set envvar AWS_DEFAULT_REGION to avoid extra # requests when downloading from s3 # thread safe: # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html#multithreading-multiprocessing session = boto3.session.Session() self.s3 = session.resource("s3") logger.info("loaded s3 resolver with config: {}".format(config))
def __init__(self, config): super(SimpleHTTPResolver, self).__init__(config) self.source_prefix = self.config.get('source_prefix', '') self.source_suffix = self.config.get('source_suffix', '') self.default_format = self.config.get('default_format', None) self.head_resolvable = self.config.get('head_resolvable', False) self.uri_resolvable = self.config.get('uri_resolvable', False) self.user = self.config.get('user', None) self.pw = self.config.get('pw', None) self.cert = self.config.get('cert', None) self.key = self.config.get('key', None) self.ssl_check = self.config.get('ssl_check', True) self._ident_regex_checker = IdentRegexChecker( ident_regex=self.config.get('ident_regex') ) self._cache_namer = CacheNamer() if 'cache_root' in self.config: self.cache_root = self.config['cache_root'] else: message = 'Server Side Error: Configuration incomplete and cannot resolve. Missing setting for cache_root.' logger.error(message) raise ResolverException(message) if not self.uri_resolvable and self.source_prefix == '': message = 'Server Side Error: Configuration incomplete and cannot resolve. Must either set uri_resolvable' \ ' or source_prefix settings.' logger.error(message) raise ResolverException(message)
class SimpleHTTPResolver(_AbstractResolver): ''' Example resolver that one might use if image files were coming from an http image store (like Fedora Commons). The first call to `resolve()` copies the source image into a local cache; subsequent calls use local copy from the cache. The config dictionary MUST contain * `cache_root`, which is the absolute path to the directory where source images should be cached. The config dictionary MAY contain * `source_prefix`, the url up to the identifier. * `source_suffix`, the url after the identifier (if applicable). * `default_format`, the format of images (will use content-type of response if not specified). * `head_resolvable` with value True, whether to make HEAD requests to verify object existence (don't set if using Fedora Commons prior to 3.8). * `uri_resolvable` with value True, allows one to use full uri's to resolve to an image. * `user`, the username to make the HTTP request as. * `pw`, the password to make the HTTP request as. * `ssl_check`, whether to check the validity of the origin server's HTTPS certificate. Set to False if you are using an origin server with a self-signed certificate. * `cert`, path to an SSL client certificate to use for authentication. If `cert` and `key` are both present, they take precedence over `user` and `pw` for authentication. * `key`, path to an SSL client key to use for authentication. ''' def __init__(self, config): super(SimpleHTTPResolver, self).__init__(config) self.source_prefix = self.config.get('source_prefix', '') self.source_suffix = self.config.get('source_suffix', '') self.default_format = self.config.get('default_format', None) self.head_resolvable = self.config.get('head_resolvable', False) self.uri_resolvable = self.config.get('uri_resolvable', False) self.user = self.config.get('user', None) self.pw = self.config.get('pw', None) self.cert = self.config.get('cert', None) self.key = self.config.get('key', None) self.ssl_check = self.config.get('ssl_check', True) self._ident_regex_checker = IdentRegexChecker( ident_regex=self.config.get('ident_regex')) self._cache_namer = CacheNamer() if 'cache_root' in self.config: self.cache_root = self.config['cache_root'] else: message = 'Server Side Error: Configuration incomplete and cannot resolve. Missing setting for cache_root.' logger.error(message) raise ResolverException(message) if not self.uri_resolvable and self.source_prefix == '': message = 'Server Side Error: Configuration incomplete and cannot resolve. Must either set uri_resolvable' \ ' or source_prefix settings.' logger.error(message) raise ResolverException(message) def request_options(self): # parameters to pass to all head and get requests; options = {} if self.cert is not None and self.key is not None: options['cert'] = (self.cert, self.key) if self.user is not None and self.pw is not None: options['auth'] = (self.user, self.pw) options['verify'] = self.ssl_check return options def is_resolvable(self, ident): ident = unquote(ident) if not self._ident_regex_checker.is_allowed(ident): return False fp = self.cache_dir_path(ident=ident) if exists(fp): return True else: try: (url, options) = self._web_request_url(ident) except ResolverException: return False try: if self.head_resolvable: response = requests.head(url, **options) return response.ok else: with closing(requests.get(url, stream=True, **options)) as response: return response.ok except requests.ConnectionError: return False def get_format(self, ident, potential_format): if self.default_format is not None: return self.default_format elif potential_format is not None: return potential_format else: return self.format_from_ident(ident) def _web_request_url(self, ident): if ident.startswith(('http://', 'https://')) and self.uri_resolvable: url = ident else: url = self.source_prefix + ident + self.source_suffix if not url.startswith(('http://', 'https://')): logger.warn('Bad URL request at %s for identifier: %s.', url, ident) raise ResolverException( "Bad URL request made for identifier: %r." % ident) return (url, self.request_options()) def cache_dir_path(self, ident): return os.path.join(self.cache_root, CacheNamer.cache_directory_name(ident=ident)) def raise_404_for_ident(self, ident): raise ResolverException("Image not found for identifier: %r." % ident) def cached_file_for_ident(self, ident): cache_dir = self.cache_dir_path(ident) if exists(cache_dir): files = glob.glob(join(cache_dir, 'loris_cache.*')) if files: return files[0] return None def cache_file_extension(self, ident, response): if 'content-type' in response.headers: try: extension = self.get_format( ident, constants.FORMATS_BY_MEDIA_TYPE[ response.headers['content-type']]) except KeyError: logger.warn( 'Your server may be responding with incorrect content-types. Reported %s for ident %s.', response.headers['content-type'], ident) # Attempt without the content-type extension = self.get_format(ident, None) else: extension = self.get_format(ident, None) return extension def copy_to_cache(self, ident): ident = unquote(ident) #get source image and write to temporary file (source_url, options) = self._web_request_url(ident) assert source_url is not None cache_dir = self.cache_dir_path(ident) mkdir_p(cache_dir) with closing(requests.get(source_url, stream=True, **options)) as response: if not response.ok: logger.warn( "Source image not found at %s for identifier: %s. " "Status code returned: %s.", source_url, ident, response.status_code) raise ResolverException( "Source image not found for identifier: %s. " "Status code returned: %s." % (ident, response.status_code)) extension = self.cache_file_extension(ident, response) local_fp = join(cache_dir, "loris_cache." + extension) with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tmp_file: for chunk in response.iter_content(2048): tmp_file.write(chunk) # Now rename the temp file to the desired file name if it still # doesn't exist (another process could have created it). # # Note: This is purely an optimisation; if the file springs into # existence between the existence check and the copy, it will be # overridden. if exists(local_fp): logger.info('Another process downloaded src image %s', local_fp) remove(tmp_file.name) else: safe_rename(tmp_file.name, local_fp) logger.info("Copied %s to %s", source_url, local_fp) # Check for rules file associated with image file # These files are < 2k in size, so fetch in one go. # Assumes that the rules will be next to the image # cache_dir is image specific, so this is easy bits = split(source_url) fn = bits[1].rsplit('.', 1)[0] + "." + self.auth_rules_ext rules_url = bits[0] + '/' + fn try: resp = requests.get(rules_url) if resp.status_code == 200: local_rules_fp = join(cache_dir, "loris_cache." + self.auth_rules_ext) if not exists(local_rules_fp): with open(local_rules_fp, 'w') as fh: fh.write(resp.text) except: # No connection available pass return local_fp def resolve(self, app, ident, base_uri): cached_file_path = self.cached_file_for_ident(ident) if not cached_file_path: cached_file_path = self.copy_to_cache(ident) format_ = self.get_format(cached_file_path, None) uri = self.fix_base_uri(base_uri) if self.use_extra_info: extra = self.get_extra_info(ident, cached_file_path) else: extra = {} return ImageInfo(app, uri, cached_file_path, format_, extra)
class S3Resolver(_AbstractResolver): """Resolver for image files stored on aws s3 buckets. The first call to `resolve()` copies the source image into a local cache; subsequent calls use local copy from the cache. A config example: [resolver] impl = 'hxloris.s3resolver.S3Resolver' # absolute path to dir where source images are downloaded from s3 # mandatory cache_root = '/var/loris_cache_root' # subsection to define mappings from :ident to an s3 bucket/key # optional [[bucket_map]] [[[site1]]] bucket = 'bucket-for-site1' key_prefix = 'loris/images' [[[site2]]] bucket = 'bucket-for-site2' key_prefix = 'loris/other-images' ... an incoming request url and its corresponding s3 bucket/prefix: http://localhost/site1/this/that/image.jpg/:region/:size/:rotation/default.jpg s3://bucket-for-site1/loris/images/this/that/image.jpg or http://localhost/site2/blah/image3.jpg/:region/:size/:rotation/default.jpg s3://bucket-for-site2/loris/other-images/blah/image3.jpg `bucket_map` is optional (as is `key_prefix`), but will always require a `bucket` to be in the request url. For example, the url below is invalid: http://localhost/image.jpg If it looks too similar to loris.resolver.SimpleHTTPResolver... you're right! """ def __init__(self, config): super(S3Resolver, self).__init__(config) self.default_format = self.config.get("default_format", None) self._ident_regex_checker = IdentRegexChecker( ident_regex=self.config.get("ident_regex")) self._cache_namer = CacheNamer() if "cache_root" in self.config: self.cache_root = self.config["cache_root"] else: message = ("Server Side Error: Configuration incomplete and " "cannot resolve. Missing setting for cache_root.") logger.error(message) raise ResolverException(message) self.has_bucket_map = False if "bucket_map" in config: self.bucket_map = config["bucket_map"] self.has_bucket_map = True logger.debug("s3 bucket_map: {}".format(self.bucket_map)) # boto3: if not in us-east-1, set envvar AWS_DEFAULT_REGION to avoid extra # requests when downloading from s3 # thread safe: # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html#multithreading-multiprocessing session = boto3.session.Session() self.s3 = session.resource("s3") logger.info("loaded s3 resolver with config: {}".format(config)) def raise_404_for_ident(self, ident): message = "Source image not found for identifier: %s." % (ident, ) logger.warn(message) raise ResolverException(message) def is_resolvable(self, ident): """checks if ident contains a readable s3 object. this generates a head request for the s3 object """ ident = unquote(ident) if not self._ident_regex_checker.is_allowed(ident): return False fp = self.cache_dir_path(ident=ident) if os.path.exists(fp): return True else: try: (bucketname, keyname) = self.s3bucket_from_ident(ident) except ResolverException as e: logger.warn(e) return False # check that we can get to this object on s3 # access to s3obj prop generates a head request or 404 try: s3obj = self.s3.Object(bucketname, keyname) content_length = s3obj.content_length except Exception as e: logger.error("unable to access s3 object ({}:{}): {}".format( bucketname, keyname, e)) return False else: if content_length > 0: return True else: logger.warning("empty s3 object ({}:{})".format( bucketname, keyname)) return False def get_format(self, ident, potential_format): if self.default_format is not None: return self.default_format elif potential_format is not None: return potential_format else: return self.format_from_ident(ident) def s3bucket_from_ident(self, ident): """ returns tuple(buckename, keyname) parsed from ident.""" key_parts = ident.split("/", 1) if len(key_parts) == 2: (bucket, partial_key) = key_parts else: raise ResolverException( "Invalid identifier. Expected bucket/ident; got {}".format( key_parts)) # check if bucketname actually means something different if self.has_bucket_map and bucket in self.bucket_map: bucketname = self.bucket_map[bucket]["bucket"] if "key_prefix" in self.bucket_map[bucket]: keyname = os.path.join(self.bucket_map[bucket]["key_prefix"], partial_key) else: keyname = partial_key return (bucketname, keyname) else: # what came in ident is the actual bucketname return (bucket, partial_key) def cache_dir_path(self, ident): # build dir path for ident file in cache return os.path.join(self.cache_root, CacheNamer.cache_directory_name(ident=ident)) def cached_file_for_ident(self, ident): # recover filepath for ident in cache cache_dir = self.cache_dir_path(ident) if os.path.exists(cache_dir): files = glob.glob(os.path.join(cache_dir, "loris_cache.*")) if files: return files[0] return None def cache_file_extension(self, ident, content_type=None): if content_type is not None: try: extension = self.get_format( ident, constants.FORMATS_BY_MEDIA_TYPE[content_type]) except KeyError: logger.warn( "wonky s3 resource content-type({}) for ident({})", content_type, ident, ) # Attempt without the content-type extension = self.get_format(ident, None) else: extension = self.get_format(ident, None) return extension def copy_to_cache(self, ident): """ downloads image source file from s3, if not in cache already.""" ident = unquote(ident) # get source image and write to temporary file (bucketname, keyname) = self.s3bucket_from_ident(ident) try: s3obj = self.s3.Object(bucketname, keyname) content_type = s3obj.content_type except Exception as e: msg = "no content_type for s3 object ({}:{}): {}".format( bucketname, keyname, e) logger.error(msg) raise ResolverException(msg) extension = self.cache_file_extension(ident, content_type) cache_dir = self.cache_dir_path(ident) os.makedirs(cache_dir, exist_ok=True) local_fp = os.path.join(cache_dir, "loris_cache." + extension) with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tmp_file: try: self.s3.Bucket(bucketname).download_fileobj(keyname, tmp_file) except Exception as e: msg = "unable to access or save s3 object ({}:{}): {}".format( bucketname, keyname, e) logger.error(msg) raise ResolverException(msg) # Now rename the temp file to the desired file name if it still # doesn't exist (another process could have created it). # # Note: This is purely an optimisation; if the file springs into # existence between the existence check and the copy, it will be # overridden. if os.path.exists(local_fp): logger.info( "Another process downloaded src image {}".format(local_fp)) os.remove(tmp_file.name) else: safe_rename(tmp_file.name, local_fp) logger.info("Copied {}:{} to {}".format(bucketname, keyname, local_fp)) # Check for rules file associated with image file # These files are < 2k in size, so fetch in one go. # Assumes that the rules will be next to the image # cache_dir is image specific, so this is easy bits = os.path.split(keyname) # === bash basename fn = bits[1].rsplit(".")[0] + "." + self.auth_rules_ext rules_keyname = bits[0] + "/" + fn local_rules_fp = os.path.join(cache_dir, "loris_cache." + self.auth_rules_ext) try: self.s3.Object(bucketname, rules_keyname).download_file(local_rules_fp) except Exception as e: # no connection available? msg = "ignoring rules file({}/{}) for ident({}): {}".format( bucketname, rules_keyname, ident, e) logger.warn(msg) return local_fp def resolve(self, app, ident, base_uri): if not self.is_resolvable(ident): self.raise_404_for_ident(ident) cached_file_path = self.cached_file_for_ident(ident) if not cached_file_path: cached_file_path = self.copy_to_cache(ident) format_ = self.get_format(cached_file_path, None) auth_rules = self.get_auth_rules(ident, cached_file_path) return ImageInfo( app=app, src_img_fp=cached_file_path, src_format=format_, auth_rules=auth_rules, )
def test_checker_has_correct_is_allowed(self, ident_regex, ident, expected_is_allowed): checker = IdentRegexChecker(ident_regex=ident_regex) assert checker.is_allowed(ident=ident) is expected_is_allowed
def test_any_ident_is_allowed_if_regex_is_none(self, ident): checker = IdentRegexChecker(ident_regex=None) assert checker.is_allowed(ident=ident) is True