Пример #1
0
    def __init__(self, config):
        super(S3Resolver, self).__init__(config)
        self.default_format = self.config.get("default_format", None)

        self._ident_regex_checker = IdentRegexChecker(
            ident_regex=self.config.get("ident_regex"))
        self._cache_namer = CacheNamer()

        if "cache_root" in self.config:
            self.cache_root = self.config["cache_root"]
        else:
            message = ("Server Side Error: Configuration incomplete and "
                       "cannot resolve. Missing setting for cache_root.")
            logger.error(message)
            raise ResolverException(message)

        self.has_bucket_map = False
        if "bucket_map" in config:
            self.bucket_map = config["bucket_map"]
            self.has_bucket_map = True
            logger.debug("s3 bucket_map: {}".format(self.bucket_map))

        # boto3: if not in us-east-1, set envvar AWS_DEFAULT_REGION to avoid extra
        # requests when downloading from s3
        # thread safe:
        # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html#multithreading-multiprocessing
        session = boto3.session.Session()
        self.s3 = session.resource("s3")

        logger.info("loaded s3 resolver with config: {}".format(config))
Пример #2
0
    def __init__(self, config):
        super(SimpleHTTPResolver, self).__init__(config)

        self.source_prefix = self.config.get('source_prefix', '')

        self.source_suffix = self.config.get('source_suffix', '')

        self.default_format = self.config.get('default_format', None)

        self.head_resolvable = self.config.get('head_resolvable', False)

        self.uri_resolvable = self.config.get('uri_resolvable', False)

        self.user = self.config.get('user', None)

        self.pw = self.config.get('pw', None)

        self.cert = self.config.get('cert', None)

        self.key = self.config.get('key', None)

        self.ssl_check = self.config.get('ssl_check', True)

        self._ident_regex_checker = IdentRegexChecker(
            ident_regex=self.config.get('ident_regex')
        )
        self._cache_namer = CacheNamer()

        if 'cache_root' in self.config:
            self.cache_root = self.config['cache_root']
        else:
            message = 'Server Side Error: Configuration incomplete and cannot resolve. Missing setting for cache_root.'
            logger.error(message)
            raise ResolverException(message)

        if not self.uri_resolvable and self.source_prefix == '':
            message = 'Server Side Error: Configuration incomplete and cannot resolve. Must either set uri_resolvable' \
                      ' or source_prefix settings.'
            logger.error(message)
            raise ResolverException(message)
Пример #3
0
class SimpleHTTPResolver(_AbstractResolver):
    '''
    Example resolver that one might use if image files were coming from
    an http image store (like Fedora Commons). The first call to `resolve()`
    copies the source image into a local cache; subsequent calls use local
    copy from the cache.

    The config dictionary MUST contain
     * `cache_root`, which is the absolute path to the directory where source images
        should be cached.

    The config dictionary MAY contain
     * `source_prefix`, the url up to the identifier.
     * `source_suffix`, the url after the identifier (if applicable).
     * `default_format`, the format of images (will use content-type of response if not specified).
     * `head_resolvable` with value True, whether to make HEAD requests to verify object existence (don't set if using
        Fedora Commons prior to 3.8).
     * `uri_resolvable` with value True, allows one to use full uri's to resolve to an image.
     * `user`, the username to make the HTTP request as.
     * `pw`, the password to make the HTTP request as.
     * `ssl_check`, whether to check the validity of the origin server's HTTPS
     certificate. Set to False if you are using an origin server with a
     self-signed certificate.
     * `cert`, path to an SSL client certificate to use for authentication. If `cert` and `key` are both present, they take precedence over `user` and `pw` for authentication.
     * `key`, path to an SSL client key to use for authentication.
    '''
    def __init__(self, config):
        super(SimpleHTTPResolver, self).__init__(config)

        self.source_prefix = self.config.get('source_prefix', '')

        self.source_suffix = self.config.get('source_suffix', '')

        self.default_format = self.config.get('default_format', None)

        self.head_resolvable = self.config.get('head_resolvable', False)

        self.uri_resolvable = self.config.get('uri_resolvable', False)

        self.user = self.config.get('user', None)

        self.pw = self.config.get('pw', None)

        self.cert = self.config.get('cert', None)

        self.key = self.config.get('key', None)

        self.ssl_check = self.config.get('ssl_check', True)

        self._ident_regex_checker = IdentRegexChecker(
            ident_regex=self.config.get('ident_regex'))
        self._cache_namer = CacheNamer()

        if 'cache_root' in self.config:
            self.cache_root = self.config['cache_root']
        else:
            message = 'Server Side Error: Configuration incomplete and cannot resolve. Missing setting for cache_root.'
            logger.error(message)
            raise ResolverException(message)

        if not self.uri_resolvable and self.source_prefix == '':
            message = 'Server Side Error: Configuration incomplete and cannot resolve. Must either set uri_resolvable' \
                      ' or source_prefix settings.'
            logger.error(message)
            raise ResolverException(message)

    def request_options(self):
        # parameters to pass to all head and get requests;
        options = {}
        if self.cert is not None and self.key is not None:
            options['cert'] = (self.cert, self.key)
        if self.user is not None and self.pw is not None:
            options['auth'] = (self.user, self.pw)
        options['verify'] = self.ssl_check
        return options

    def is_resolvable(self, ident):
        ident = unquote(ident)

        if not self._ident_regex_checker.is_allowed(ident):
            return False

        fp = self.cache_dir_path(ident=ident)
        if exists(fp):
            return True
        else:
            try:
                (url, options) = self._web_request_url(ident)
            except ResolverException:
                return False

            try:
                if self.head_resolvable:
                    response = requests.head(url, **options)
                    return response.ok
                else:
                    with closing(requests.get(url, stream=True,
                                              **options)) as response:
                        return response.ok
            except requests.ConnectionError:
                return False

    def get_format(self, ident, potential_format):
        if self.default_format is not None:
            return self.default_format
        elif potential_format is not None:
            return potential_format
        else:
            return self.format_from_ident(ident)

    def _web_request_url(self, ident):
        if ident.startswith(('http://', 'https://')) and self.uri_resolvable:
            url = ident
        else:
            url = self.source_prefix + ident + self.source_suffix
        if not url.startswith(('http://', 'https://')):
            logger.warn('Bad URL request at %s for identifier: %s.', url,
                        ident)
            raise ResolverException(
                "Bad URL request made for identifier: %r." % ident)
        return (url, self.request_options())

    def cache_dir_path(self, ident):
        return os.path.join(self.cache_root,
                            CacheNamer.cache_directory_name(ident=ident))

    def raise_404_for_ident(self, ident):
        raise ResolverException("Image not found for identifier: %r." % ident)

    def cached_file_for_ident(self, ident):
        cache_dir = self.cache_dir_path(ident)
        if exists(cache_dir):
            files = glob.glob(join(cache_dir, 'loris_cache.*'))
            if files:
                return files[0]
        return None

    def cache_file_extension(self, ident, response):
        if 'content-type' in response.headers:
            try:
                extension = self.get_format(
                    ident, constants.FORMATS_BY_MEDIA_TYPE[
                        response.headers['content-type']])
            except KeyError:
                logger.warn(
                    'Your server may be responding with incorrect content-types. Reported %s for ident %s.',
                    response.headers['content-type'], ident)
                # Attempt without the content-type
                extension = self.get_format(ident, None)
        else:
            extension = self.get_format(ident, None)
        return extension

    def copy_to_cache(self, ident):
        ident = unquote(ident)

        #get source image and write to temporary file
        (source_url, options) = self._web_request_url(ident)
        assert source_url is not None

        cache_dir = self.cache_dir_path(ident)
        mkdir_p(cache_dir)

        with closing(requests.get(source_url, stream=True,
                                  **options)) as response:
            if not response.ok:
                logger.warn(
                    "Source image not found at %s for identifier: %s. "
                    "Status code returned: %s.", source_url, ident,
                    response.status_code)
                raise ResolverException(
                    "Source image not found for identifier: %s. "
                    "Status code returned: %s." %
                    (ident, response.status_code))

            extension = self.cache_file_extension(ident, response)
            local_fp = join(cache_dir, "loris_cache." + extension)

            with tempfile.NamedTemporaryFile(dir=cache_dir,
                                             delete=False) as tmp_file:
                for chunk in response.iter_content(2048):
                    tmp_file.write(chunk)

        # Now rename the temp file to the desired file name if it still
        # doesn't exist (another process could have created it).
        #
        # Note: This is purely an optimisation; if the file springs into
        # existence between the existence check and the copy, it will be
        # overridden.
        if exists(local_fp):
            logger.info('Another process downloaded src image %s', local_fp)
            remove(tmp_file.name)
        else:
            safe_rename(tmp_file.name, local_fp)
            logger.info("Copied %s to %s", source_url, local_fp)

        # Check for rules file associated with image file
        # These files are < 2k in size, so fetch in one go.
        # Assumes that the rules will be next to the image
        # cache_dir is image specific, so this is easy

        bits = split(source_url)
        fn = bits[1].rsplit('.', 1)[0] + "." + self.auth_rules_ext
        rules_url = bits[0] + '/' + fn
        try:
            resp = requests.get(rules_url)
            if resp.status_code == 200:
                local_rules_fp = join(cache_dir,
                                      "loris_cache." + self.auth_rules_ext)
                if not exists(local_rules_fp):
                    with open(local_rules_fp, 'w') as fh:
                        fh.write(resp.text)
        except:
            # No connection available
            pass

        return local_fp

    def resolve(self, app, ident, base_uri):
        cached_file_path = self.cached_file_for_ident(ident)
        if not cached_file_path:
            cached_file_path = self.copy_to_cache(ident)
        format_ = self.get_format(cached_file_path, None)
        uri = self.fix_base_uri(base_uri)
        if self.use_extra_info:
            extra = self.get_extra_info(ident, cached_file_path)
        else:
            extra = {}
        return ImageInfo(app, uri, cached_file_path, format_, extra)
Пример #4
0
class S3Resolver(_AbstractResolver):
    """Resolver for image files stored on aws s3 buckets.

    The first call to `resolve()` copies the source image into a local cache;
    subsequent calls use local copy from the cache.

    A config example:

        [resolver]
        impl = 'hxloris.s3resolver.S3Resolver'

        # absolute path to dir where source images are downloaded from s3
        # mandatory
        cache_root = '/var/loris_cache_root'

        # subsection to define mappings from :ident to an s3 bucket/key
        # optional
        [[bucket_map]]
          [[[site1]]]
            bucket = 'bucket-for-site1'
            key_prefix = 'loris/images'

          [[[site2]]]
            bucket = 'bucket-for-site2'
            key_prefix = 'loris/other-images'

        ...

    an incoming request url and its corresponding s3 bucket/prefix:
        http://localhost/site1/this/that/image.jpg/:region/:size/:rotation/default.jpg
        s3://bucket-for-site1/loris/images/this/that/image.jpg
    or
        http://localhost/site2/blah/image3.jpg/:region/:size/:rotation/default.jpg
        s3://bucket-for-site2/loris/other-images/blah/image3.jpg

    `bucket_map` is optional (as is `key_prefix`), but will always require a
    `bucket` to be in the request url. For example, the url below is invalid:
        http://localhost/image.jpg

    If it looks too similar to loris.resolver.SimpleHTTPResolver... you're right!
    """
    def __init__(self, config):
        super(S3Resolver, self).__init__(config)
        self.default_format = self.config.get("default_format", None)

        self._ident_regex_checker = IdentRegexChecker(
            ident_regex=self.config.get("ident_regex"))
        self._cache_namer = CacheNamer()

        if "cache_root" in self.config:
            self.cache_root = self.config["cache_root"]
        else:
            message = ("Server Side Error: Configuration incomplete and "
                       "cannot resolve. Missing setting for cache_root.")
            logger.error(message)
            raise ResolverException(message)

        self.has_bucket_map = False
        if "bucket_map" in config:
            self.bucket_map = config["bucket_map"]
            self.has_bucket_map = True
            logger.debug("s3 bucket_map: {}".format(self.bucket_map))

        # boto3: if not in us-east-1, set envvar AWS_DEFAULT_REGION to avoid extra
        # requests when downloading from s3
        # thread safe:
        # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html#multithreading-multiprocessing
        session = boto3.session.Session()
        self.s3 = session.resource("s3")

        logger.info("loaded s3 resolver with config: {}".format(config))

    def raise_404_for_ident(self, ident):
        message = "Source image not found for identifier: %s." % (ident, )
        logger.warn(message)
        raise ResolverException(message)

    def is_resolvable(self, ident):
        """checks if ident contains a readable s3 object.

        this generates a head request for the s3 object
        """
        ident = unquote(ident)

        if not self._ident_regex_checker.is_allowed(ident):
            return False

        fp = self.cache_dir_path(ident=ident)
        if os.path.exists(fp):
            return True
        else:
            try:
                (bucketname, keyname) = self.s3bucket_from_ident(ident)
            except ResolverException as e:
                logger.warn(e)
                return False

            # check that we can get to this object on s3
            # access to s3obj prop generates a head request or 404
            try:
                s3obj = self.s3.Object(bucketname, keyname)
                content_length = s3obj.content_length
            except Exception as e:
                logger.error("unable to access s3 object ({}:{}): {}".format(
                    bucketname, keyname, e))
                return False
            else:
                if content_length > 0:
                    return True
                else:
                    logger.warning("empty s3 object ({}:{})".format(
                        bucketname, keyname))
                    return False

    def get_format(self, ident, potential_format):
        if self.default_format is not None:
            return self.default_format
        elif potential_format is not None:
            return potential_format
        else:
            return self.format_from_ident(ident)

    def s3bucket_from_ident(self, ident):
        """ returns tuple(buckename, keyname) parsed from ident."""
        key_parts = ident.split("/", 1)
        if len(key_parts) == 2:
            (bucket, partial_key) = key_parts
        else:
            raise ResolverException(
                "Invalid identifier. Expected bucket/ident; got {}".format(
                    key_parts))

        # check if bucketname actually means something different
        if self.has_bucket_map and bucket in self.bucket_map:
            bucketname = self.bucket_map[bucket]["bucket"]
            if "key_prefix" in self.bucket_map[bucket]:
                keyname = os.path.join(self.bucket_map[bucket]["key_prefix"],
                                       partial_key)
            else:
                keyname = partial_key
            return (bucketname, keyname)

        else:  # what came in ident is the actual bucketname
            return (bucket, partial_key)

    def cache_dir_path(self, ident):
        # build dir path for ident file in cache
        return os.path.join(self.cache_root,
                            CacheNamer.cache_directory_name(ident=ident))

    def cached_file_for_ident(self, ident):
        # recover filepath for ident in cache
        cache_dir = self.cache_dir_path(ident)
        if os.path.exists(cache_dir):
            files = glob.glob(os.path.join(cache_dir, "loris_cache.*"))
            if files:
                return files[0]
        return None

    def cache_file_extension(self, ident, content_type=None):
        if content_type is not None:
            try:
                extension = self.get_format(
                    ident, constants.FORMATS_BY_MEDIA_TYPE[content_type])
            except KeyError:
                logger.warn(
                    "wonky s3 resource content-type({}) for ident({})",
                    content_type,
                    ident,
                )
                # Attempt without the content-type
                extension = self.get_format(ident, None)
        else:
            extension = self.get_format(ident, None)
        return extension

    def copy_to_cache(self, ident):
        """ downloads image source file from s3, if not in cache already."""
        ident = unquote(ident)

        # get source image and write to temporary file
        (bucketname, keyname) = self.s3bucket_from_ident(ident)

        try:
            s3obj = self.s3.Object(bucketname, keyname)
            content_type = s3obj.content_type
        except Exception as e:
            msg = "no content_type for s3 object ({}:{}): {}".format(
                bucketname, keyname, e)
            logger.error(msg)
            raise ResolverException(msg)

        extension = self.cache_file_extension(ident, content_type)
        cache_dir = self.cache_dir_path(ident)
        os.makedirs(cache_dir, exist_ok=True)
        local_fp = os.path.join(cache_dir, "loris_cache." + extension)
        with tempfile.NamedTemporaryFile(dir=cache_dir,
                                         delete=False) as tmp_file:
            try:
                self.s3.Bucket(bucketname).download_fileobj(keyname, tmp_file)
            except Exception as e:
                msg = "unable to access or save s3 object ({}:{}): {}".format(
                    bucketname, keyname, e)
                logger.error(msg)
                raise ResolverException(msg)

        # Now rename the temp file to the desired file name if it still
        # doesn't exist (another process could have created it).
        #
        # Note: This is purely an optimisation; if the file springs into
        # existence between the existence check and the copy, it will be
        # overridden.
        if os.path.exists(local_fp):
            logger.info(
                "Another process downloaded src image {}".format(local_fp))
            os.remove(tmp_file.name)
        else:
            safe_rename(tmp_file.name, local_fp)
            logger.info("Copied {}:{} to {}".format(bucketname, keyname,
                                                    local_fp))

        # Check for rules file associated with image file
        # These files are < 2k in size, so fetch in one go.
        # Assumes that the rules will be next to the image
        # cache_dir is image specific, so this is easy
        bits = os.path.split(keyname)  # === bash basename
        fn = bits[1].rsplit(".")[0] + "." + self.auth_rules_ext
        rules_keyname = bits[0] + "/" + fn
        local_rules_fp = os.path.join(cache_dir,
                                      "loris_cache." + self.auth_rules_ext)
        try:
            self.s3.Object(bucketname,
                           rules_keyname).download_file(local_rules_fp)
        except Exception as e:
            # no connection available?
            msg = "ignoring rules file({}/{}) for ident({}): {}".format(
                bucketname, rules_keyname, ident, e)
            logger.warn(msg)

        return local_fp

    def resolve(self, app, ident, base_uri):
        if not self.is_resolvable(ident):
            self.raise_404_for_ident(ident)
        cached_file_path = self.cached_file_for_ident(ident)
        if not cached_file_path:
            cached_file_path = self.copy_to_cache(ident)
        format_ = self.get_format(cached_file_path, None)
        auth_rules = self.get_auth_rules(ident, cached_file_path)
        return ImageInfo(
            app=app,
            src_img_fp=cached_file_path,
            src_format=format_,
            auth_rules=auth_rules,
        )
Пример #5
0
 def test_checker_has_correct_is_allowed(self, ident_regex, ident,
                                         expected_is_allowed):
     checker = IdentRegexChecker(ident_regex=ident_regex)
     assert checker.is_allowed(ident=ident) is expected_is_allowed
Пример #6
0
 def test_any_ident_is_allowed_if_regex_is_none(self, ident):
     checker = IdentRegexChecker(ident_regex=None)
     assert checker.is_allowed(ident=ident) is True