Пример #1
0
class PublicDomain:

    app = None
    config = None

    fcrepo_id = ""
    dhurl = ""

    is_public_domain = False

    def __init__(self, app, config, fcrepo_id):
        self.logger = getLogger(__name__)
        self.app = app
        self.config = config
        self.fcrepo_id = fcrepo_id
        self.session = requests.Session()
        if "enviro" not in config:
            self.session.proxies = {
                "http": "http://sysprox.artic.edu:3128",
                "https": "http://sysprox.artic.edu:3128",
            }
        if self._valid_fcrepo_id(fcrepo_id):
            self.dhurl = "http://aggregator-data.artic.edu/api/v1/artworks/search?cache=false&query[bool][should][][term][image_id]=" + self.fcrepo_id + "&query[bool][should][][term][alt_image_ids]=" + self.fcrepo_id + "&fields=is_public_domain,id,is_zoomable,max_zoom_window_size,api_link,title,artist_display"
        self._db = DB(app, config["sqlite"]["db"])

        self.logger.debug("fcrepo_id is: {}".format(self.fcrepo_id))
        return

    def get(self):
        self.logger.debug("Fetching public_domain status for: {}".format(
            self.fcrepo_id))
        fs_path = self.get_fs_path()
        if fs_path != "Status404":
            self.logger.debug("Reading: {}".format(fs_path))
            with open(fs_path, "rb") as f:
                imagedata = f.read()
                self.logger.debug("Serving: {}".format(fs_path))
            if imagedata:
                response = Response(imagedata)
                response.headers['Content-type'] = self.contenttype
                return (response, "200")
            else:
                return ("What? " + fs_path, 404)
        else:
            return ("404 Not Found", 404)

    def get_pd_status(self):
        self.logger.debug(
            "Fetching stored public_domain status for: {}".format(
                self.fcrepo_id))

        if self._valid_fcrepo_id(self.fcrepo_id):
            pd_status = self._pd_desg_get()
            self.logger.debug(
                "Returning public_domain status {} for {}".format(
                    pd_status, self.fcrepo_id))
            if str(pd_status) == "Status503" or str(pd_status) == "Status404":
                return pd_status
            else:
                return '{ "is_public_domain": ' + str(pd_status).lower() + ' }'
        else:
            return "Status404"

    def _pd_desg_get(self):

        self.pd_desgs_exists = False
        sql_query = "SELECT public_domain FROM pd_designations WHERE fcrepo_image_id = '" + self.fcrepo_id + "' AND last_checked >= datetime('now', '-24 hours');"
        self.logger.debug(
            "Checking for existing pd_status within expiry time: {}".format(
                sql_query))
        pd_desgs = self._db.query(sql_query)
        if pd_desgs != None:
            self.logger.debug("Found DB entry for {}.".format(self.fcrepo_id))
            self.pd_desgs_exists = True
            if str(pd_desgs[0][0]) == "1":
                self.is_public_domain = True
        else:
            # Must look it up in the datahub, but first we'll make sure it is in lakemichigan
            if self._content_in_fcrepo(self.fcrepo_id):
                self.logger.debug("No DB entry found for {}.".format(
                    self.fcrepo_id))
                self.logger.debug("Checking datahub for {}.".format(
                    self.fcrepo_id))
                try:
                    dhresponse = self.session.get(self.dhurl)
                    dhdata = dhresponse.json()
                    if (len(dhdata["data"]) > 0):
                        if (dhdata["data"][0]["is_public_domain"]):
                            self.is_public_domain = True
                    else:
                        self.logger.debug(
                            "Datahub does not know about {}. Public_domain is true as this may be an Interpretive Resource."
                            .format(self.fcrepo_id))
                        self.is_public_domain = True
                    self._pd_desg_put()
                except:
                    return "Status503"
            else:
                return "Status404"
        return self.is_public_domain

    def _pd_desg_put(self):
        self.logger.debug(
            "Public domain status is {} for insert to DB for Asset {}".format(
                self.is_public_domain, self.fcrepo_id))
        pd_status_str = "0"
        if self.is_public_domain:
            pd_status_str = "1"
        sql_query = "SELECT public_domain FROM pd_designations WHERE fcrepo_image_id = '" + self.fcrepo_id + "';"
        self.logger.debug(
            "Checking for existing pd_status regardless of expiry: {}".format(
                sql_query))
        pd_desgs = self._db.query(sql_query)
        if pd_desgs != None:
            sql_query = "UPDATE pd_designations SET public_domain='" + pd_status_str + "', last_checked=datetime('now') WHERE fcrepo_image_id = '" + self.fcrepo_id + "';"
            etags = self._db.update(sql_query)
        else:
            sql_query = "INSERT INTO pd_designations (fcrepo_image_id, public_domain, last_checked) VALUES ('" + self.fcrepo_id + "', '" + pd_status_str + "', datetime('now'))"
            dbid = self._db.update(sql_query)
        return True

    def _content_in_fcrepo(self, fcrepo_id):
        fcrepo_path = fcrepo_path_from_hash(fcrepo_id)
        fcrepo_url = self.config["httpresolver"][
            "prefix"] + fcrepo_path + self.config["httpresolver"]["postfix"]
        fcrepo_hit = self.session.head(fcrepo_url)
        if fcrepo_hit.status_code == 200:
            return True
        return False

    def _valid_fcrepo_id(self, fcrepo_id):
        regex = re.compile(
            '^[a-z0-9]{8}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{12}$',
            re.I)
        match = regex.match(fcrepo_id)
        if bool(match):
            fcrepo_path = fcrepo_path_from_hash(fcrepo_id)
            fcrepo_url = self.config["httpresolver"][
                "prefix"] + fcrepo_path + self.config["httpresolver"]["postfix"]
            fcrepo_hit = self.session.head(fcrepo_url)
            if fcrepo_hit.status_code == 200:
                return True
        return False
Пример #2
0
class Content:

    app = None
    config = None

    fcrepo_id = ""
    fcepo_path = ""
    url = ""

    etag_exists = False
    content_type_extension_map = {
        "image/jp2": "jp2",
        "image/tiff": "tif",
        "image/tif": "tif",
        "audio/mpeg": "mp3",
        "audio/x-wave": "wav",
        "text/plain": "txt",
        "application/pdf": "pdf",
        "video/mp4": "mp4",
        "video/mpeg": "mpeg",
        "video/quicktime": "mov",
        "video/x-flv": "flv",
        "application/x-shockwave-flash": "swf",
        "image/jpeg": "jpg",
        "image/png": "png",
        "image/gif": "gif",
    }
    extension = ""
    contenttype = ""

    def __init__(self, app, config, fcrepo_id):
        self.logger = getLogger(__name__)
        self.app = app
        self.config = config
        self.fcrepo_id = fcrepo_id
        self.session = requests.Session()
        if self._valid_fcrepo_id(fcrepo_id):
            self.fcrepo_path = fcrepo_path_from_hash(fcrepo_id)
        else:
            self.fcrepo_path = '/' + fcrepo_id
        self.url = self.config["httpresolver"][
            "prefix"] + self.fcrepo_path + self.config["httpresolver"][
                "postfix"]
        self._db = DB(app, config["sqlite"]["db"])

        self.logger.debug("fcrepo_id is: {}".format(self.fcrepo_id))
        self.logger.debug("fcrepo_path is: {}".format(self.fcrepo_path))
        return

    def get(self):
        self.logger.debug("Fetching binary for: {}".format(self.fcrepo_id))
        fs_path = self.get_fs_path()
        if fs_path != "Status404":
            self.logger.debug("Reading: {}".format(fs_path))
            with open(fs_path, "rb") as f:
                imagedata = f.read()
                self.logger.debug("Serving: {}".format(fs_path))
            if imagedata:
                response = Response(imagedata)
                response.headers['Content-type'] = self.contenttype
                return (response, "200")
            else:
                return ("What? " + fs_path, 404)
        else:
            return ("404 Not Found", 404)

    def get_fs_path(self):
        self.logger.debug("Fetching fileystem location for: {}".format(
            self.fcrepo_id))
        headers = {}
        etag = self._etag_get()
        self.logger.debug("Etag is: {}".format(etag))
        if etag:
            headers["If-None-Match"] = etag

        cache_req = self.session.head(self.url, headers=headers)
        self.logger.debug('ETag cache response code: {}'.format(
            cache_req.status_code))
        self.logger.debug("cache_req headers: {}".format(cache_req.headers))

        if cache_req.status_code == 304:
            self.logger.debug('Status was 304.  Looking for cached file.')
            cache_fs_path = self.config["cache"][
                "basedir"] + self.fcrepo_path + "." + self.extension

            # ABSOLUTE NECESSITY
            file_matches = glob.glob(cache_fs_path + "[0-9a-zA-Z]*")
            if len(file_matches) > 0:
                cached_file_path = file_matches[0]
                if '.' in cached_file_path:
                    self.extension = cached_file_path.split('.')[-1]
                    for key, value in self.content_type_extension_map.items():
                        if value == self.extension:
                            self.extension = value
                            self.contenttype = key
                            break
                if self.extension == "":
                    self.extension = "jp2"
                    self.contenttype = "image/jp2"
            else:
                cached_file_path = self._copy_to_cache(cache_req.headers)
        elif cache_req.status_code == 404:
            cached_file_path = "Status404"
        elif cache_req.status_code == 503:
            cached_file_path = "Status503"
        else:
            cached_file_path = self._copy_to_cache(cache_req.headers)

        self.logger.debug(
            "Returning filesystem location: {}".format(cached_file_path))
        return cached_file_path

    def iipimage_redirect_path(self):
        fs_path = self.get_fs_path()
        redirect_file = fs_path.replace(self.config["cache"]["basedir"], '')
        return redirect_file

    def _set_extension_contenttype(self, cache_req_headers):
        if "content-type" in cache_req_headers:
            for key, value in self.content_type_extension_map.items():
                if key == cache_req_headers["content-type"]:
                    self.extension = value
                    self.contenttype = key
                    break
        return

    def _copy_to_cache(self, cache_req_headers):

        if "content-type" in cache_req_headers:
            self._set_extension_contenttype(cache_req_headers)
        else:
            contenthead = self.session.head(self.url)
            self._set_extension_contenttype(contenthead.headers)

        self.logger.debug("Copying to cache.")
        ident = self.fcrepo_id
        # Will take:
        #   /86/bf/14/11/86bf1411-6180-8103-52a1-e4d84f478ec1
        # and return:
        #   /86/bf/14/11/
        cache_dir = self.config["cache"]["basedir"] + self.fcrepo_path.replace(
            self.fcrepo_id, '')
        self.logger.debug("Cache dir is: {}".format(cache_dir))
        self._create_cache_dir(cache_dir)

        cache_fs_path = self.config["cache"][
            "basedir"] + self.fcrepo_path + "." + self.extension
        if os.path.isfile(cache_fs_path):
            os.unlink(cache_fs_path)

        # Mindful of this.  Requests.Session may require non-streamed content
        # or the connection is not released back in to the pool.
        with self.session.get(self.url, stream=True) as r:
            with open(cache_fs_path, 'wb') as f:
                # Increase the chunk size.  Fewer disk writes.
                for chunk in r.iter_content(10240):
                    f.write(chunk)
            # Store ETags.
            self._etag_put(r.headers['etag'])
        '''
        # This code didn't seem to improve matters and, in fact, the memory hit 
        # may have resulted in diminished service.
        # Curious about fewer disk writes.
        r = self.session.get(self.url)
        with open(cache_fs_path, 'wb') as f: 
            f.write(r.content)
        '''
        # Store ETags.
        # self._etag_put(r.headers['etag'])

        return cache_fs_path

    def _create_cache_dir(self, cache_dir):
        try:
            os.makedirs(cache_dir)
        except OSError as ose:
            if ose.errno == errno.EEXIST:
                pass
            else:
                raise

    def _etag_get(self):
        sql_query = "SELECT etag FROM etags WHERE fcrepoid = '" + self.fcrepo_id + "';"
        etags = self._db.query(sql_query)
        if etags != None:
            self.etag_exists = True
            return '"' + str(etags[0][0]) + '"'
        else:
            return None

    def _etag_put(self, etag):
        etag = etag.split(',')[0]
        etag = etag.replace('"', '')
        self.logger.debug("Etag for inserting into DB: {}".format(etag))
        if self.etag_exists:
            sql_query = "UPDATE etags SET etag='" + etag + "' WHERE fcrepoid = '" + self.fcrepo_id + "';"
            etags = self._db.update(sql_query)
        else:
            sql_query = "INSERT INTO etags (fcrepoid, etag) VALUES ('" + self.fcrepo_id + "', '" + etag + "')"
            dbid = self._db.update(sql_query)
        return True

    def _valid_fcrepo_id(self, fcrepo_id):
        regex = re.compile(
            '^[a-z0-9]{8}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{12}$',
            re.I)
        match = regex.match(fcrepo_id)
        return bool(match)