Пример #1
0
    def test_md5s3stash_with_auth(self, mock_s3move, mock_urlopen):
        mock_urlopen.return_value = FakeReq('test resp')
        report = md5s3stash.md5s3stash(self.testfilepath,
                                       'fake-bucket',
                                       conn='FAKE CONN',
                                       url_auth=('username', 'password'))
        tdict = {
            self.testfilepath: {
                u'If-None-Match': "you're it",
                u'md5': '85b5a0deaa11f3a5d1762c55701c03da'
            },
            'https://example.com/endinslash/': {
                u'If-None-Match': "you're it",
                u'md5': '85b5a0deaa11f3a5d1762c55701c03da'
            },
        }

        mock_urlopen.assert_called_once_with(
            os.path.join(DIR_FIXTURES, '1x1.png'),
            auth=('username', 'password'),
            cache=tdict,
        )
        #mock_urlopen.reset_mock()

        self.assertEqual(report.mime_type, None)  # mock's file is not an image
        self.assertEqual(report.md5, '85b5a0deaa11f3a5d1762c55701c03da')
        self.assertEqual(report.url, os.path.join(DIR_FIXTURES, '1x1.png'))
        self.assertEqual(report.s3_url,
                         's3://fake-bucket/85b5a0deaa11f3a5d1762c55701c03da')
Пример #2
0
 def test_hash_cache(
     self,
     mock_s3move,
     mock_urlopen
 ):
     mock_urlopen.return_value = FakeReq('test resp')
     report = md5s3stash.md5s3stash('http://example.edu/', 'fake-bucket',
                             conn='FAKE CONN',
                             url_cache=self.url_cache,
                             hash_cache=self.hash_cache)
     StashReport = namedtuple('StashReport', 'url, md5, s3_url, mime_type, dimensions')
     self.assertEqual(
         report,
         StashReport(
             url='http://example.edu/',
             md5='85b5a0deaa11f3a5d1762c55701c03da',
             s3_url='s3_url',
             mime_type='mime_type',
             dimensions=(100, 100)
         )
     )
     self.assertEqual(
         self.hash_cache,
         {'85b5a0deaa11f3a5d1762c55701c03da': ('s3_url',
                                   'mime_type',
                                   (100, 100))}
     )
     mock_urlopen.reset_mock()
Пример #3
0
def main(collection_key=None, url_couchdb=SERVER_COUCHDB):
    '''If collection_key is none, trying to grab all of the images. (Not
    recommended)
    '''
    s = couchdb.Server(url=url_couchdb)
    db = s[DB_COUCHDB]
    #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true')
    v = couchdb_pager(
        db, view_name=COUCH_VIEW, include_docs='true',
        key=collection_key) if collection_key else couchdb_pager(
            db, view_name=COUCH_VIEW, include_docs='true')
    for r in v:
        doc = r.doc
        msg = doc['_id']
        if 's3://' in doc.get('object', ''):  #already downloaded
            msg = ' '.join((msg, 'already fetched image'))
            continue
        try:
            doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc))
        except Exception, e:
            print("ERROR: Can't get isShownBy for {} : {}".format(
                doc['_id'], e))
            continue  #next doc
        try:
            url_image = doc['isShownBy']['src']
            dt_start = dt_end = datetime.datetime.now()
            report = md5s3stash(url_image, bucket_base=BUCKET_BASE)
            dt_end = datetime.datetime.now()
            doc['object'] = report.s3_url
            db.save(doc)
            msg = ' '.join((msg, doc['object']))
        except KeyError, e:
            msg = ' '.join((msg, "ERROR: No isShownBy field"))
Пример #4
0
def stash_image_for_doc(doc,
                        url_cache,
                        hash_cache,
                        ignore_content_type,
                        bucket_bases=BUCKET_BASES,
                        auth=None):
    '''Stash the images in s3, using md5s3stash
    Duplicate it among the "BUCKET_BASES" list. This will give redundancy
    in case some idiot (me) deletes one of the copies. Not tons of data so
    cheap to replicate them.
    Return md5s3stash report if image found
    If link is not an image type, don't stash & raise
    '''
    try:
        url_image = doc['isShownBy']
        if not url_image:
            raise IsShownByError(
                "isShownBy empty for {0}".format(doc['_id']),
                doc_id=doc['_id'])
    except KeyError as e:
        raise IsShownByError(
            "isShownBy missing for {0}".format(doc['_id']), doc_id=doc['_id'])
    if isinstance(url_image, list):  # need to fix marc map_is_shown_at
        url_image = url_image[0]
    # try to parse url, check values of scheme & netloc at least
    url_parsed = urlparse.urlsplit(url_image)
    if url_parsed.scheme == 'ark':
        # for some OAC objects, the reference image is not a url but a path.
        url_image = '/'.join((URL_OAC_CONTENT_BASE, url_image))
    elif not url_parsed.scheme or not url_parsed.netloc:
        msg = 'Link not http URL for {} - {}'.format(doc['_id'], url_image)
        print >> sys.stderr, msg
        raise FailsImageTest(msg, doc_id=doc['_id'])
    reports = []
    # If '--ignore_content_type' set, don't check link_is_to_image
    if link_is_to_image(doc['_id'], url_image, auth) or ignore_content_type:
        for bucket_base in bucket_bases:
            try:
                logging.getLogger('image_harvest.stash_image').info(
                    'bucket_base:{0} url_image:{1}'.format(bucket_base,
                                                           url_image))
                region, bucket_base = bucket_base.split(':')
                conn = boto.s3.connect_to_region(region)
                report = md5s3stash.md5s3stash(
                    url_image,
                    bucket_base=bucket_base,
                    conn=conn,
                    url_auth=auth,
                    url_cache=url_cache,
                    hash_cache=hash_cache)
                reports.append(report)
            except TypeError as e:
                print >> sys.stderr, 'TypeError for doc:{} {} Msg: {} Args:' \
                    ' {}'.format(
                        doc['_id'], url_image, e.message, e.args)
        return reports
    else:
        msg = 'Not an image for {} - {}'.format(doc['_id'], url_image)
        print >> sys.stderr, msg
        raise FailsImageTest(msg, doc_id=doc['_id'])
Пример #5
0
def main(collection_key=None, url_couchdb=SERVER_COUCHDB):
    '''If collection_key is none, trying to grab all of the images. (Not
    recommended)
    '''
    s = couchdb.Server(url=url_couchdb)
    db = s[DB_COUCHDB]
    #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true')
    v = couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true')
    for r in v:
        doc = r.doc
        msg = doc['_id']
        if 's3://' in doc.get('object', ''): #already downloaded
            msg = ' '.join((msg, 'already fetched image'))
            continue
        try:
            doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc))
        except Exception, e:
            print("ERROR: Can't get isShownBy for {} : {}".format(doc['_id'], e))
            continue #next doc
        try:
            url_image = doc['isShownBy']['src']
            dt_start = dt_end = datetime.datetime.now()
            report = md5s3stash(url_image, bucket_base=BUCKET_BASE)
            dt_end = datetime.datetime.now()
            doc['object'] = report.s3_url
            db.save(doc)
            msg = ' '.join((msg, doc['object']))
        except KeyError, e:
            msg = ' '.join((msg, "ERROR: No isShownBy field"))
Пример #6
0
def stash_image_for_doc(doc,
                        url_cache,
                        hash_cache,
                        ignore_content_type,
                        bucket_bases=BUCKET_BASES,
                        auth=None):
    '''Stash the images in s3, using md5s3stash
    Duplicate it among the "BUCKET_BASES" list. This will give redundancy
    in case some idiot (me) deletes one of the copies. Not tons of data so
    cheap to replicate them.
    Return md5s3stash report if image found
    If link is not an image type, don't stash & raise
    '''
    try:
        url_image = doc['isShownBy']
        if not url_image:
            raise IsShownByError("isShownBy empty for {0}".format(doc['_id']),
                                 doc_id=doc['_id'])
    except KeyError as e:
        raise IsShownByError("isShownBy missing for {0}".format(doc['_id']),
                             doc_id=doc['_id'])
    if isinstance(url_image, list):  # need to fix marc map_is_shown_at
        url_image = url_image[0]
    # try to parse url, check values of scheme & netloc at least
    url_parsed = urlparse.urlsplit(url_image)
    if url_parsed.scheme == 'ark':
        # for some OAC objects, the reference image is not a url but a path.
        url_image = '/'.join((URL_OAC_CONTENT_BASE, url_image))
    elif not url_parsed.scheme or not url_parsed.netloc:
        msg = 'Link not http URL for {} - {}'.format(doc['_id'], url_image)
        print >> sys.stderr, msg
        raise FailsImageTest(msg, doc_id=doc['_id'])
    reports = []
    # If '--ignore_content_type' set, don't check link_is_to_image
    if link_is_to_image(doc['_id'], url_image, auth) or ignore_content_type:
        for bucket_base in bucket_bases:
            try:
                logging.getLogger('image_harvest.stash_image').info(
                    'bucket_base:{0} url_image:{1}'.format(
                        bucket_base, url_image))
                region, bucket_base = bucket_base.split(':')
                conn = boto.s3.connect_to_region(region)
                report = md5s3stash.md5s3stash(url_image,
                                               bucket_base=bucket_base,
                                               conn=conn,
                                               url_auth=auth,
                                               url_cache=url_cache,
                                               hash_cache=hash_cache)
                reports.append(report)
            except TypeError as e:
                print >> sys.stderr, 'TypeError for doc:{} {} Msg: {} Args:' \
                    ' {}'.format(
                        doc['_id'], url_image, e.message, e.args)
        return reports
    else:
        msg = 'Not an image for {} - {}'.format(doc['_id'], url_image)
        print >> sys.stderr, msg
        raise FailsImageTest(msg, doc_id=doc['_id'])
Пример #7
0
 def test_md5s3stash_trailing_slash_url(self, mock_s3move, mock_urlopen):
     '''The Nuxeo urls end with a slash.
     The use of os.path.basename doesn't work as it returns a blank str ''.
     Need to switch to use of NamedTemporaryFile with delete=False to handle
     all cases.
     '''
     mock_urlopen.return_value = FakeReq('test resp')
     report = md5s3stash.md5s3stash('https://example.com/endinslash/', 'fake-bucket',
                             conn='FAKE CONN',
                             url_auth=('username', 'password'))
Пример #8
0
 def test_md5s3stash_trailing_slash_url(self, mock_s3move, mock_urlopen):
     '''The Nuxeo urls end with a slash.
     The use of os.path.basename doesn't work as it returns a blank str ''.
     Need to switch to use of NamedTemporaryFile with delete=False to handle
     all cases.
     '''
     mock_urlopen.return_value = FakeReq('test resp')
     report = md5s3stash.md5s3stash('https://example.com/endinslash/',
                                    'fake-bucket',
                                    conn='FAKE CONN',
                                    url_auth=('username', 'password'))
Пример #9
0
 def test_redis_cache_save(self, mock_s3move, mock_urlopen):
     mock_urlopen.return_value = FakeReq('test resp')
     report = md5s3stash.md5s3stash('https://example.com/endinslash/', 'fake-bucket',
                             conn='FAKE CONN',
                             url_auth=('username', 'password'),
                             url_cache=self.url_cache,
                             hash_cache=self.hash_cache)
     self.assertEqual(self.url_cache['https://example.com/endinslash/'],
             {u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da'})
     self.assertEqual(self.hash_cache['85b5a0deaa11f3a5d1762c55701c03da'],
                 ('s3://m.fake-bucket/85b5a0deaa11f3a5d1762c55701c03da',
                     None,
                     (0, 0))
             )
Пример #10
0
 def test_redis_url_cache_retrieve(self, mock_s3move):
     httpretty.enable()
     httpretty.register_uri(httpretty.GET, 'https://example.edu',
             status=200,
             content_type='mime_type',
             body='test body'
             )
     report = md5s3stash.md5s3stash('https://example.edu', 'fake-bucket',
                             conn='FAKE CONN',
                             url_cache=self.url_cache,
                             hash_cache=self.hash_cache)
     request_headers = httpretty.last_request().headers
     self.assertEqual(request_headers['If-None-Match'], 'nice etag')
     self.assertEqual(request_headers['If-Modified-Since'],
             'since test val')
Пример #11
0
 def save(self, *args, **kwargs):
     super(ExhibitItem, self).save(*args, **kwargs)
     for s3field in self.push_to_s3:
         name = getattr(self, s3field).name
         if name:
             url = settings.MEDIA_ROOT + "/" + name
             if os.path.isfile(url):
                 field_instance = getattr(self, s3field)
                 report = md5s3stash("file://" + url, settings.S3_STASH)
                 field_instance.storage.delete(name)
                 field_instance.name = report.md5
                 upload_to = self._meta.get_field(s3field).upload_to
                 self._meta.get_field(s3field).upload_to = ''
                 super(ExhibitItem, self).save(update_fields=[s3field])
                 self._meta.get_field(s3field).upload_to = upload_to
Пример #12
0
 def save(self, *args, **kwargs):
     super(ExhibitItem, self).save(*args, **kwargs)
     for s3field in self.push_to_s3:
         name = getattr(self, s3field).name
         if name:
             url = settings.MEDIA_ROOT + "/" + name
             if os.path.isfile(url):
                 field_instance = getattr(self, s3field)
                 report = md5s3stash("file://" + url, settings.S3_STASH)
                 field_instance.storage.delete(name)
                 field_instance.name = report.md5
                 upload_to = self._meta.get_field(s3field).upload_to
                 self._meta.get_field(s3field).upload_to = ''
                 super(ExhibitItem, self).save(update_fields=[s3field])
                 self._meta.get_field(s3field).upload_to = upload_to
Пример #13
0
 def test_redis_url_cache_retrieve(self, mock_s3move):
     httpretty.enable()
     httpretty.register_uri(httpretty.GET,
                            'https://example.edu',
                            status=200,
                            content_type='mime_type',
                            body='test body')
     report = md5s3stash.md5s3stash('https://example.edu',
                                    'fake-bucket',
                                    conn='FAKE CONN',
                                    url_cache=self.url_cache,
                                    hash_cache=self.hash_cache)
     request_headers = httpretty.last_request().headers
     self.assertEqual(request_headers['If-None-Match'], 'nice etag')
     self.assertEqual(request_headers['If-Modified-Since'],
                      'since test val')
Пример #14
0
 def test_redis_cache_save(self, mock_s3move, mock_urlopen):
     mock_urlopen.return_value = FakeReq('test resp')
     report = md5s3stash.md5s3stash('https://example.com/endinslash/',
                                    'fake-bucket',
                                    conn='FAKE CONN',
                                    url_auth=('username', 'password'),
                                    url_cache=self.url_cache,
                                    hash_cache=self.hash_cache)
     self.assertEqual(
         self.url_cache['https://example.com/endinslash/'], {
             u'If-None-Match': "you're it",
             u'md5': '85b5a0deaa11f3a5d1762c55701c03da'
         })
     self.assertEqual(
         self.hash_cache['85b5a0deaa11f3a5d1762c55701c03da'],
         ('s3://m.fake-bucket/85b5a0deaa11f3a5d1762c55701c03da', None,
          (0, 0)))
Пример #15
0
 def test_redis_hash_cache_retrieve(self, mock_s3move, mock_urlopen):
     mock_urlopen.return_value = FakeReq('test resp')
     self.hash_cache['85b5a0deaa11f3a5d1762c55701c03da'] = ('s3_url',
                                                            'mime_type',
                                                            (100, 100))
     report = md5s3stash.md5s3stash('http://example.edu/',
                                    'fake-bucket',
                                    conn='FAKE CONN',
                                    url_cache=self.url_cache,
                                    hash_cache=self.hash_cache)
     StashReport = namedtuple('StashReport',
                              'url, md5, s3_url, mime_type, dimensions')
     self.assertEqual(
         report,
         StashReport(url='http://example.edu/',
                     md5='85b5a0deaa11f3a5d1762c55701c03da',
                     s3_url='s3_url',
                     mime_type='mime_type',
                     dimensions=(100, 100)))
Пример #16
0
    def test_md5s3stash_with_auth(
        self,
        mock_s3move,
        mock_urlopen
    ):
        mock_urlopen.return_value = FakeReq('test resp')
        report = md5s3stash.md5s3stash(self.testfilepath, 'fake-bucket',
                                conn='FAKE CONN',
                                url_auth=('username', 'password'))
        tdict = {
            self.testfilepath : {u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da'},
            'https://example.com/endinslash/': {u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da'}, }

        mock_urlopen.assert_called_once_with(
          os.path.join(DIR_FIXTURES, '1x1.png'),
          auth=('username', 'password'), cache=tdict,)
        #mock_urlopen.reset_mock()

        self.assertEqual(report.mime_type, None)  # mock's file is not an image
        self.assertEqual(report.md5, '85b5a0deaa11f3a5d1762c55701c03da')
        self.assertEqual(report.url, os.path.join(DIR_FIXTURES, '1x1.png'))
        self.assertEqual(report.s3_url,
            's3://fake-bucket/85b5a0deaa11f3a5d1762c55701c03da')
Пример #17
0
 def file_complete(self, file_size):
     url = "file:///" + settings.MEDIA_ROOT + "/uploads/" + self.file_name
     report = md5s3stash(url, "static-ucldc-cdlib-org/harvested_images")
     S3UploadedFile = namedtuple('S3UploadedFile',
                                 'name, size, content_type')
     return S3UploadedFile(report.md5, file_size, self.content_type)
Пример #18
0
    elif not url_parsed.scheme or not url_parsed.netloc:
        print >>sys.stderr, "Link not http URL for {} - {}".format(doc["_id"], url_image)
        return None
    reports = []
    if link_is_to_image(url_image, auth):
        for bucket_base in bucket_bases:
            try:
                logging.getLogger("image_harvest.stash_image").info(
                    "bucket_base:{0} url_image:{1}".format(bucket_base, url_image)
                )
                region, bucket_base = bucket_base.split(":")
                conn = boto.s3.connect_to_region(region)
                report = md5s3stash.md5s3stash(
                    url_image,
                    bucket_base=bucket_base,
                    conn=conn,
                    url_auth=auth,
                    url_cache=url_cache,
                    hash_cache=hash_cache,
                )
                reports.append(report)
            except TypeError, e:
                print >>sys.stderr, "TypeError for doc:{} {} Msg: {} Args:" " {}".format(
                    doc["_id"], url_image, e.message, e.args
                )
        return reports
    else:
        print >>sys.stderr, "Not an image for {} - {}".format(doc["_id"], url_image)
        return None


class ImageHarvester(object):