def test_md5s3stash_with_auth(self, mock_s3move, mock_urlopen): mock_urlopen.return_value = FakeReq('test resp') report = md5s3stash.md5s3stash(self.testfilepath, 'fake-bucket', conn='FAKE CONN', url_auth=('username', 'password')) tdict = { self.testfilepath: { u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da' }, 'https://example.com/endinslash/': { u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da' }, } mock_urlopen.assert_called_once_with( os.path.join(DIR_FIXTURES, '1x1.png'), auth=('username', 'password'), cache=tdict, ) #mock_urlopen.reset_mock() self.assertEqual(report.mime_type, None) # mock's file is not an image self.assertEqual(report.md5, '85b5a0deaa11f3a5d1762c55701c03da') self.assertEqual(report.url, os.path.join(DIR_FIXTURES, '1x1.png')) self.assertEqual(report.s3_url, 's3://fake-bucket/85b5a0deaa11f3a5d1762c55701c03da')
def test_hash_cache( self, mock_s3move, mock_urlopen ): mock_urlopen.return_value = FakeReq('test resp') report = md5s3stash.md5s3stash('http://example.edu/', 'fake-bucket', conn='FAKE CONN', url_cache=self.url_cache, hash_cache=self.hash_cache) StashReport = namedtuple('StashReport', 'url, md5, s3_url, mime_type, dimensions') self.assertEqual( report, StashReport( url='http://example.edu/', md5='85b5a0deaa11f3a5d1762c55701c03da', s3_url='s3_url', mime_type='mime_type', dimensions=(100, 100) ) ) self.assertEqual( self.hash_cache, {'85b5a0deaa11f3a5d1762c55701c03da': ('s3_url', 'mime_type', (100, 100))} ) mock_urlopen.reset_mock()
def main(collection_key=None, url_couchdb=SERVER_COUCHDB): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' s = couchdb.Server(url=url_couchdb) db = s[DB_COUCHDB] #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true') v = couchdb_pager( db, view_name=COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else couchdb_pager( db, view_name=COUCH_VIEW, include_docs='true') for r in v: doc = r.doc msg = doc['_id'] if 's3://' in doc.get('object', ''): #already downloaded msg = ' '.join((msg, 'already fetched image')) continue try: doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc)) except Exception, e: print("ERROR: Can't get isShownBy for {} : {}".format( doc['_id'], e)) continue #next doc try: url_image = doc['isShownBy']['src'] dt_start = dt_end = datetime.datetime.now() report = md5s3stash(url_image, bucket_base=BUCKET_BASE) dt_end = datetime.datetime.now() doc['object'] = report.s3_url db.save(doc) msg = ' '.join((msg, doc['object'])) except KeyError, e: msg = ' '.join((msg, "ERROR: No isShownBy field"))
def stash_image_for_doc(doc, url_cache, hash_cache, ignore_content_type, bucket_bases=BUCKET_BASES, auth=None): '''Stash the images in s3, using md5s3stash Duplicate it among the "BUCKET_BASES" list. This will give redundancy in case some idiot (me) deletes one of the copies. Not tons of data so cheap to replicate them. Return md5s3stash report if image found If link is not an image type, don't stash & raise ''' try: url_image = doc['isShownBy'] if not url_image: raise IsShownByError( "isShownBy empty for {0}".format(doc['_id']), doc_id=doc['_id']) except KeyError as e: raise IsShownByError( "isShownBy missing for {0}".format(doc['_id']), doc_id=doc['_id']) if isinstance(url_image, list): # need to fix marc map_is_shown_at url_image = url_image[0] # try to parse url, check values of scheme & netloc at least url_parsed = urlparse.urlsplit(url_image) if url_parsed.scheme == 'ark': # for some OAC objects, the reference image is not a url but a path. url_image = '/'.join((URL_OAC_CONTENT_BASE, url_image)) elif not url_parsed.scheme or not url_parsed.netloc: msg = 'Link not http URL for {} - {}'.format(doc['_id'], url_image) print >> sys.stderr, msg raise FailsImageTest(msg, doc_id=doc['_id']) reports = [] # If '--ignore_content_type' set, don't check link_is_to_image if link_is_to_image(doc['_id'], url_image, auth) or ignore_content_type: for bucket_base in bucket_bases: try: logging.getLogger('image_harvest.stash_image').info( 'bucket_base:{0} url_image:{1}'.format(bucket_base, url_image)) region, bucket_base = bucket_base.split(':') conn = boto.s3.connect_to_region(region) report = md5s3stash.md5s3stash( url_image, bucket_base=bucket_base, conn=conn, url_auth=auth, url_cache=url_cache, hash_cache=hash_cache) reports.append(report) except TypeError as e: print >> sys.stderr, 'TypeError for doc:{} {} Msg: {} Args:' \ ' {}'.format( doc['_id'], url_image, e.message, e.args) return reports else: msg = 'Not an image for {} - {}'.format(doc['_id'], url_image) print >> sys.stderr, msg raise FailsImageTest(msg, doc_id=doc['_id'])
def main(collection_key=None, url_couchdb=SERVER_COUCHDB): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' s = couchdb.Server(url=url_couchdb) db = s[DB_COUCHDB] #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true') v = couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true') for r in v: doc = r.doc msg = doc['_id'] if 's3://' in doc.get('object', ''): #already downloaded msg = ' '.join((msg, 'already fetched image')) continue try: doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc)) except Exception, e: print("ERROR: Can't get isShownBy for {} : {}".format(doc['_id'], e)) continue #next doc try: url_image = doc['isShownBy']['src'] dt_start = dt_end = datetime.datetime.now() report = md5s3stash(url_image, bucket_base=BUCKET_BASE) dt_end = datetime.datetime.now() doc['object'] = report.s3_url db.save(doc) msg = ' '.join((msg, doc['object'])) except KeyError, e: msg = ' '.join((msg, "ERROR: No isShownBy field"))
def stash_image_for_doc(doc, url_cache, hash_cache, ignore_content_type, bucket_bases=BUCKET_BASES, auth=None): '''Stash the images in s3, using md5s3stash Duplicate it among the "BUCKET_BASES" list. This will give redundancy in case some idiot (me) deletes one of the copies. Not tons of data so cheap to replicate them. Return md5s3stash report if image found If link is not an image type, don't stash & raise ''' try: url_image = doc['isShownBy'] if not url_image: raise IsShownByError("isShownBy empty for {0}".format(doc['_id']), doc_id=doc['_id']) except KeyError as e: raise IsShownByError("isShownBy missing for {0}".format(doc['_id']), doc_id=doc['_id']) if isinstance(url_image, list): # need to fix marc map_is_shown_at url_image = url_image[0] # try to parse url, check values of scheme & netloc at least url_parsed = urlparse.urlsplit(url_image) if url_parsed.scheme == 'ark': # for some OAC objects, the reference image is not a url but a path. url_image = '/'.join((URL_OAC_CONTENT_BASE, url_image)) elif not url_parsed.scheme or not url_parsed.netloc: msg = 'Link not http URL for {} - {}'.format(doc['_id'], url_image) print >> sys.stderr, msg raise FailsImageTest(msg, doc_id=doc['_id']) reports = [] # If '--ignore_content_type' set, don't check link_is_to_image if link_is_to_image(doc['_id'], url_image, auth) or ignore_content_type: for bucket_base in bucket_bases: try: logging.getLogger('image_harvest.stash_image').info( 'bucket_base:{0} url_image:{1}'.format( bucket_base, url_image)) region, bucket_base = bucket_base.split(':') conn = boto.s3.connect_to_region(region) report = md5s3stash.md5s3stash(url_image, bucket_base=bucket_base, conn=conn, url_auth=auth, url_cache=url_cache, hash_cache=hash_cache) reports.append(report) except TypeError as e: print >> sys.stderr, 'TypeError for doc:{} {} Msg: {} Args:' \ ' {}'.format( doc['_id'], url_image, e.message, e.args) return reports else: msg = 'Not an image for {} - {}'.format(doc['_id'], url_image) print >> sys.stderr, msg raise FailsImageTest(msg, doc_id=doc['_id'])
def test_md5s3stash_trailing_slash_url(self, mock_s3move, mock_urlopen): '''The Nuxeo urls end with a slash. The use of os.path.basename doesn't work as it returns a blank str ''. Need to switch to use of NamedTemporaryFile with delete=False to handle all cases. ''' mock_urlopen.return_value = FakeReq('test resp') report = md5s3stash.md5s3stash('https://example.com/endinslash/', 'fake-bucket', conn='FAKE CONN', url_auth=('username', 'password'))
def test_redis_cache_save(self, mock_s3move, mock_urlopen): mock_urlopen.return_value = FakeReq('test resp') report = md5s3stash.md5s3stash('https://example.com/endinslash/', 'fake-bucket', conn='FAKE CONN', url_auth=('username', 'password'), url_cache=self.url_cache, hash_cache=self.hash_cache) self.assertEqual(self.url_cache['https://example.com/endinslash/'], {u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da'}) self.assertEqual(self.hash_cache['85b5a0deaa11f3a5d1762c55701c03da'], ('s3://m.fake-bucket/85b5a0deaa11f3a5d1762c55701c03da', None, (0, 0)) )
def test_redis_url_cache_retrieve(self, mock_s3move): httpretty.enable() httpretty.register_uri(httpretty.GET, 'https://example.edu', status=200, content_type='mime_type', body='test body' ) report = md5s3stash.md5s3stash('https://example.edu', 'fake-bucket', conn='FAKE CONN', url_cache=self.url_cache, hash_cache=self.hash_cache) request_headers = httpretty.last_request().headers self.assertEqual(request_headers['If-None-Match'], 'nice etag') self.assertEqual(request_headers['If-Modified-Since'], 'since test val')
def save(self, *args, **kwargs): super(ExhibitItem, self).save(*args, **kwargs) for s3field in self.push_to_s3: name = getattr(self, s3field).name if name: url = settings.MEDIA_ROOT + "/" + name if os.path.isfile(url): field_instance = getattr(self, s3field) report = md5s3stash("file://" + url, settings.S3_STASH) field_instance.storage.delete(name) field_instance.name = report.md5 upload_to = self._meta.get_field(s3field).upload_to self._meta.get_field(s3field).upload_to = '' super(ExhibitItem, self).save(update_fields=[s3field]) self._meta.get_field(s3field).upload_to = upload_to
def test_redis_url_cache_retrieve(self, mock_s3move): httpretty.enable() httpretty.register_uri(httpretty.GET, 'https://example.edu', status=200, content_type='mime_type', body='test body') report = md5s3stash.md5s3stash('https://example.edu', 'fake-bucket', conn='FAKE CONN', url_cache=self.url_cache, hash_cache=self.hash_cache) request_headers = httpretty.last_request().headers self.assertEqual(request_headers['If-None-Match'], 'nice etag') self.assertEqual(request_headers['If-Modified-Since'], 'since test val')
def test_redis_cache_save(self, mock_s3move, mock_urlopen): mock_urlopen.return_value = FakeReq('test resp') report = md5s3stash.md5s3stash('https://example.com/endinslash/', 'fake-bucket', conn='FAKE CONN', url_auth=('username', 'password'), url_cache=self.url_cache, hash_cache=self.hash_cache) self.assertEqual( self.url_cache['https://example.com/endinslash/'], { u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da' }) self.assertEqual( self.hash_cache['85b5a0deaa11f3a5d1762c55701c03da'], ('s3://m.fake-bucket/85b5a0deaa11f3a5d1762c55701c03da', None, (0, 0)))
def test_redis_hash_cache_retrieve(self, mock_s3move, mock_urlopen): mock_urlopen.return_value = FakeReq('test resp') self.hash_cache['85b5a0deaa11f3a5d1762c55701c03da'] = ('s3_url', 'mime_type', (100, 100)) report = md5s3stash.md5s3stash('http://example.edu/', 'fake-bucket', conn='FAKE CONN', url_cache=self.url_cache, hash_cache=self.hash_cache) StashReport = namedtuple('StashReport', 'url, md5, s3_url, mime_type, dimensions') self.assertEqual( report, StashReport(url='http://example.edu/', md5='85b5a0deaa11f3a5d1762c55701c03da', s3_url='s3_url', mime_type='mime_type', dimensions=(100, 100)))
def test_md5s3stash_with_auth( self, mock_s3move, mock_urlopen ): mock_urlopen.return_value = FakeReq('test resp') report = md5s3stash.md5s3stash(self.testfilepath, 'fake-bucket', conn='FAKE CONN', url_auth=('username', 'password')) tdict = { self.testfilepath : {u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da'}, 'https://example.com/endinslash/': {u'If-None-Match': "you're it", u'md5': '85b5a0deaa11f3a5d1762c55701c03da'}, } mock_urlopen.assert_called_once_with( os.path.join(DIR_FIXTURES, '1x1.png'), auth=('username', 'password'), cache=tdict,) #mock_urlopen.reset_mock() self.assertEqual(report.mime_type, None) # mock's file is not an image self.assertEqual(report.md5, '85b5a0deaa11f3a5d1762c55701c03da') self.assertEqual(report.url, os.path.join(DIR_FIXTURES, '1x1.png')) self.assertEqual(report.s3_url, 's3://fake-bucket/85b5a0deaa11f3a5d1762c55701c03da')
def file_complete(self, file_size): url = "file:///" + settings.MEDIA_ROOT + "/uploads/" + self.file_name report = md5s3stash(url, "static-ucldc-cdlib-org/harvested_images") S3UploadedFile = namedtuple('S3UploadedFile', 'name, size, content_type') return S3UploadedFile(report.md5, file_size, self.content_type)
elif not url_parsed.scheme or not url_parsed.netloc: print >>sys.stderr, "Link not http URL for {} - {}".format(doc["_id"], url_image) return None reports = [] if link_is_to_image(url_image, auth): for bucket_base in bucket_bases: try: logging.getLogger("image_harvest.stash_image").info( "bucket_base:{0} url_image:{1}".format(bucket_base, url_image) ) region, bucket_base = bucket_base.split(":") conn = boto.s3.connect_to_region(region) report = md5s3stash.md5s3stash( url_image, bucket_base=bucket_base, conn=conn, url_auth=auth, url_cache=url_cache, hash_cache=hash_cache, ) reports.append(report) except TypeError, e: print >>sys.stderr, "TypeError for doc:{} {} Msg: {} Args:" " {}".format( doc["_id"], url_image, e.message, e.args ) return reports else: print >>sys.stderr, "Not an image for {} - {}".format(doc["_id"], url_image) return None class ImageHarvester(object):