def test_compression(self): from databundles.run import get_runconfig from databundles.cache import new_cache from databundles.util import temp_file_name, md5_for_file, copy_file_or_flo rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),RunConfig.USER_CONFIG)) comp_cache = new_cache(rc.filesystem('compressioncache')) test_file_name = 'test_file' fn = temp_file_name() print 'orig file ', fn with open(fn,'wb') as f: for i in range(1000): f.write("{:03d}:".format(i)) cf = comp_cache.put(fn, test_file_name) with open(cf) as stream: from databundles.util.sgzip import GzipFile stream = GzipFile(stream) uncomp_cache = new_cache(rc.filesystem('fscache')) uncomp_stream = uncomp_cache.put_stream('decomp') copy_file_or_flo(stream, uncomp_stream) uncomp_stream.close() dcf = uncomp_cache.get('decomp') self.assertEquals(md5_for_file(fn), md5_for_file(dcf))
def post_dataset(did,library): '''Accept a payload that describes a bundle in the remote. Download the bundle and install it. ''' did = did.replace('|','/') from databundles.identity import new_identity, Identity from databundles.util import md5_for_file payload = request.json identity = new_identity(payload['identity']) if not did in set([identity.id_, identity.vid]): raise exc.Conflict("Dataset address '{}' doesn't match payload id '{}'".format(did, identity.vid)) # need to go directly to remote, not library.get() because the # dataset hasn't been loaded yet. db_path = library.load(identity.cache_key) if not db_path: logger.error("Failed to get {} from cache while posting dataset".format(identity.cache_key)) logger.error(" cache = {}".format(library.cache)) logger.error(" remote = {}".format(library.remote)) raise exc.NotFound("Didn't get bundle file for cache key {} ".format(identity.cache_key)) logger.debug("Loading {} for identity {} ".format(db_path, identity)) #b = DbBundle(db_path, logger=logger) md5 = md5_for_file(db_path) if md5 != payload['md5']: logger.debug('MD5 Mismatch: {} != {} '.format( md5 , payload['md5'])) # First, try deleting the cached copy and re-fetching # but don't delete it unless there is an intervening cache #if library.remote.path(identity.cache_key).startswith('http'): # raise exc.Conflict("MD5 Mismatch (a)") library.remote.remove(identity.cache_key) db_path = library.remote.get(identity.cache_key) md5 = md5_for_file(db_path) if md5 != payload['md5']: logger.debug('MD5 Mismatch, persiting after refetch: {} != {} '.format( md5 , payload['md5'])) raise exc.Conflict("MD5 Mismatch (b)") b = DbBundle(db_path) if b.identity.cache_key != identity.cache_key: logger.debug("Identity mismatch while posting dataset: {} != {}".format(b.identity.cache_key, identity.cache_key)) raise exc.Conflict("Identity of downloaded bundle doesn't match request payload") library.put(b) #library.run_dumper_thread() return b.identity.to_dict()
def test_caches(self): '''Basic test of put(), get() and has() for all cache types''' from functools import partial from databundles.run import get_runconfig, RunConfig from databundles.filesystem import Filesystem from databundles.cache import new_cache from databundles.util import md5_for_file from databundles.bundle import DbBundle self.start_server() # For the rest-cache #fn = '/tmp/1mbfile' #with open(fn, 'wb') as f: # f.write('.'*(1024)) fn = self.bundle.database.path # Opening the file might run the database updates in # database.sqlite._on_connect_update_schema, which can affect the md5. b = DbBundle(fn) md5 = md5_for_file(fn) print "MD5 {} = {}".format(fn, md5) rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),RunConfig.USER_CONFIG)) for i, fsname in enumerate(['fscache', 'limitedcache', 'compressioncache','cached-s3', 'cached-compressed-s3', 'rest-cache']): #'compressioncache', config = rc.filesystem(fsname) cache = new_cache(config) print '---', fsname, cache identity = self.bundle.identity relpath = identity.cache_key r = cache.put(fn, relpath,identity.to_meta(md5=md5)) r = cache.get(relpath) if not r.startswith('http'): self.assertTrue(os.path.exists(r), str(cache)) self.assertTrue(cache.has(relpath, md5=md5)) cache.remove(relpath, propagate=True) self.assertFalse(os.path.exists(r), str(cache)) self.assertFalse(cache.has(relpath)) cache = new_cache(rc.filesystem('s3cache-noupstream')) r = cache.put(fn, 'a')
def _send(self, package, extract_data, file_): import os import mimetypes _, ext = os.path.splitext(file_) mimetypes.init() content_type = mimetypes.types_map.get(ext,None) #@UndefinedVariable try: _,format = content_type.split('/') except: format = None name = extract_data.get('name', os.path.basename(file_)) # # If the filestore exists, write to S3 first, the upload the URL if self.filestore: from databundles.util import md5_for_file urlf = self.filestore.public_url_f(public=True) path = self.bundle.identity.path+'/'+name # Don't upload if S3 has the file of the same key and md5 md5 = md5_for_file(file_) if not self.filestore.has(path, md5=md5): self.filestore.put(file_, path, metadata={'public':True, 'md5':md5}) r = self.remote.add_url_resource(package, urlf(path), name, description=extract_data['description'], content_type = content_type, format=format, hash=md5, rel_path=path ) else: r = self.remote.add_file_resource(package, file_, name=name, description=extract_data['description'], content_type = content_type, format=format ) return r
def upload_file(self, identity, path, ci=None, force=False): '''Upload file to the object_store_config's object store''' from databundles.util import md5_for_file from databundles.dbexceptions import ConfigurationError import json if ci is None: ci = self.remote.info().objectstore().get().object if ci['service'] == 's3': from databundles.filesystem import S3Cache, FsCompressionCache if not self.accounts_config: raise ConfigurationError("Remote requires S3 upload, but no account_config is set for this api") secret = self.accounts_config.get('s3',{}).get(ci['access_key'], False) if not secret: print self.accounts_config raise ConfigurationError("Didn't find key {} in configuration accounts.s3".format(ci['access_key'])) ci['secret'] = secret del ci['service'] fs = FsCompressionCache(S3Cache(**ci)) #fs = S3Cache(**ci) else: raise NotImplementedError("No handler for service: {} ".format(ci)) md5 = md5_for_file(path) if fs.has(identity.cache_key, md5) and not force: return identity.cache_key else: metadata = {'id':identity.id_, 'identity': json.dumps(identity.to_dict()), 'name':identity.name, 'md5':md5} return fs.put(path, identity.cache_key, metadata=metadata)