Exemplo n.º 1
0
    def test_compression(self):
        from databundles.run import  get_runconfig
        from databundles.cache import new_cache
        from databundles.util import  temp_file_name, md5_for_file, copy_file_or_flo
        
        rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),RunConfig.USER_CONFIG))

        comp_cache = new_cache(rc.filesystem('compressioncache'))
        
        test_file_name = 'test_file'

        fn =  temp_file_name()
        print 'orig file ', fn
        with open(fn,'wb') as f:
            for i in range(1000):
                f.write("{:03d}:".format(i))

        cf = comp_cache.put(fn, test_file_name)

        with open(cf) as stream:
            from databundles.util.sgzip import GzipFile
            stream = GzipFile(stream)
            
            uncomp_cache = new_cache(rc.filesystem('fscache'))
            
            uncomp_stream = uncomp_cache.put_stream('decomp')
            
            copy_file_or_flo(stream, uncomp_stream)
    
        uncomp_stream.close()
            
        dcf = uncomp_cache.get('decomp')

        self.assertEquals(md5_for_file(fn), md5_for_file(dcf))
Exemplo n.º 2
0
def post_dataset(did,library): 
    '''Accept a payload that describes a bundle in the remote. Download the
    bundle and install it. '''

    did = did.replace('|','/')

    from databundles.identity import new_identity, Identity
    from databundles.util import md5_for_file
    
    payload = request.json
    identity = new_identity(payload['identity'])

    if not did in set([identity.id_, identity.vid]):
        raise exc.Conflict("Dataset address '{}' doesn't match payload id '{}'".format(did, identity.vid))

    # need to go directly to remote, not library.get() because the
    # dataset hasn't been loaded yet. 
    db_path = library.load(identity.cache_key)

    if not db_path:
        logger.error("Failed to get {} from cache while posting dataset".format(identity.cache_key))
        logger.error("  cache =  {}".format(library.cache))
        logger.error("  remote = {}".format(library.remote))
        raise exc.NotFound("Didn't  get bundle file for cache key {} ".format(identity.cache_key))

    logger.debug("Loading {} for identity {} ".format(db_path, identity))

    #b = DbBundle(db_path, logger=logger)

    md5 = md5_for_file(db_path)
    
    if md5 != payload['md5']:
        logger.debug('MD5 Mismatch: {} != {} '.format( md5 , payload['md5']))
        # First, try deleting the cached copy and re-fetching
        # but don't delete it unless there is an intervening cache
        #if library.remote.path(identity.cache_key).startswith('http'):
        #    raise exc.Conflict("MD5 Mismatch (a)")
        
        library.remote.remove(identity.cache_key)
        db_path = library.remote.get(identity.cache_key)
        
        md5 = md5_for_file(db_path)
        if md5 != payload['md5']:
            logger.debug('MD5 Mismatch, persiting after refetch: {} != {} '.format( md5 , payload['md5']))
            raise exc.Conflict("MD5 Mismatch (b)")

    b = DbBundle(db_path)

    if b.identity.cache_key != identity.cache_key:
        logger.debug("Identity mismatch while posting dataset: {} != {}".format(b.identity.cache_key, identity.cache_key))
        raise exc.Conflict("Identity of downloaded bundle doesn't match request payload")

    library.put(b)

    #library.run_dumper_thread()

    return b.identity.to_dict()
Exemplo n.º 3
0
    def test_caches(self):
        '''Basic test of put(), get() and has() for all cache types'''
        from functools import partial
        from databundles.run import  get_runconfig, RunConfig
        from databundles.filesystem import Filesystem
        from databundles.cache import new_cache
        from databundles.util import md5_for_file
        from databundles.bundle import DbBundle
        
        self.start_server() # For the rest-cache
        
        #fn = '/tmp/1mbfile'
        #with open(fn, 'wb') as f:
        #    f.write('.'*(1024))
      
        fn = self.bundle.database.path
      
        # Opening the file might run the database updates in 
        # database.sqlite._on_connect_update_schema, which can affect the md5.
        b = DbBundle(fn)
      
        md5 = md5_for_file(fn)
    
        
        print "MD5 {}  = {}".format(fn, md5)

        rc = get_runconfig((os.path.join(self.bundle_dir,'test-run-config.yaml'),RunConfig.USER_CONFIG))
        
        for i, fsname in enumerate(['fscache', 'limitedcache', 'compressioncache','cached-s3', 'cached-compressed-s3', 'rest-cache']): #'compressioncache',

            config = rc.filesystem(fsname)
            cache = new_cache(config)
            print '---', fsname, cache
            identity = self.bundle.identity

            relpath = identity.cache_key

            r = cache.put(fn, relpath,identity.to_meta(md5=md5))
            r = cache.get(relpath)

            if not r.startswith('http'):
                self.assertTrue(os.path.exists(r), str(cache))
                
            self.assertTrue(cache.has(relpath, md5=md5))
            
            cache.remove(relpath, propagate=True)
            
            self.assertFalse(os.path.exists(r), str(cache))
            self.assertFalse(cache.has(relpath))
            

        cache = new_cache(rc.filesystem('s3cache-noupstream'))         
        r = cache.put(fn, 'a')
Exemplo n.º 4
0
    def _send(self, package, extract_data, file_):
        import os
        import mimetypes
        
        _, ext = os.path.splitext(file_)
        mimetypes.init()
        content_type = mimetypes.types_map.get(ext,None)  #@UndefinedVariable
        
        try:
            _,format = content_type.split('/')
        except:
            format = None
        
        name = extract_data.get('name', os.path.basename(file_))

        #
        # If the filestore exists, write to S3 first, the upload the URL
        if self.filestore:
            from databundles.util import md5_for_file
            urlf = self.filestore.public_url_f(public=True)
            path = self.bundle.identity.path+'/'+name

            # Don't upload if  S3 has the file of the same key and md5
            md5 =  md5_for_file(file_)
            if not self.filestore.has(path, md5=md5):
                self.filestore.put(file_, path, metadata={'public':True, 'md5':md5})

            r = self.remote.add_url_resource(package, urlf(path), name,
                    description=extract_data['description'],
                    content_type = content_type, 
                    format=format,
                    hash=md5,
                    rel_path=path
                    )
        else:
            r = self.remote.add_file_resource(package, file_, 
                                name=name,
                                description=extract_data['description'],
                                content_type = content_type, 
                                format=format
                                )
        
        return r
Exemplo n.º 5
0
    def upload_file(self, identity, path, ci=None, force=False):
        '''Upload  file to the object_store_config's object store'''
        from databundles.util import md5_for_file
        from databundles.dbexceptions import ConfigurationError
        import json

        if ci is None:
            ci = self.remote.info().objectstore().get().object

        if ci['service'] == 's3':
            from databundles.filesystem import S3Cache, FsCompressionCache
            
            if not self.accounts_config:
                raise ConfigurationError("Remote requires S3 upload, but no account_config is set for this api")
            
            secret = self.accounts_config.get('s3',{}).get(ci['access_key'], False)
            
            if not secret:
                print self.accounts_config
                raise ConfigurationError("Didn't find key {} in configuration accounts.s3".format(ci['access_key']))

            ci['secret'] = secret
            
            del ci['service']
            fs = FsCompressionCache(S3Cache(**ci))
            #fs = S3Cache(**ci)
        else:
            raise NotImplementedError("No handler for service: {} ".format(ci))

        md5 = md5_for_file(path)
        
        if  fs.has(identity.cache_key, md5) and not force:
            return identity.cache_key
        else:
            
            metadata = {'id':identity.id_, 'identity': json.dumps(identity.to_dict()), 'name':identity.name, 'md5':md5}

            return fs.put(path, identity.cache_key,  metadata=metadata)