예제 #1
0
    def __init__(
            self,
            identifier          = None,     #name under which the cache file is stored. defaults to modulename.objname
            environment         = None,     #object containing information regarding the dependencies on global state of the cached operation
            operation           = None,     #function to be cached. note that the order of arguments is significant for key reuse
            hierarchy           = None,     #key hierarchy; if and how args and kwargs are hierarchically ordered
            validate            = False,    #validation mode. if enabled, all cache retrievals are checked against a recomputed function call.
            deferred_timeout    = 30,       #time to wait before a deferred object is considered obsolete. compilation may take a long time; that said, it may also crash your process...
            lock_timeout        = 1,        #time to wait before a lock is considered obsolete. the lock is needed for pure db transactions only; this makes once second a long time
            environment_clear   = True,     #clear the cache upon connection with a novel environment key
            connect_clear       = False     #clear the cache upon every connection
            ):
        """
        if environment_clear is set to true, the cache is cleared
        """
        if identifier: self.identifier = identifier
        if operation: self.operation = operation
        self.hierarchy = hierarchy
        #add some essentials to the environment
        import platform
        globalenv               = platform.architecture(), platform.python_version()
        funcenv                 = inspect.getargspec(self.operation), inspect.getsource(self.operation)
        self.environment        = globalenv, funcenv, (environment if environment else self.environment())
        estr, ehash             = process_key(self.environment)

        self.validate           = validate
        self.lock_timeout       = lock_timeout
        self.deferred_timeout   = deferred_timeout

        self.filename           = os.path.join(cachepath, self.identifier)
        self.shelve             = Shelve(self.filename, autocommit = True)
        self.lock               = threading.Lock()
        self.lock_file          = lockfile.MkdirLockFile(self.filename, timeout = lock_timeout)

        with self.lock, self.lock_file:
            if connect_clear:
                #this isnt right; we are now invalidating the precomputed envrowid of other processes...
                self.shelve.clear()           #need write lock here

            #write environment key to database and obtain its unique rowid
            try:
                self.envrowid = self.shelve.getrowid(self.environment, estr, ehash)
            except:
                #connect to the db with a novel environment; probably wont change back again
                if environment_clear:
                    self.shelve.clear()         #need write lock here
                self.shelve.setitem(self.environment, None, estr, ehash)
                self.envrowid = self.shelve.getrowid(self.environment, estr, ehash)
예제 #2
0
class AbstractCache(object):
    """
    abstract base class of a cache object which gracefully handles large arbitrary key objects
    """
    def __init__(
            self,
            identifier          = None,     #name under which the cache file is stored. defaults to modulename.objname
            environment         = None,     #object containing information regarding the dependencies on global state of the cached operation
            operation           = None,     #function to be cached. note that the order of arguments is significant for key reuse
            hierarchy           = None,     #key hierarchy; if and how args and kwargs are hierarchically ordered
            validate            = False,    #validation mode. if enabled, all cache retrievals are checked against a recomputed function call.
            deferred_timeout    = 30,       #time to wait before a deferred object is considered obsolete. compilation may take a long time; that said, it may also crash your process...
            lock_timeout        = 1,        #time to wait before a lock is considered obsolete. the lock is needed for pure db transactions only; this makes once second a long time
            environment_clear   = True,     #clear the cache upon connection with a novel environment key
            connect_clear       = False     #clear the cache upon every connection
            ):
        """
        if environment_clear is set to true, the cache is cleared
        """
        if identifier: self.identifier = identifier
        if operation: self.operation = operation
        self.hierarchy = hierarchy
        #add some essentials to the environment
        import platform
        globalenv               = platform.architecture(), platform.python_version()
        funcenv                 = inspect.getargspec(self.operation), inspect.getsource(self.operation)
        self.environment        = globalenv, funcenv, (environment if environment else self.environment())
        estr, ehash             = process_key(self.environment)

        self.validate           = validate
        self.lock_timeout       = lock_timeout
        self.deferred_timeout   = deferred_timeout

        self.filename           = os.path.join(cachepath, self.identifier)
        self.shelve             = Shelve(self.filename, autocommit = True)
        self.lock               = threading.Lock()
        self.lock_file          = lockfile.MkdirLockFile(self.filename, timeout = lock_timeout)

        with self.lock, self.lock_file:
            if connect_clear:
                #this isnt right; we are now invalidating the precomputed envrowid of other processes...
                self.shelve.clear()           #need write lock here

            #write environment key to database and obtain its unique rowid
            try:
                self.envrowid = self.shelve.getrowid(self.environment, estr, ehash)
            except:
                #connect to the db with a novel environment; probably wont change back again
                if environment_clear:
                    self.shelve.clear()         #need write lock here
                self.shelve.setitem(self.environment, None, estr, ehash)
                self.envrowid = self.shelve.getrowid(self.environment, estr, ehash)




    def __call__(self, *args, **kwargs):
        """
        look up a hierachical key object
        fill in the missing parts, and perform the computation at the leaf if so required
        """
        if self.hierarchy:
            #apply the structure in hierarchy to the arguments
            fkey = kwargs.copy()
            fkey.update(enumerate(args))
            hkey = [[fkey.pop(a) for a in level] for level in self.hierarchy]
            if fkey: hkey.append(fkey)  #any arguments not part of the hierarchy spec are placed at the end
        else:
            hkey = [args + ((kwargs,) if kwargs else ())]   #put all args in a single key
        #preprocess subkeys. this minimizes time spent in locked state
        hkey = map(as_deterministic, hkey)

        with self.lock:     #fairly stupid thread locking. dont use threads; how about that?
            while True:
                try:
                    with self.lock_file:
                        #hierarchical key lookup; first key is prebound environment key
                        previouskey = Partial(self.envrowid)
                        for ikey, subkey in enumerate(hkey[:-1]):
                            partialkey = previouskey, subkey
                            rowid = self.shelve.getrowid(partialkey, *process_key(partialkey))  #read lock?
                            previouskey = Partial(rowid)
                        #leaf iteration
                        ikey = len(hkey)-1
                        leafkey = previouskey, hkey[-1]
                        value = self.shelve[leafkey]                                            #read lock?

                    if isinstance(value, Deferred):
                        if value.expired(self.deferred_timeout):
                            raise Exception()
                        sleep(0.01)
                    else:
                        if self.validate:
                            #check if recomputed value is identical under deterministic serialization
                            newvalue = self.operation(*args, **kwargs)

                            try:
                                #note; new may differ from old in case aliasing in an ndarray was erased
                                #by original serialization. is this an error?
                                #id say so; depending on wether we have a cache hit, downstream code may react diffently
                                #perhaps its best to use custom serializating for values too
                                assert(as_deterministic(value)==as_deterministic(newvalue))
                            except:
                                print 'Cache returned invalid value!'
                                print 'arguments:'
                                print args
                                print kwargs
                                print 'cached value'
                                print value
                                print 'recomputed value'
                                print newvalue
                                quit()

                        #yes! hitting this return is what we are doing this all for!
                        return value

                except:
                    #lock for the writing branch. multiprocess does not benefit here, but so be it.
                    #worst case we make multiple insertions into db, but this should do no harm for behavior

                    if self.lock_file.is_locked():
                        #if lock not available, better to go back to waiting for a deferred to appear
                        sleep(0.001)
                    else:
                        with self.lock_file:
                            #hierarchical key insertion
                            for subkey in hkey[ikey:-1]:
                                partialkey = previouskey, subkey
                                kstr, khash = process_key(partialkey)
                                self.shelve.setitem(partialkey, None, kstr, khash)      #wite lock
                                rowid = self.shelve.getrowid(partialkey, kstr, khash)   #read lock
                                previouskey = Partial(rowid)
                            #insert leaf node
                            leafkey = previouskey, hkey[-1]
                            kstr, khash = process_key(leafkey)
                            self.shelve.setitem(leafkey, Deferred(), kstr, khash)       #write lock

                        #dont need lock while doing expensive things
                        value = self.operation(*args, **kwargs)

                        with self.lock_file:
                            self.shelve.setitem(leafkey, value     , kstr, khash)       #write lock
                            return value




    def operation(self, input):
        """
        implements the cached operation; to be invoked upon a cache miss
        input is a picklable python object
        the returned output should be a pickalble python object as well
        """
        raise NotImplementedError()
    def environment(self):
        """
        returns a pickeable object describing the environment of the cached operation,
        or a description of the state of your computer which may influence the relation
        between the input and output of Cache.cached
        """
        raise NotImplementedError()