def read(): db = DataBase('not_bsd.dat') a = db['a'] b = db['b'] db.close() print('a', a) print('b', b)
def __init__(self, filename, flag, key_type='str', dump_method='json', cached=True, writeback=False): DbfilenameShelf.__init__(self, filename, flag, -1, writeback) cached = (flag is 'r') and cached self._setup_methods(cached, key_type, dump_method)
def load_workers(self, filename): """ Return a list of workers read from disk as [(id, started, assetid),...]. """ shelf = DbfilenameShelf(filename) try: workers = shelf['workers'] except: workers = [] shelf.close() return workers
def dump_workers(self, filename, workers): """ Write a sequence of workers written to disk, e.g. [(id, started, assetid),...], and then return the sequence. """ seq = [] for w in workers: seq.append((w['worker'], w['started'], w['args'][1])) shelf = DbfilenameShelf(filename) shelf['workers'] = seq shelf.close() return seq
def __init__(self, fname=None, tmpdir=None, persistent=False): if fname is None: #get tmp directory self._tmpdir = TMPDIR if tmpdir is None else tmpdir #create tmp file for db fd, self.filename = mkstemp('', PREFIX, dir=self._tmpdir) os.close(fd); os.unlink(self.filename) #create a shelf in the db DbfilenameShelf.__init__(self, self.filename, flag='n', protocol=-1) if not persistent: register_tmp_file(self.filename) else: self._tmpdir = os.path.dirname(fname) self.filename = fname DbfilenameShelf.__init__(self, self.filename, flag='w', protocol=-1)
def get_common_words(text_storage: shelve.DbfilenameShelf, amount_of_common_words: int) -> [(str, int)]: stop_words = set() # To hold the set of stop words word_frequencies = defaultdict( int) # To hold the amount of occurrences for non-stop words # Enchant setup adapted from their tutorial: https://pyenchant.github.io/pyenchant/tutorial.html dictionary = enchant.Dict("en_US") # To validate words # Open up the stop_words file and read in the set of stop words with open("stop_words.txt", "r") as file_input_stream: for next_word in file_input_stream: stop_words.add(next_word.rstrip()) # Loop through the text for each webpage for next_webpage_text in text_storage.values(): # Split the webpage according to all whitespace, dashes, and hyphens for next_word in re.split(r"[\s\-–]", next_webpage_text): # Remove special characters from the words (if any) next_word = re.sub(r"[.,?:!;()\[\]{}\"]", "", next_word) # If the next word contains only alphabetical characters (and some special characters), # is a recognizable English word, and is not a stop word, increment its frequency if (re.match(r"^[a-zA-Z']+$", next_word) is not None) and \ dictionary.check(next_word) and \ (next_word.lower() not in stop_words): word_frequencies[next_word.lower()] += 1 # Sort the words according to their frequency in descending order and return them words_in_descending_frequency = \ [(next_word, frequency) for next_word, frequency in sorted(word_frequencies.items(), key=lambda x: (-x[1]))] return words_in_descending_frequency[:amount_of_common_words]
def loadfile( site: site.Site, src: ContentSrc, bin: bool, filecache: shelve.DbfilenameShelf ) -> List[Tuple[ContentSrc, Optional[bytes]]]: curstat = src.stat() key = f"{src.package}_::::_{src.srcpath}" stat, bodies = filecache.get(key, (None, None)) if stat: if stat == curstat: return cast(List[Tuple[ContentSrc, Optional[bytes]]], bodies) if not bin: assert src.srcpath ext = os.path.splitext(src.srcpath)[1] loader = FILELOADERS.get(ext, binloader) else: loader = binloader ret: List[Tuple[ContentSrc, Optional[bytes]]] = [] for contentsrc, body in loader(site, src): assert contentsrc.metadata["loader"] if isinstance(body, bytes): ret.append((contentsrc, body)) if isinstance(body, str): ret.append((contentsrc, body.encode("utf-8"))) else: ret.append((contentsrc, None)) filecache[key] = curstat, ret return ret
def main(): global TM global LABELMAP global CTMAP global GENDER_TO_PRONOUN global TOKEN_TO_GENDER cfg = CONFIG[args.config] catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') TM = catpeople['__TOKEN_MAPPER__'] TM.finalize() LABELMAP = util_catpeople.get_labelmap() CTMAP = util_catpeople.get_coarse_tagmap() GENDER_TO_PRONOUN = get_gender_to_pronoun(TM) TOKEN_TO_GENDER = get_token_to_gender(TM) if args.print_to_conll: # Print CatPeople in Conll Format partial_print_to_conll = functools.partial(print_to_conll, catpeople=catpeople) n_jobs = 4 Parallel(n_jobs=n_jobs)( delayed(partial_print_to_conll)(out_fn=out_fn, urls=urls) for (out_fn, urls) in itertools.izip((args.out_fn + str(i) for i in range( n_jobs)), split(catpeople['__URL_LIST__'], n_jobs))) return else: name = cfg._name if name.startswith(UNIGRAM): return doc_to_unigrams(cfg, catpeople) # doc_to_unigrams # --> entity_list_to_ngram_csr_mat(n=0, width=None) # --> get_ngrams_from_catpeople_entity # --> yield_ngrams # --> catpeople_sentence_iterator elif name.startswith(BIGRAM): return doc_to_bigrams(cfg, catpeople) # doc_to_unigrams # --> entity_list_to_ngram_csr_mat(n=0, width=None) # --> get_width_for_bigrams # --> entity_list_to_ngram_csr_mat(n=1, width=width) elif name.startswith(UNIVEC): return doc_to_univec(cfg, catpeople) # doc_to_univec # --> save_vec_file # --> entity_list_to_ngram_csr_mat(n=0, width=None) elif name.startswith(BIVEC): return doc_to_bivec(cfg) elif name.startswith(DSCTOK) or name.startswith(DSCSUF): return doc_to_dscfeat(cfg, catpeople) # --> entity_list_to_dscfeat_csr_mat # --> get_dscfeat_from_catpeople_entity # --> catpeople_sentence_iterator # --> yield_dsctok elif name.startswith(DSCTOKVEC): return doc_to_dsctokvec(cfg) elif name.startswith(UNISUF): return doc_to_unisuf(cfg, catpeople) else: raise NotImplementedError(name)
def __init__(self, name, logger): filename = os.environ['HOME'] + '/.lox/.' + name + '.cache' DbfilenameShelf.__init__(self, filename, protocol=2, writeback=True) api = LoxApi(name) api_version = api.version() config_dir = config.settings[name]['local_dir'] try: my_dir = self.get('local_dir',None) assert config_dir == my_dir my_version = self.get('version',None) assert api_version == my_version except AssertionError: # Cache is considered not safe, so re-initialized logger.warn("Initializing cache") self.clear() self[u'local_dir'] = config_dir self[u'version'] = api_version
def update_shelf(): url_mention = DbfilenameShelf(args.in_shelf, protocol=-1) TM = url_mention['__TOKEN_MAPPER__'] TM.finalize(catpeople_baseline_nb_config.MAX_TOK) E = url_mention['__URL_LIST__'] n_doc = 10000 with rasengan.tictoc('Extracting Contexts'): df_obj = TextualClueObject(E[:n_doc], url_mention, TM) df = defaultdict(int) for features in df_obj.features.itervalues(): for f in features: df[f] += 1 for f in df.keys(): df[f] = df[f] / float(n_doc) url_mention['__DF__'] = dict(df) url_mention.close() return
def __init__(self, args_sequence: Sequence, as_input_cache=False): Cache.__init__(self, args_sequence) try: if as_input_cache: # noinspection PyUnresolvedReferences DbfilenameShelf.__init__(self, filename=self.filename, flag='r') else: # noinspection PyUnresolvedReferences DbfilenameShelf.__init__(self, filename=self.filename, flag=self.FILE_CACHE_MODE[self.mode], writeback=self.in_memory_cache) except IOError as e: self._critical('Error while opening cache', ExitCode.FILE_ERROR, e) # noinspection PyUnresolvedReferences self._debug(f' Initialized', header=f'Cache "{self.filename}"')
def get_friends(cache: DbfilenameShelf = None, name: str = SHLOMO): if cache is not None: friends = cache.get(name) if friends is not None: return cache[name] response = get_first_request(name) first_page_singers = response[0] parallel_searcher = response[1] pages_range = response[2] if not pages_range: # only 1 page friends = first_page_singers else: with TPool(INNER_MAX_WORKERS) as pool: next_pages_singers = pool.map(parallel_searcher, pages_range) friends = first_page_singers.union(*next_pages_singers) if cache is not None: cache[name] = friends cache.sync() return friends
def __init__(self, fname=None, tmpdir=None, persistent=False): if fname is None: #get tmp directory self._tmpdir = TMPDIR if tmpdir is None else tmpdir #create tmp file for db fd, self.filename = mkstemp('', PREFIX, dir=self._tmpdir) os.close(fd) os.unlink(self.filename) #create a shelf in the db DbfilenameShelf.__init__(self, self.filename, flag='n', protocol=-1) if not persistent: register_tmp_file(self.filename) else: self._tmpdir = os.path.dirname(fname) self.filename = fname DbfilenameShelf.__init__(self, self.filename, flag='w', protocol=-1)
def setup(): ''' Load the catpeople data. ''' url_mention = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') TM = url_mention['__TOKEN_MAPPER__'] TM.finalize(catpeople_baseline_nb_config.MAX_TOK) E = url_mention['__URL_LIST__'] DF = url_mention['__DF__'] cat_folds = pkl.load(open(args.fold_fn)) cat2url = util_catpeople.load_cat2url(args.cat2url_fn) performance_aggregator = Performance_Aggregator(args=args) return (url_mention, TM, E, cat_folds, cat2url, performance_aggregator, DF)
def __init__(self, datacfg, ppcfg, expcfg): # Init Part 0 self.datacfg = datacfg self.ppcfg = ppcfg self.expcfg = expcfg with rasengan.tictoc('Init Part 1 : The Datacfg'): self.cp = DbfilenameShelf( r'%s/%s'%(uc.get_pfx(),self.datacfg.cp_fn), protocol=-1, flag='r') self.url_list = self.cp['__URL_LIST__'] self.TM = self.cp['__TOKEN_MAPPER__'] # self.TM.final must be patched to work with older # versions of TokenMapper that are in the pickle. if not hasattr(self.TM, 'final'): self.TM.final = False if self.is_malignull(): self.TM([self.expcfg.NULL_KEY]) self.bos_idx = self.TM.finalize() self.pa = Aggregator( datacfg=datacfg, ppcfg=ppcfg, expcfg=expcfg, url_list=self.url_list, TM=self.TM) self.cat_folds = pkl.load(uc.proj_open(self.datacfg.fold_fn)) self.cat2url = uc.load_cat2url(uc.proj_open(self.datacfg.cat2url_fn)) self.url_to_idx = dict((b,a) for a,b in enumerate(self.url_list)) self.scratch = {} pass with rasengan.tictoc('Init Part 2 : The PP CFG'): print 'Reading', 'catpeople_pp_%d'%args.ppcfg self.smat = io.mmread(uc.proj_open('catpeople_pp_%d'%args.ppcfg)) assert scipy.sparse.isspmatrix_coo(self.smat) if self.pp_prefix_is([UNIVEC, BIVEC, MALIGNER, DSCTOKVEC]): self.vectors = np.load(uc.proj_open('catpeople_pp_%d.vec'%args.ppcfg)) pass if self.is_malignull(): self.NULL_VEC = np.zeros((1,self.vectors.shape[1])) if self.exp_prefix_is([NBKERNEL, KERMACH, MALIGNER]): assert self.pp_prefix_is([UNIVEC, BIVEC, DSCTOKVEC]) if self.expcfg.rm_fn_word: # Internally Manipulates smat self.remove_fn_word() if self.expcfg.weight_method.endswith('/df'): self.populate_idf() return
def setUpClass(cls): super(TestEntityDescriptors, cls).setUpClass() global TM global LABELMAP global CTMAP cls.cpfn = (util_catpeople.get_pfx() + '/catpeople_clean_segmented_context.shelf') cls.parsefn = (util_catpeople.get_pfx() + '/catpeople.parse.pkl') cls.catpeople = DbfilenameShelf(cls.cpfn, protocol=-1, flag='r') TM = cls.catpeople['__TOKEN_MAPPER__'] TM.finalize() LABELMAP = util_catpeople.get_labelmap() CTMAP = util_catpeople.get_coarse_tagmap() # Inject global variables to module's namespace. catpeople_preprocessor.TM = TM catpeople_preprocessor.LABELMAP = LABELMAP catpeople_preprocessor.CTMAP = CTMAP catpeople_preprocessor.GENDER_TO_PRONOUN = catpeople_preprocessor.get_gender_to_pronoun(TM) catpeople_preprocessor.TOKEN_TO_GENDER = catpeople_preprocessor.get_token_to_gender(TM) catpeople_preprocessor.populate_dsctok_globals() cls.testid = 1 print 'Calling setup'
def get_longest_page(shelf: shelve.DbfilenameShelf) -> (str, int): # To hold the longest page and its number of words longest_page = "" longest_page_count = 0 # Get all urls from the shelve and loop through them keys = shelf.keys() for key in keys: # Get the words for the current url and count its words words = re.split(r"[\s\-–]", shelf[key]) word_count = len(words) # If the amount of words for this url is more than the previous longest page, update the longest page and # word amount if word_count > longest_page_count: longest_page_count = word_count longest_page = key # Return the longest page and its word amount as a 2-tuple return longest_page, longest_page_count
def _get_shelf_data(path): with closing(DbfilenameShelf(path, flag='r')) as shelf: return dict(shelf)
def get(self,name,default=None): key = name.encode('utf8') if DbfilenameShelf.has_key(self,key): return DbfilenameShelf.__getitem__(self, key) else: return default
def __setitem__(self, name, value): key = name.encode('utf8') DbfilenameShelf.__setitem__(self, key, value) self.sync()
if len(form_data) != 0: try: cookie = SimpleCookie() http_cookie_header = environ.get('HTTP_COOKIE') if not http_cookie_header: sid = sha256(repr(time()).encode()).hexdigest() cookie['reset'] = sid else: cookie.load(http_cookie_header) if 'reset' not in cookie: sid = sha256(repr(time()).encode()).hexdigest() cookie['reset'] = sid else: sid = cookie['reset'].value session_store = DbfilenameShelf('../sessions/reset_' + sid, writeback=True) if session_store.get('code'): code = escape(form_data.getfirst('code', '').strip()) if code: form = """<form action="forgot.py" method="post"> <label for="code">Code: </label> <input type="number" name="code" id="code" min="0" max="99999" value="%s" required /> <label for="pass1">Enter new password: </label> <input type="password" name="pass1" id="pass1" required /> <label for="pass2">Reenter password: </label> <input type="password" name="pass2" id="pass2" required /> <input type="submit" /> </form>""" % code if session_store.get('code') == code: pass1 = escape(form_data.getfirst('pass1', '').strip()) pass2 = escape(form_data.getfirst('pass2', '').strip())
import os from pathlib import Path import shelve from shelve import Shelf, DbfilenameShelf data = {'a': 0, 'b': 1, 'c': 'c-string'} filename = str(Path.home() / "shelf") #os.remove(filename + ".db") db = DbfilenameShelf(filename, flag='c', protocol=3, writeback=True) #db.update(data) #db.sync() print(f"shelf: {dict(db)}")
def __delitem__(self, key): DbfilenameShelf.__delitem__(self, key) self.sync_now()
def __init__(self, filename): DbfilenameShelf.__init__(self, filename) self.filename = filename
def __init__(self, *args, **kwargs): DbfilenameShelf.__init__(self, *args, **kwargs)
def __init__(self, filename): self.filename = filename DbfilenameShelf.__init__(self, filename, flag='r', protocol=-1)
</form> <p><a href="accounts/forgot.py">Forgot password</a></p>""" % username sha256_password = sha256(password.encode()).hexdigest() try: connection = db.connect('localhost', 'cf26', 'pecah', 'cs6503_cs1106_cf26') cursor = connection.cursor(db.cursors.DictCursor) cursor.execute("""SELECT * FROM users WHERE username = %s AND password = %s""", (username, sha256_password)) if cursor.rowcount == 0: message = '<p><strong>Error! Incorrect user name or password</strong></p>' else: cookie = SimpleCookie() sid = sha256(repr(time()).encode()).hexdigest() cookie['sid'] = sid session_store = DbfilenameShelf('sessions/sess_' + sid, writeback=True) session_store['authenticated'] = True session_store['username'] = username session_store.close() result = """ <h2>Welcome back %s!</h2> <ul> <li><a href="game.py">Play Now</a></li> <li><a href="account.py">Your Scores & Account Management</a></li> <li><a href="logout.py">Logout</a></li> </ul>""" % username print(cookie) cursor.close() connection.close() except (db.Error, IOError): message = '<p>Sorry! We are experiencing problems at the moment. Please try again later.</p>'
import cPickle as pkl from rasengan import groupby PFX = get_pfx() arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--in_shelf', default=PFX + '/catpeople_clean_segmented_context.shelf', type=str) arg_parser.add_argument('--parsefn', default=PFX + '/catpeople.parse.gz', type=str) arg_parser.add_argument('--parse_pkl', default=PFX + '/catpeople.parse.pkl', type=str) args = arg_parser.parse_args() catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') TM = catpeople['__TOKEN_MAPPER__'] labelmap = get_labelmap() ctmap = get_coarse_tagmap() ftmap = get_fine_tagmap() f = gzip.GzipFile(fileobj=proj_open(args.parsefn)) def get(e): e = e.split('\t') return [e[1], int(e[6]), e[7], e[3], e[4]] PARSES = {} for parse in groupby(f): token, parent, labels, ctags, ftags = zip(*[get(r) for r in parse])
def sync(self): # noinspection PyUnresolvedReferences self._debug(f'Synchronizing...', header=f'Cache "{self.filename}"') DbfilenameShelf.sync(self) # noinspection PyUnresolvedReferences self._debug('Synchronized', header=f'Cache "{self.filename}"')
def __init__(self, filename): DbfilenameShelf.__init__(self,filename)
def __delitem__(self, name): key = name.encode('utf8') if DbfilenameShelf.has_key(self,key): DbfilenameShelf.__delitem__(self, key) self.sync()
result = """ <section> <p>You are not logged in.</p> <p> <a href="login.py">Login</a> | <a href="accounts/register.py">Register</a> </p> </section>""" try: cookie = SimpleCookie() http_cookie_header = environ.get('HTTP_COOKIE') if http_cookie_header: cookie.load(http_cookie_header) if 'sid' in cookie: sid = cookie['sid'].value session_store = DbfilenameShelf('sessions/sess_' + sid, writeback=False) if session_store.get('authenticated'): message = '' form_data = FieldStorage() username = session_store.get('username') form = """<p> Hey, %s. Sorry to see you go. </p> <p> <strong>Warning! This action is permenant.</strong> All of your scores will be lost. </p> <form action="delete_account.py" method="post"> <label for="pass1">Enter password: </label> <input type="password" name="pass1" id="pass1" placeholder="Enter password" required /> <label for="pass2">Reenter password: </label> <input type="password" name="pass2" id="pass2" placeholder="Reenter password" required />
def __getitem__(self, name): key = name.encode('utf8') value = DbfilenameShelf.__getitem__(self, key) return value
def _open(self): DbfilenameShelf.__init__( self, os.path.join(self.get_database_path(self.config["path"]), "shelfdb.db"), )
def get_eye_position(shelf: shelve.DbfilenameShelf): return shelf.get(EYE_POSITION, 0)
def __setitem__(self, key, value): DbfilenameShelf.__setitem__(self, key, value) self.sync_now()
def __init__(self, filename, protocol=2, writeback=True): DbfilenameShelf.__init__(self, filename, protocol=protocol, writeback=writeback)
def sync_now(self): filename = self.filename self.close() DbfilenameShelf.__init__(self, filename)
def __setitem__(self, key, value): if isinstance(key, int): key = str(key) DbfilenameShelf.__setitem__(self, key, value) self.sync()
def _reset(self): # noinspection PyUnresolvedReferences DbfilenameShelf.__init__(self, filename=self.filename, flag='n', writeback=self.in_memory_cache)
def __getitem__(self, key): if isinstance(key, int): key = str(key) return DbfilenameShelf.__getitem__(self, key)
import argparse import sys, os arg_parser = argparse.ArgumentParser( description='Remove junk from catpeople wikimic') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument('--MAX_CHAR_IN_SENT', default=1000, type=int) PDIR = ('/export/b15/prastog3' if os.uname()[1] == 'b15' else 'data/') arg_parser.add_argument('--in_shelf', default='%s/catpeople_wikilink_mentions.shelf' % PDIR, type=str) arg_parser.add_argument('--out_shelf', default='%s/catpeople_clean_segmented_context.shelf' % PDIR, type=str) args = arg_parser.parse_args() in_shelf = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') out_shelf = DbfilenameShelf(args.out_shelf, protocol=-1) urls = in_shelf['__URL_LIST__'] PAT_TOKENIZER = get_tokenizer() TOKEN_MAPPER = TokenMapper() MAX_CHAR_IN_SENT = args.MAX_CHAR_IN_SENT import re MIDDLE_NAME_REGEX = re.compile('[A-Z][^ ]*? [A-Z]\. [A-Z]') for url_idx, url in enumerate(urls): print >> sys.stderr, ('Done: %.3f \r' % (float(url_idx) * 100 / len(urls))), mentions = in_shelf[url] out_mentions = [] for mention in mentions:
def __delitem__(self, key): if isinstance(key, int): key = str(key) DbfilenameShelf.__delitem__(self, key) self.sync()