def test_when_file_does_not_exist(self): path = os.path.join(self.tempdir, 'foo.db') db = semidbm.open(path, 'n') db['foo'] = 'bar' self.assertEqual(db['foo'], b'bar') db.close() # Opening the file again should basically blank out # any existing database. db = semidbm.open(path, 'n') self.assertEqual(list(db.keys()), []) db.close()
def test_checksum_failure(self): db = semidbm.open(self.dbdir, 'c') db['key'] = 'value' db.close() # Change the first digit of the checksum data. data_file = self.open_data_file(mode='r') # 3:key15:<checksum>value # First checksum digit is 9 bytes into the file. beginning = data_file.read() new_digit = int(beginning[8]) + 1 data_file.close() data_file = self.open_data_file(mode='w') data_file.write(beginning[:8]) data_file.write(str(new_digit)) data_file.write(beginning[9:]) data_file.close() db = self.open_db_file(verify_checksums=True) with self.assertRaises(semidbm.DBMChecksumError): db['key'] # If checksums are not enabled, an exception is not raised. db = self.open_db_file(verify_checksums=False) try: db['key'] except semidbm.DBMChecksumError: self.fail("Checksums were suppose to be disabled.")
def __init__(self, csm): self.csm = csm # This uses dbm, so we open the DB file: filename = simbase.config.GetString('account-bridge-filename', 'account-bridge') self.dbm = semidbm.open(filename, 'c')
def test_load_empty_db(self): db = semidbm.open(self.dbdir, 'c') db.close() empty_db = self.open_db_file() keys = empty_db.keys() empty_db.close() self.assertEqual(keys, [])
def test_key_does_not_exist(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db.close() read_only = self.open_db_file() self.assertRaises(KeyError, read_only.__getitem__, 'bar') read_only.close()
def test_when_files_exist(self): db = self.open_db_file() db['foo'] = 'bar' db.close() db_write_mode = semidbm.open(self.dbdir, 'w') self.assertEqual(db_write_mode['foo'], b'bar') db_write_mode.close()
def test_open_read_multiple_times(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db.close() # Open then close db immediately. db2 = self.open_db_file() db2.close() read_only = self.open_db_file() self.assertEqual(read_only['foo'], b'bar') read_only.close()
def test_open_read_multiple_times(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db.close() # Open then close db immediately. db2 = self.open_db_file() db2.close() read_only = self.open_db_file() self.assertEqual(read_only['foo'], b'bar') read_only.close()
def create(self): """Create a new on-disk database. @raise anydbm.error: If there's a problem creating the database. """ if self.filename: self.db = anydbm.open(self.filename, "n") #raises anydbm.error self.db["--Reserved--type"] = self.type self.db.sync() else: self.db = {}
def test_can_read_items(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db['bar'] = 'baz' db['baz'] = 'foo' db.close() read_only = self.open_db_file() self.assertEqual(read_only[b'foo'], b'bar') self.assertEqual(read_only[b'bar'], b'baz') self.assertEqual(read_only[b'baz'], b'foo') read_only.close()
def test_can_read_items(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db['bar'] = 'baz' db['baz'] = 'foo' db.close() read_only = self.open_db_file() self.assertEqual(read_only[b'foo'], b'bar') self.assertEqual(read_only[b'bar'], b'baz') self.assertEqual(read_only[b'baz'], b'foo') read_only.close()
def __init__(self, dbdir, check_frequency=20, max_filesize=MAX_DISK_USAGE): import semidbm self._db = semidbm.open(dbdir, 'c') self._max_filesize = max_filesize # How frequently we check the file size of the cache. # If we check every 20 writes, then at worst case we overshoot # the max size by MAX_BODY_SIZE * check_frequency, or # about 20MB if we use the default values for everything. self._check_frequency = check_frequency self._counter = 0 # When we reach the max disk size, we disable # writing data to the cache. self._writes_enabled = True
def open(self): """Open a pre-existing on-disk database. @raise anydbm.error: If there's a problem opening the database. @raise ValueError: If the database is not of the right type. """ if not self.filename: raise ValueError("Can only open on-disk databases") self.db = anydbm.open(self.filename, "w") #raises anydbm.error try: if self.db["--Reserved--type"] != self.type: raise ValueError("Not a %s database" % self.type) except KeyError: raise ValueError("Not a recognized database")
def test_checksum_failure(self): db = semidbm.open(self.dbdir, 'c') db[b'key'] = b'value' db.close() data_file = self.open_data_file(mode='rb') contents = data_file.read() data_file.close() # Changing 'value' to 'Value' should cause a checksum failure. contents = contents.replace(b'value', b'Value') data_file = self.open_data_file(mode='wb') data_file.write(contents) data_file.close() db = self.open_db_file(verify_checksums=True) with self.assertRaises(semidbm.DBMChecksumError): db['key'] db.close() # If checksums are not enabled, an exception is not raised. db = self.open_db_file(verify_checksums=False) try: db['key'] except semidbm.DBMChecksumError: self.fail("Checksums were suppose to be disabled.") finally: db.close()
def test_checksum_failure(self): db = semidbm.open(self.dbdir, 'c') db[b'key'] = b'value' db.close() data_file = self.open_data_file(mode='rb') contents = data_file.read() data_file.close() # Changing 'value' to 'Value' should cause a checksum failure. contents = contents.replace(b'value', b'Value') data_file = self.open_data_file(mode='wb') data_file.write(contents) data_file.close() db = self.open_db_file(verify_checksums=True) with self.assertRaises(semidbm.DBMChecksumError): db['key'] db.close() # If checksums are not enabled, an exception is not raised. db = self.open_db_file(verify_checksums=False) try: db['key'] except semidbm.DBMChecksumError: self.fail("Checksums were suppose to be disabled.") finally: db.close()
def __init__(self): if self._kanwadict is None: dictpath = resource_filename(__name__, 'kanwadict3.db') # FIXME: no hardcoded filename self._kanwadict = dbm.open(dictpath, 'r')
def open_db_file(self, **kwargs): # If they do not explicitly set verify_checksums # to something, default to it being on. if 'verify_checksums' not in kwargs: kwargs['verify_checksums'] = True return semidbm.open(self.dbdir, 'c', **kwargs)
def reader(self): self.close() self._file = dbm.open(self._path, 'r') return self
def writer(self): """Return a new writer. Will always create a new file""" self.close() self._file = dbm.open(self._path, 'n') return self
def __init__(self, name): super(Database, self).__init__() self.db = semidbm.open(os.path.join(directory(), name), 'c')
def open_db_file(self, **kwargs): return semidbm.open(self.dbdir, 'r', **kwargs)
def reader(self): self.close() self._file = dbm.open(self._path, 'r') return self
def appender(self): """Return a new writer, preserving the file if it already exists""" self.close() self._file = dbm.open(self._path, 'c') return self
def open_db_file(self, **kwargs): return semidbm.open(self.dbdir, 'r', **kwargs)
def __init__(self, dbm_file): self._dbm = semidbm.open(dbm_file, 'r')
def __init__(self, dbm_file): self._dbm = semidbm.open(dbm_file, 'r')
def open(self): self._subcat_index = semidbm.open(self._subcat_index_file, 'c') self._supercat_index = semidbm.open(self._supercat_index_file, 'c') return self
def test_unicode_chars(self): db = semidbm.open(self.dbdir, 'c') # cafe with the e-accute. db[b'caf\xc3\xa9'] = b'caf\xc3\xa9' self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9') db.close()
def __init__(self): if self._kanwadict is None: dictpath = resource_filename( __name__, 'kanwadict3.db') # FIXME: no hardcoded filename self._kanwadict = dbm.open(dictpath, 'r')
def open_db_file(self, **kwargs): # If they do not explicitly set verify_checksums # to something, default to it being on. if 'verify_checksums' not in kwargs: kwargs['verify_checksums'] = True return semidbm.open(self.dbdir, 'c', **kwargs)
def appender(self): """Return a new writer, preserving the file if it already exists""" self.close() self._file = dbm.open(self._path, 'c') return self
def shelve_open_semidbm(filename, flag='c', protocol=None, writeback=False): import semidbm # pylint: disable=import-error return shelve.Shelf(semidbm.open(filename, flag), protocol, writeback)
def writer(self): """Return a new writer. Will always create a new file""" self.close() self._file = dbm.open(self._path, 'n') return self
# This file is the starting file for a rhyming program using NLTK. import semidbm import random import time from nltk.tokenize import RegexpTokenizer # Open the rhyming database syllablesDB = semidbm.open('words.db') rhymesDB = semidbm.open('rhymes.db') def rhyme(word, count): """Returns a list of all the words that rhyme with 'word' with 'count' number of syllables.""" # start = time.time() ##### try: wordSyllables = syllablesDB[word.upper()].decode().split()[0] # print wordSyllables ### wordRhymes = [word.decode() for word in rhymesDB[wordSyllables].split()] wordRhymes.remove(word.upper()) # print wordRhymes ### backlist = [x.lower() for x in wordRhymes if count == 0 or syllablesDB[x].split()[1] == count] except: backlist = [] # print 'rhyme: '+str(time.time() - start) ##### return backlist def rhymesWith(word1, word2): """Determines if two words rhyme."""
def test_unicode_chars(self): db = semidbm.open(self.dbdir, 'c') # cafe with the e-accute. db[b'caf\xc3\xa9'] = b'caf\xc3\xa9' self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9') db.close()
def __init__(self, filename, flag='c', protocol=pickle.HIGHEST_PROTOCOL): self._my_file = semidbm.open(filename, flag=flag) self._protocol = protocol
def __init__(self, db=None): if isinstance(db, str): self.db = semidbm.open(db, 'c') else: self.db = db
def __init__(self, conf=None): if conf is None: raise Exception('Path required.') if not os.path.exists(os.path.dirname(conf)): os.makedirs(os.path.dirname(conf)) self._db = semidbm.open(conf, 'c')
categories = collections.defaultdict(list) review_dict = {} terms_to_collect = set() businesses_to_collect = set() with open('Business.json') as f: businesses = {b['business_id'].encode('utf-8'): (b['name'].encode('utf-8'), [cat.encode('utf-8') for cat in b['categories'] if cat != 'Restaurants'], b['stars']) for b in json.load(f)} with open(city + '.json') as reviews: reviews_list = json.load(reviews) for review in reviews_list: review_dict[review['review_id'].encode('utf-8')] = (review['text'].encode('utf-8'), review['business_id'].encode('utf-8'), review['stars'], review['date'].encode('utf-8')) terms = tokenize_regex.findall(review['text'].lower()) if terms: review_weight = 1.0/len(terms) for term in terms: categories[term.encode('utf-8')].append((review['review_id'].encode('utf-8'), review_weight)) terms_to_collect.add(term.encode('utf-8')) businesses_to_collect.add(review['business_id'].encode('utf-8')) s = shelve.Shelf(semidbm.open(city + '-Baseline.db', flag='n'), protocol=pickle.HIGHEST_PROTOCOL) for k,v in categories.items(): s["c=" + k] = v for k, v in review_dict.items(): s["r=" + k] = v for b in businesses_to_collect: s["b=" + b] = businesses[b] for t in terms_to_collect: s["t=" + t] = [(t,1.0)] s.close()
def __init__(self, csm): self.csm = csm filename = simbase.config.GetString( 'account-bridge-filename', 'account-bridge') self.dbm = semidbm.open(filename, 'c')
def __init__(self, filename, flag='c', protocol=None, writeback=False): import semidbm shelve.Shelf.__init__(self, semidbm.open(filename, flag), protocol, writeback)
def __init__(self, name): super(Database, self).__init__() try: self.db = semidbm.open(os.path.join(directory(), name), 'c') except NotADirectoryError: logger.error("Old database type encountered!")
def shelve_open_semidbm(filename, flag='c', protocol=None, writeback=False): import semidbm return shelve.Shelf(semidbm.open(filename, flag), protocol, writeback)
def __enter__(self): self._cache = semidbm.open(self._cache_dir, flag='c') return self._cache
def open(self): self.db = semidbm.open(self.path, 'c')
def open_db_file(self): return semidbm.open(os.path.join(self.tempdir, 'myfile.db'), 'c')
def kanwaout(self, out): dic = dbm.open(out, 'c') for (k, v) in self.records.items(): dic[k] = compress(dumps(v)) dic.close()
def __init__(self, city): self.city = city self.f = semidbm.open(city + '.db', flag='r') self.db = shelve.Shelf(self.f, protocol=pickle.HIGHEST_PROTOCOL)
def kanwaout(self, out): dic = dbm.open(out, 'c') for (k, v) in self.records.items(): dic[k] = compress(dumps(v)) dic.close()
def run_bench(N, db_tpl) -> Dict[str, Dict[str, float]]: batchsize = 1000 LMDBM_FILE = db_tpl.format("lmdbm") LMDBM_BATCH_FILE = db_tpl.format("lmdbm-batch") PYSOS_FILE = db_tpl.format("pysos") SQLITEDICT_FILE = db_tpl.format("sqlitedict") SQLITEDICT_BATCH_FILE = db_tpl.format("sqlitedict-batch") DBM_DUMB_FILE = db_tpl.format("dbm.dumb") DBM_GNU_FILE = db_tpl.format("dbm.gnu") SEMIDBM_FILE = db_tpl.format("semidbm") VEDIS_FILE = db_tpl.format("vedis") VEDIS_BATCH_FILE = db_tpl.format("vedis-batch") UNQLITE_FILE = db_tpl.format("unqlite") UNQLITE_BATCH_FILE = db_tpl.format("unqlite-batch") remove_lmdbm(LMDBM_FILE) remove_lmdbm(LMDBM_BATCH_FILE) with suppress(FileNotFoundError): os.unlink(PYSOS_FILE) with suppress(FileNotFoundError): os.unlink(SQLITEDICT_FILE) with suppress(FileNotFoundError): os.unlink(SQLITEDICT_BATCH_FILE) remove_dbm(DBM_DUMB_FILE) remove_semidbm(SEMIDBM_FILE) with suppress(FileNotFoundError): os.unlink(VEDIS_FILE) with suppress(FileNotFoundError): os.unlink(VEDIS_BATCH_FILE) with suppress(FileNotFoundError): os.unlink(UNQLITE_FILE) with suppress(FileNotFoundError): os.unlink(UNQLITE_BATCH_FILE) ret: DefaultDict[str, Dict[str, float]] = defaultdict(dict) # writes with MeasureTime() as t: with JsonLmdb.open(LMDBM_FILE, "c") as db: for k, v in data(N): db[k] = v ret["lmdbm"]["write"] = t.get() print("lmdbm write", N, t.get()) with MeasureTime() as t: with JsonLmdb.open(LMDBM_BATCH_FILE, "c") as db: for pairs in batch(data(N), batchsize): db.update(pairs) ret["lmdbm-batch"]["write"] = t.get() print("lmdbm-batch write", N, t.get()) with open(os.devnull, "w") as devnull: # mute annoying "free lines" output with redirect_stdout(devnull): with MeasureTime() as t: db = pysos.Dict(PYSOS_FILE) for k, v in data(N): db[k] = v db.close() ret["pysos"]["write"] = t.get() print("pysos write", N, t.get()) with MeasureTime() as t: with SqliteDict(SQLITEDICT_FILE, autocommit=True) as db: for k, v in data(N): db[k] = v ret["sqlitedict"]["write"] = t.get() print("sqlitedict write", N, t.get()) with MeasureTime() as t: with SqliteDict(SQLITEDICT_BATCH_FILE, autocommit=False) as db: for pairs in batch(data(N), batchsize): db.update(pairs) db.commit() ret["sqlitedict-batch"]["write"] = t.get() print("sqlitedict-batch write", N, t.get()) with MeasureTime() as t: with dbm.dumb.open(DBM_DUMB_FILE, "c") as db: for k, v in data(N): db[k] = json.dumps(v) ret["dbm.dumb"]["write"] = t.get() print("dbm.dumb write", N, t.get()) if gdbm: with MeasureTime() as t: with dbm.gnu.open(DBM_GNU_FILE, "c") as db: for k, v in data(N): db[k] = json.dumps(v) ret["dbm.gnu"]["write"] = t.get() print("dbm.gnu write", N, t.get()) with MeasureTime() as t: db = semidbm.open(SEMIDBM_FILE, "c") for k, v in data(N): db[k] = json.dumps(v) db.close() ret["semidbm"]["write"] = t.get() print("semidbm write", N, t.get()) with MeasureTime() as t: with Vedis(VEDIS_FILE) as db: for k, v in data(N): db[k] = json.dumps(v) ret["vedis"]["write"] = t.get() print("vedis write", N, t.get()) with MeasureTime() as t: with Vedis(VEDIS_BATCH_FILE) as db: for pairs in batch(data(N), batchsize): db.update({k: json.dumps(v) for k, v in pairs}) ret["vedis-batch"]["write"] = t.get() print("vedis-batch write", N, t.get()) with MeasureTime() as t: with UnQLite(UNQLITE_FILE) as db: for k, v in data(N): db[k] = json.dumps(v) ret["unqlite"]["write"] = t.get() print("unqlite write", N, t.get()) with MeasureTime() as t: with UnQLite(UNQLITE_BATCH_FILE) as db: for pairs in batch(data(N), batchsize): db.update({k: json.dumps(v) for k, v in pairs}) ret["unqlite-batch"]["write"] = t.get() print("unqlite-batch write", N, t.get()) # reads with MeasureTime() as t: with JsonLmdb.open(LMDBM_FILE, "r") as db: for k in allkeys(N): db[k] # ret["lmdbm"]["read"] = t.get() print("lmdbm cont read", N, t.get()) with MeasureTime() as t: with JsonLmdb.open(LMDBM_FILE, "r") as db: for k in randkeys(N, N): db[k] ret["lmdbm"]["read"] = t.get() print("lmdbm rand read", N, t.get()) with open(os.devnull, "w") as devnull: # mute annoying "free lines" output with redirect_stdout(devnull): with MeasureTime() as t: db = pysos.Dict(PYSOS_FILE) for k in randkeys(N, N): db[k] db.close() ret["pysos"]["read"] = t.get() print("pysos read", N, t.get()) with MeasureTime() as t: with SqliteDict(SQLITEDICT_FILE) as db: for k in randkeys(N, N): db[k] ret["sqlitedict"]["read"] = t.get() print("sqlitedict read", N, t.get()) with MeasureTime() as t: with dbm.dumb.open(DBM_DUMB_FILE, "r") as db: for k in randkeys(N, N): json.loads(db[k]) ret["dbm.dumb"]["read"] = t.get() print("dbm.dumb read", N, t.get()) if gdbm: with MeasureTime() as t: with dbm.gnu.open(DBM_GNU_FILE, "r") as db: for k in randkeys(N, N): json.loads(db[k]) ret["dbm.gnu"]["read"] = t.get() print("dbm.gnu read", N, t.get()) with MeasureTime() as t: db = semidbm.open(SEMIDBM_FILE, "r") for k in randkeys(N, N): json.loads(db[k]) db.close() ret["semidbm"]["read"] = t.get() print("semidbm read", N, t.get()) with MeasureTime() as t: with Vedis(VEDIS_FILE) as db: for k in randkeys(N, N): json.loads(db[k]) ret["vedis"]["read"] = t.get() print("vedis read", N, t.get()) with MeasureTime() as t: with UnQLite(UNQLITE_FILE) as db: for k in randkeys(N, N): json.loads(db[k]) ret["unqlite"]["read"] = t.get() print("unqlite read", N, t.get()) return ret
def __init__(self, csm): self.csm = csm filename = simbase.config.GetString( 'account-bridge-filename', 'account-bridge') self.dbm = semidbm.open(filename, 'c')
import re import pickle import shelve import semidbm import pandas from os.path import basename from collections import defaultdict from operator import itemgetter from math import sqrt NUM_TOPICS = 40 tokenize_regex = re.compile(r'[A-Za-z]+') s = shelve.Shelf(semidbm.open(city + '.db', flag='n'), protocol=pickle.HIGHEST_PROTOCOL) for b in pandas.read_json('Business.json').itertuples(): s["b=" + str(b.business_id)] = (b.name, [ cat for cat in b.categories if cat != 'Restaurants' ], b.stars) reviews_body = pandas.read_json('Urbana.json') reviews_by_term = defaultdict(list) term_frequencies = defaultdict(int) for r in reviews_body.itertuples(): for term in tokenize_regex.findall(r.text): reviews_by_term[term].append((r.review_id, 1)) term_frequencies[term] += 1