class TestQuery(unittest.TestCase): def setUp(self): self.discodb = DiscoDB( (('alice', ('blue',)), ('bob', ('red',)), ('carol', ('blue', 'red'))), ) def q(self, s): return self.discodb.query(Q.parse(s)) def test_empty(self): self.assertEqual(list(self.q('')), []) self.assertEqual(len(self.q('')), 0) def test_get_len(self): self.assertEqual(len(self.discodb.get('alice')), 1) self.assertEquals(len(self.discodb.get('bob')), 1) self.assertEquals(len(self.discodb.get('carol')), 2) def test_query_len(self): self.assertEquals(len(self.q('alice')), 1) self.assertEquals(len(self.q('bob')), 1) self.assertEquals(len(self.q('carol')), 2) self.assertEquals(len(self.q('alice & bob')), 0) self.assertEquals(len(self.q('alice | bob')), 2) self.assertEquals(len(self.q('alice & carol')), 1) self.assertEquals(len(self.q('alice | carol')), 2) self.assertEquals(len(self.q('alice|bob|carol')), 2) self.assertEquals(len(self.q('alice&bob&carol')), 0) def test_query_len_doesnt_advance_iter(self): # check that calling len() doesn't advance the iterator res = self.q('alice') self.assertEquals(len(res), 1) self.assertEquals(len(res), 1) def test_query_results(self): self.assertEquals(set(self.q('alice')), set(['blue'])) self.assertEquals(set(self.q('bob')), set(['red'])) self.assertEquals(set(self.q('carol')), set(['blue', 'red'])) self.assertEquals(set(self.q('alice & bob')), set()) self.assertEquals(set(self.q('alice | bob')), set(['blue', 'red'])) self.assertEquals(set(self.q('alice & carol')), set(['blue'])) self.assertEquals(set(self.q('alice | carol')), set(['blue', 'red'])) self.assertEquals(set(self.q('alice|bob|carol')), set(['blue', 'red'])) self.assertEquals(set(self.q('alice&bob&carol')), set()) def test_query_len_nonkey(self): self.assertEquals(len(self.q('nonkey')), 0) self.assertEquals(len(self.q('~nonkey')), 2) self.assertEquals(len(self.q('nonkey & alice')), 0) self.assertEquals(len(self.q('nonkey | alice')), 1) def test_query_results_nonkey(self): self.assertEquals(set(self.q('nonkey')), set()) self.assertEquals(set(self.q('~nonkey')), set(['blue', 'red'])) self.assertEquals(set(self.q('nonkey & alice')), set()) self.assertEquals(set(self.q('nonkey | alice')), set(['blue']))
class TestUncompressed(TestMappingProtocol, TestSerializationProtocol): def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys), disable_compression=True) self.discodb_c = DiscoDB(self.discodb) def test_compression(self): self.assertEqual(dict((k, list(vs)) for k, vs in self.discodb.items()), dict((k, list(vs)) for k, vs in self.discodb_c.items()))
def load(cls, file): """a deserialized instance of %s from file.""" % cls if isinstance(file, basestring): file = open(file, 'r', 0) header = file.readline() offset = len(header) version, metalen = header.strip().split(':') metadb = DiscoDB.load(file, offset) offset += int(metalen) datadb = DiscoDB.load(file, offset) return cls(datadb, metadb)
def reduce(iter, params): partitions = params['partitions'] name = params['name'] discodb = DiscoDB(kvgroup(iter)) try: # figure out what partition we are in key = discodb.keys().__iter__().next() partition = util.default_partition(key, partitions, params) discodb.dump(open(filename(name, partition), 'w')) yield partition, None except StopIteration: # no keys, nothing to write pass
def test_dump_load(self): from tempfile import NamedTemporaryFile handle = NamedTemporaryFile() self.discodb.dump(handle) handle.seek(0) discodb = DiscoDB.load(handle) self.assertEquals(discodb.dumps(), self.discodb.dumps())
def test_leak(): while True: d = DiscoDB(zip(letters, ['abc'] * 1000)) t = len(d.query('a')) t = len(d['b']) t = 'd' in d t = d.dumps() t = DiscoDB.loads(t) t = d.dump(open('/tmp/discodb', 'w')) t = DiscoDB.load(open('/tmp/discodb')) for k in d.keys(): for v in d.values(): t = k == v
def maybe_method(datadir, rest, method, xargs=None): if rest.find('/%s/' % method) > 0: file, arg = rest.split('/%s/' % method, 1) path = os.path.join(datadir, file) if os.path.isfile(path): bound_method = getattr(DiscoDB.load(open(path)), method) return bound_method(xargs(arg)) if xargs else bound_method() raise NotMethod(method)
def create_db(numvs, vsize, disable_compression): from itertools import islice, izip, permutations, repeat pool = letters * (vsize / len(letters) + 1) return DiscoDB(izip( letters, repeat([''.join(p) for p in islice(permutations(pool, vsize), numvs)])), disable_compression=disable_compression)
class TestSerializationProtocol(unittest.TestCase): numkeys = 10000 def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys)) def test_dumps_loads(self): dbuffer = self.discodb.dumps() self.assertEquals(dbuffer, DiscoDB.loads(dbuffer).dumps()) def test_dump_load(self): from tempfile import NamedTemporaryFile handle = NamedTemporaryFile() self.discodb.dump(handle) handle.seek(0) discodb = DiscoDB.load(handle) self.assertEquals(discodb.dumps(), self.discodb.dumps())
def __init__(self, **tables): # {'name': DiscoDB} self._tables = {} for name, path_or_db in tables.items(): if isinstance(path_or_db, DiscoDB): db = path_or_db else: db = DiscoDB.load(open(path_or_db)) self._tables[name] = DiscoTable(self, name, db)
def load(name): dir = dirname(name) with open(os.path.join(dir, 'partitions')) as file: partitions = int(file.read()) discodbs = [DiscoDB()] * partitions for partition in xrange(0,partitions): path = filename(name, partition) if os.path.exists(path): discodbs[partition] = DiscoDB.load(open(path)) dbs[name] = discodbs
def test_leak(): while True: d = DiscoDB(zip(letters, ["abc"] * 1000)) t = len(d.query("a")) t = len(d["b"]) t = "d" in d t = d.dumps() t = DiscoDB.loads(t) t = d.dump(open("/tmp/discodb", "w")) t = DiscoDB.load(open("/tmp/discodb")) for k in d.keys(): for v in d.values(): t = k == v
def input_stream(fd, size, url, params): import os from disco import util from disco.comm import download from discodb import DiscoDB, Q scheme, netloc, rest = util.urlsplit(url) path, rest = rest.split('!', 1) if '!' in rest else (rest, '') if netloc[0] == Task.netloc[0]: discodb = DiscoDB.load(open(os.path.join(Task.root, path))) else: discodb = DiscoDB.loads(download('disco://%s/%s' % (netloc, path))) if rest: method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None) method = getattr(discodb, method_name) if method_name in ('metaquery', 'query'): return method(Q.urlscan(arg)), size, url return method(*filter(None, arg)), size, url return discodb, size, url
class TestMappingProtocol(unittest.TestCase): numkeys = 1000 def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys)) def test_contains(self): assert "200" in self.discodb assert "key" not in self.discodb def test_length(self): assert len(self.discodb) == self.numkeys def test_getitem(self): for x in xrange(self.numkeys): try: list(self.discodb[str(x)]) except KeyError: assert x == self.numkeys def test_iter(self): assert list(self.discodb) == list(self.discodb.keys()) def test_items(self): for key, values in self.discodb.items(): key, list(values) def test_keys(self): len(list(self.discodb.keys())) def test_values(self): len(list(self.discodb.values())) def test_query(self): q = Q.parse('5 & 10 & (15 | 30)') list(self.discodb.query(q)) def test_str(self): assert str(self.discodb) != repr(self.discodb)
def scan_database_dir(state): db_path = os.environ["DATA_DB_PATH"] for fname in (os.path.join(db_path, f) for f in os.listdir(db_path)): if fname not in state["dbs"] and os.path.isfile(fname): try: state["dbs"][fname] = DiscoDB.load(open(fname)) except DiscoDBError: # maybe a corrupt discodb, nuke it the sync # process should fetch a new one later on os.remove(fname) logger.exception("Unable to open %s", fname) if state["dbs"]: state["cache_time"] = time.time() return state
def scan_database_dir(state): db_path = os.environ['DATA_DB_PATH'] for fname in (os.path.join(db_path, f) for f in os.listdir(db_path)): if fname not in state['dbs'] and os.path.isfile(fname): try: state['dbs'][fname] = DiscoDB.load(open(fname)) except DiscoDBError: # maybe a corrupt discodb, nuke it the sync # process should fetch a new one later on os.remove(fname) logger.exception('Unable to open %s', fname) if state['dbs']: state['cache_time'] = time.time() return state
def input_stream(fd, size, url, params): scheme, netloc, rest = util.urlsplit(url) if netloc[0] == Task.netloc[0]: path, rest = rest.split('!', 1) if '!' in rest else (rest, '') Task.discodb = DiscoDB.load(open(os.path.join(Task.root, path))) if rest: method, arg = rest.split('/', 1) if method == 'query': if hasattr(params, 'discodb_query'): return Task.discodb.query(params.discodb_query), size, url return Task.discodb.query(Q.urlscan(arg)), size, url return getattr(Task.discodb, method)(), size, url return Task.discodb, size, url raise core.DiscoError("Scheme 'discodb' can only be used with force_local=True")
def Open(url, task=None): if task: disco_data = task.disco_data ddfs_data = task.ddfs_data else: from disco.settings import DiscoSettings settings = DiscoSettings() disco_data = settings['DISCO_DATA'] ddfs_data = settings['DDFS_DATA'] scheme, netloc, rest = util.urlsplit(url) path, rest = rest.split('!', 1) if '!' in rest else (rest, '') discodb = DiscoDB.load(open(util.localize(path, disco_data=disco_data, ddfs_data=ddfs_data))) if rest: method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None) method = getattr(discodb, method_name) if method_name in ('metaquery', 'query'): return method(Q.urlscan(arg)) return method(*filter(None, arg)) return discodb
def Open(url, task=None): if task: disco_data = task.disco_data ddfs_data = task.ddfs_data else: from disco.settings import DiscoSettings settings = DiscoSettings() disco_data = settings['DISCO_DATA'] ddfs_data = settings['DDFS_DATA'] scheme, netloc, rest = util.urlsplit(url) path, rest = rest.split('!', 1) if '!' in rest else (rest, '') discodb = DiscoDB.load( open(util.localize(path, disco_data=disco_data, ddfs_data=ddfs_data))) if rest: method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None) method = getattr(discodb, method_name) if method_name in ('metaquery', 'query'): return method(Q.urlscan(arg)) return method(*filter(None, arg)) return discodb
def open_db(): if len(sys.argv > 1): fname = sys.arv[1] else: # fetch some data from s3 conn = boto.s3_connectection( os.environ['AWS_KEY'], os.environ['AWS_SECRET'] ) bucket = conn.get_bucket('com.mozillalabs.blink') bucket = conn.bucket('blink') item = bucket.list('data/').next() fname = os.path.join( os.environ['DATA_DB_PATH'], str(item.key) ) if not os.path.exists(fname): item.get_contents_to_path(fname) app.db = DiscoDB.load(open(fname))
class TestMappingProtocol(unittest.TestCase): numkeys = 1000 def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys)) def test_contains(self): assert "0" in self.discodb assert "key" not in self.discodb def test_nonzero(self): self.assertFalse(self.discodb.query('NONKEY')) self.assertTrue(self.discodb.query('0')) self.assertTrue(self.discodb.values()) self.assertTrue(self.discodb.keys()) def test_length(self): self.assertEquals(len(self.discodb), self.numkeys) def test_get(self): len(list(self.discodb.get('0'))) self.assertEquals(self.discodb.get('X'), None) self.assertEquals(self.discodb.get('X', 'Y'), 'Y') def test_getitem(self): for x in xrange(self.numkeys): try: list(self.discodb[str(x)]) except KeyError: self.assertEquals(x, self.numkeys) def test_iter(self): self.assertEquals(list(self.discodb), list(self.discodb.keys())) def test_items(self): for key, values in self.discodb.items(): key, list(values) def test_keys(self): len(list(self.discodb.keys())) def test_values(self): len(list(self.discodb.values())) def test_unique_values(self): len(list(self.discodb.unique_values())) def test_peek(self): self.assertNotEquals(self.discodb.peek('0'), None) self.assertEquals(self.discodb.peek('X'), None) self.assert_(int(self.discodb.peek('0', '1')) >= 0) def test_query(self): q = Q.parse('5 & 10 & (15 | 30)') list(self.discodb.query(q)) def test_query_results(self): q = Q.parse('5') self.assertEquals(list(self.discodb.query(q)), list(self.discodb.get('5'))) def test_query_results_nonkey(self): q = Q.parse('nonkey') self.assertEquals(list(self.discodb.query(q)), []) def test_str(self): repr(self.discodb) str(self.discodb)
def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys))
def create_db(self, name, data): db_path = os.path.join(os.environ['DATA_DB_PATH'], name + '.db') data = DiscoDB(data) data.dump(open(db_path, 'w')) return db_path
def setUp(self): self.discodb = DiscoDB( (('alice', ('blue',)), ('bob', ('red',)), ('carol', ('blue', 'red'))), )
def setUp(self): self.discodb = DiscoDB(k_vs_iter(self.numkeys), disable_compression=True) self.discodb_c = DiscoDB(self.discodb)
def test_dumps_loads(self): dbuffer = self.discodb.dumps() self.assertEquals(dbuffer, DiscoDB.loads(dbuffer).dumps())
##------------------------------------------------------------------------------------------------------------------------------ ## globals ##------------------------------------------------------------------------------------------------------------------------------ botoConfig = botocore.config.Config(s3={'addressing_style': 'path'}) s3 = boto3.resource('s3', config=botoConfig) dbUsername = os.environ['DBUSERNAME'] dbPassword = os.environ['DBPASSWORD'] dbConnection = os.environ['DBCONNECTION'] dbName = os.environ['DBNAME'] try: #logger.info(f'connecting with c:{dbConnection}, u:{dbUsername}, p:{dbPassword}') db = DiscoDB(username=dbUsername, password=dbPassword, host=dbConnection, dbName=dbName) db.connect() filebase = DiscoFilebase(DISCO_RESOURCE_BUCKET) except pymysql.MySQLError as e: logger.error( "ERROR: Unexpected error: Could not connect to MySQL instance.") logger.error(e) sys.exit() logger.info("SUCCESS: Connection to RDS MySQL instance succeeded") jsonHeaders = { "Content-Type": "application/json", "Access-Control-Allow-Headers": "Content-Type,Authorization,X-Amz-Date,X-Api-Key,X-Amz-Security-Token",
def test(db): for v, s in samples(db): if list(db.query('a', view=v)) != [x for x in db['a'] if x in s]: raise Exception("no match: %d" % i) print "all ok!" def bmark(db): for i, (v, s) in enumerate(samples(db)): yield { "sample-size": i * 10, "list-baseline": timed(lambda: sum(1 for _ in db['a'])), "list-if": timed(lambda: sum(1 for x in db['a'] if x in s)), "list-view": timed(lambda: sum(1 for x in db.query('a', view=v))), "count-baseline": timed(lambda: len(db['a'])), "count-if": timed(lambda: sum(1 for x in db['a'] if x in s)), "count-view": timed(lambda: len(db.query('a', view=v))) } db = DiscoDB(items()) test(db) print "tests pass" rows = list(bmark(db)) f = open('bmark.csv', 'w') csv = DictWriter(f, sorted(rows[0].keys(), reverse=True)) csv.writeheader() csv.writerows(rows) f.close()
#!/usr/bin/python import sys from discodb import DiscoDB db = DiscoDB.load(file(sys.argv[1], 'r')) for key in map(str.rstrip, sys.stdin): inq = db.get(key) if inq and len(inq) > 0: print iter(inq).next()
#!/usr/bin/python import sys from discodb import DiscoDB def read_data(instream): for line in instream: try: (key, value) = line.rstrip().split("\t") yield (key, value) except: pass db = DiscoDB( read_data( open(sys.argv[1], 'r') if ( len(sys.argv) > 1 and sys.argv[1] != '-') else sys.stdin)) db.dump(file(sys.argv[2] if len(sys.argv) > 2 else 'out.discodb', 'w'))
def test_dumps_loads(self): dbuffer = self.discodb.dumps() assert dbuffer == DiscoDB.loads(dbuffer).dumps()
def discodbparse(iterable, size, fname, params): """Splits lines of input by whitespace and uses the fields as keys for a :class:`discodb.DiscoDB` objects.""" from discodb import DiscoDB for line in iterable: yield DiscoDB((field, []) for field in line.split())
def create(path, schema, records): if isinstance(schema, dict): schema = Schema(**schema) return DiscoDB(index(schema, records))
file, arg = rest.split('/%s/' % method, 1) path = os.path.join(datadir, file) if os.path.isfile(path): bound_method = getattr(DiscoDB.load(open(path)), method) return bound_method(xargs(arg)) if xargs else bound_method() raise NotMethod(method) def input_stream(fd, size, url, params): scheme, rest = url.split('://', 1) host, rest = rest.split('/', 1) if hasattr(params, "discodb_query"): query = lambda x: params.discodb_query else: query = Q.urlscan if host == Task.host or Task.has_flag("resultfs"): datadir = os.path.join(Task.root, "data") try: return maybe_method(datadir, rest, 'query', xargs=query), size, params except NotMethod, e: pass for method in ('keys', 'values'): try: return maybe_method(datadir, rest, method), size, params except NotMethod, e: pass return DiscoDB.load(open(os.path.join(datadir, rest))), size, params raise core.DiscoError("Scheme 'discodb' can only be used with force_local=True")
def create_db(self, name, data): db_path = os.path.join(os.environ["DATA_DB_PATH"], name + ".db") data = DiscoDB(data) data.dump(open(db_path, "w")) return db_path
gnr_song_dict = {} for r in df_tags.itertuples(): gnr = r.tag if gnr in gnr_song_dict.keys(): gnr_song_dict[gnr].append(r.lfm_id) else: gnr_song_dict[gnr] = [r.lfm_id] unq_tags = list(np.unique(df_tags['tag'])) gnr_song_dict['genres'] = unq_tags db = DiscoDB(gnr_song_dict) tag_cnts = [len(gnr_song_dict[i]) for i in unq_tags] tags_srtd=[x for _,x in sorted(zip(tag_cnts, unq_tags), reverse=True)] # can add restriction here tags_srt_sub = tags_srtd tags_srtd_alf = sorted(unq_tags) mp_input = [] for i in range(4): mp_input.append([])