Пример #1
0
class TestQuery(unittest.TestCase):
    def setUp(self):
        self.discodb = DiscoDB(
            (('alice', ('blue',)),
            ('bob', ('red',)),
            ('carol', ('blue', 'red'))),
        )

    def q(self, s):
        return self.discodb.query(Q.parse(s))

    def test_empty(self):
        self.assertEqual(list(self.q('')), [])
        self.assertEqual(len(self.q('')), 0)

    def test_get_len(self):
        self.assertEqual(len(self.discodb.get('alice')), 1)
        self.assertEquals(len(self.discodb.get('bob')), 1)
        self.assertEquals(len(self.discodb.get('carol')), 2)

    def test_query_len(self):
        self.assertEquals(len(self.q('alice')), 1)
        self.assertEquals(len(self.q('bob')), 1)
        self.assertEquals(len(self.q('carol')), 2)
        self.assertEquals(len(self.q('alice & bob')), 0)
        self.assertEquals(len(self.q('alice | bob')), 2)
        self.assertEquals(len(self.q('alice & carol')), 1)
        self.assertEquals(len(self.q('alice | carol')), 2)
        self.assertEquals(len(self.q('alice|bob|carol')), 2)
        self.assertEquals(len(self.q('alice&bob&carol')), 0)

    def test_query_len_doesnt_advance_iter(self):
        # check that calling len() doesn't advance the iterator
        res = self.q('alice')
        self.assertEquals(len(res), 1)
        self.assertEquals(len(res), 1)

    def test_query_results(self):
        self.assertEquals(set(self.q('alice')), set(['blue']))
        self.assertEquals(set(self.q('bob')), set(['red']))
        self.assertEquals(set(self.q('carol')), set(['blue', 'red']))
        self.assertEquals(set(self.q('alice & bob')), set())
        self.assertEquals(set(self.q('alice | bob')), set(['blue', 'red']))
        self.assertEquals(set(self.q('alice & carol')), set(['blue']))
        self.assertEquals(set(self.q('alice | carol')), set(['blue', 'red']))
        self.assertEquals(set(self.q('alice|bob|carol')), set(['blue', 'red']))
        self.assertEquals(set(self.q('alice&bob&carol')), set())

    def test_query_len_nonkey(self):
        self.assertEquals(len(self.q('nonkey')), 0)
        self.assertEquals(len(self.q('~nonkey')), 2)
        self.assertEquals(len(self.q('nonkey & alice')), 0)
        self.assertEquals(len(self.q('nonkey | alice')), 1)

    def test_query_results_nonkey(self):
        self.assertEquals(set(self.q('nonkey')), set())
        self.assertEquals(set(self.q('~nonkey')), set(['blue', 'red']))
        self.assertEquals(set(self.q('nonkey & alice')), set())
        self.assertEquals(set(self.q('nonkey | alice')), set(['blue']))
Пример #2
0
class TestUncompressed(TestMappingProtocol, TestSerializationProtocol):
    def setUp(self):
        self.discodb = DiscoDB(k_vs_iter(self.numkeys),
                               disable_compression=True)
        self.discodb_c = DiscoDB(self.discodb)

    def test_compression(self):
        self.assertEqual(dict((k, list(vs)) for k, vs in self.discodb.items()),
                         dict((k, list(vs)) for k, vs in self.discodb_c.items()))
Пример #3
0
    def load(cls, file):
        """a deserialized instance of %s from file.""" % cls
        if isinstance(file, basestring):
            file = open(file, 'r', 0)

        header = file.readline()
        offset = len(header)
        version, metalen = header.strip().split(':')
        metadb = DiscoDB.load(file, offset)
        offset += int(metalen)
        datadb = DiscoDB.load(file, offset)
        return cls(datadb, metadb)
Пример #4
0
 def reduce(iter, params):
     partitions = params['partitions']
     name = params['name']
     discodb = DiscoDB(kvgroup(iter))
     try:
         # figure out what partition we are in
         key = discodb.keys().__iter__().next()
         partition = util.default_partition(key, partitions, params)
         discodb.dump(open(filename(name, partition), 'w'))
         yield partition, None
     except StopIteration:
         # no keys, nothing to write
         pass
Пример #5
0
 def test_dump_load(self):
     from tempfile import NamedTemporaryFile
     handle = NamedTemporaryFile()
     self.discodb.dump(handle)
     handle.seek(0)
     discodb = DiscoDB.load(handle)
     self.assertEquals(discodb.dumps(), self.discodb.dumps())
Пример #6
0
def test_leak():
    while True:
        d = DiscoDB(zip(letters, ['abc'] * 1000))
        t = len(d.query('a'))
        t = len(d['b'])
        t = 'd' in d
        t = d.dumps()
        t = DiscoDB.loads(t)
        t = d.dump(open('/tmp/discodb', 'w'))
        t = DiscoDB.load(open('/tmp/discodb'))
        for k in d.keys():
            for v in d.values():
                t = k == v
Пример #7
0
def maybe_method(datadir, rest, method, xargs=None):
    if rest.find('/%s/' % method) > 0:
        file, arg = rest.split('/%s/' % method, 1)
        path = os.path.join(datadir, file)
        if os.path.isfile(path):
            bound_method = getattr(DiscoDB.load(open(path)), method)
            return bound_method(xargs(arg)) if xargs else bound_method()
    raise NotMethod(method)
Пример #8
0
def create_db(numvs, vsize, disable_compression):
    from itertools import islice, izip, permutations, repeat
    pool = letters * (vsize / len(letters) + 1)
    return DiscoDB(izip(
        letters,
        repeat([''.join(p)
                for p in islice(permutations(pool, vsize), numvs)])),
                   disable_compression=disable_compression)
Пример #9
0
class TestSerializationProtocol(unittest.TestCase):
    numkeys = 10000

    def setUp(self):
        self.discodb = DiscoDB(k_vs_iter(self.numkeys))

    def test_dumps_loads(self):
        dbuffer = self.discodb.dumps()
        self.assertEquals(dbuffer, DiscoDB.loads(dbuffer).dumps())

    def test_dump_load(self):
        from tempfile import NamedTemporaryFile
        handle = NamedTemporaryFile()
        self.discodb.dump(handle)
        handle.seek(0)
        discodb = DiscoDB.load(handle)
        self.assertEquals(discodb.dumps(), self.discodb.dumps())
Пример #10
0
 def __init__(self, **tables):
   # {'name': DiscoDB}
   self._tables = {}
   for name, path_or_db in tables.items():
     if isinstance(path_or_db, DiscoDB):
       db = path_or_db
     else:
       db = DiscoDB.load(open(path_or_db))
     self._tables[name] = DiscoTable(self, name, db)
Пример #11
0
def load(name):
    dir = dirname(name)
    with open(os.path.join(dir, 'partitions')) as file:
        partitions = int(file.read())
    discodbs = [DiscoDB()] * partitions
    for partition in xrange(0,partitions):
        path = filename(name, partition)
        if os.path.exists(path):
            discodbs[partition] = DiscoDB.load(open(path))
    dbs[name] = discodbs
Пример #12
0
def test_leak():
    while True:
        d = DiscoDB(zip(letters, ['abc'] * 1000))
        t = len(d.query('a'))
        t = len(d['b'])
        t = 'd' in d
        t = d.dumps()
        t = DiscoDB.loads(t)
        t = d.dump(open('/tmp/discodb', 'w'))
        t = DiscoDB.load(open('/tmp/discodb'))
        for k in d.keys():
            for v in d.values():
                t = k == v
Пример #13
0
def test_leak():
    while True:
        d = DiscoDB(zip(letters, ["abc"] * 1000))
        t = len(d.query("a"))
        t = len(d["b"])
        t = "d" in d
        t = d.dumps()
        t = DiscoDB.loads(t)
        t = d.dump(open("/tmp/discodb", "w"))
        t = DiscoDB.load(open("/tmp/discodb"))
        for k in d.keys():
            for v in d.values():
                t = k == v
Пример #14
0
def input_stream(fd, size, url, params):
    import os
    from disco import util
    from disco.comm import download
    from discodb import DiscoDB, Q
    scheme, netloc, rest = util.urlsplit(url)
    path, rest = rest.split('!', 1) if '!' in rest else (rest, '')

    if netloc[0] == Task.netloc[0]:
        discodb = DiscoDB.load(open(os.path.join(Task.root, path)))
    else:
        discodb = DiscoDB.loads(download('disco://%s/%s' % (netloc, path)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg)), size, url
        return method(*filter(None, arg)), size, url
    return discodb, size, url
Пример #15
0
def input_stream(fd, size, url, params):
    import os
    from disco import util
    from disco.comm import download
    from discodb import DiscoDB, Q
    scheme, netloc, rest = util.urlsplit(url)
    path, rest   = rest.split('!', 1) if '!' in rest else (rest, '')

    if netloc[0] == Task.netloc[0]:
        discodb = DiscoDB.load(open(os.path.join(Task.root, path)))
    else:
        discodb = DiscoDB.loads(download('disco://%s/%s' % (netloc, path)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg)), size, url
        return method(*filter(None, arg)), size, url
    return discodb, size, url
Пример #16
0
class TestMappingProtocol(unittest.TestCase):
    numkeys = 1000

    def setUp(self):
        self.discodb = DiscoDB(k_vs_iter(self.numkeys))

    def test_contains(self):
        assert "200" in self.discodb
        assert "key" not in self.discodb

    def test_length(self):
        assert len(self.discodb) == self.numkeys

    def test_getitem(self):
        for x in xrange(self.numkeys):
            try:
                list(self.discodb[str(x)])
            except KeyError:
                assert x == self.numkeys

    def test_iter(self):
        assert list(self.discodb) == list(self.discodb.keys())

    def test_items(self):
        for key, values in self.discodb.items():
            key, list(values)

    def test_keys(self):
        len(list(self.discodb.keys()))

    def test_values(self):
        len(list(self.discodb.values()))

    def test_query(self):
        q = Q.parse('5 & 10 & (15 | 30)')
        list(self.discodb.query(q))

    def test_str(self):
        assert str(self.discodb) != repr(self.discodb)
Пример #17
0
def scan_database_dir(state):
    db_path = os.environ["DATA_DB_PATH"]
    for fname in (os.path.join(db_path, f) for f in os.listdir(db_path)):
        if fname not in state["dbs"] and os.path.isfile(fname):
            try:
                state["dbs"][fname] = DiscoDB.load(open(fname))
            except DiscoDBError:
                # maybe a corrupt discodb, nuke it the sync
                # process should fetch a new one later on
                os.remove(fname)
                logger.exception("Unable to open %s", fname)

    if state["dbs"]:
        state["cache_time"] = time.time()
    return state
Пример #18
0
def scan_database_dir(state):
  db_path = os.environ['DATA_DB_PATH']
  for fname in (os.path.join(db_path, f) for f in os.listdir(db_path)):
    if fname not in state['dbs'] and os.path.isfile(fname):
      try:
        state['dbs'][fname] = DiscoDB.load(open(fname))
      except DiscoDBError:
        # maybe a corrupt discodb, nuke it the sync
        # process should fetch a new one later on
        os.remove(fname)
        logger.exception('Unable to open %s', fname)

  if state['dbs']:
    state['cache_time'] = time.time()
  return state
Пример #19
0
def input_stream(fd, size, url, params):
    scheme, netloc, rest = util.urlsplit(url)

    if netloc[0] == Task.netloc[0]:
        path, rest   = rest.split('!', 1) if '!' in rest else (rest, '')
        Task.discodb = DiscoDB.load(open(os.path.join(Task.root, path)))

        if rest:
            method, arg = rest.split('/', 1)
            if method == 'query':
                if hasattr(params, 'discodb_query'):
                    return Task.discodb.query(params.discodb_query), size, url
                return Task.discodb.query(Q.urlscan(arg)), size, url
            return getattr(Task.discodb, method)(), size, url
        return Task.discodb, size, url
    raise core.DiscoError("Scheme 'discodb' can only be used with force_local=True")
Пример #20
0
def Open(url, task=None):
    if task:
        disco_data = task.disco_data
        ddfs_data = task.ddfs_data
    else:
        from disco.settings import DiscoSettings
        settings = DiscoSettings()
        disco_data = settings['DISCO_DATA']
        ddfs_data = settings['DDFS_DATA']
    scheme, netloc, rest = util.urlsplit(url)
    path, rest = rest.split('!', 1) if '!' in rest else (rest, '')
    discodb = DiscoDB.load(open(util.localize(path, disco_data=disco_data,
                                ddfs_data=ddfs_data)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg))
        return method(*filter(None, arg))
    return discodb
Пример #21
0
def Open(url, task=None):
    if task:
        disco_data = task.disco_data
        ddfs_data = task.ddfs_data
    else:
        from disco.settings import DiscoSettings
        settings = DiscoSettings()
        disco_data = settings['DISCO_DATA']
        ddfs_data = settings['DDFS_DATA']
    scheme, netloc, rest = util.urlsplit(url)
    path, rest = rest.split('!', 1) if '!' in rest else (rest, '')
    discodb = DiscoDB.load(
        open(util.localize(path, disco_data=disco_data, ddfs_data=ddfs_data)))

    if rest:
        method_name, arg = rest.split('/', 1) if '/' in rest else (rest, None)
        method = getattr(discodb, method_name)
        if method_name in ('metaquery', 'query'):
            return method(Q.urlscan(arg))
        return method(*filter(None, arg))
    return discodb
Пример #22
0
def open_db():
  if len(sys.argv > 1):
    fname = sys.arv[1]
  else:
    # fetch some data from s3
    conn = boto.s3_connectection(
      os.environ['AWS_KEY'],
      os.environ['AWS_SECRET']
    )
    bucket = conn.get_bucket('com.mozillalabs.blink')

    bucket = conn.bucket('blink')
    item = bucket.list('data/').next()
    fname = os.path.join(
      os.environ['DATA_DB_PATH'],
      str(item.key)
    )
    if not os.path.exists(fname):
      item.get_contents_to_path(fname)

  app.db = DiscoDB.load(open(fname))
Пример #23
0
class TestMappingProtocol(unittest.TestCase):
    numkeys = 1000

    def setUp(self):
        self.discodb = DiscoDB(k_vs_iter(self.numkeys))

    def test_contains(self):
        assert "0" in self.discodb
        assert "key" not in self.discodb

    def test_nonzero(self):
        self.assertFalse(self.discodb.query('NONKEY'))
        self.assertTrue(self.discodb.query('0'))
        self.assertTrue(self.discodb.values())
        self.assertTrue(self.discodb.keys())

    def test_length(self):
        self.assertEquals(len(self.discodb), self.numkeys)

    def test_get(self):
        len(list(self.discodb.get('0')))
        self.assertEquals(self.discodb.get('X'), None)
        self.assertEquals(self.discodb.get('X', 'Y'), 'Y')

    def test_getitem(self):
        for x in xrange(self.numkeys):
            try:
                list(self.discodb[str(x)])
            except KeyError:
                self.assertEquals(x, self.numkeys)

    def test_iter(self):
        self.assertEquals(list(self.discodb), list(self.discodb.keys()))

    def test_items(self):
        for key, values in self.discodb.items():
            key, list(values)

    def test_keys(self):
        len(list(self.discodb.keys()))

    def test_values(self):
        len(list(self.discodb.values()))

    def test_unique_values(self):
        len(list(self.discodb.unique_values()))

    def test_peek(self):
        self.assertNotEquals(self.discodb.peek('0'), None)
        self.assertEquals(self.discodb.peek('X'), None)
        self.assert_(int(self.discodb.peek('0', '1')) >= 0)

    def test_query(self):
        q = Q.parse('5 & 10 & (15 | 30)')
        list(self.discodb.query(q))

    def test_query_results(self):
        q = Q.parse('5')
        self.assertEquals(list(self.discodb.query(q)),
                          list(self.discodb.get('5')))

    def test_query_results_nonkey(self):
        q = Q.parse('nonkey')
        self.assertEquals(list(self.discodb.query(q)), [])

    def test_str(self):
        repr(self.discodb)
        str(self.discodb)
Пример #24
0
 def setUp(self):
     self.discodb = DiscoDB(k_vs_iter(self.numkeys))
Пример #25
0
 def create_db(self, name, data):
   db_path = os.path.join(os.environ['DATA_DB_PATH'], name + '.db')
   data = DiscoDB(data)
   data.dump(open(db_path, 'w'))
   return db_path
Пример #26
0
 def setUp(self):
     self.discodb = DiscoDB(
         (('alice', ('blue',)),
         ('bob', ('red',)),
         ('carol', ('blue', 'red'))),
     )
Пример #27
0
 def create_db(self, name, data):
     db_path = os.path.join(os.environ['DATA_DB_PATH'], name + '.db')
     data = DiscoDB(data)
     data.dump(open(db_path, 'w'))
     return db_path
Пример #28
0
 def setUp(self):
     self.discodb = DiscoDB(k_vs_iter(self.numkeys),
                            disable_compression=True)
     self.discodb_c = DiscoDB(self.discodb)
Пример #29
0
 def test_dumps_loads(self):
     dbuffer = self.discodb.dumps()
     self.assertEquals(dbuffer, DiscoDB.loads(dbuffer).dumps())
Пример #30
0
##------------------------------------------------------------------------------------------------------------------------------
## globals
##------------------------------------------------------------------------------------------------------------------------------

botoConfig = botocore.config.Config(s3={'addressing_style': 'path'})
s3 = boto3.resource('s3', config=botoConfig)

dbUsername = os.environ['DBUSERNAME']
dbPassword = os.environ['DBPASSWORD']
dbConnection = os.environ['DBCONNECTION']
dbName = os.environ['DBNAME']

try:
    #logger.info(f'connecting with c:{dbConnection}, u:{dbUsername}, p:{dbPassword}')
    db = DiscoDB(username=dbUsername,
                 password=dbPassword,
                 host=dbConnection,
                 dbName=dbName)
    db.connect()
    filebase = DiscoFilebase(DISCO_RESOURCE_BUCKET)

except pymysql.MySQLError as e:
    logger.error(
        "ERROR: Unexpected error: Could not connect to MySQL instance.")
    logger.error(e)
    sys.exit()
logger.info("SUCCESS: Connection to RDS MySQL instance succeeded")

jsonHeaders = {
    "Content-Type": "application/json",
    "Access-Control-Allow-Headers":
    "Content-Type,Authorization,X-Amz-Date,X-Api-Key,X-Amz-Security-Token",
Пример #31
0
def test(db):
    for v, s in samples(db):
        if list(db.query('a', view=v)) != [x for x in db['a'] if x in s]:
            raise Exception("no match: %d" % i)
    print "all ok!"


def bmark(db):
    for i, (v, s) in enumerate(samples(db)):
        yield {
            "sample-size": i * 10,
            "list-baseline": timed(lambda: sum(1 for _ in db['a'])),
            "list-if": timed(lambda: sum(1 for x in db['a'] if x in s)),
            "list-view": timed(lambda: sum(1 for x in db.query('a', view=v))),
            "count-baseline": timed(lambda: len(db['a'])),
            "count-if": timed(lambda: sum(1 for x in db['a'] if x in s)),
            "count-view": timed(lambda: len(db.query('a', view=v)))
        }


db = DiscoDB(items())
test(db)
print "tests pass"
rows = list(bmark(db))
f = open('bmark.csv', 'w')
csv = DictWriter(f, sorted(rows[0].keys(), reverse=True))
csv.writeheader()
csv.writerows(rows)
f.close()
Пример #32
0
#!/usr/bin/python

import sys
from discodb import DiscoDB

db = DiscoDB.load(file(sys.argv[1], 'r'))

for key in map(str.rstrip, sys.stdin):
    inq = db.get(key)
    if inq and len(inq) > 0:
        print iter(inq).next()
Пример #33
0
#!/usr/bin/python

import sys
from discodb import DiscoDB


def read_data(instream):
    for line in instream:
        try:
            (key, value) = line.rstrip().split("\t")
            yield (key, value)
        except:
            pass


db = DiscoDB(
    read_data(
        open(sys.argv[1], 'r') if (
            len(sys.argv) > 1 and sys.argv[1] != '-') else sys.stdin))

db.dump(file(sys.argv[2] if len(sys.argv) > 2 else 'out.discodb', 'w'))
Пример #34
0
 def test_dumps_loads(self):
     dbuffer = self.discodb.dumps()
     assert dbuffer == DiscoDB.loads(dbuffer).dumps()
Пример #35
0
def discodbparse(iterable, size, fname, params):
    """Splits lines of input by whitespace and uses the fields as keys for a :class:`discodb.DiscoDB` objects."""
    from discodb import DiscoDB
    for line in iterable:
        yield DiscoDB((field, []) for field in line.split())
Пример #36
0
def create(path, schema, records):
  if isinstance(schema, dict):
    schema = Schema(**schema)

  return DiscoDB(index(schema, records))
Пример #37
0
        file, arg = rest.split('/%s/' % method, 1)
        path = os.path.join(datadir, file)
        if os.path.isfile(path):
            bound_method = getattr(DiscoDB.load(open(path)), method)
            return bound_method(xargs(arg)) if xargs else bound_method()
    raise NotMethod(method)

def input_stream(fd, size, url, params):
    scheme, rest = url.split('://', 1)
    host, rest = rest.split('/', 1)

    if hasattr(params, "discodb_query"):
        query = lambda x: params.discodb_query
    else:
        query = Q.urlscan

    if host == Task.host or Task.has_flag("resultfs"):
        datadir = os.path.join(Task.root, "data")
        try:
            return maybe_method(datadir, rest, 'query', xargs=query), size, params
        except NotMethod, e:
            pass
        for method in ('keys', 'values'):
            try:
                return maybe_method(datadir, rest, method), size, params
            except NotMethod, e:
                pass
        return DiscoDB.load(open(os.path.join(datadir, rest))), size, params
    raise core.DiscoError("Scheme 'discodb' can only be used with force_local=True")

Пример #38
0
 def create_db(self, name, data):
     db_path = os.path.join(os.environ["DATA_DB_PATH"], name + ".db")
     data = DiscoDB(data)
     data.dump(open(db_path, "w"))
     return db_path
Пример #39
0
gnr_song_dict = {}

for r in df_tags.itertuples():
    gnr = r.tag
    
    if gnr in gnr_song_dict.keys():
        gnr_song_dict[gnr].append(r.lfm_id)
    else:
        gnr_song_dict[gnr] = [r.lfm_id]

unq_tags = list(np.unique(df_tags['tag']))

gnr_song_dict['genres'] = unq_tags

db = DiscoDB(gnr_song_dict)

tag_cnts = [len(gnr_song_dict[i]) for i in unq_tags]
tags_srtd=[x for _,x in sorted(zip(tag_cnts, unq_tags), reverse=True)]


# can add restriction here
tags_srt_sub = tags_srtd

tags_srtd_alf = sorted(unq_tags)

mp_input = []

for i in range(4):
    mp_input.append([])