Пример #1
0
 def _vid_for_value(self, column, key):
     if column.is_trie:
         if column.rtrie_indicator == mdb.MDB_UINT_16:
             key = rtrie.vid_for_value(self.vid16_nodes, self.vid16_kids, key)
         else:
             key = rtrie.vid_for_value(self.vid_nodes, self.vid_kids, key)
     elif column.is_lz4:
         key = clz4.compress(key)
     return key
Пример #2
0
 def _vid_for_value(self, column, key):
     if column.is_trie:
         if column.rtrie_indicator == mdb.MDB_UINT_16:
             key = rtrie.vid_for_value(self.vid16_nodes, self.vid16_kids, key)
         else:
             key = rtrie.vid_for_value(self.vid_nodes, self.vid_kids, key)
     elif column.is_lz4:
         key = clz4.compress(key)
     return key
Пример #3
0
 def test_marble_insert(self):
     #  test general infomation
     self.assertEqual(self.n_inserted, len(_ALBUMS))
     self.assertEqual(_NPARTITIONS, len(self.files))
     part_id = {}
     #  test that each sub db is fine
     for date, file in self.files.iteritems():
         env, txn, dbs, meta = self.marble._open(file)
         #  check meta db
         self.assertTrue(meta.contains(txn, "_vid_nodes"))
         self.assertTrue(meta.contains(txn, "_vid_kids"))
         self.assertTrue(meta.contains(txn, "_vid16_nodes"))
         self.assertTrue(meta.contains(txn, "_vid16_kids"))
         self.assertEqual(meta.get(txn, "name"), ujson.dumps("Collections"))
         self.assertEqual(meta.get(txn, "partition"), ujson.dumps("date"))
         self.assertEqual(meta.get(txn, "fields"), ujson.dumps(_FIELDS))
         self.assertEqual(meta.get(txn, "_pdata"), ujson.dumps(date))
         vid_nodes, _ = meta.get_raw(txn, '_vid_nodes')
         vid_kids, _ = meta.get_raw(txn, '_vid_kids')
         vid16_nodes, _ = meta.get_raw(txn, '_vid16_nodes', (None, 0))
         vid16_kids, _ = meta.get_raw(txn, '_vid16_kids', (None, 0))
         #  check subdb, subinddb
         part_id[date] = 1
         for name, (db, ind_db, _, column, _) in dbs.iteritems():
             if name == "_count":
                 continue
             bitmaps = {}
             part_id[date] = 1
             for album in self.albums:
                 if date == album[_PARTITIONS]:  # match the partition
                     value = album[name]
                     i = part_id[album[_PARTITIONS]]
                     part_id[album[_PARTITIONS]] += 1
                     if column.is_trie:
                         if column.rtrie_indicator == mdb.MDB_UINT_16:
                             val = rtrie.vid_for_value(
                                 vid16_nodes, vid16_kids, value)
                         else:
                             val = rtrie.vid_for_value(
                                 vid_nodes, vid_kids, value)
                     elif column.is_lz4:
                         val = clz4.compress(value)
                     else:
                         val = value
                     # self.assertEqual(db.get(txn, i), val)
                     if ind_db is not None:
                         #  row_id should be in bitmap too
                         if val in bitmaps:
                             bitmap = bitmaps[val]
                         else:
                             bitmap = BitSet()
                             bitmap.loads(ind_db.get(txn, val))
                             bitmaps[val] = bitmap
                         self.assertTrue(i in bitmap)
         txn.commit()
         env.close()
Пример #4
0
 def test_marble_insert(self):
     #  test general infomation
     self.assertEqual(self.n_inserted, len(_ALBUMS))
     self.assertEqual(_NPARTITIONS, len(self.files))
     part_id = {}
     #  test that each sub db is fine
     for date, file in self.files.iteritems():
         env, txn, dbs, meta = self.marble._open(file)
         #  check meta db
         self.assertTrue(meta.contains(txn, "_vid_nodes"))
         self.assertTrue(meta.contains(txn, "_vid_kids"))
         self.assertTrue(meta.contains(txn, "_vid16_nodes"))
         self.assertTrue(meta.contains(txn, "_vid16_kids"))
         self.assertEqual(meta.get(txn, "name"), ujson.dumps("Collections"))
         self.assertEqual(meta.get(txn, "partition"), ujson.dumps("date"))
         self.assertEqual(meta.get(txn, "fields"), ujson.dumps(_FIELDS))
         self.assertEqual(meta.get(txn, "_pdata"), ujson.dumps(date))
         vid_nodes, _ = meta.get_raw(txn, '_vid_nodes')
         vid_kids, _ = meta.get_raw(txn, '_vid_kids')
         vid16_nodes, _ = meta.get_raw(txn, '_vid16_nodes', (None, 0))
         vid16_kids, _ = meta.get_raw(txn, '_vid16_kids', (None, 0))
         #  check subdb, subinddb
         part_id[date] = 1
         for name, (db, ind_db, _, column, _) in dbs.iteritems():
             if name == "_count":
                 continue
             bitmaps = {}
             part_id[date] = 1
             for album in self.albums:
                 if date == album[_PARTITIONS]:  # match the partition
                     value = album[name]
                     i = part_id[album[_PARTITIONS]]
                     part_id[album[_PARTITIONS]] += 1
                     if column.is_trie:
                         if column.rtrie_indicator == mdb.MDB_UINT_16:
                             val = rtrie.vid_for_value(vid16_nodes, vid16_kids, value)
                         else:
                             val = rtrie.vid_for_value(vid_nodes, vid_kids, value)
                     elif column.is_lz4:
                         val = clz4.compress(value)
                     else:
                         val = value
                     # self.assertEqual(db.get(txn, i), val)
                     if ind_db is not None:
                         #  row_id should be in bitmap too
                         if val in bitmaps:
                             bitmap = bitmaps[val]
                         else:
                             bitmap = BitSet()
                             bitmap.loads(ind_db.get(txn, val))
                             bitmaps[val] = bitmap
                         self.assertTrue(i in bitmap)
         txn.commit()
         env.close()
Пример #5
0
    def test_rtrie_in_mdb(self):
        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        try:
            env = mdb.Env('/tmp/test_rtrie',
                          flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC
                          | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE)
            db.put_raw(txn, 'nodes', nodeaddr, nodelen)
            db.put_raw(txn, 'kids', kidaddr, kidlen)

            n, ns = db.get_raw(txn, 'nodes')
            k, ks = db.get_raw(txn, 'kids')
            txn.commit()
            env.close()

            env = mdb.Env('/tmp/test_rtrie',
                          flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name='_meta_')

            n, ns = db.get_raw(txn, 'nodes')
            k, ks = db.get_raw(txn, 'kids')
            self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2)
            self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6)
            self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3)
            self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4)
            self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere'))

            txn.commit()
            env.close()
        finally:
            import os
            os.unlink('/tmp/test_rtrie')
            os.unlink('/tmp/test_rtrie-lock')
Пример #6
0
    def test_stress_wtrie(self):
        ktrie = Trie()
        strie = Trie()
        etrie = Trie()

        keywords = {}
        search_terms = {}
        exchange_ids = {}

        with open(fixture) as f:
            for data in f:
                for word in data.split(' '):
                    vid = ktrie.add(word)
                    actual_vid = keywords.get(word)
                    if actual_vid is not None:
                        self.assertEqual(vid, actual_vid)
                    else:
                        keywords[word] = vid

                vid = strie.add(data)
                actual_vid = search_terms.get(data)
                if actual_vid is not None:
                    self.assertEqual(vid, actual_vid)
                else:
                    search_terms[data] = vid

        nodes, kids, nodelen = etrie.serialize()
        naddr, nlen = nodes.buffer_info()
        kaddr, klen = kids.buffer_info()
        #summarize(naddr, kaddr, nodelen)
        #print_it(naddr, kaddr)

        for dc, vid in exchange_ids.iteritems():
            rvid = etrie.add(dc)
            self.assertEqual(vid, rvid)

            print dc, vid
            value = value_for_vid(naddr, kaddr, vid)
            self.assertEqual(dc, value)
            if dc != value:
                print "      dc=%s adc=%s" % (dc, value)

            avid = vid_for_value(naddr, kaddr, dc)
            #print "vid=%s avid=%s" % (vid, avid)
            self.assertEqual(vid, avid)
Пример #7
0
    def test_stress_wtrie(self):
        ktrie = Trie()
        strie = Trie()
        etrie = Trie()

        keywords = {}
        search_terms = {}
        exchange_ids = {}

        with open(fixture) as f:
            for data in f:
                for word in data.split(" "):
                    vid = ktrie.add(word)
                    actual_vid = keywords.get(word)
                    if actual_vid is not None:
                        self.assertEqual(vid, actual_vid)
                    else:
                        keywords[word] = vid

                vid = strie.add(data)
                actual_vid = search_terms.get(data)
                if actual_vid is not None:
                    self.assertEqual(vid, actual_vid)
                else:
                    search_terms[data] = vid

        nodes, kids, nodelen = etrie.serialize()
        naddr, nlen = nodes.buffer_info()
        kaddr, klen = kids.buffer_info()
        # summarize(naddr, kaddr, nodelen)
        # print_it(naddr, kaddr)

        for dc, vid in exchange_ids.iteritems():
            rvid = etrie.add(dc)
            self.assertEqual(vid, rvid)

            print dc, vid
            value = value_for_vid(naddr, kaddr, vid)
            self.assertEqual(dc, value)
            if dc != value:
                print "      dc=%s adc=%s" % (dc, value)

            avid = vid_for_value(naddr, kaddr, dc)
            # print "vid=%s avid=%s" % (vid, avid)
            self.assertEqual(vid, avid)
Пример #8
0
    def test_rtrie_in_mdb(self):
        t = Trie()
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hell"), 2)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellothere"), 3)
        self.assertEqual(t.add("good"), 4)
        self.assertEqual(t.add("goodbye"), 5)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellsink"), 6)

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        try:
            env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name="_meta_", flags=mdb.MDB_CREATE)
            db.put_raw(txn, "nodes", nodeaddr, nodelen)
            db.put_raw(txn, "kids", kidaddr, kidlen)

            n, ns = db.get_raw(txn, "nodes")
            k, ks = db.get_raw(txn, "kids")
            txn.commit()
            env.close()

            env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
            txn = env.begin_txn()
            db = env.open_db(txn, name="_meta_")

            n, ns = db.get_raw(txn, "nodes")
            k, ks = db.get_raw(txn, "kids")
            self.assertEqual(rtrie.vid_for_value(n, k, "hello"), 1)
            self.assertEqual(rtrie.vid_for_value(n, k, "hell"), 2)
            self.assertEqual(rtrie.vid_for_value(n, k, "goodbye"), 5)
            self.assertEqual(rtrie.vid_for_value(n, k, "hellsink"), 6)
            self.assertEqual(rtrie.vid_for_value(n, k, "hellothere"), 3)
            self.assertEqual(rtrie.vid_for_value(n, k, "good"), 4)
            self.assertIsNone(rtrie.vid_for_value(n, k, "notthere"))

            txn.commit()
            env.close()
        finally:
            import os

            os.unlink("/tmp/test_rtrie")
            os.unlink("/tmp/test_rtrie-lock")
Пример #9
0
    def test_rtrie_in_memory(self):

        s = unicode(u"séllsink").encode("utf-8")
        # print "HELLSINK: %s" % s

        t = Trie()
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hell"), 2)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellothere"), 3)
        self.assertEqual(t.add("good"), 4)
        self.assertEqual(t.add("goodbye"), 5)
        self.assertEqual(t.add("hello"), 1)
        self.assertEqual(t.add("hellsink"), 6)
        self.assertEqual(t.add(s), 7)
        t.print_it()

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        print "LENS %s %s" % (nodelen, kidlen)

        for i in range(8):
            val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
            print "Value", i, val

        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hello"), 1)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hell"), 2)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "goodbye"), 5)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellsink"), 6)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellothere"), 3)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "good"), 4)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "notthere"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "h"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "he"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hel"))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hells"))
Пример #10
0
    def test_rtrie_in_memory(self):

        s = unicode(u'séllsink').encode('utf-8')
        #print "HELLSINK: %s" % s

        t = Trie()
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hell'), 2)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellothere'), 3)
        self.assertEqual(t.add('good'), 4)
        self.assertEqual(t.add('goodbye'), 5)
        self.assertEqual(t.add('hello'), 1)
        self.assertEqual(t.add('hellsink'), 6)
        self.assertEqual(t.add(s), 7)
        t.print_it()

        nodes, kids, _ = t.serialize()
        nodeaddr, nodelen = nodes.buffer_info()
        kidaddr, kidlen = kids.buffer_info()
        print "LENS %s %s" % (nodelen, kidlen)

        for i in range(8):
            val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
            print "Value", i, val

        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'),
                         3)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4)
        self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel'))
        self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells'))