def _vid_for_value(self, column, key): if column.is_trie: if column.rtrie_indicator == mdb.MDB_UINT_16: key = rtrie.vid_for_value(self.vid16_nodes, self.vid16_kids, key) else: key = rtrie.vid_for_value(self.vid_nodes, self.vid_kids, key) elif column.is_lz4: key = clz4.compress(key) return key
def test_marble_insert(self): # test general infomation self.assertEqual(self.n_inserted, len(_ALBUMS)) self.assertEqual(_NPARTITIONS, len(self.files)) part_id = {} # test that each sub db is fine for date, file in self.files.iteritems(): env, txn, dbs, meta = self.marble._open(file) # check meta db self.assertTrue(meta.contains(txn, "_vid_nodes")) self.assertTrue(meta.contains(txn, "_vid_kids")) self.assertTrue(meta.contains(txn, "_vid16_nodes")) self.assertTrue(meta.contains(txn, "_vid16_kids")) self.assertEqual(meta.get(txn, "name"), ujson.dumps("Collections")) self.assertEqual(meta.get(txn, "partition"), ujson.dumps("date")) self.assertEqual(meta.get(txn, "fields"), ujson.dumps(_FIELDS)) self.assertEqual(meta.get(txn, "_pdata"), ujson.dumps(date)) vid_nodes, _ = meta.get_raw(txn, '_vid_nodes') vid_kids, _ = meta.get_raw(txn, '_vid_kids') vid16_nodes, _ = meta.get_raw(txn, '_vid16_nodes', (None, 0)) vid16_kids, _ = meta.get_raw(txn, '_vid16_kids', (None, 0)) # check subdb, subinddb part_id[date] = 1 for name, (db, ind_db, _, column, _) in dbs.iteritems(): if name == "_count": continue bitmaps = {} part_id[date] = 1 for album in self.albums: if date == album[_PARTITIONS]: # match the partition value = album[name] i = part_id[album[_PARTITIONS]] part_id[album[_PARTITIONS]] += 1 if column.is_trie: if column.rtrie_indicator == mdb.MDB_UINT_16: val = rtrie.vid_for_value( vid16_nodes, vid16_kids, value) else: val = rtrie.vid_for_value( vid_nodes, vid_kids, value) elif column.is_lz4: val = clz4.compress(value) else: val = value # self.assertEqual(db.get(txn, i), val) if ind_db is not None: # row_id should be in bitmap too if val in bitmaps: bitmap = bitmaps[val] else: bitmap = BitSet() bitmap.loads(ind_db.get(txn, val)) bitmaps[val] = bitmap self.assertTrue(i in bitmap) txn.commit() env.close()
def test_marble_insert(self): # test general infomation self.assertEqual(self.n_inserted, len(_ALBUMS)) self.assertEqual(_NPARTITIONS, len(self.files)) part_id = {} # test that each sub db is fine for date, file in self.files.iteritems(): env, txn, dbs, meta = self.marble._open(file) # check meta db self.assertTrue(meta.contains(txn, "_vid_nodes")) self.assertTrue(meta.contains(txn, "_vid_kids")) self.assertTrue(meta.contains(txn, "_vid16_nodes")) self.assertTrue(meta.contains(txn, "_vid16_kids")) self.assertEqual(meta.get(txn, "name"), ujson.dumps("Collections")) self.assertEqual(meta.get(txn, "partition"), ujson.dumps("date")) self.assertEqual(meta.get(txn, "fields"), ujson.dumps(_FIELDS)) self.assertEqual(meta.get(txn, "_pdata"), ujson.dumps(date)) vid_nodes, _ = meta.get_raw(txn, '_vid_nodes') vid_kids, _ = meta.get_raw(txn, '_vid_kids') vid16_nodes, _ = meta.get_raw(txn, '_vid16_nodes', (None, 0)) vid16_kids, _ = meta.get_raw(txn, '_vid16_kids', (None, 0)) # check subdb, subinddb part_id[date] = 1 for name, (db, ind_db, _, column, _) in dbs.iteritems(): if name == "_count": continue bitmaps = {} part_id[date] = 1 for album in self.albums: if date == album[_PARTITIONS]: # match the partition value = album[name] i = part_id[album[_PARTITIONS]] part_id[album[_PARTITIONS]] += 1 if column.is_trie: if column.rtrie_indicator == mdb.MDB_UINT_16: val = rtrie.vid_for_value(vid16_nodes, vid16_kids, value) else: val = rtrie.vid_for_value(vid_nodes, vid_kids, value) elif column.is_lz4: val = clz4.compress(value) else: val = value # self.assertEqual(db.get(txn, i), val) if ind_db is not None: # row_id should be in bitmap too if val in bitmaps: bitmap = bitmaps[val] else: bitmap = BitSet() bitmap.loads(ind_db.get(txn, val)) bitmaps[val] = bitmap self.assertTrue(i in bitmap) txn.commit() env.close()
def test_rtrie_in_mdb(self): t = Trie() self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hell'), 2) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellothere'), 3) self.assertEqual(t.add('good'), 4) self.assertEqual(t.add('goodbye'), 5) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellsink'), 6) nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() try: env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE) db.put_raw(txn, 'nodes', nodeaddr, nodelen) db.put_raw(txn, 'kids', kidaddr, kidlen) n, ns = db.get_raw(txn, 'nodes') k, ks = db.get_raw(txn, 'kids') txn.commit() env.close() env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name='_meta_') n, ns = db.get_raw(txn, 'nodes') k, ks = db.get_raw(txn, 'kids') self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1) self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2) self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5) self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6) self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3) self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4) self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere')) txn.commit() env.close() finally: import os os.unlink('/tmp/test_rtrie') os.unlink('/tmp/test_rtrie-lock')
def test_stress_wtrie(self): ktrie = Trie() strie = Trie() etrie = Trie() keywords = {} search_terms = {} exchange_ids = {} with open(fixture) as f: for data in f: for word in data.split(' '): vid = ktrie.add(word) actual_vid = keywords.get(word) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: keywords[word] = vid vid = strie.add(data) actual_vid = search_terms.get(data) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: search_terms[data] = vid nodes, kids, nodelen = etrie.serialize() naddr, nlen = nodes.buffer_info() kaddr, klen = kids.buffer_info() #summarize(naddr, kaddr, nodelen) #print_it(naddr, kaddr) for dc, vid in exchange_ids.iteritems(): rvid = etrie.add(dc) self.assertEqual(vid, rvid) print dc, vid value = value_for_vid(naddr, kaddr, vid) self.assertEqual(dc, value) if dc != value: print " dc=%s adc=%s" % (dc, value) avid = vid_for_value(naddr, kaddr, dc) #print "vid=%s avid=%s" % (vid, avid) self.assertEqual(vid, avid)
def test_stress_wtrie(self): ktrie = Trie() strie = Trie() etrie = Trie() keywords = {} search_terms = {} exchange_ids = {} with open(fixture) as f: for data in f: for word in data.split(" "): vid = ktrie.add(word) actual_vid = keywords.get(word) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: keywords[word] = vid vid = strie.add(data) actual_vid = search_terms.get(data) if actual_vid is not None: self.assertEqual(vid, actual_vid) else: search_terms[data] = vid nodes, kids, nodelen = etrie.serialize() naddr, nlen = nodes.buffer_info() kaddr, klen = kids.buffer_info() # summarize(naddr, kaddr, nodelen) # print_it(naddr, kaddr) for dc, vid in exchange_ids.iteritems(): rvid = etrie.add(dc) self.assertEqual(vid, rvid) print dc, vid value = value_for_vid(naddr, kaddr, vid) self.assertEqual(dc, value) if dc != value: print " dc=%s adc=%s" % (dc, value) avid = vid_for_value(naddr, kaddr, dc) # print "vid=%s avid=%s" % (vid, avid) self.assertEqual(vid, avid)
def test_rtrie_in_mdb(self): t = Trie() self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hell"), 2) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellothere"), 3) self.assertEqual(t.add("good"), 4) self.assertEqual(t.add("goodbye"), 5) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellsink"), 6) nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() try: env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name="_meta_", flags=mdb.MDB_CREATE) db.put_raw(txn, "nodes", nodeaddr, nodelen) db.put_raw(txn, "kids", kidaddr, kidlen) n, ns = db.get_raw(txn, "nodes") k, ks = db.get_raw(txn, "kids") txn.commit() env.close() env = mdb.Env("/tmp/test_rtrie", flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) txn = env.begin_txn() db = env.open_db(txn, name="_meta_") n, ns = db.get_raw(txn, "nodes") k, ks = db.get_raw(txn, "kids") self.assertEqual(rtrie.vid_for_value(n, k, "hello"), 1) self.assertEqual(rtrie.vid_for_value(n, k, "hell"), 2) self.assertEqual(rtrie.vid_for_value(n, k, "goodbye"), 5) self.assertEqual(rtrie.vid_for_value(n, k, "hellsink"), 6) self.assertEqual(rtrie.vid_for_value(n, k, "hellothere"), 3) self.assertEqual(rtrie.vid_for_value(n, k, "good"), 4) self.assertIsNone(rtrie.vid_for_value(n, k, "notthere")) txn.commit() env.close() finally: import os os.unlink("/tmp/test_rtrie") os.unlink("/tmp/test_rtrie-lock")
def test_rtrie_in_memory(self): s = unicode(u"séllsink").encode("utf-8") # print "HELLSINK: %s" % s t = Trie() self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hell"), 2) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellothere"), 3) self.assertEqual(t.add("good"), 4) self.assertEqual(t.add("goodbye"), 5) self.assertEqual(t.add("hello"), 1) self.assertEqual(t.add("hellsink"), 6) self.assertEqual(t.add(s), 7) t.print_it() nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() print "LENS %s %s" % (nodelen, kidlen) for i in range(8): val = rtrie.value_for_vid(nodeaddr, kidaddr, i) print "Value", i, val self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hello"), 1) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hell"), 2) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "goodbye"), 5) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellsink"), 6) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "hellothere"), 3) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, "good"), 4) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "notthere")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "h")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "he")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hel")) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, "hells"))
def test_rtrie_in_memory(self): s = unicode(u'séllsink').encode('utf-8') #print "HELLSINK: %s" % s t = Trie() self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hell'), 2) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellothere'), 3) self.assertEqual(t.add('good'), 4) self.assertEqual(t.add('goodbye'), 5) self.assertEqual(t.add('hello'), 1) self.assertEqual(t.add('hellsink'), 6) self.assertEqual(t.add(s), 7) t.print_it() nodes, kids, _ = t.serialize() nodeaddr, nodelen = nodes.buffer_info() kidaddr, kidlen = kids.buffer_info() print "LENS %s %s" % (nodelen, kidlen) for i in range(8): val = rtrie.value_for_vid(nodeaddr, kidaddr, i) print "Value", i, val self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'), 3) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4) self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel')) self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells'))