示例#1
0
 def __commit_terms(self, batch_size=64):
     term_store = leveldb.LevelDB("%s/term.db" % self.data_dir)
     batch = []
     term_id = 0
     batch_key = 0
     while term_id < len(self.term_id_map):
         batch.append(self.id_term_map[term_id])
         if term_id % batch_size == batch_size - 1:
             batch_data = self.compressHC(pickle.dumps(batch))
             term_store.Put(numencode.encode_uint(batch_key), batch_data)
             batch = []
             batch_key += 1
         term_id += 1
     if len(batch) > 0:
         batch_data = self.compressHC(pickle.dumps(batch))
         term_store.Put(numencode.encode_uint(batch_key), batch_data)
 def __commit_terms(self, batch_size=64):
     term_store = leveldb.LevelDB("%s/term.db" % self.data_dir)
     batch = []
     term_id = 0
     batch_key = 0
     while term_id < len(self.term_id_map):
         batch.append(self.id_term_map[term_id])
         if term_id % batch_size == batch_size - 1:
             batch_data = self.compressHC(pickle.dumps(batch))
             term_store.Put(numencode.encode_uint(batch_key), batch_data)
             batch = []
             batch_key += 1
         term_id += 1
     if len(batch) > 0:
         batch_data = self.compressHC(pickle.dumps(batch))
         term_store.Put(numencode.encode_uint(batch_key), batch_data)
 def write_terms(self, id_term_map, batch_size=64):
     term_store = leveldb.LevelDB("%s/term.db" % self.data_dir)
     batch = []
     term_id = 0
     batch_key = 0
     while term_id < len(id_term_map):
         batch.append(id_term_map[term_id])
         if term_id % batch_size == batch_size - 1:
             batch_data = self.compressHC(pickle.dumps(batch))
             term_store.Put(numencode.encode_uint(batch_key), batch_data)
             batch = []
             batch_key += 1
         term_id += 1
     if len(batch) > 0:
         batch_data = self.compressHC(pickle.dumps(batch))
         term_store.Put(numencode.encode_uint(batch_key), batch_data)
     logging.info("wrote %d terms" % len(id_term_map))
示例#4
0
 def write_terms(self, id_term_map, batch_size=64):
     term_store = leveldb.LevelDB("%s/term.db" % self.data_dir)
     batch = []
     term_id = 0
     batch_key = 0
     while term_id < len(id_term_map):
         batch.append(id_term_map[term_id])
         if term_id % batch_size == batch_size - 1:
             batch_data = self.compressHC(pickle.dumps(batch))
             term_store.Put(numencode.encode_uint(batch_key), batch_data)
             batch = []
             batch_key += 1
         term_id += 1
     if len(batch) > 0:
         batch_data = self.compressHC(pickle.dumps(batch))
         term_store.Put(numencode.encode_uint(batch_key), batch_data)
     logging.info("wrote %d terms" % len(id_term_map))
 def write_objects(self, id_object_map):
     object_store = leveldb.LevelDB("%s/object.db" % self.data_dir)
     w_batch = leveldb.WriteBatch()
     for obj_id, obj in id_object_map:
         obj_str = self.obj_to_str(obj)
         obj_blob = self.compressHC(obj_str)
         obj_key = numencode.encode_uint(obj_id)
         w_batch.Put(obj_key, obj_blob)
     object_store.Write(w_batch, sync=True)
     logging.info("wrote %d objects" % len(id_object_map))
     self.update_objnum(self.objnum)
示例#6
0
 def write_objects(self, id_object_map):
     object_store = leveldb.LevelDB("%s/object.db" % self.data_dir)
     w_batch = leveldb.WriteBatch()
     for obj_id, obj in id_object_map:
         obj_str = self.obj_to_str(obj)
         obj_blob = self.compressHC(obj_str)
         obj_key = numencode.encode_uint(obj_id)
         w_batch.Put(obj_key, obj_blob)
     object_store.Write(w_batch, sync=True)
     logging.info("wrote %d objects" % len(id_object_map))
     self.update_objnum(self.objnum)
 def __update_arg_index(self):
     w_batch = leveldb.WriteBatch()
     arg_index = leveldb.LevelDB("%s/arg.index" % self.data_dir)
     for term_id, plist in self.arg_cache.iteritems():
         term_key = numencode.encode_uint(term_id)
         try:
             old_plist_blob = arg_index.Get(term_key)
         except KeyError:
             old_plist_blob = None
         if old_plist_blob is None:
             plist_blob = self.encode_posting_list(plist)
         else:
             plist_blob = self.update_posting_list(old_plist_blob, plist)
         w_batch.Put(term_key, plist_blob)
     arg_index.Write(w_batch, sync=True)
示例#8
0
 def __update_arg_index(self):
     w_batch = leveldb.WriteBatch()
     arg_index = leveldb.LevelDB("%s/arg.index" % self.data_dir)
     for term_id, plist in self.arg_cache.iteritems():
         term_key = numencode.encode_uint(term_id)
         try:
             old_plist_blob = arg_index.Get(term_key)
         except KeyError:
             old_plist_blob = None
         if old_plist_blob is None:
             plist_blob = self.encode_posting_list(plist)
         else:
             plist_blob = self.update_posting_list(old_plist_blob, plist)
         w_batch.Put(term_key, plist_blob)
     arg_index.Write(w_batch, sync=True)
 def search(self, rel_type=None, arg_query=()):
     norm_query = []
     for arg in arg_query:
         if isinstance(arg, list) or isinstance(arg, tuple):
             term, pos = arg
             if isinstance(term, basestring):
                 if isinstance(term, unicode):
                     term = term.encode("utf-8")
                 term_id = self.term_id_map.get(term)
             else:
                 term_id = term
         elif isinstance(arg, basestring):
             term, pos = arg, -1
             if isinstance(term, unicode):
                 term = term.encode("utf-8")
             term_id = self.term_id_map.get(term)
         elif isinstance(arg, int):
             term_id, pos = arg, -1
         else:
             term_id, pos = None, -1
         if term_id is not None and term_id in self.id_term_map:
             norm_query.append((term_id, pos))
     results = None
     for term_id, pos in norm_query:
         try:
             plist_blob = self.arg_index.Get(numencode.encode_uint(term_id))
             plist = self.index.decode_posting_list(plist_blob)
         except KeyError:
             plist = []
         if pos != -1:
             plist = filter(lambda plist_el: plist_el[1] == pos, plist)
         plist = [plist_el[0] for plist_el in plist]
         plist = set(plist)
         if results is None:
             results = plist
         else:
             results &= plist
     if results is None:
         return ()
     results = [self.id_triple_map[triple_id] for triple_id in results]
     if rel_type is not None:
         results = filter(lambda triple: triple[0] == rel_type, results)
     return results
示例#10
0
 def search(self, rel_type=None, arg_query=()):
     norm_query = []
     for arg in arg_query:
         if isinstance(arg, list) or isinstance(arg, tuple):
             term, pos = arg
             if isinstance(term, basestring):
                 if isinstance(term, unicode):
                     term = term.encode("utf-8")
                 term_id = self.term_id_map.get(term)
             else:
                 term_id = term
         elif isinstance(arg, basestring):
             term, pos = arg, -1
             if isinstance(term, unicode):
                 term = term.encode("utf-8")
             term_id = self.term_id_map.get(term)
         elif isinstance(arg, int):
             term_id, pos = arg, -1
         else:
             term_id, pos = None, -1
         if term_id is not None and term_id in self.id_term_map:
             norm_query.append((term_id, pos))
     results = None
     for term_id, pos in norm_query:
         try:
             plist_blob = self.arg_index.Get(numencode.encode_uint(term_id))
             plist = self.index.decode_posting_list(plist_blob)
         except KeyError:
             plist = []
         if pos != -1:
             plist = filter(lambda plist_el: plist_el[1] == pos, plist)
         plist = [plist_el[0] for plist_el in plist]
         plist = set(plist)
         if results is None:
             results = plist
         else:
             results &= plist
     if results is None:
         return ()
     results = [self.id_triple_map[triple_id] for triple_id in results]
     if rel_type is not None:
         results = filter(lambda triple: triple[0] == rel_type, results)
     return results
 def update_posting_lists(self, post_lists):
     plist_store = leveldb.LevelDB("%s/plist.index" % self.data_dir)
     w_batch = leveldb.WriteBatch()
     upd_num = 0
     new_num = 0
     for term_id, plist in post_lists.iteritems():
         term_key = numencode.encode_uint(term_id)
         try:
             old_plist_blob = plist_store.Get(term_key)
             upd_num += 1
         except KeyError:
             new_num += 1
             old_plist_blob = None
         if old_plist_blob is None:
             plist_blob = self.encode_posting_list(plist)
         else:
             plist_blob = self.update_posting_list(old_plist_blob, plist)
         w_batch.Put(term_key, plist_blob)
     plist_store.Write(w_batch, sync=True)
     logging.info("updated %d plists, %d new" % (upd_num, new_num))
示例#12
0
 def update_posting_lists(self, post_lists):
     plist_store = leveldb.LevelDB("%s/plist.index" % self.data_dir)
     w_batch = leveldb.WriteBatch()
     upd_num = 0
     new_num = 0
     for term_id, plist in post_lists.iteritems():
         term_key = numencode.encode_uint(term_id)
         try:
             old_plist_blob = plist_store.Get(term_key)
             upd_num += 1
         except KeyError:
             new_num += 1
             old_plist_blob = None
         if old_plist_blob is None:
             plist_blob = self.encode_posting_list(plist)
         else:
             plist_blob = self.update_posting_list(old_plist_blob, plist)
         w_batch.Put(term_key, plist_blob)
     plist_store.Write(w_batch, sync=True)
     logging.info("updated %d plists, %d new" % (upd_num, new_num))
 def load_object(self, obj_id, obj_store):
     obj_key = numencode.encode_uint(obj_id)
     obj_blob = obj_store.Get(obj_key)
     obj_str = self.decompress(obj_blob)
     obj = self.str_to_obj(obj_str)
     return obj
 def load_posting_list(self, term_id, plist_store):
     term_key = numencode.encode_uint(term_id)
     plist_blob = plist_store.Get(term_key)
     plist = self.decode_posting_list(plist_blob)
     return plist
示例#15
0
 def load_object(self, obj_id, obj_store):
     obj_key = numencode.encode_uint(obj_id)
     obj_blob = obj_store.Get(obj_key)
     obj_str = self.decompress(obj_blob)
     obj = self.str_to_obj(obj_str)
     return obj
示例#16
0
 def load_posting_list(self, term_id, plist_store):
     term_key = numencode.encode_uint(term_id)
     plist_blob = plist_store.Get(term_key)
     plist = self.decode_posting_list(plist_blob)
     return plist