class IndexStore(object): """Index metadata and provide rich query facilities on it. """ def __init__(self): self._database = None self._flush_timeout = None self._pending_writes = 0 root_path=layoutmanager.get_instance().get_root_path() self._index_updated_path = os.path.join(root_path, 'index_updated') self._std_index_path = layoutmanager.get_instance().get_index_path() self._index_path = self._std_index_path # Create an instance of ERS self._ers = ERS() def open_index(self, temp_path=False): # callers to open_index must be able to # handle an exception -- usually caused by # IO errors such as ENOSPC and retry putting # the index on a temp_path if temp_path: try: # mark the on-disk index stale self._set_index_updated(False) except: pass self._index_path = temp_path else: self._index_path = self._std_index_path try: self._database = WritableDatabase(self._index_path, xapian.DB_CREATE_OR_OPEN) except Exception as e: logging.error('Exception opening database') raise def close_index(self): """Close index database if it is open.""" if not self._database: return self._flush(True) try: # does Xapian write in its destructors? self._database = None except Exception as e: logging.error('Exception tearing down database') raise def remove_index(self): if not os.path.exists(self._index_path): return for f in os.listdir(self._index_path): os.remove(os.path.join(self._index_path, f)) def contains(self, uid): ''' Check if there is a journal entry with the given UID ''' # Name of the entry entity_name = layoutmanager.get_instance().get_entity_name(uid) # Tells if the UID is in the store or not return self._ers.contains_entity(entity_name) def store(self, uid, properties): ''' Add a document to the index ''' document = Document() document.add_value(_VALUE_UID, uid) term_generator = TermGenerator() term_generator.index_document(document, properties) if not self.contains(uid): self._database.add_document(document) else: self._database.replace_document(_PREFIX_FULL_VALUE + \ _PREFIX_UID + uid, document) self._flush(True) def find(self, query): ''' Get a list of UIDs matching a query ''' logging.warn("Query " + str(query)) offset = query.pop('offset', 0) limit = query.pop('limit', MAX_QUERY_LIMIT) order_by = query.pop('order_by', []) query_string = query.pop('query', None) query_parser = QueryParser() query_parser.set_database(self._database) parsed_query = query_parser.parse_query(query, query_string) logging.warn("Parsed query " + str(parsed_query)) # This will assure that the results count is exact. check_at_least = offset + limit + 1 # TODO : Implement order by logging.warn('Unsupported property for sorting: %s', order_by) results = self._ers.search('uid', 'blah') logging.warn("Found " + str(results)) total_count = 0 uids = [] return (uids, total_count) def delete(self, uid): self._database.delete_document(_PREFIX_FULL_VALUE + _PREFIX_UID + uid) self._flush(True) def get_activities(self): activities = [] prefix = _PREFIX_FULL_VALUE + _PREFIX_ACTIVITY for term in self._database.allterms(prefix): activities.append(term.term[len(prefix):]) return activities def flush(self): self._flush(True) def get_index_updated(self): return os.path.exists(self._index_updated_path) index_updated = property(get_index_updated) def _set_index_updated(self, index_updated): if self._std_index_path != self._index_path: # operating from tmpfs return True if index_updated != self.index_updated: if index_updated: index_updated_file = open(self._index_updated_path, 'w') # index_updated = True will happen every # indexstore._FLUSH_TIMEOUT seconds, so it is ok to fsync os.fsync(index_updated_file.fileno()) index_updated_file.close() else: os.remove(self._index_updated_path) def _flush_timeout_cb(self): self._flush(True) return False def _flush(self, force=False): """Called after any database mutation""" logging.debug('IndexStore.flush: force=%r _pending_writes=%r', force, self._pending_writes) self._set_index_updated(False) if self._flush_timeout is not None: GObject.source_remove(self._flush_timeout) self._flush_timeout = None self._pending_writes += 1 if force or self._pending_writes > _FLUSH_THRESHOLD: try: logging.debug("Start database flush") self._database.flush() logging.debug("Completed database flush") except Exception, e: logging.exception(e) logging.error("Exception during database.flush()") # bail out to trigger a reindex sys.exit(1) self._pending_writes = 0 self._set_index_updated(True) else:
class IndexStore(object): """Index metadata and provide rich query facilities on it. """ def __init__(self): self._database = None self._flush_timeout = None self._pending_writes = 0 root_path = layoutmanager.get_instance().get_root_path() self._index_updated_path = os.path.join(root_path, 'index_updated') self._std_index_path = layoutmanager.get_instance().get_index_path() self._index_path = self._std_index_path def open_index(self, temp_path=False): # callers to open_index must be able to # handle an exception -- usually caused by # IO errors such as ENOSPC and retry putting # the index on a temp_path if temp_path: try: # mark the on-disk index stale self._set_index_updated(False) except: pass self._index_path = temp_path else: self._index_path = self._std_index_path try: self._database = WritableDatabase(self._index_path, xapian.DB_CREATE_OR_OPEN) except Exception as e: logging.error('Exception opening database') raise def close_index(self): """Close index database if it is open.""" if not self._database: return self._flush(True) try: # does Xapian write in its destructors? self._database = None except Exception as e: logging.error('Exception tearing down database') raise def remove_index(self): if not os.path.exists(self._index_path): return for f in os.listdir(self._index_path): os.remove(os.path.join(self._index_path, f)) def contains(self, uid): postings = self._database.postlist(_PREFIX_FULL_VALUE + \ _PREFIX_UID + uid) try: __ = postings.next() except StopIteration: return False return True def store(self, uid, properties): document = Document() document.add_value(_VALUE_UID, uid) term_generator = TermGenerator() term_generator.index_document(document, properties) if not self.contains(uid): self._database.add_document(document) else: self._database.replace_document(_PREFIX_FULL_VALUE + \ _PREFIX_UID + uid, document) self._flush(True) def find(self, query): offset = query.pop('offset', 0) limit = query.pop('limit', MAX_QUERY_LIMIT) order_by = query.pop('order_by', []) query_string = query.pop('query', None) query_parser = QueryParser() query_parser.set_database(self._database) enquire = Enquire(self._database) enquire.set_query(query_parser.parse_query(query, query_string)) # This will assure that the results count is exact. check_at_least = offset + limit + 1 if not order_by: order_by = '+timestamp' else: order_by = order_by[0] if order_by == '+timestamp': enquire.set_sort_by_value(_VALUE_TIMESTAMP, True) elif order_by == '-timestamp': enquire.set_sort_by_value(_VALUE_TIMESTAMP, False) elif order_by == '+title': enquire.set_sort_by_value(_VALUE_TITLE, True) elif order_by == '-title': enquire.set_sort_by_value(_VALUE_TITLE, False) elif order_by == '+filesize': enquire.set_sort_by_value(_VALUE_FILESIZE, True) elif order_by == '-filesize': enquire.set_sort_by_value(_VALUE_FILESIZE, False) elif order_by == '+creation_time': enquire.set_sort_by_value(_VALUE_CREATION_TIME, True) elif order_by == '-creation_time': enquire.set_sort_by_value(_VALUE_CREATION_TIME, False) else: logging.warning('Unsupported property for sorting: %s', order_by) query_result = enquire.get_mset(offset, limit, check_at_least) total_count = query_result.get_matches_estimated() uids = [] for hit in query_result: uids.append(hit.document.get_value(_VALUE_UID)) return (uids, total_count) def delete(self, uid): self._database.delete_document(_PREFIX_FULL_VALUE + _PREFIX_UID + uid) self._flush(True) def get_activities(self): activities = [] prefix = _PREFIX_FULL_VALUE + _PREFIX_ACTIVITY for term in self._database.allterms(prefix): activities.append(term.term[len(prefix):]) return activities def flush(self): self._flush(True) def get_index_updated(self): return os.path.exists(self._index_updated_path) index_updated = property(get_index_updated) def _set_index_updated(self, index_updated): if self._std_index_path != self._index_path: # operating from tmpfs return True if index_updated != self.index_updated: if index_updated: index_updated_file = open(self._index_updated_path, 'w') # index_updated = True will happen every # indexstore._FLUSH_TIMEOUT seconds, so it is ok to fsync os.fsync(index_updated_file.fileno()) index_updated_file.close() else: os.remove(self._index_updated_path) def _flush_timeout_cb(self): self._flush(True) return False def _flush(self, force=False): """Called after any database mutation""" logging.debug('IndexStore.flush: force=%r _pending_writes=%r', force, self._pending_writes) self._set_index_updated(False) if self._flush_timeout is not None: GLib.source_remove(self._flush_timeout) self._flush_timeout = None self._pending_writes += 1 if force or self._pending_writes > _FLUSH_THRESHOLD: try: logging.debug("Start database flush") self._database.flush() logging.debug("Completed database flush") except Exception, e: logging.exception(e) logging.error("Exception during database.flush()") # bail out to trigger a reindex sys.exit(1) self._pending_writes = 0 self._set_index_updated(True) else:
class IndexStore(object): """Index metadata and provide rich query facilities on it. """ def __init__(self): self._database = None self._flush_timeout = None self._pending_writes = 0 root_path=layoutmanager.get_instance().get_root_path() self._index_updated_path = os.path.join(root_path, 'index_updated') self._std_index_path = layoutmanager.get_instance().get_index_path() self._index_path = self._std_index_path def open_index(self, temp_path=False): # callers to open_index must be able to # handle an exception -- usually caused by # IO errors such as ENOSPC and retry putting # the index on a temp_path if temp_path: try: # mark the on-disk index stale self._set_index_updated(False) except: pass self._index_path = temp_path else: self._index_path = self._std_index_path try: self._database = WritableDatabase(self._index_path, xapian.DB_CREATE_OR_OPEN) except Exception as e: logging.error('Exception opening database') raise def close_index(self): """Close index database if it is open.""" if not self._database: return self._flush(True) try: # does Xapian write in its destructors? self._database = None except Exception as e: logging.error('Exception tearing down database') raise def remove_index(self): if not os.path.exists(self._index_path): return for f in os.listdir(self._index_path): os.remove(os.path.join(self._index_path, f)) def contains(self, uid): postings = self._database.postlist(_PREFIX_FULL_VALUE + \ _PREFIX_UID + uid) try: __ = postings.next() except StopIteration: return False return True def store(self, uid, properties): document = Document() document.add_value(_VALUE_UID, uid) term_generator = TermGenerator() term_generator.index_document(document, properties) if not self.contains(uid): self._database.add_document(document) else: self._database.replace_document(_PREFIX_FULL_VALUE + \ _PREFIX_UID + uid, document) self._flush(True) def find(self, query): offset = query.pop('offset', 0) limit = query.pop('limit', MAX_QUERY_LIMIT) order_by = query.pop('order_by', []) query_string = query.pop('query', None) query_parser = QueryParser() query_parser.set_database(self._database) enquire = Enquire(self._database) enquire.set_query(query_parser.parse_query(query, query_string)) # This will assure that the results count is exact. check_at_least = offset + limit + 1 if not order_by: order_by = '+timestamp' else: order_by = order_by[0] if order_by == '+timestamp': enquire.set_sort_by_value(_VALUE_TIMESTAMP, True) elif order_by == '-timestamp': enquire.set_sort_by_value(_VALUE_TIMESTAMP, False) elif order_by == '+title': enquire.set_sort_by_value(_VALUE_TITLE, True) elif order_by == '-title': enquire.set_sort_by_value(_VALUE_TITLE, False) elif order_by == '+filesize': enquire.set_sort_by_value(_VALUE_FILESIZE, True) elif order_by == '-filesize': enquire.set_sort_by_value(_VALUE_FILESIZE, False) elif order_by == '+creation_time': enquire.set_sort_by_value(_VALUE_CREATION_TIME, True) elif order_by == '-creation_time': enquire.set_sort_by_value(_VALUE_CREATION_TIME, False) else: logging.warning('Unsupported property for sorting: %s', order_by) query_result = enquire.get_mset(offset, limit, check_at_least) total_count = query_result.get_matches_estimated() uids = [] for hit in query_result: uids.append(hit.document.get_value(_VALUE_UID)) return (uids, total_count) def delete(self, uid): self._database.delete_document(_PREFIX_FULL_VALUE + _PREFIX_UID + uid) self._flush(True) def get_activities(self): activities = [] prefix = _PREFIX_FULL_VALUE + _PREFIX_ACTIVITY for term in self._database.allterms(prefix): activities.append(term.term[len(prefix):]) return activities def flush(self): self._flush(True) def get_index_updated(self): return os.path.exists(self._index_updated_path) index_updated = property(get_index_updated) def _set_index_updated(self, index_updated): if self._std_index_path != self._index_path: # operating from tmpfs return True if index_updated != self.index_updated: if index_updated: index_updated_file = open(self._index_updated_path, 'w') # index_updated = True will happen every # indexstore._FLUSH_TIMEOUT seconds, so it is ok to fsync os.fsync(index_updated_file.fileno()) index_updated_file.close() else: os.remove(self._index_updated_path) def _flush_timeout_cb(self): self._flush(True) return False def _flush(self, force=False): """Called after any database mutation""" logging.debug('IndexStore.flush: force=%r _pending_writes=%r', force, self._pending_writes) self._set_index_updated(False) if self._flush_timeout is not None: GObject.source_remove(self._flush_timeout) self._flush_timeout = None self._pending_writes += 1 if force or self._pending_writes > _FLUSH_THRESHOLD: try: logging.debug("Start database flush") self._database.flush() logging.debug("Completed database flush") except Exception, e: logging.exception(e) logging.error("Exception during database.flush()") # bail out to trigger a reindex sys.exit(1) self._pending_writes = 0 self._set_index_updated(True) else:
class Catalog(object): nb_changes = 0 logger = None _db = None read_only = False def __init__(self, ref, fields, read_only=False, asynchronous_mode=True): self.read_only = read_only # Load the database if isinstance(ref, (Database, WritableDatabase)): path = None self._db = ref else: path = lfs.get_absolute_path(ref) if read_only: self._db = Database(path) else: self._db = WritableDatabase(path, DB_OPEN) db = self._db self._asynchronous = asynchronous_mode self._fields = fields # FIXME: There's a bug in xapian: # Wa cannot get stored values if DB not flushed self.commit_each_transaction = True # Asynchronous mode if not read_only and asynchronous_mode: db.begin_transaction(self.commit_each_transaction) # Set XAPIAN_FLUSH_THRESHOLD os.environ["XAPIAN_FLUSH_THRESHOLD"] = "2000" # Load the xfields from the database self._metadata = {} self._value_nb = 0 self._prefix_nb = 0 self._load_all_internal() if not read_only: self._init_all_metadata() # Catalog log if path: catalog_log = '{}/catalog.log'.format(path) self.logger = CatalogLogger(catalog_log) register_logger(self.logger, 'itools.catalog') def _init_all_metadata(self): """Init new metadata (to avoid 'field is not indexed' warning) """ has_changes = False metadata = self._metadata for name, field_cls in self._fields.items(): if name not in metadata: print('[Catalog] New field registered: {0}'.format(name)) has_changes = True metadata[name] = self._get_info(field_cls, name) else: # If the field was in the catalog but is newly stored if (not metadata[name].has_key('value') and getattr(field_cls, 'stored', False)): print('[Catalog] Indexed field is now stored: {0}'.format(name)) has_changes = True metadata[name] = merge_dicts( metadata[name], self._get_info_stored()) # If the field was stored in the catalog but is newly indexed if (not metadata[name].has_key('prefix') and getattr(field_cls, 'indexed', False)): print('[Catalog] Stored field is now indexed: {0}'.format(name)) has_changes = True metadata[name] = merge_dicts( metadata[name], self._get_info_indexed()) if has_changes: self._db.set_metadata('metadata', dumps(metadata)) self._db.commit_transaction() self._db.begin_transaction(self.commit_each_transaction) ####################################################################### # API / Public / Transactions ####################################################################### def save_changes(self): """Save the last changes to disk. """ if not self._asynchronous: raise ValueError, "The transactions are synchronous" db = self._db db.commit_transaction() db.commit() # FIXME: There's a bug in xapian: # Wa cannot get stored values if DB not flushed #if self.nb_changes > 200: # # XXX Not working since cancel_transaction() # # cancel all transactions not commited to disk # # We have to use new strategy to abort transaction # db.commit() # if self.logger: # self.logger.clear() # self.nb_changes = 0 db.begin_transaction(self.commit_each_transaction) def abort_changes(self): """Abort the last changes made in memory. """ if not self._asynchronous: raise ValueError, "The transactions are synchronous" db = self._db if self.commit_each_transaction: db.cancel_transaction() db.begin_transaction(self.commit_each_transaction) else: raise NotImplementedError self._load_all_internal() def close(self): if self._db is None: msg = 'Catalog is already closed' print(msg) return if self.read_only: self._db.close() self._db = None return if self.commit_each_transaction: try: self._db.cancel_transaction() except: print('Warning: cannot cancel xapian transaction') self._db.close() self._db = None else: self._db.close() self._db = None else: self.abort_changes() self._db.commit_transaction() self._db.flush() self._db.close() self._db = None if self.logger: self.logger.clear() ####################################################################### # API / Public / (Un)Index ####################################################################### def index_document(self, document): self.nb_changes += 1 abspath, term, xdoc = self.get_xdoc_from_document(document) self._db.replace_document(term, xdoc) if self.logger: log_info(abspath, domain='itools.catalog') def unindex_document(self, abspath): """Remove the document that has value stored in its abspath. If the document does not exist => no error """ self.nb_changes += 1 data = _reduce_size(_encode(self._fields['abspath'], abspath)) self._db.delete_document('Q' + data) if self.logger: log_info(abspath, domain='itools.catalog') def get_xdoc_from_document(self, doc_values): """Return (abspath, term, xdoc) from the document (resource or values as dict) """ term = None metadata = self._metadata # Check the input if type(doc_values) is not dict: raise NotImplementedError('Deprecated: doc_values should be a dict') fields = self._fields abspath = doc_values['abspath'] # Make the xapian document metadata_modified = False xdoc = Document() for name, value in doc_values.iteritems(): if name not in fields: warn_not_indexed_nor_stored(name) field_cls = fields[name] # New field ? if name not in metadata: info = metadata[name] = self._get_info(field_cls, name) metadata_modified = True else: info = metadata[name] # XXX This comment is no longer valid, now the key field is # always abspath with field_cls = String # Store the key field with the prefix 'Q' # Comment: the key field is indexed twice, but we must do it # one => to index (as the others) # two => to index without split # the problem is that "_encode != _index" if name == 'abspath': key_value = _reduce_size(_encode(field_cls, value)) term = 'Q' + key_value xdoc.add_term(term) # A multilingual value? if isinstance(value, dict): for language, lang_value in value.iteritems(): lang_name = name + '_' + language # New field ? if lang_name not in metadata: lang_info = self._get_info(field_cls, lang_name) lang_info['from'] = name metadata[lang_name] = lang_info metadata_modified = True else: lang_info = metadata[lang_name] # The value can be None if lang_value is not None: # Is stored ? if 'value' in lang_info: xdoc.add_value(lang_info['value'], _encode(field_cls, lang_value)) # Is indexed ? if 'prefix' in lang_info: # Comment: Index twice _index(xdoc, field_cls, lang_value, info['prefix'], language) _index(xdoc, field_cls, lang_value, lang_info['prefix'], language) # The value can be None elif value is not None: # Is stored ? if 'value' in info: xdoc.add_value(info['value'], _encode(field_cls, value)) # Is indexed ? if 'prefix' in info: # By default language='en' _index(xdoc, field_cls, value, info['prefix'], 'en') # Store metadata ? if metadata_modified: metadata = self._metadata self._db.set_metadata('metadata', dumps(metadata)) # Ok return abspath, term, xdoc ####################################################################### # API / Public / Search ####################################################################### def get_unique_values(self, name): """Return all the terms of a given indexed field """ metadata = self._metadata # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return set() # Ok prefix = metadata[name]['prefix'] prefix_len = len(prefix) return set([ t.term[prefix_len:] for t in self._db.allterms(prefix) ]) ####################################################################### # API / Private ####################################################################### def _get_info(self, field_cls, name): # The key field ? if name == 'abspath': if not (issubclass(field_cls, String) and field_cls.stored and field_cls.indexed): raise ValueError, ('the abspath field must be declared as ' 'String(stored=True, indexed=True)') # Stored ? info = {} if getattr(field_cls, 'stored', False): info = self._get_info_stored() # Indexed ? if getattr(field_cls, 'indexed', False): info = merge_dicts(info, self._get_info_indexed()) # Ok return info def _get_info_stored(self): value = self._value_nb self._value_nb += 1 return {'value': value} def _get_info_indexed(self): prefix = _get_prefix(self._prefix_nb) self._prefix_nb += 1 return {'prefix': prefix} def _load_all_internal(self): """Load the metadata from the database """ self._value_nb = 0 self._prefix_nb = 0 metadata = self._db.get_metadata('metadata') if metadata == '': self._metadata = {} else: self._metadata = loads(metadata) for name, info in self._metadata.iteritems(): if 'value' in info: self._value_nb += 1 if 'prefix' in info: self._prefix_nb += 1 def _query2xquery(self, query): """take a "itools" query and return a "xapian" query """ query_class = type(query) fields = self._fields metadata = self._metadata # All Query if query_class is AllQuery: return Query('') # PhraseQuery, the field must be indexed if query_class is PhraseQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] try: prefix = info['prefix'] except KeyError: raise ValueError, 'the field "%s" must be indexed' % name field_cls = _get_field_cls(name, fields, info) return _make_PhraseQuery(field_cls, query.value, prefix) # RangeQuery, the field must be stored if query_class is RangeQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] value = info.get('value') if value is None: raise AttributeError, MSG_NOT_STORED.format(name=name) field_cls = _get_field_cls(name, fields, info) if field_cls.multiple: error = 'range-query not supported on multiple fields' raise ValueError, error left = query.left if left is not None: left = _encode_simple_value(field_cls, left) right = query.right if right is not None: right = _encode_simple_value(field_cls, right) # Case 1: no limits, return everything if left is None and right is None: return Query('') # Case 2: left limit only if right is None: return Query(OP_VALUE_GE, value, left) # Case 3: right limit only if left is None: return Query(OP_VALUE_LE, value, right) # Case 4: left and right return Query(OP_VALUE_RANGE, value, left, right) # StartQuery, the field must be stored if query_class is StartQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] value_nb = info.get('value') if value_nb is None: raise AttributeError, MSG_NOT_STORED.format(name=name) field_cls = _get_field_cls(name, fields, info) value = query.value value = _encode(field_cls, value) if value: # good = {x / x >= value} good = Query(OP_VALUE_GE, value_nb, value) # Construct the variable end_value: # end_value = the word "after" value: toto => totp # Delete the '\xff' at the end of value end_value = value while end_value and ord(end_value[-1]) == 255: end_value = end_value[:-1] # Normal case: end_value is not empty if end_value: # The world after end_value = end_value[:-1] + chr(ord(end_value[-1]) + 1) # bad = {x / x >= end_value} bad = Query(OP_VALUE_GE, value_nb, end_value) # Return {x / x in good but x not in bad} return Query(OP_AND_NOT, good, bad) # If end_value is empty else: # Return {x / x in good} return good else: # If value == '', we return everything return Query('') # TextQuery, the field must be indexed if query_class is TextQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected %s for 'name'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] field_cls = _get_field_cls(name, fields, info) try: prefix = info['prefix'] except KeyError: raise ValueError, 'the field "%s" must be indexed' % name # Remove accents from the value value = query.value if type(value) is not unicode: raise TypeError, "unexpected %s for 'value'" % type(value) value = value.translate(TRANSLATE_MAP) qp = QueryParser() qp.set_database(self._db) return qp.parse_query(_encode(field_cls, value), TQ_FLAGS, prefix) i2x = self._query2xquery # Multiple query with single atom if isinstance(query, _MultipleQuery) and len(query.atoms) == 1: return i2x(query.atoms[0]) # And if query_class is _AndQuery: return Query(OP_AND, [ i2x(q) for q in query.atoms ]) # Or if query_class is _OrQuery: return Query(OP_OR, [ i2x(q) for q in query.atoms ]) # Not if query_class is NotQuery: return Query(OP_AND_NOT, Query(''), i2x(query.query))
class Catalog(object): def __init__(self, ref, fields, read_only=False, asynchronous_mode=True): # Load the database if isinstance(ref, (Database, WritableDatabase)): self._db = ref else: path = lfs.get_absolute_path(ref) if read_only: self._db = Database(path) else: self._db = WritableDatabase(path, DB_OPEN) db = self._db self._asynchronous = asynchronous_mode self._fields = fields # Asynchronous mode if not read_only and asynchronous_mode: db.begin_transaction(False) # Load the xfields from the database self._metadata = {} self._value_nb = 0 self._prefix_nb = 0 self._load_all_internal() ####################################################################### # API / Public / Transactions ####################################################################### def save_changes(self): """Save the last changes to disk. """ if not self._asynchronous: raise ValueError, "The transactions are synchronous" db = self._db db.commit_transaction() db.flush() db.begin_transaction(False) def abort_changes(self): """Abort the last changes made in memory. """ if not self._asynchronous: raise ValueError, "The transactions are synchronous" db = self._db db.cancel_transaction() self._load_all_internal() db.begin_transaction(False) ####################################################################### # API / Public / (Un)Index ####################################################################### def index_document(self, document): """Add a new document. """ db = self._db metadata = self._metadata fields = self._fields # Check the input if type(document) is dict: doc_values = document else: doc_values = document.get_catalog_values() # Make the xapian document metadata_modified = False xdoc = Document() for name, value in doc_values.iteritems(): if name not in fields: warn_not_indexed_nor_stored(name) field_cls = fields[name] # New field ? if name not in metadata: info = metadata[name] = self._get_info(field_cls, name) metadata_modified = True else: info = metadata[name] # XXX This comment is no longer valid, now the key field is # always abspath with field_cls = String # Store the key field with the prefix 'Q' # Comment: the key field is indexed twice, but we must do it # one => to index (as the others) # two => to index without split # the problem is that "_encode != _index" if name == 'abspath': key_value = _reduce_size(_encode(field_cls, value)) xdoc.add_term('Q' + key_value) # A multilingual value? if isinstance(value, dict): for language, lang_value in value.iteritems(): lang_name = name + '_' + language # New field ? if lang_name not in metadata: lang_info = self._get_info(field_cls, lang_name) lang_info['from'] = name metadata[lang_name] = lang_info metadata_modified = True else: lang_info = metadata[lang_name] # The value can be None if lang_value is not None: # Is stored ? if 'value' in lang_info: xdoc.add_value(lang_info['value'], _encode(field_cls, lang_value)) # Is indexed ? if 'prefix' in lang_info: # Comment: Index twice _index(xdoc, field_cls, lang_value, info['prefix'], language) _index(xdoc, field_cls, lang_value, lang_info['prefix'], language) # The value can be None elif value is not None: # Is stored ? if 'value' in info: xdoc.add_value(info['value'], _encode(field_cls, value)) # Is indexed ? if 'prefix' in info: # By default language='en' _index(xdoc, field_cls, value, info['prefix'], 'en') # TODO: Don't store two documents with the same key field! # Save the doc db.add_document(xdoc) # Store metadata ? if metadata_modified: db.set_metadata('metadata', dumps(metadata)) def unindex_document(self, abspath): """Remove the document that has value stored in its abspath. If the document does not exist => no error """ data = _reduce_size(_encode(self._fields['abspath'], abspath)) self._db.delete_document('Q' + data) ####################################################################### # API / Public / Search ####################################################################### def get_unique_values(self, name): """Return all the terms of a given indexed field """ metadata = self._metadata # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return set() # Ok prefix = metadata[name]['prefix'] prefix_len = len(prefix) return set([ t.term[prefix_len:] for t in self._db.allterms(prefix) ]) ####################################################################### # API / Private ####################################################################### def _get_info(self, field_cls, name): # The key field ? if name == 'abspath': if not (issubclass(field_cls, String) and field_cls.stored and field_cls.indexed): raise ValueError, ('the abspath field must be declared as ' 'String(stored=True, indexed=True)') # Stored ? info = {} if getattr(field_cls, 'stored', False): info['value'] = self._value_nb self._value_nb += 1 # Indexed ? if getattr(field_cls, 'indexed', False): info['prefix'] = _get_prefix(self._prefix_nb) self._prefix_nb += 1 return info def _load_all_internal(self): """Load the metadata from the database """ self._value_nb = 0 self._prefix_nb = 0 metadata = self._db.get_metadata('metadata') if metadata == '': self._metadata = {} else: self._metadata = loads(metadata) for name, info in self._metadata.iteritems(): if 'value' in info: self._value_nb += 1 if 'prefix' in info: self._prefix_nb += 1 def _query2xquery(self, query): """take a "itools" query and return a "xapian" query """ query_class = type(query) fields = self._fields metadata = self._metadata # All Query if query_class is AllQuery: return Query('') # PhraseQuery, the field must be indexed if query_class is PhraseQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] try: prefix = info['prefix'] except KeyError: raise ValueError, 'the field "%s" must be indexed' % name field_cls = _get_field_cls(name, fields, info) return _make_PhraseQuery(field_cls, query.value, prefix) # RangeQuery, the field must be stored if query_class is RangeQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] value = info.get('value') if value is None: raise AttributeError, MSG_NOT_STORED.format(name=name) field_cls = _get_field_cls(name, fields, info) if field_cls.multiple: error = 'range-query not supported on multiple fields' raise ValueError, error left = query.left if left is not None: left = _encode_simple_value(field_cls, left) right = query.right if right is not None: right = _encode_simple_value(field_cls, right) # Case 1: no limits, return everything if left is None and right is None: return Query('') # Case 2: left limit only if right is None: return Query(OP_VALUE_GE, value, left) # Case 3: right limit only if left is None: return Query(OP_VALUE_LE, value, right) # Case 4: left and right return Query(OP_VALUE_RANGE, value, left, right) # StartQuery, the field must be stored if query_class is StartQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] value_nb = info.get('value') if value_nb is None: raise AttributeError, MSG_NOT_STORED.format(name=name) field_cls = _get_field_cls(name, fields, info) value = query.value value = _encode(field_cls, value) if value: # good = {x / x >= value} good = Query(OP_VALUE_GE, value_nb, value) # Construct the variable end_value: # end_value = the word "after" value: toto => totp # Delete the '\xff' at the end of value end_value = value while end_value and ord(end_value[-1]) == 255: end_value = end_value[:-1] # Normal case: end_value is not empty if end_value: # The world after end_value = end_value[:-1] + chr(ord(end_value[-1]) + 1) # bad = {x / x >= end_value} bad = Query(OP_VALUE_GE, value_nb, end_value) # Return {x / x in good but x not in bad} return Query(OP_AND_NOT, good, bad) # If end_value is empty else: # Return {x / x in good} return good else: # If value == '', we return everything return Query('') # TextQuery, the field must be indexed if query_class is TextQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected %s for 'name'" % type(name) # If there is a problem => an empty result if name not in metadata: warn_not_indexed(name) return Query() info = metadata[name] field_cls = _get_field_cls(name, fields, info) try: prefix = info['prefix'] except KeyError: raise ValueError, 'the field "%s" must be indexed' % name # Remove accents from the value value = query.value if type(value) is not unicode: raise TypeError, "unexpected %s for 'value'" % type(value) value = value.translate(TRANSLATE_MAP) qp = QueryParser() qp.set_database(self._db) return qp.parse_query(_encode(field_cls, value), TQ_FLAGS, prefix) i2x = self._query2xquery # Multiple query with single atom if isinstance(query, _MultipleQuery) and len(query.atoms) == 1: return i2x(query.atoms[0]) # And if query_class is _AndQuery: return Query(OP_AND, [ i2x(q) for q in query.atoms ]) # Or if query_class is _OrQuery: return Query(OP_OR, [ i2x(q) for q in query.atoms ]) # Not if query_class is NotQuery: return Query(OP_AND_NOT, Query(''), i2x(query.query))
class Catalog(object): def __init__(self, ref, fields, read_only=False, asynchronous_mode=True): # Load the database if isinstance(ref, Database) or isinstance(ref, WritableDatabase): self._db = ref else: path = lfs.get_absolute_path(ref) if read_only: self._db = Database(path) else: self._db = WritableDatabase(path, DB_OPEN) db = self._db self._asynchronous = asynchronous_mode self._fields = fields # Asynchronous mode if not read_only and asynchronous_mode: db.begin_transaction(False) # Load the xfields from the database self._metadata = {} self._key_field = None self._value_nb = 0 self._prefix_nb = 0 self._load_all_internal() ####################################################################### # API / Public / Transactions ####################################################################### def save_changes(self): """Save the last changes to disk. """ if not self._asynchronous: raise ValueError, "The transactions are synchronous" db = self._db db.commit_transaction() db.flush() db.begin_transaction(False) def abort_changes(self): """Abort the last changes made in memory. """ if not self._asynchronous: raise ValueError, "The transactions are synchronous" db = self._db db.cancel_transaction() self._load_all_internal() db.begin_transaction(False) ####################################################################### # API / Public / (Un)Index ####################################################################### def index_document(self, document): """Add a new document. """ db = self._db metadata = self._metadata fields = self._fields # Check the input if type(document) is dict: doc_values = document elif isinstance(document, CatalogAware): doc_values = document.get_catalog_values() else: raise ValueError, 'the document must be a CatalogAware object' # Make the xapian document metadata_modified = False xdoc = Document() for name, value in doc_values.iteritems(): field_cls = fields[name] # New field ? if name not in metadata: info = metadata[name] = self._get_info(field_cls, name) metadata_modified = True else: info = metadata[name] # A multilingual value ? if isinstance(value, dict): for language, lang_value in value.iteritems(): lang_name = name + '_' + language # New field ? if lang_name not in metadata: lang_info = self._get_info(field_cls, lang_name) lang_info['from'] = name metadata[lang_name] = lang_info metadata_modified = True else: lang_info = metadata[lang_name] # The value can be None if lang_value is not None: # Is stored ? if 'value' in lang_info: xdoc.add_value(lang_info['value'], _encode(field_cls, lang_value)) # Is indexed ? if 'prefix' in lang_info: # Comment: Index twice _index(xdoc, field_cls, lang_value, info['prefix'], language) _index(xdoc, field_cls, lang_value, lang_info['prefix'], language) # The value can be None elif value is not None: # Is stored ? if 'value' in info: xdoc.add_value(info['value'], _encode(field_cls, value)) # Is indexed ? if 'prefix' in info: # By default language='en' _index(xdoc, field_cls, value, info['prefix'], 'en') # Store the key field with the prefix 'Q' # Comment: the key field is indexed twice, but we must do it # one => to index (as the others) # two => to index without split # the problem is that "_encode != _index" key_field = self._key_field if (key_field is None or key_field not in doc_values or doc_values[key_field] is None): raise ValueError, 'the "key_field" value is compulsory' data = _reduce_size(_encode(fields[key_field], doc_values[key_field])) xdoc.add_term('Q' + data) # TODO: Don't store two documents with the same key field! # Save the doc db.add_document(xdoc) # Store metadata ? if metadata_modified: db.set_metadata('metadata', dumps(metadata)) def unindex_document(self, value): """Remove the document that has value stored in its key_field. If the document does not exist => no error """ key_field = self._key_field if key_field is not None: data = _reduce_size(_encode(self._fields[key_field], value)) self._db.delete_document('Q' + data) ####################################################################### # API / Public / Search ####################################################################### def search(self, query=None, **kw): """Launch a search in the catalog. """ xquery = _get_xquery(self, query, **kw) return SearchResults(self, xquery) def get_unique_values(self, name): """Return all the terms of a given indexed field """ metadata = self._metadata # If there is a problem => an empty result if name not in metadata: return set() # Ok prefix = metadata[name]['prefix'] prefix_len = len(prefix) return set([ t.term[prefix_len:] for t in self._db.allterms(prefix) ]) ####################################################################### # API / Private ####################################################################### def _get_info(self, field_cls, name): info = {} # The key field ? if getattr(field_cls, 'is_key_field', False): if self._key_field is not None: raise ValueError, ('You must have only one key field, ' 'not multiple, not multilingual') if not (field_cls.is_stored and field_cls.is_indexed): raise ValueError, ('the key field must be stored ' 'and indexed') self._key_field = name info['key_field'] = True # Stored ? if getattr(field_cls, 'is_stored', False): info['value'] = self._value_nb self._value_nb += 1 # Indexed ? if getattr(field_cls, 'is_indexed', False): info['prefix'] = _get_prefix(self._prefix_nb) self._prefix_nb += 1 return info def _load_all_internal(self): """Load the metadata from the database """ self._key_field = None self._value_nb = 0 self._prefix_nb = 0 metadata = self._db.get_metadata('metadata') if metadata == '': self._metadata = {} else: self._metadata = loads(metadata) for name, info in self._metadata.iteritems(): if 'key_field' in info: self._key_field = name if 'value' in info: self._value_nb += 1 if 'prefix' in info: self._prefix_nb += 1 def _query2xquery(self, query): """take a "itools" query and return a "xapian" query """ query_class = query.__class__ fields = self._fields metadata = self._metadata # All Query if query_class is AllQuery: return Query('') # PhraseQuery, the field must be indexed if query_class is PhraseQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: return Query() info = metadata[name] try: prefix = info['prefix'] except KeyError: raise ValueError, 'the field "%s" must be indexed' % name field_cls = _get_field_cls(name, fields, info) return _make_PhraseQuery(field_cls, query.value, prefix) # RangeQuery, the field must be stored if query_class is RangeQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: return Query() info = metadata[name] value = info['value'] field_cls = _get_field_cls(name, fields, info) left = query.left right = query.right # Case 1: no limits, return everything if left is None and right is None: return Query('') # Case 2: left limit only if right is None: return Query(OP_VALUE_GE, value, _encode(field_cls, left)) # Case 3: right limit only if left is None: return Query(OP_VALUE_LE, value, _encode(field_cls, right)) # Case 4: left and right return Query(OP_VALUE_RANGE, value, _encode(field_cls, left), _encode(field_cls, right)) # StartQuery, the field must be stored if query_class is StartQuery: name = query.name if type(name) is not str: raise TypeError, "unexpected '%s'" % type(name) # If there is a problem => an empty result if name not in metadata: return Query() info = metadata[name] value_nb = info['value'] field_cls = _get_field_cls(name, fields, info) value = query.value value = _encode(field_cls, value) if value: # good = {x / x >= value} good = Query(OP_VALUE_GE, value_nb, value) # Construct the variable end_value: # end_value = the word "after" value: toto => totp # Delete the '\xff' at the end of value end_value = value while end_value and ord(end_value[-1]) == 255: end_value = end_value[:-1] # Normal case: end_value is not empty if end_value: # The world after end_value = end_value[:-1] + chr(ord(end_value[-1]) + 1) # bad = {x / x >= end_value} bad = Query(OP_VALUE_GE, value_nb, end_value) # Return {x / x in good but x not in bad} return Query(OP_AND_NOT, good, bad) # If end_value is empty else: # Return {x / x in good} return good else: # If value == '', we return everything return Query('') # And i2x = self._query2xquery if query_class is AndQuery: return Query(OP_AND, [ i2x(q) for q in query.atoms ]) # Or if query_class is OrQuery: return Query(OP_OR, [ i2x(q) for q in query.atoms ]) # Not if query_class is NotQuery: return Query(OP_AND_NOT, Query(''), i2x(query.query))