Пример #1
0
class IndexStore(object):
    """Index metadata and provide rich query facilities on it.
    """
    def __init__(self):
        self._database = None
        self._flush_timeout = None
        self._pending_writes = 0
        root_path = layoutmanager.get_instance().get_root_path()
        self._index_updated_path = os.path.join(root_path, 'index_updated')
        self._std_index_path = layoutmanager.get_instance().get_index_path()
        self._index_path = self._std_index_path

    def open_index(self, temp_path=False):
        # callers to open_index must be able to
        # handle an exception -- usually caused by
        # IO errors such as ENOSPC and retry putting
        # the index on a temp_path
        if temp_path:
            try:
                # mark the on-disk index stale
                self._set_index_updated(False)
            except:
                pass
            self._index_path = temp_path
        else:
            self._index_path = self._std_index_path
        try:
            self._database = WritableDatabase(self._index_path,
                                              xapian.DB_CREATE_OR_OPEN)
        except Exception as e:
            logging.error('Exception opening database')
            raise

    def close_index(self):
        """Close index database if it is open."""
        if not self._database:
            return

        self._flush(True)
        try:
            # does Xapian write in its destructors?
            self._database = None
        except Exception as e:
            logging.error('Exception tearing down database')
            raise

    def remove_index(self):
        if not os.path.exists(self._index_path):
            return
        for f in os.listdir(self._index_path):
            os.remove(os.path.join(self._index_path, f))

    def contains(self, uid):
        postings = self._database.postlist(_PREFIX_FULL_VALUE + \
            _PREFIX_UID + uid)
        try:
            __ = postings.next()
        except StopIteration:
            return False
        return True

    def store(self, uid, properties):
        document = Document()
        document.add_value(_VALUE_UID, uid)
        term_generator = TermGenerator()
        term_generator.index_document(document, properties)

        if not self.contains(uid):
            self._database.add_document(document)
        else:
            self._database.replace_document(_PREFIX_FULL_VALUE + \
                _PREFIX_UID + uid, document)

        self._flush(True)

    def find(self, query):
        offset = query.pop('offset', 0)
        limit = query.pop('limit', MAX_QUERY_LIMIT)
        order_by = query.pop('order_by', [])
        query_string = query.pop('query', None)

        query_parser = QueryParser()
        query_parser.set_database(self._database)
        enquire = Enquire(self._database)
        enquire.set_query(query_parser.parse_query(query, query_string))

        # This will assure that the results count is exact.
        check_at_least = offset + limit + 1

        if not order_by:
            order_by = '+timestamp'
        else:
            order_by = order_by[0]

        if order_by == '+timestamp':
            enquire.set_sort_by_value(_VALUE_TIMESTAMP, True)
        elif order_by == '-timestamp':
            enquire.set_sort_by_value(_VALUE_TIMESTAMP, False)
        elif order_by == '+title':
            enquire.set_sort_by_value(_VALUE_TITLE, True)
        elif order_by == '-title':
            enquire.set_sort_by_value(_VALUE_TITLE, False)
        elif order_by == '+filesize':
            enquire.set_sort_by_value(_VALUE_FILESIZE, True)
        elif order_by == '-filesize':
            enquire.set_sort_by_value(_VALUE_FILESIZE, False)
        elif order_by == '+creation_time':
            enquire.set_sort_by_value(_VALUE_CREATION_TIME, True)
        elif order_by == '-creation_time':
            enquire.set_sort_by_value(_VALUE_CREATION_TIME, False)
        else:
            logging.warning('Unsupported property for sorting: %s', order_by)

        query_result = enquire.get_mset(offset, limit, check_at_least)
        total_count = query_result.get_matches_estimated()

        uids = []
        for hit in query_result:
            uids.append(hit.document.get_value(_VALUE_UID))

        return (uids, total_count)

    def delete(self, uid):
        self._database.delete_document(_PREFIX_FULL_VALUE + _PREFIX_UID + uid)
        self._flush(True)

    def get_activities(self):
        activities = []
        prefix = _PREFIX_FULL_VALUE + _PREFIX_ACTIVITY
        for term in self._database.allterms(prefix):
            activities.append(term.term[len(prefix):])
        return activities

    def flush(self):
        self._flush(True)

    def get_index_updated(self):
        return os.path.exists(self._index_updated_path)

    index_updated = property(get_index_updated)

    def _set_index_updated(self, index_updated):
        if self._std_index_path != self._index_path:
            # operating from tmpfs
            return True
        if index_updated != self.index_updated:
            if index_updated:
                index_updated_file = open(self._index_updated_path, 'w')
                # index_updated = True will happen every
                # indexstore._FLUSH_TIMEOUT seconds, so it is ok to fsync
                os.fsync(index_updated_file.fileno())
                index_updated_file.close()
            else:
                os.remove(self._index_updated_path)

    def _flush_timeout_cb(self):
        self._flush(True)
        return False

    def _flush(self, force=False):
        """Called after any database mutation"""
        logging.debug('IndexStore.flush: force=%r _pending_writes=%r', force,
                      self._pending_writes)

        self._set_index_updated(False)

        if self._flush_timeout is not None:
            GLib.source_remove(self._flush_timeout)
            self._flush_timeout = None

        self._pending_writes += 1
        if force or self._pending_writes > _FLUSH_THRESHOLD:
            try:
                logging.debug("Start database flush")
                self._database.flush()
                logging.debug("Completed database flush")
            except Exception, e:
                logging.exception(e)
                logging.error("Exception during database.flush()")
                # bail out to trigger a reindex
                sys.exit(1)
            self._pending_writes = 0
            self._set_index_updated(True)
        else:
Пример #2
0
class IndexStore(object):
    """Index metadata and provide rich query facilities on it.
    """

    def __init__(self):
        self._database = None
        self._flush_timeout = None
        self._pending_writes = 0
        root_path=layoutmanager.get_instance().get_root_path()
        self._index_updated_path = os.path.join(root_path,
                                                'index_updated')
        self._std_index_path = layoutmanager.get_instance().get_index_path()
        self._index_path = self._std_index_path

    def open_index(self, temp_path=False):
        # callers to open_index must be able to
        # handle an exception -- usually caused by
        # IO errors such as ENOSPC and retry putting
        # the index on a temp_path
        if temp_path:
            try:
                # mark the on-disk index stale
                self._set_index_updated(False)
            except:
                pass
            self._index_path = temp_path
        else:
             self._index_path = self._std_index_path
        try:
             self._database = WritableDatabase(self._index_path,
                                               xapian.DB_CREATE_OR_OPEN)
        except Exception as e:
             logging.error('Exception opening database')
             raise

    def close_index(self):
        """Close index database if it is open."""
        if not self._database:
            return

        self._flush(True)
        try:
            # does Xapian write in its destructors?
            self._database = None
        except Exception as e:
            logging.error('Exception tearing down database')
            raise

    def remove_index(self):
        if not os.path.exists(self._index_path):
            return
        for f in os.listdir(self._index_path):
            os.remove(os.path.join(self._index_path, f))

    def contains(self, uid):
        postings = self._database.postlist(_PREFIX_FULL_VALUE + \
            _PREFIX_UID + uid)
        try:
            __ = postings.next()
        except StopIteration:
            return False
        return True

    def store(self, uid, properties):
        document = Document()
        document.add_value(_VALUE_UID, uid)
        term_generator = TermGenerator()
        term_generator.index_document(document, properties)

        if not self.contains(uid):
            self._database.add_document(document)
        else:
            self._database.replace_document(_PREFIX_FULL_VALUE + \
                _PREFIX_UID + uid, document)

        self._flush(True)

    def find(self, query):
        offset = query.pop('offset', 0)
        limit = query.pop('limit', MAX_QUERY_LIMIT)
        order_by = query.pop('order_by', [])
        query_string = query.pop('query', None)

        query_parser = QueryParser()
        query_parser.set_database(self._database)
        enquire = Enquire(self._database)
        enquire.set_query(query_parser.parse_query(query, query_string))

        # This will assure that the results count is exact.
        check_at_least = offset + limit + 1

        if not order_by:
            order_by = '+timestamp'
        else:
            order_by = order_by[0]

        if order_by == '+timestamp':
            enquire.set_sort_by_value(_VALUE_TIMESTAMP, True)
        elif order_by == '-timestamp':
            enquire.set_sort_by_value(_VALUE_TIMESTAMP, False)
        elif order_by == '+title':
            enquire.set_sort_by_value(_VALUE_TITLE, True)
        elif order_by == '-title':
            enquire.set_sort_by_value(_VALUE_TITLE, False)
        elif order_by == '+filesize':
            enquire.set_sort_by_value(_VALUE_FILESIZE, True)
        elif order_by == '-filesize':
            enquire.set_sort_by_value(_VALUE_FILESIZE, False)
        elif order_by == '+creation_time':
            enquire.set_sort_by_value(_VALUE_CREATION_TIME, True)
        elif order_by == '-creation_time':
            enquire.set_sort_by_value(_VALUE_CREATION_TIME, False)
        else:
            logging.warning('Unsupported property for sorting: %s', order_by)

        query_result = enquire.get_mset(offset, limit, check_at_least)
        total_count = query_result.get_matches_estimated()

        uids = []
        for hit in query_result:
            uids.append(hit.document.get_value(_VALUE_UID))

        return (uids, total_count)

    def delete(self, uid):
        self._database.delete_document(_PREFIX_FULL_VALUE + _PREFIX_UID + uid)
        self._flush(True)

    def get_activities(self):
        activities = []
        prefix = _PREFIX_FULL_VALUE + _PREFIX_ACTIVITY
        for term in self._database.allterms(prefix):
            activities.append(term.term[len(prefix):])
        return activities

    def flush(self):
        self._flush(True)

    def get_index_updated(self):
        return os.path.exists(self._index_updated_path)

    index_updated = property(get_index_updated)

    def _set_index_updated(self, index_updated):
        if self._std_index_path != self._index_path:
             # operating from tmpfs
             return True
        if index_updated != self.index_updated:
            if index_updated:
                index_updated_file = open(self._index_updated_path, 'w')
                # index_updated = True will happen every
                # indexstore._FLUSH_TIMEOUT seconds, so it is ok to fsync
                os.fsync(index_updated_file.fileno())
                index_updated_file.close()
            else:
                os.remove(self._index_updated_path)

    def _flush_timeout_cb(self):
        self._flush(True)
        return False

    def _flush(self, force=False):
        """Called after any database mutation"""
        logging.debug('IndexStore.flush: force=%r _pending_writes=%r',
                force, self._pending_writes)

        self._set_index_updated(False)

        if self._flush_timeout is not None:
            GObject.source_remove(self._flush_timeout)
            self._flush_timeout = None

        self._pending_writes += 1
        if force or self._pending_writes > _FLUSH_THRESHOLD:
            try:
                logging.debug("Start database flush")
                self._database.flush()
                logging.debug("Completed database flush")
            except Exception, e:
                logging.exception(e)
                logging.error("Exception during database.flush()")
                # bail out to trigger a reindex
                sys.exit(1)
            self._pending_writes = 0
            self._set_index_updated(True)
        else: