def __index_file(self, filepath): """Indexes the contents of the file at the specified path.""" has_file_changed, db_record = self._index.has_file_updated(filepath) if FORCE_INDEX_REBUILD and db_record is not None: has_file_changed = True if not has_file_changed: return log.debug('indexing file: %s' % filepath) self._index.index_file(filepath, document_id=db_record.id)
def __index_dir(self, dpath): """Indexes the contents of the directory at the specified path. """ log.debug('Checking directory: %s' % dpath) # sanity checks if not isinstance(settings.EXCLUDE_FILE_SUFFIX, (tuple, type(None))): raise Exception( "settings.EXCLUDE_FILE_SUFFIX must be a tuple or None, found: %s" % type(settings.EXCLUDE_FILE_SUFFIX)) if not isinstance(settings.INCLUDE_FILE_SUFFIX, (tuple, type(None))): raise Exception( "settings.INCLUDE_FILE_SUFFIX must be a tuple or None, found: %s" % type(settings.INCLUDE_FILE_SUFFIX)) # nested, reused code block def check_name(name): """Returns True if the item with the specified name can be indexed""" can_index = True # ignore hidden files if name.startswith("."): return False # ignore excluded files if settings.EXCLUDE_FILE_SUFFIX: for suffix in settings.EXCLUDE_FILE_SUFFIX: can_index = True if name.endswith(suffix): return False # ignore files that do not have the given suffixes if settings.INCLUDE_FILE_SUFFIX: for suffix in settings.INCLUDE_FILE_SUFFIX: can_index = False if name.endswith(suffix): return True return can_index # perform item indexing if not self._is_recursive: # just check the files in the target directory items = os.listdir(dpath) for item in items: if not check_name(item): continue path = os.path.join(dpath, item) self.__index_file(path) pass else: # traverse the given path for dirpath, dirnames, filenames in os.walk(dpath): dirname = os.path.basename(dirpath) # ignore hidden dirs if dirname.startswith('.'): continue for name in filenames: can_index = check_name(name) if can_index: path = os.path.join(dirpath, name) self.__index_file(path) pass
def __index_dir(self, dpath): """Indexes the contents of the directory at the specified path. """ log.debug("Checking directory: %s" % dpath) # sanity checks if not isinstance(settings.EXCLUDE_FILE_SUFFIX, (tuple, type(None))): raise Exception( "settings.EXCLUDE_FILE_SUFFIX must be a tuple or None, found: %s" % type(settings.EXCLUDE_FILE_SUFFIX) ) if not isinstance(settings.INCLUDE_FILE_SUFFIX, (tuple, type(None))): raise Exception( "settings.INCLUDE_FILE_SUFFIX must be a tuple or None, found: %s" % type(settings.INCLUDE_FILE_SUFFIX) ) # nested, reused code block def check_name(name): """Returns True if the item with the specified name can be indexed""" can_index = True # ignore hidden files if name.startswith("."): return False # ignore excluded files if settings.EXCLUDE_FILE_SUFFIX: for suffix in settings.EXCLUDE_FILE_SUFFIX: can_index = True if name.endswith(suffix): return False # ignore files that do not have the given suffixes if settings.INCLUDE_FILE_SUFFIX: for suffix in settings.INCLUDE_FILE_SUFFIX: can_index = False if name.endswith(suffix): return True return can_index # perform item indexing if not self._is_recursive: # just check the files in the target directory items = os.listdir(dpath) for item in items: if not check_name(item): continue path = os.path.join(dpath, item) self.__index_file(path) pass else: # traverse the given path for dirpath, dirnames, filenames in os.walk(dpath): dirname = os.path.basename(dirpath) # ignore hidden dirs if dirname.startswith("."): continue for name in filenames: can_index = check_name(name) if can_index: path = os.path.join(dirpath, name) self.__index_file(path) pass
def __index_file(self, filepath): """Indexes the contents of the file at the specified path.""" has_file_changed, db_record = self._index.has_file_updated(filepath) if FORCE_INDEX_REBUILD and db_record is not None: has_file_changed = True if not has_file_changed: return log.debug("indexing file: %s" % filepath) self._index.index_file(filepath, document_id=db_record.id)
def clean_index(self): """Cleans the index by purging any documents that no longer exist. """ # iterate each record in the database # see if it exists on the file system for record in self.get_indexed_files(): if not os.path.exists(record.path): self._index.delete_by_term('path', record.path) record.delete_instance() logger.debug('removed indexed file: %s' % record)
def __index_dir(self, dpath): """Indexes the contents of the directory at the specified path.""" log.debug('Checking directory: %s' % dpath) # sanity checks if not isinstance(settings.EXCLUDE_FILE_SUFFIX, (tuple, list, type(None))): raise Exception( 'settings.EXCLUDE_FILE_SUFFIX must be a tuple or None, found: %s' % type(settings.EXCLUDE_FILE_SUFFIX)) if not isinstance(settings.INCLUDE_FILE_SUFFIX, (tuple, list, type(None))): raise Exception( 'settings.INCLUDE_FILE_SUFFIX must be a tuple or None, found: %s' % type(settings.INCLUDE_FILE_SUFFIX)) if not os.listdir(dpath): raise Exception('Directory to index is empty: %s' % dpath) # nested, reused code block def check_name(name): """Returns True if the item with the specified name can be indexed.""" # ignore hidden files if name.startswith('.'): return False can_index = True # ignore excluded files if settings.EXCLUDE_FILE_SUFFIX: can_index = True if any(filter(name.endswith, settings.EXCLUDE_FILE_SUFFIX)): return False # ignore files that do not have the given suffixes if settings.INCLUDE_FILE_SUFFIX: can_index = False if any(filter(name.endswith, settings.INCLUDE_FILE_SUFFIX)): return True return can_index # perform item indexing if not self._is_recursive: # just check the files in the target directory items = os.listdir(dpath) for item in filter(check_name, items): path = os.path.join(dpath, item) self.__index_file(path) else: # traverse the given path for dirpath, dirnames, filenames in os.walk(dpath): # ignore hidden dirs dirnames[:] = [d for d in dirnames if not d.startswith('.')] for name in filter(check_name, filenames): path = os.path.join(dirpath, name) self.__index_file(path)
def clean_index(self): """Cleans the index by purging any documents that no longer exist. """ # iterate each record in the database # see if it exists on the file system for record in self.get_indexed_files(): if not os.path.exists(record.path): try: self.index.delete_document(record.id) except xapian.DocNotFoundError: # it is safe to continue pass record.delete_instance() logger.debug('removed indexed file: %s' % record)
def clean_index(self): """Cleans the index by purging any documents that no longer exist. """ # iterate each record in the database # see if it exists on the file system for record in self.get_indexed_files(): if not os.path.exists(record.path): self._index.delete_by_term('path', record.path) record.delete_instance() logger.debug('removed indexed file: %s' % record) # Docs says the index has this method, it doesn't # must find a way to 'purge' deleted documents. # It does remove them from the query, but the index info is stored until purged. # http://packages.python.org/Whoosh/indexing.html#deleting-documents #self.index.commit() pass
def open(self, index_path, **kwargs): """Creates or opens an index at the specified path.""" if not os.path.isdir(index_path): msg = 'Directory `%s` is not a valid index directory.' % index_path log.warning(msg) raise Exception(msg) # create the dir, if needed path = os.path.join(index_path, self._name) if not os.path.isdir(path): os.mkdir(path) log.warning('created index directory at %s' % path) # create or open the index if self._rebuild_index or not self._index.index_exists(path): log.debug('creating index at %s' % path) self._index.create_index(path) else: log.debug('opening index at %s' % path) self._index.open_index(path, writable=self._is_writable) # store indexes path self._path = path
def open(self, index_path, **kwargs): """Creates or opens an index at the specified path.""" if not os.path.isdir(index_path): msg = "Directory `%s` is not a valid index directory." % index_path log.warning(msg) raise Exception(msg) # create the dir, if needed path = os.path.join(index_path, self._name) if not os.path.isdir(path): os.mkdir(path) log.warning("created index directory at %s" % path) # create or open the index if self._rebuild_index or not self._index.index_exists(path): log.debug("creating index at %s" % path) self._index.create_index(path) else: log.debug("opening index at %s" % path) self._index.open_index(path, writable=self._is_writable) # store indexes path self._path = path
def find_path(self, path): """Finds the document at the specified path """ log.debug('search for path: %s' % path) return self._searcher.find_path(path)
def find_text(self, text, pagenum=1, limit=10): """Finds the specified text by searching the internal index """ log.debug('[%s] searching for: %s' % (datetime.now(), text)) return self._searcher.find_text(text, pagenum, limit)
def find_path(self, path): """Finds the document at the specified path.""" log.debug('search for path: %s' % path) return self._searcher.find_path(path)
def find_text(self, text, pagenum=1, limit=10): """Finds the specified text by searching the internal index.""" log.debug('[%s] searching for: %s', datetime.now(), text) return self._searcher.find_text(text, pagenum, limit)