def _cleanup(self): """Remove all files created by us""" if self._generated_file: self._generated_file.close() file_path = Path(self._generated_file.name) if file_path.isfile(): file_path.remove() # end delete file self._generated_file = None
def executable(self, env): """@return butility.Path to executable - its not verified to be existing @note for now this is uncached, but its okay for our use @note we always resolve environment variables """ executables = self.data().executable if not executables: raise ValueError("no executable set for package '%s'" % self.name()) error = None executable_path = None for executable in executables: executable_path = Path._expandvars_deep(executable, env) try: executable_path = self.to_abs_path(executable_path) except EnvironmentError as err: if '$' in executable_path: # give it more time, let them work with it until something breaks # Don't have another choice executable_path = Path(executable_path) else: error = err continue # end handle conversion if os.name == 'nt': # We assume exe by default, and not com or bat. # Even though magic isn't good, I see no point in making this configurable, people # can just be explicit about the extension win_ext = '.exe' if not executable_path.ext(): executable_path += win_ext # handle extension # end handle windows # If we have variables in the path, we can't assume anything (nor resolve) as it might be too early # for that. In that case, we assume the best. Otherwise, the executable must exist if not executable_path.containsvars() and not executable_path.isfile(): continue # end return executable_path # end for each executable to try assert executable_path or error, "Should have collected at least one error at this point" if error: raise error return executable_path
class CreateFSItemOperation(FSOperationBase): """Create a directory or file with the given access permissions and ownership. In case of a file, you may specify an initial content. For this operation to succeed, the destination path must not exist yet!""" __slots__ = ("_path", "_content", "_mode", "_uid", "_gid") name = "CreateFSItem" def __init__(self, transaction, path, initial_file_content=None, mode=None, uid=None, gid=None): """Initialize the operation with a path to create. If initial_file_content is set, to a string, it will be written in binary mode to a file. If it is unset, a directory will be created. Non-existing parent-directories will be created. After creation, the mode will be set if not None, and uid and gid will be set as well to the given numerical ID if of of them is not None""" super(CreateFSItemOperation, self).__init__(transaction) self._assert_posix() self._path = Path(path) self._content = initial_file_content self._mode = mode self._uid = uid self._gid = gid def apply(self): if self._content and self._path.isdir() or not self._content and self._path.isfile(): raise AssertionError( "Cannot create item of type directory or file as the an equally named item of different type exists") # END sanity check if self._dry_run(): return if self._path.exists(): return # end ignore existing items of the same type # we don't do it the most efficient way, as we could specify certain things in # at creation. For now, we don't do it though as it shouldn't matter if self._content: self.log.info("creating file %s", self._path) self._path.write_bytes(self._content) else: self.log.info("creating directory %s", self._path) self._path.makedirs() # END initial creation self._operation_performed = True if self._mode is not None: self._path.chmod(self._mode) # END handle mode self.set_user_group(self._path, self._gid, self._uid) def rollback(self): try: if not self._operation_performed or not self._path.exists(): return if self._content: self.log.info("Removing file %s", self._path) self._path.remove() else: self.log.info("Removing single directory %s", self._path) self._path.rmdir() # END handle removal, safely as we don't recursively delete anything finally: self._reset_state()
class RsyncOperation(Operation): """An operation which allows to safely copy a source file or directory to a given destination file or directory. The location of the rsync-program is currently taken assumed. @note only works on linux (even though theoretically, rsync might exist on windows""" __slots__ = ("_source_path", "_destination_path", "_actual_destination_path", "_move_mode", "_current_path", "_total_num_files_transferred", "_num_files_transferred", "_total_transferred_filesize_bytes", "_transferred_filesize_bytes", "_current_total_transferred_filesize_bytes", "_seen_progress_for_current_file", "_current_bandwidth", "_start_time", '_last_time_left_s', "_process", "_destination_existed", "_actual_destination_existed", "_max_bandwidth_kb" ) # ------------------------- # @name Constants # @{ NUM_FILES = "Number of files transferred: " TRANSFERRED_BYTES = "Total file size: " # -- End Constants -- @} # ------------------------- # @name Configuration # @{ name = "rsync" description = "Synchronize directory structures or copy files" re_is_path = re.compile(r"^/?[\w\-]+(?:/[\w\-\.]+)*$") re_progress = re.compile(r"(?P<bytes>\d+)\s+(?P<percent>\d+)%\s+(?P<bandwidth>\d+\.\d+\w+/\w)\s+.*") rsync_path = "/usr/bin/rsync" rm_path = "/bin/rm" # -- End Configuration -- @} def __init__(self, transaction, source, destination, move=False, max_bandwidth_kb=0): """initialize an rsync operation with a source and destination path. If move is True, the source will be deleted after a successful rsync operation. An operation is successful if there were no error lines in stderr of the process, and if If the maximum bandwidth is greater 0, the rsync operation will be using no more than the given bandwidth in kilobytes. the return code was 0.""" super(RsyncOperation, self).__init__(transaction) if os.name != "posix": raise AssertionError("This module only works on posix systems") # END handle posix self._source_path = Path(source).expandvars() self._destination_path = self._actual_destination_path = Path(destination).expandvars() self._destination_existed = self._destination_path.exists() # rsync creates at max one parent directory if it does not exist. It will alays put # directories into other directories, unless you specify a / in the end of source. # # In case of files, it will always put them into existing directories, or rename them to the given # name if ((self._source_path.isdir() and not self._source_path.tolinuxpath().endswith('/')) or (self._source_path.isfile() and not self._destination_existed or self._destination_path.isdir())): self._actual_destination_path = self._destination_path / self._source_path.basename() + "/" # / as we have to assure it copies the contents of the directory in case of undo # END assure destination self._actual_destination_existed = self._actual_destination_path.exists() self._move_mode = move self._max_bandwidth_kb = max_bandwidth_kb self._current_path = None self._total_num_files_transferred = 0 self._total_transferred_filesize_bytes = 0 self._process = None self._reset_current_state() def _reset_current_state(self): """Reset the current values that will be counted in the following invokation""" self._start_time = 0 self._num_files_transferred = 0 self._transferred_filesize_bytes = 0 self._current_total_transferred_filesize_bytes = 0 self._seen_progress_for_current_file = False self._current_bandwidth = None self._last_time_left_s = None def _set_current_file(self, path): """set the path of the file being currently transferred, adjust state""" # if there is progress for the previous file, we count it # This will make us miss the last file, but its okay ... if self._seen_progress_for_current_file: self._num_files_transferred += 1 self._current_total_transferred_filesize_bytes += self._transferred_filesize_bytes self._transferred_filesize_bytes = 0 self._seen_progress_for_current_file = False # END handle count self._current_path = path def _handle_progress_match(self, match): """Check the match regex and adjust our state""" self._current_bandwidth = match.group("bandwidth") self._transferred_filesize_bytes = int(match.group("bytes")) def _update_progress(self, gather_mode=False): """Use our state to produce a progresss @param gather_mode in gather mode, we will just present a count up""" self._abort_point() self._seen_progress_for_current_file = True prog = self._progress() if gather_mode: prog.set(self._num_files_transferred, message="Gathering Files ... %i" % self._num_files_transferred) else: # remaining bytes remaining_mbytes = ( self._total_transferred_filesize_bytes - self._current_total_transferred_filesize_bytes) / 1024 ** 2 time_taken = max(1, time.time() - self._start_time) time_left_s = self._total_transferred_filesize_bytes / \ max(1, (self._current_total_transferred_filesize_bytes / time_taken)) # remaining time suffix = "second" divisor = 1 if time_left_s >= 60.0: divisor = 60 suffix = "minute" # END handle suffix if time_left_s > 1: suffix += "s" # END handle plural if self._last_time_left_s is not None: time_left_s = (time_left_s + self._last_time_left_s) / 2.0 # soften jumps a bit # END average values msg = "Transferring %s at %s - %i files left, done in about %i %s" % ( self._current_path, self._current_bandwidth, self._total_num_files_transferred - self._num_files_transferred, int(time_left_s / divisor), suffix) prog.set(self._num_files_transferred, message=msg) self._last_time_left_s = time_left_s # END handle gather mode def _parse_output_line(self, line): """Parse a single line and adjust our state accordingly @return 1 == file, 2 == progress, 3 == stats, False otherwise""" # FILENAME ########### line = line.strip() # strip trailing newline if self.re_is_path.match(line) is not None: self._set_current_file(line) return 1 # END check for path # PROGRESS ########### # parse " 27131904 51% 2.44MB/s 0:00:10 " m = self.re_progress.match(line) if m: self._handle_progress_match(m) self._update_progress() return 2 # END handle progress # STATS ########### if line.startswith(self.NUM_FILES): self._total_num_files_transferred = int(line[len(self.NUM_FILES):]) return 3 # END check file count if line.startswith(self.TRANSFERRED_BYTES): self._total_transferred_filesize_bytes = int(line[len(self.TRANSFERRED_BYTES):].split(" ")[0]) return 3 # END check transferred bytes return False def _force_removal(self, destination): """Forcefully delete given directory or file, linux only. @throws OSError""" self.log.info("about to remove directory at %s ... " % destination) rval = subprocess.call([self.rm_path, "-Rf", str(destination)]) if rval != 0: raise OSError("Failed to remove file or directory that we managed to copy previously: %s" % destination) self.log.info("... done removing destination path") def _parse_output(self, gather_mode=False): """Parse the output of the rsync process and set the progress accordingly @param gather_mode if True, we will just hang onto the standard output, which may speed up processing. This way, we only count the line actually @return error data, chunk-separated, lines are within the data accordingly""" empty_list = list() timeout = 0.1 err_data = list() process = self._process # GATHER MODE ################## try: if gather_mode: while True: line = process.stdout.readline().decode() if not line: return err_data if self._parse_output_line(line) == 1: self._update_progress(gather_mode) # END update progress only if we parsed something # END loop forever return err_data # END handle stderr # RUN MODE ########### # Set stderr to non-blocking to allow simple reads fl = fcntl.fcntl(process.stderr.fileno(), fcntl.F_GETFL) fcntl.fcntl(process.stderr.fileno(), fcntl.F_SETFL, fl | os.O_NONBLOCK) while process.poll() is None: # stdout is in blocking mode, so we can read lines accordingly # try to read as many as possible # as long as there is someting while select([process.stdout.fileno()], empty_list, empty_list, timeout)[0]: line = process.stdout.readline().decode() if not line: break self._parse_output_line(line) # END handle standard output try: # from stderr we expect not much output if at all, so poll it from time to time err = process.stderr.read() if err: err_data.append(err) # END gather errors except IOError: # it can happen that the process goes down in the process of reading stdout # Therefore we fail to read - lets just try again in this case continue # END handle invalid reads time.sleep(timeout) # END while process is active finally: # if we don't close the handles, process will stay around, even if the handle gets # deleted. Will never get used to that, call it a bug !!! Everytime I fall for this ... if process.stdout: process.stdout.close() if process.stderr: process.stderr.close() # END assure f*****g pipes are closed !!! return err_data # ------------------------- # @name Interface Implementation # @{ def apply(self): try: # assure that directories will have their content copied, to assure # the source = self._source_path def_args = ("-a", "--append", "--progress", self._source_path, self._destination_path) def proc(args, gather_mode=False): self.log.info(" ".join(args)) return subprocess.Popen(args, stdin=None, stdout=subprocess.PIPE, stderr=not gather_mode and subprocess.PIPE or None, shell=False) # END proc helper def handle_process(gather_mode=False): # PARSE OUTPUT ############### try: err_chunks = self._parse_output(gather_mode) if self._current_path is not None: self._set_current_file(None) # trigger the last file to be registered self._process.wait() finally: if self._process.poll() is None: self._process.terminate() # END assure process is terminated # END if process is still running here, kill it, as we are likely to be in # an exceptional state (user aborted) if self._process.returncode != 0 or err_chunks: raise OSError("rsync failed with error code: %i, error was \n%s" % (self._process.returncode, "".join(err_chunks))) # END handle error code self._process = None # END handle process # GATHER RUN ############# # Gather information about the run to determine the required needs args = [self.rsync_path, "--dry-run", "--stats"] args.extend(def_args) self._progress().setup(round_robin=True, relative=False) self.log.info("Calculating cost of operation ... ") self._process = proc(args, True) handle_process(gather_mode=True) self._reset_current_state() # VERIFY FREE SPACE IN DESTINATION ################################## # destination doesn't necessarily exist, hence we try the parent path as well # prefer the actual destination, in case its a dir - the parent might already be # on another mount for item in [self._destination_path, self._destination_path.dirname()]: if not item.exists(): continue # END handle missing items if not self._total_transferred_filesize_bytes: self.log.info("Wouldn't do any work - skipping transfer operation") return # end abort if nothing to do fs_info = os.statvfs(item) free_bytes_at_destination = fs_info.f_bsize * fs_info.f_bavail if self._total_transferred_filesize_bytes >= free_bytes_at_destination: msg = "Insufficient disk space available at %s to copy %s - require %iMB, have %iMB" % ( item, self._source_path, self._total_transferred_filesize_bytes / 1024 ** 2, free_bytes_at_destination / 1024 ** 2) raise OSError(msg) # END check free space # END for each item to try args = [self.rsync_path] if self._dry_run(): args.append("--dry-run") # END handle dry-run if self._max_bandwidth_kb > 0: args.append("--bwlimit=%i" % self._max_bandwidth_kb) # END handle bandwidth limit args.extend(def_args) # START PROCESS ################ self.log.info("Starting %s" % (" ".join(args))) self._progress().setup(range=(0, self._total_num_files_transferred), relative=True) self._start_time = time.time() self._process = proc(args) handle_process() if self._move_mode and not self._dry_run(): self._force_removal(self._source_path) # END handle movemode finally: if self._process and self._process.poll() is not None: self.log.error("Terminating child forcefully") try: self._process.kill() except OSError: pass self._process = None # END handle process # END assure process is killed on error def rollback(self): # without destination, we couldn't fix anything anyway if not self._destination_path.exists(): return # have to reproduce source from destination ? if not self._source_path.exists(): if self._destination_existed: self.log.warn("Destination at %s existed - rollback might copy more data than expected" % self._destination_path) # END info self.log.info("Restoring source from destination ...") t = Transaction(self.log, progress=self._progress()) type(self)(t, self._actual_destination_path + "/", self._source_path) self.log.info("rolling back rsync from %s to %s", self._actual_destination_path, self._source_path) if not t.apply().succeeded(): raise IOError( "Expected copy operation to succeed - rollback failed, destination data exists at %s" % self._destination_path) # END apply sub-transaction # END source doesn't exist # finally remove destination if possible for destination, existed in ((self._actual_destination_path, self._actual_destination_existed), (self._destination_path, self._destination_existed)): if existed: self.log.warn( "Refusing deletion of destination during rollback as it existed before the rsync operation at %s" % destination) continue # END sanity check self._force_removal(destination) # END for each pair of possible paths # -- End Interface Implementation -- @} # ------------------------- # @name Interface # @{ def actual_destination(self): """:return: the destination that will actually receive the copy""" return self._actual_destination_path
def _update_db(self, args): """Update the sqlite database database @return error code""" log = self.log() num_sources = bool(args.directories) + bool(args.merge_paths) if num_sources > 1: raise AssertionError("Cannot use --from-directories or --merge together") elif num_sources and args.remove_duplicates: raise AssertionError("--remove-duplicate-paths cannot be used in conjunction with any source") elif not (args.fast or args.remove_duplicates) and num_sources == 0: raise AssertionError("Specify at least one of the flags specifying from where to update the database") # end assure consistency ############# # INIT DB ## ########### path = args.update_db engine = create_engine(self._url_from_path(path)) meta = None # Assume file exists if is_url(path) or path.isfile(): meta = MetaData(engine, reflect=True) # end handle file exists if not meta or args.table_name not in meta.tables: if args.fast: log.warn("Database didn't exist yet - fast implicitly disabled") args.fast = False if num_sources == 0: raise AssertionError( "Require at least one initial data source, either --from-directories or --merge" ) # end handle logic # end handle fast if args.remove_duplicates: raise AssertionError("Cannot remove duplicates on non-existing table") # end handle remove duplicates meta = fsstat_schema.meta fsstat_schema.record.name = args.table_name meta.bind = engine meta.create_all() log.info("initalized database at %s", path) fsitem = fsstat_schema.record # assure we have the meta-data with the proper name - renaming the table before we create_all # is kind of a hack meta = MetaData(engine, reflect=True) else: if args.with_index: log.info("Cannot create index on exiting table without additional logic - turning index creation off") # end args.with_index = False fsitem = meta.tables[args.table_name] log.info("Updating database '%s' at '%s'", path, args.table_name) # end initialize table strip = str.strip basename = os.path.basename connection = engine.connect() insert = fsitem.insert() st = time() nr = 0 # num records handled records = list() ######################## # REMOVE DUPLICATES ### ###################### if args.remove_duplicates: nr = self._remove_duplicates(connection, fsitem) ###################### # FAST UPDATE #### ############### elif args.fast: nr = self._fast_update_database(engine, args) ########################### ## DIRECTORY CRAWLING #### ######################### elif args.directories: streamer = HashStreamer(hashlib.sha1, lz4dumps) join = os.path.join normalize = os.path.normpath totalbcount = 0 # total amount of bytes processed lct = time() progress_every = 500 commit_every_fcount = 15000 commit_every_seconds = 1 * 60 ## commits per minute def progress(): elapsed = time() - st log.info( "Processed %i files with %s in %.2fs (%.2f files/s | %s MB/s)", nr, int_to_size_string(totalbcount), elapsed, nr / elapsed, mb(totalbcount) / elapsed, ) # end for directory in args.directories: if not os.path.isdir(directory): log.error("Skipped non-existing directory '%s'", directory) continue # end handle failed directory acccess # normalize to prevent extra stuff directory = normalize(directory) for root, dirs, files in os.walk(directory, followlinks=False): # NOTE: We also take directories, as it allows to find directories with many files, or with # no files (empty directories). Also, we can optimize updates that way # Just to also handle root ! It must be in the database, otherwise we can never # handle additions correctly, at least not for the root folder chains = [files, dirs] if root is directory: # an empty string joined with root, is root chains.insert(0, [""]) # end handle root for filename in chain(*chains): nr += 1 # only join if we are not seeing the root. Otherwise we get a slash appended # Which is something we really don't want as it could hinder later updates path = filename and join(root, filename) or root stat = self._append_path_record(records, path, streamer, log) if stat: totalbcount += stat.st_size if nr % progress_every == 0: progress() # end show progress # end managaed to handle file if time() - lct >= commit_every_seconds or nr % commit_every_fcount == 0: lct = time() progress() self.do_execute_records(connection, insert, records, log, st, nr) # end commit # end for each file # end for each directory to traverse # final execute progress() self.do_execute_records(connection, insert, records, log, st, nr) ######################### ## Database Merges #### ###################### elif args.merge_paths: ## Commit this amount of records at once commit_count = 100000 def progress(): elapsed = time() - st log.info("Inserted %i records in %.2fs (%.2f records/s)", nr, elapsed, nr / elapsed) # end for merge_path in args.merge_paths: merge_path = Path(merge_path) if not is_url(merge_path) and not merge_path.isfile(): log.error("Database at '%s' didn't exist - skipping", merge_path) continue # end for each path log.info("Merging DB at '%s' ...", merge_path) merge_engine = create_engine(self._url_from_path(merge_path)) mcon = merge_engine.connect() md = MetaData(merge_engine, reflect=True) try: for table in md.tables.itervalues(): # If id is part of it, and we rollback because of a unicode error, the counter # will be offset and we cannot commit anymore. Just let it be done automatically, no # matter what column_names = [c.name for c in table.columns if c != "id"] try: cursor = mcon.execute(select([table])) # We assume the cursor deals with the query efficiently, and doesn't really fetch everything while True: fst = time() log.info("Fetching %i '%s' records from '%s' ...", commit_count, table.name, merge_path) rows = cursor.fetchmany(commit_count) records.extend(dict(zip(column_names, row)) for row in rows) elapsed = time() - fst log.info( "Fetched %i records in %.2fs (%.2f records/s)", len(records), elapsed, len(records) / elapsed, ) nr += len(records) must_break = len(records) < commit_count ############## self.do_execute_records(connection, insert, records, log, st, nr) progress() ############## # Did we get enough ? if must_break: break # end check for end of iteration # end endless loop finally: cursor.close() # end for each table to merge finally: mcon.close() # end assure we close resources # end for each merge path else: raise AssertionError("Reached unexpected mode") # end handle mode of operation ############################## # CREATE INDICES AND VIEWS ## ############################ if args.with_index: # Create one index per column, which allows fast searches over it # Create a custom one that speeds up our common search group by path, order by path, mtime. for col in fsitem.columns: # id is primary, and thus already indexed # path is too big - it needs to be hashed to be useful in an actual index # file as well if col in (fsitem.c.id, fsitem.c.path, fsitem.c.sha1): continue # end handle index creation ist = time() log.info("Creating index for columns '%s' ...", col) try: Index("idx_%s_%s" % (fsitem.name, col.name), col).create(engine) except Exception: log.error("Index creation failed", exc_info=True) else: elapsed = time() - ist log.info("Created index with %i entries in %.2fs (%.2f entries/s)" % (nr, elapsed, nr / elapsed)) # end handle creation errors # end for each index to create # end handle index creation if args.sql_directories: for sql_dir in args.sql_directories: sql_dir = Path(sql_dir) for sql_file in sql_dir.files(pattern="*.sql"): try: transaction = connection.begin() log.info("Creating view from '%s'", sql_file) connection.execute(sqlite_view_from_file(sql_file)) transaction.commit() except Exception: transaction.rollback() log.error("Failed to create view for file '%s' - it might have existed - skipping", sql_file) continue # end handle transaction per sql view # end for each file # end for eeach sqldir # end have sql directories # FINAL CLEANUP ################ # If there were unicode errors, we end up having a row with a null-path. This breaks our code # Lets keep the data consistent instead of altering code dst = time() connection.execute(fsitem.delete().where(fsitem.c.path == None)) log.info("Cleaned dataset after (possible) unicode errors in %fs", time() - dst) connection.close() ################## # FINAL INFO ### ############### elapsed = time() - st log.info("Overall time to process %i records is %.2fs (%.2f records/s)", nr, elapsed, nr / elapsed) log.info("File written to %s", Path(args.update_db).abspath()) return self.SUCCESS