def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None, checkpoint=None, onlyparts=False): # fixme # this needs to do more work if there are file parts or checkpoint files linked in here from # earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts # changes too, so maybe old files still exist and the links need to be removed because we # have newer files for the same phase of the dump. if SymLinks.NAME in self._enabled: latest_dir = self.dump_dir.latest_dir() files = os.listdir(latest_dir) for filename in files: link = os.path.join(latest_dir, filename) if os.path.islink(link): realfile = os.readlink(link) file_obj = DumpFilename(self.dump_dir._wiki) file_obj.new_from_filename(os.path.basename(realfile)) if file_obj.date < date_string: # fixme check that these are ok if the value is None if dump_name and (file_obj.dumpname != dump_name): continue if (partnum or onlyparts) and (file_obj.partnum != partnum): continue if checkpoint and (file_obj.checkpoint != checkpoint): continue self.debugfn("Removing old symlink %s -> %s" % (link, realfile)) os.remove(link)
def get_per_file_path(self, htype, filename): ''' return the full path to the file containing the checksum of the specified type for the given filename. this is only in txt format ''' dfname = DumpFilename(self.wiki, None) # fixme check to see if this is right or what dfname.new_from_filename(Checksummer.get_checksum_basename_perfile(htype, filename)) return self.dump_dir.filename_public_path(dfname)
def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None, checkpoint=None, onlyparts=False): """ Remove symlinks from the 'latest' directory for (some) links that point to files from other runs than the current one (of 'date_string'). If dump_name, part_num, checkpoint are False or None, we remove all the old symlinks for all values of the arg in the filename. example: if partnum is False or None then we remove all old values for all file parts This needs to do more work if there are file parts or checkpoint files linked in here from earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts changes too, so maybe old files still exist and the links need to be removed because we have newer files for the same phase of the dump. So we keep symlinks to files from one older run only, and clean up the rest. We do this because here at WMF we do partial and full runs alternating, and we like to keep the links to files from the full runs around until a new full run is in place. Really the number of keeps should be configurable (FIXME later I guess). """ if SymLinks.NAME in self._enabled: latest_dir = self.dump_dir.latest_dir() files = os.listdir(latest_dir) dates = [] files_for_cleanup = [] for filename in files: link = os.path.join(latest_dir, filename) if os.path.islink(link): realfilepath = os.readlink(link) dfname = DumpFilename(self.dump_dir._wiki) dfname.new_from_filename(os.path.basename(realfilepath)) files_for_cleanup.append({'link': link, 'dfname': dfname, 'path': realfilepath}) dates.append(dfname.date) try: index = dates.index(date_string) prev_run_date = dates[index - 1] except Exception: if len(dates) >= 2: prev_run_date = dates[-2] else: prev_run_date = None for item in files_for_cleanup: if item['dfname'].date < date_string: if dump_name and (item['dfname'].dumpname != dump_name): continue if prev_run_date is None or item['dfname'].date == prev_run_date: # for the previous run, or the only existing run, if different # from the current one, we are very careful. For all older runs # we pretty much want to toss everything # fixme check that these are ok if the value is None if (partnum or onlyparts) and (item['dfname'].partnum != partnum): continue if checkpoint and (item['dfname'].checkpoint != checkpoint): continue self.debugfn("Removing old symlink %s -> %s" % (item['link'], item['path'])) os.remove(item['link'])
def write_specialfilesinfo_file(self): """ get info about all files for the most current dump of a given wiki, possibly in progress, that don't contain dump job output; write this info to an output file """ if SpecialFileInfo.NAME not in self._enabled: return dump_dir = DumpDir(self.wiki, self.wiki.db_name) files = self.get_special_filenames() fileinfo = {} for filename in files: fileinfo[filename] = {} path = os.path.join(self.wiki.public_dir(), self.wiki.date, filename) fileinfo[filename]['status'] = 'present' try: size = os.path.getsize(path) fileinfo[filename]['size'] = size except Exception: fileinfo[filename]['status'] = 'missing' continue dfname = DumpFilename(self.wiki) dfname.new_from_filename(os.path.basename(path)) fileinfo[filename]['url'] = dump_dir.web_path_relative(dfname) contents = {} contents['files'] = fileinfo contents['version'] = SpecialFileInfo.VERSION try: self.write_contents(contents) except Exception: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value, exc_traceback))) message = "Couldn't write special files info. Continuing anyways" if self.error_callback: self.error_callback(message) else: sys.stderr.write("%s\n" % message)
def command_completion_callback(self, series): """ if the series of commands ran successfully to completion, mv produced output files from temporary to permanent names we write the data into temporary locations initially so that as each command series completes, the output files can be made available as done immediately, rather than waiting for all the parallel processes of a dump step to complete first. args: CommandSeries for which all commands have completed """ if not series.exited_successfully(): return for commands in self.commands_submitted: if commands['series'] == series._command_series: if not commands['output_files']: return for inprogress_filename in commands['output_files']: if not inprogress_filename.endswith(DumpFilename.INPROG): continue final_dfname = DumpFilename(commands['runner'].wiki) final_dfname.new_from_filename( inprogress_filename[:-1 * len(DumpFilename.INPROG)]) in_progress_path = os.path.join(commands['output_dir'], inprogress_filename) final_path = os.path.join(commands['output_dir'], final_dfname.filename) try: os.rename(in_progress_path, final_path) except Exception: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr( traceback.format_exception(exc_type, exc_value, exc_traceback))) continue # sanity check of file contents, move if bad self.move_if_truncated(commands['runner'], final_dfname)
def __init__(self, wiki, prefetch=True, prefetchdate=None, spawn=True, job=None, skip_jobs=None, restart=False, notice="", dryrun=False, enabled=None, partnum_todo=None, checkpoint_file=None, page_id_range=None, skipdone=False, cleanup=False, do_prereqs=False, verbose=False): self.wiki = wiki self.db_name = wiki.db_name self.prefetch = prefetch self.prefetchdate = prefetchdate self.spawn = spawn self.filepart_info = FilePartInfo(wiki, self.db_name, self.log_and_print) self.restart = restart self.html_notice_file = None self.log = None self.dryrun = dryrun self._partnum_todo = partnum_todo self.checkpoint_file = checkpoint_file self.page_id_range = page_id_range self.skipdone = skipdone self.verbose = verbose self.enabled = enabled self.cleanup_old_files = cleanup self.do_prereqs = do_prereqs if self.checkpoint_file is not None: fname = DumpFilename(self.wiki) fname.new_from_filename(checkpoint_file) # we should get file partnum if any if self._partnum_todo is None and fname.partnum_int: self._partnum_todo = fname.partnum_int elif (self._partnum_todo is not None and fname.partnum_int and self._partnum_todo != fname.partnum_int): raise BackupError("specifed partnum to do does not match part number " "of checkpoint file %s to redo", self.checkpoint_file) self.checkpoint_file = fname if self.enabled is None: self.enabled = {} for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME, RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME, Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps", "cleanup_old_files", "check_trunc_files"]: self.enabled[setting] = True if not self.cleanup_old_files: if "cleanup_old_files" in self.enabled: del self.enabled["cleanup_old_files"] if self.dryrun or self._partnum_todo is not None or self.checkpoint_file is not None: for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME, RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME, Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps"]: if setting in self.enabled: del self.enabled[setting] if self.dryrun: for setting in ["check_trunc_files"]: if setting in self.enabled: del self.enabled[setting] if "logging" in self.enabled: del self.enabled["logging"] self.job_requested = job if self.job_requested == "latestlinks": for setting in [StatusHtml.NAME, IndexHtml.NAME, RunInfoFile.NAME]: if setting in self.enabled: del self.enabled[setting] if self.job_requested == "createdirs": for setting in [SymLinks.NAME, Feeds.NAME, RunSettings.NAME]: if setting in self.enabled: del self.enabled[setting] if self.job_requested == "latestlinks" or self.job_requested == "createdirs": for setting in [Checksummer.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps", "check_trunc_files"]: if setting in self.enabled: del self.enabled[setting] if self.job_requested == "noop": for setting in ["clean_old_dumps", "check_trunc_files"]: if setting in self.enabled: del self.enabled[setting] self.skip_jobs = skip_jobs if skip_jobs is None: self.skip_jobs = [] self.db_server_info = DbServerInfo(self.wiki, self.db_name, self.log_and_print) self.dump_dir = DumpDir(self.wiki, self.db_name) # these must come after the dumpdir setup so we know which directory we are in if "logging" in self.enabled and "makedir" in self.enabled: file_obj = DumpFilename(self.wiki) file_obj.new_from_filename(self.wiki.config.log_file) self.log_filename = self.dump_dir.filename_private_path(file_obj) self.make_dir(os.path.join(self.wiki.private_dir(), self.wiki.date)) self.log = Logger(self.log_filename) # thread should die horribly when main script dies. no exceptions. self.log.daemon = True self.log.start() self.dumpjobdata = DumpRunJobData(self.wiki, self.dump_dir, notice, self.log_and_print, self.debug, self.enabled, self.verbose) # some or all of these dump_items will be marked to run self.dump_item_list = DumpItemList(self.wiki, self.prefetch, self.prefetchdate, self.spawn, self._partnum_todo, self.checkpoint_file, self.job_requested, self.skip_jobs, self.filepart_info, self.page_id_range, self.dumpjobdata, self.dump_dir, self.verbose) # only send email failure notices for full runs if self.job_requested: email = False else: email = True self.failurehandler = FailureHandler(self.wiki, email) self.statushtml = StatusHtml(self.wiki, self.dump_dir, self.dump_item_list.dump_items, self.dumpjobdata, self.enabled, self.failurehandler, self.log_and_print, self.verbose) self.indexhtml = IndexHtml(self.wiki, self.dump_dir, self.dump_item_list.dump_items, self.dumpjobdata, self.enabled, self.failurehandler, self.log_and_print, self.verbose)