def _write_dump_runinfo(self, content): for fmt in RunInfo.FORMATS: dump_runinfo_filename = self._get_dump_runinfo_filename(fmt=fmt) # FileUtils.write_file(directory, dumpRunInfoFilename, text, # self.wiki.config.fileperms) FileUtils.write_file_in_place(dump_runinfo_filename, content[fmt], self.wiki.config.fileperms)
def write_status(wiki, message): """ get the status information for the wiki dump run for all dump jobs and write it into the status html file """ index = StatusHtml.get_statusfile_path(wiki, wiki.date) FileUtils.write_file_in_place(index, message, wiki.config.fileperms)
def cp_chksum_tmpfiles_to_permfile(self): if Checksummer.NAME in self._enabled: for htype in Checksummer.HASHTYPES: tmp_filename = self._get_checksum_filename_tmp(htype) real_filename = self._get_checksum_filename(htype) text = FileUtils.read_file(tmp_filename) FileUtils.write_file(self.wiki.config.temp_dir, real_filename, text, self.wiki.config.fileperms)
def db_info_by_age(self, use_status_time=False): """ Sort wikis in reverse order of last successful dump and return tuples of information for each wiki: * whether the dump failed, * the date of the run as found in dump dir string OR as determined by time of status file, if use_status_time is True, * age of status file if any, * wiki name Order is (DumpFailed, Age), and False < True: First, wikis whose latest dump was successful, most recent dump first Then, wikis whose latest dump failed, most recent dump first. Finally, wikis which have never been dumped. According to that sort, the last item of this list is, when applicable, the oldest failed dump attempt. If some error occurs checking a dump status, that dump is put last in the list (sort value is (True, maxsize) ) Note that we now sort this list by the date of the dump directory, not the last date that a dump file in that directory may have been touched. This allows us to rerun jobs to completion from older runs, for example an en pedia history urn that failed in the middle, without borking the index page links. """ available = [] today = int(TimeUtils.today()) for dbname in self.db_list: wiki = Wiki(self, dbname) age = sys.maxsize date = sys.maxsize last = wiki.latest_dump() status = '' if last: dump_status = StatusHtml.get_statusfile_path(wiki, last) try: if use_status_time: # only use the status file time, not the dir date date = today else: date = today - int(last) # tack on the file mtime so that if we have multiple wikis # dumped on the same day, they get ordered properly age = FileUtils.file_age(dump_status) status = FileUtils.read_file(dump_status) except Exception as ex: print("dump dir missing status file %s?" % dump_status) dump_failed = (status == '') or ('dump aborted' in status) available.append((dump_failed, date, age, dbname)) available = sorted(available) return available
def update_index_html_and_json(self, dump_status=""): ''' generate the index.html file for the wiki's dump run which contains information on each dump step as well as links to completed files for download, hash files, etc. and links to completed files; generate the json file with the same information as well''' if Report.NAME in self._enabled: self.dumpjobdata.notice.refresh_notice() status_items = [Report.report_dump_step_status(self.dump_dir, item) for item in self.items] status_items_html = [item['html'] for item in status_items] status_items_html.reverse() html = "\n".join(status_items_html) checksums = [self.get_checksum_html(htype) for htype in Checksummer.HASHTYPES] checksums_html = ", ".join(checksums) failed_jobs = sum(1 for item in self.items if item.status() == "failed") txt = self.wiki.config.read_template("report.html") % { "db": self.wiki.db_name, "date": self.wiki.date, "notice": self.dumpjobdata.notice.notice, "status": StatusHtml.report_dump_status(failed_jobs, dump_status), "previous": self.report_previous_dump_link(dump_status), "items": html, "checksum": checksums_html, "index": self.wiki.config.index} json_out = {'jobs': {}} for item in status_items: for jobname in item['json']: json_out['jobs'][jobname] = item['json'][jobname] try: indexpath = os.path.join(self.wiki.public_dir(), self.wiki.date, self.wiki.config.perdump_index) FileUtils.write_file_in_place(indexpath, txt, self.wiki.config.fileperms) json_filepath = os.path.join(self.wiki.public_dir(), self.wiki.date, Report.JSONFILE) FileUtils.write_file_in_place(json_filepath, json.dumps(json_out), self.wiki.config.fileperms) except Exception: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value, exc_traceback))) message = "Couldn't update status files. Continuing anyways" if self.error_callback: self.error_callback(message) else: sys.stderr.write("%s\n" % message)
def db_latest_status(self): ''' return list of tuples for each wiki: status of latest wiki dump or None if wiki never dumped, wiki name ''' dbinfo = [] for dbname in self.db_list: wiki = Wiki(self, dbname) last = wiki.latest_dump() status = '' if last: dump_status = StatusHtml.get_statusfile_path(wiki, last) try: status = FileUtils.read_file(dump_status) except Exception as ex: status = 'failed' for value in ['missing', 'not yet', 'failed', 'aborted', 'progress', 'partial', 'complete']: if value in status: status = value break else: status = None dbinfo.append((dbname, status, last)) return dbinfo
def move_if_truncated(self, runner, dfname, emptycheck=0, tmpdir=False): """ check if the given file (DumpFile) is truncated or empty if so, move it out of the way and return True return False otherwise if emptycheck is set to a number, the file will only be checked to seee if it is empty, if the file covers a page range with more pages than the specific number. Eg a file named elwikivoyage-20180618-pages-meta-history2.xml-p140p150.bz2 would be checked for emptycheck = 8 but not for 12; files that don't have page start and end numbers in the filename would not be checked at all. if emptycheck is left as 0, the file will be checked to see if it is empty always. if file is located in the temp dir, set tmpdir=True for it to be found there; otherwise the public xml/sql dump output dir (or private, if the wiki is private), will be checked for the file. """ if "check_trunc_files" not in runner.enabled or not self.check_truncation(): return False if tmpdir: path = os.path.join( FileUtils.wiki_tempdir(runner.wiki.db_name, runner.wiki.config.temp_dir), dfname.filename) elif runner.wiki.is_private(): path = runner.dump_dir.filename_private_path(dfname) else: path = runner.dump_dir.filename_public_path(dfname) dcontents = DumpContents(runner.wiki, path) file_truncated = True if os.path.exists(dcontents.filename): # for some file types we will check that the file has the right closing tag last_tag = None if ('.xml' in dcontents.filename and ('.bz2' in dcontents.filename or '.gz' in dcontents.filename)): last_tag = b'</mediawiki>' # fixme hardcoded at 200? mmmm. but otoh configurable is kinda dumb if (not emptycheck or self.is_larger(dfname, 200)) and dcontents.check_if_empty(): # file exists and is empty, move it out of the way dcontents.rename(dcontents.filename + ".empty") elif dcontents.check_if_truncated(last_tag): # The file exists and is truncated, move it out of the way dcontents.rename(dcontents.filename + ".truncated") elif dcontents.check_if_binary_crap(): # The file exists and has binary junk in it, move it out of the way dcontents.rename(dcontents.filename + ".truncated") else: # The file exists and is not truncated and doesn't have random crap. # Heck, it's a good file! file_truncated = False else: # file doesn't exist, move on file_truncated = False return file_truncated
def refresh_notice(self): # if the notice file has changed or gone away, we comply. notice_file = self._get_notice_filename() if exists(notice_file): self.notice = FileUtils.read_file(notice_file) else: self.notice = ""
def save_symlink(self, dumpfile): if SymLinks.NAME in self._enabled: self.make_dir(self.dump_dir.latest_dir()) realfile = self.dump_dir.filename_public_path(dumpfile) latest_filename = dumpfile.new_filename(dumpfile.dumpname, dumpfile.file_type, dumpfile.file_ext, 'latest', dumpfile.partnum, dumpfile.checkpoint, dumpfile.temp) link = os.path.join(self.dump_dir.latest_dir(), latest_filename) if exists(link) or os.path.islink(link): if os.path.islink(link): oldrealfile = os.readlink(link) # format of these links should be... # ../20110228/elwikidb-20110228-templatelinks.sql.gz rellinkpattern = re.compile(r'^\.\./(20[0-9]+)/') dateinlink = rellinkpattern.search(oldrealfile) if dateinlink: dateoflinkedfile = dateinlink.group(1) dateinterval = int(self.wiki.date) - int(dateoflinkedfile) else: dateinterval = 0 # no file or it's older than ours... *then* remove the link if not exists(os.path.realpath(link)) or dateinterval > 0: self.debugfn("Removing old symlink %s" % link) os.remove(link) else: self.logfn("What the hell dude, %s is not a symlink" % link) raise BackupError("What the hell dude, %s is not a symlink" % link) relative = FileUtils.relative_path(realfile, os.path.dirname(link)) # if we removed the link cause it's obsolete, make the new one if exists(realfile) and not exists(link): self.debugfn("Adding symlink %s -> %s" % (link, relative)) os.symlink(relative, link)
def cp_chksum_tmpfiles_to_permfile(self): """ during a dump run, checksum files are written to a temporary location and updated there; we copy the content from these files into the permanent location after each dump job completes """ if Checksummer.NAME in self._enabled: for htype in Checksummer.HASHTYPES: for fmt in Checksummer.FORMATS: tmp_filename = self._get_checksum_filename_tmp(htype, fmt) real_filename = self._get_checksum_path(htype, fmt) content = FileUtils.read_file(tmp_filename) FileUtils.write_file( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), real_filename, content, self.wiki.config.fileperms)
def dostubsbackup(wikidb, history_file, current_file, articles_file, wikiconf, start, end, dryrun, verbose): ''' do a stubs xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} if history_file is not None: outfiles['history'] = {'name': history_file} if current_file is not None: outfiles['current'] = {'name': current_file} if articles_file is not None: outfiles['articles'] = {'name': articles_file} for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir), os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']] else: outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']] script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command command.extend(["--wiki=%s" % wikidb, "--full", "--stub", "--report=1000"]) if history_file is not None: command.append("--output=file:%s" % outfiles['history']['temp']) if current_file is not None: command.extend(["--output=file:%s" % outfiles['current']['temp'], "--filter=latest"]) if articles_file is not None: command.extend(["--output=file:%s" % outfiles['articles']['temp'], "--filter=latest", "--filter=notalk", "--filter=namespace:!NS_USER"]) if wikiconf.stubs_orderrevs: command.append("--orderrevs") callback = get_page_interval else: callback = None # the xml header, the body, and the xml footer should be separate gzipped # streams all concatted together # note that do_xml_stream exits on failure after cleaning up all output files # so the parent process must simply retry later do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback, header=True) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback, footer=True)
def refresh_notice(self): ''' if the notice file has changed or gone away, we comply. ''' notice_filepath = self._get_notice_filename() if os.path.exists(notice_filepath): self.notice = FileUtils.read_file(notice_filepath) else: self.notice = ""
def write_notice_file(self): if NoticeFile.NAME in self._enabled: notice_file = self._get_notice_filename() # delnotice. toss any existing file if self.notice is False: if exists(notice_file): os.remove(notice_file) self.notice = "" # addnotice, stuff notice in a file for other jobs etc elif self.notice != "": # notice_dir = self._get_notice_dir() FileUtils.write_file(self.wiki.config.temp_dir, notice_file, self.notice, self.wiki.config.fileperms) # default case. if there is a file get the contents, otherwise # we have empty contents, all good else: if exists(notice_file): self.notice = FileUtils.read_file(notice_file)
def do_abstractsbackup(wikidb, output_files, variants, wikiconf, start, end, dryrun, verbose): ''' do an abstracts xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} index = 0 for variant in variants: outfiles[variant] = {'name': output_files[index]} index += 1 for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir), os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']] else: outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']] script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command version = MultiVersion.mw_version(wikiconf, wikidb) abstract_cmd_dir = wikiconf.wiki_dir if version: abstract_cmd_dir = abstract_cmd_dir + "/" + version filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/AbstractFilter.php") if not os.path.exists(filter_path): filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/includes/AbstractFilter.php") abstract_filter = ("--plugin=AbstractFilter:" + filter_path) command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir, abstract_filter, "--current", "--report=1000", "--namespaces=0"]) for filetype in outfiles: command.extend(["--output=file:%s" % outfiles[filetype]['temp'], "--filter=namespace:NS_MAIN", "--filter=noredirect", "--filter=abstract%s" % filetype]) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose, header=True) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose, footer=True)
def update_index_html(self, dump_status=""): ''' generate the index.html file for the wiki's dump run which contains information on each dump step as well as links to completed files for download, hash files, etc. and links to completed files''' if IndexHtml.NAME in self._enabled: self.dumpjobdata.noticefile.refresh_notice() status_items = [IndexHtml.report_dump_step_status(self.dump_dir, item) for item in self.items] status_items.reverse() html = "\n".join(status_items) checksums = [self.get_checksum_html(htype) for htype in Checksummer.HASHTYPES] checksums_html = ", ".join(checksums) text = self.wiki.config.read_template("report.html") % { "db": self.wiki.db_name, "date": self.wiki.date, "notice": self.dumpjobdata.noticefile.notice, "status": StatusHtml.report_dump_status( self.failhandler.failure_count, dump_status), "previous": self.report_previous_dump_link(dump_status), "items": html, "checksum": checksums_html, "index": self.wiki.config.index} try: index = os.path.join(self.wiki.public_dir(), self.wiki.date, self.wiki.config.perdump_index) FileUtils.write_file_in_place(index, text, self.wiki.config.fileperms) except Exception as ex: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value, exc_traceback))) message = "Couldn't update status files. Continuing anyways" if self.error_callback: self.error_callback(message) else: sys.stderr.write("%s\n" % message)
def save_feed(self, file_obj): if Feeds.NAME in self._enabled: self.make_dir(self.dump_dir.latest_dir()) filename_and_path = self.dump_dir.web_path(file_obj) web_path = os.path.dirname(filename_and_path) rss_text = self.wiki.config.read_template("feed.xml") % { "chantitle": file_obj.basename, "chanlink": web_path, "chandesc": "Wikimedia dump updates for %s" % self.db_name, "title": web_path, "link": web_path, "description": xml_escape("<a href=\"%s\">%s</a>" % ( filename_and_path, file_obj.filename)), "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) } rss_path = os.path.join(self.dump_dir.latest_dir(), self.db_name + "-latest-" + file_obj.basename + "-rss.xml") self.debugfn("adding rss feed file %s " % rss_path) FileUtils.write_file(self.wiki.config.temp_dir, rss_path, rss_text, self.wiki.config.fileperms)
def _get_checksum_filename_tmp(self, htype, fmt): """ args: hashtype ('md5', 'sha1',...) format of output ('json', 'txt', ...) returns: full path of a unique-enough temporary output file for wiki and date """ dfname = DumpFilename(self.wiki, None, Checksummer.get_checksum_filename_basename(htype, fmt) + "." + self.timestamp + ".tmp") return os.path.join(FileUtils.wiki_tempdir( self.wiki.db_name, self.wiki.config.temp_dir), dfname.filename)
def get_checksum_from_file(path): ''' get the checksum recorded in a file which should have one line, consisting of the checksum, two spaces, and the filename that was checksummed return None on any error ''' try: content = FileUtils.read_file(path) checksum, _filename = content.split(' ', 1) return checksum except Exception: return None
def save_feed(self, dfname): """ produce an rss feed file for the specified dump output file (dfname) If there is already such a feed, update it only if the date of the dump output file in the feed is not older than the date of dfname, as indicated in the dump dirs/filenames themselves, NOT via stat args: DumpFilename """ if Feeds.NAME in self._enabled: rss_path = os.path.join(self.dump_dir.latest_dir(), self.db_name + "-latest-" + dfname.basename + "-rss.xml") self.make_dir(self.dump_dir.latest_dir()) filename_and_path = self.dump_dir.web_path(dfname) web_path = os.path.dirname(filename_and_path) if self.feed_newer_than_file(rss_path, dfname): return rss_text = self.wiki.config.read_template("feed.xml") % { "chantitle": dfname.basename, "chanlink": web_path, "chandesc": "Wikimedia dump updates for %s" % self.db_name, "title": web_path, "link": web_path, "description": xml_escape("<a href=\"%s\">%s</a>" % ( filename_and_path, dfname.filename)), "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) } self.debugfn("adding rss feed file %s " % rss_path) FileUtils.write_file( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), rss_path, rss_text, self.wiki.config.fileperms)
def status_line(wiki, aborted=False): date = wiki.latest_dump() if date: if aborted: return StatusHtml.report_statusline( wiki, "<span class=\"failed\">dump aborted</span>") status = StatusHtml.get_statusfile_path(wiki, date) try: return FileUtils.read_file(status) except Exception as ex: return StatusHtml.report_statusline(wiki, "missing status record") else: return StatusHtml.report_statusline(wiki, "has not yet been dumped")
def get_stub_gen_cmd_for_input(self, input_dfname, output_dfnames, runner): """ for the given input dumpfile (stub), write the requested output file (stub) """ if not exists(self.wiki.config.writeuptopageid): raise BackupError("writeuptopageid command %s not found" % self.wiki.config.writeuptopageid) if runner.wiki.is_private(): inputfile_path = runner.dump_dir.filename_private_path(input_dfname) else: inputfile_path = runner.dump_dir.filename_public_path(input_dfname) output_dir = FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir) argstrings = [] for output_dfname in output_dfnames: output_fname = output_dfname.filename # don't generate the file if we already have it (i.e. this is a retry) if not os.path.exists(os.path.join(output_dir, output_fname)): first_age_id = output_dfname.first_page_id if (output_dfname.last_page_id is not None and output_dfname.last_page_id != "00000"): last_page_id = str(int(output_dfname.last_page_id) + 1) else: last_page_id = "" argstrings.append("{outfile}:{firstpage}:{lastpage}".format( outfile=output_fname, firstpage=first_age_id, lastpage=last_page_id)) # don't generate an output file if there are no filespecs if not argstrings: return None if input_dfname.file_ext == "gz": # command1 = "%s -dc %s" % (self.wiki.config.gzip, inputfile_path) command1 = [self.wiki.config.gzip, "-dc", inputfile_path] elif input_dfname.file_ext == '7z': # command1 = "%s e -si %s" % (self.wiki.config.sevenzip, inputfile_path) command1 = [self.wiki.config.sevenzip, "e", "-si", inputfile_path] elif input_dfname.file_ext == 'bz': # command1 = "%s -dc %s" % (self.wiki.config.bzip2, inputfile_path) command1 = [self.wiki.config.bzip2, "-dc", inputfile_path] else: raise BackupError("unknown stub file extension %s" % input_dfname.file_ext) command2 = [self.wiki.config.writeuptopageid, "--odir", output_dir, "--fspecs", ";".join(argstrings)] pipeline = [command1] pipeline.append(command2) return pipeline
def write_notice(self): ''' write notice file if self.notice has contents, or remove if it self.notice is false, or read existing file and stash contents, if self.notice is empty str ''' if Notice.NAME in self._enabled: notice_filepath = self._get_notice_filename() # delnotice. toss any existing file if self.notice is False: if os.path.exists(notice_filepath): os.remove(notice_filepath) self.notice = "" # addnotice, stuff notice in a file for other jobs etc elif self.notice != "": FileUtils.write_file( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), notice_filepath, self.notice, self.wiki.config.fileperms) # default case. if there is a file get the contents, otherwise # we have empty contents, all good else: if os.path.exists(notice_filepath): self.notice = FileUtils.read_file(notice_filepath)
def has_no_pages(self, xmlfile, runner, tempdir=False): ''' see if it has a page id in it or not. no? then return True ''' if xmlfile.is_temp_file or tempdir: path = os.path.join( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), xmlfile.filename) else: if runner.wiki.is_private(): path = runner.dump_dir.filename_private_path(xmlfile, self.wiki.date) else: path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date) dcontents = DumpContents(self.wiki, path, xmlfile, self.verbose) return bool(dcontents.find_first_page_id_in_file() is None)
def do_main(): ''' main entry point, do all the work ''' (configfile, date, dryrun, filenameformat, output_dir, overwrite, wikiname, script, basename, query, retries, verbose, remainder) = get_args() validate_args(date, output_dir, retries, script, query) if retries is None: retries = "3" retries = int(retries) if configfile: config = Config(configfile) else: config = Config() if date is None: date = TimeUtils.today() if script is not None: runner = ScriptRunner(script, remainder, dryrun, verbose) else: if query is None: query = FileUtils.read_file(config.queryfile) runner = QueryRunner(query, dryrun, verbose) if basename is not None: base = Wiki(config, basename) base.set_date(date) if base is not None: base.config.parse_conffile_per_project(base.db_name) else: base = None if wikiname is not None: wiki = Wiki(config, wikiname) wiki.set_date(date) wikirunner = WikiRunner(runner, wiki, filenameformat, output_dir, base) wikirunner.do_one_wiki(overwrite) else: wikirunner = WikiRunnerLoop(config, runner, filenameformat, output_dir, base) wikirunner.do_all_wikis_til_done(retries, overwrite, date)
def report_file_size_status(dump_dir, file_obj, item_status): filename = dump_dir.filename_public_path(file_obj) if exists(filename): size = os.path.getsize(filename) else: item_status = "missing" size = 0 size = FileUtils.pretty_size(size) if item_status == "in-progress": return "<li class='file'>%s %s (written) </li>" % (file_obj.filename, size) elif item_status == "done": webpath_relative = dump_dir.web_path_relative(file_obj) return ("<li class='file'><a href=\"%s\">%s</a> %s</li>" % (webpath_relative, file_obj.filename, size)) else: return "<li class='missing'>%s</li>" % file_obj.filename
def generate_index(config, other_indexhtml=None, sorted_by_db=False): running = False states = [] if sorted_by_db: dbs = sorted(config.db_list) else: dbs = config.db_list_by_age() for db_name in dbs: try: wiki = Wiki(config, db_name) locker = Locker(wiki) lockfiles = locker.is_stale(all_locks=True) if lockfiles: locker.cleanup_stale_locks(lockfiles) running = running or locker.is_locked(all_locks=True) states.append(StatusHtml.status_line(wiki)) except Exception: # if there's a problem with one wiki at least # let's show the rest if VERBOSE: traceback.print_exc(file=sys.stdout) if running: status = "Dumps are in progress..." elif exists("maintenance.txt"): status = FileUtils.read_file("maintenance.txt") else: status = "Dump process is idle." if other_indexhtml is None: other_index_link = "" else: if sorted_by_db: other_sortedby = "dump date" else: other_sortedby = "wiki name" other_index_link = ('Also view sorted by <a href="%s">%s</a>' % (os.path.basename(other_indexhtml), other_sortedby)) return config.read_template("download-index.html") % { "otherIndexLink": other_index_link, "status": status, "items": "\n".join(states)}
def build_command(self, runner, stub_dfname, prefetch, output_dfname): """ Build the command line for the dump, minus output and filter options args: Runner, stub DumpFilename, .... """ stub_path = os.path.join( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), stub_dfname.filename) if os.path.exists(stub_path): # if this is a pagerange stub file in temp dir, use that stub_option = "--stub=gzip:%s" % stub_path else: # use regular stub file if runner.wiki.is_private(): stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_private_path(stub_dfname) else: stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_dfname) if self.jobinfo['spawn']: spawn = "--spawn=%s" % (self.wiki.config.php) else: spawn = "" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php") dump_command = [self.wiki.config.php] dump_command.extend(script_command) dump_command.extend(["--wiki=%s" % runner.db_name, "%s" % stub_option, "%s" % prefetch, "--report=1000", "%s" % spawn]) dump_command = [entry for entry in dump_command if entry is not None] dump_command.extend([self.build_filters(runner, output_dfname), self.build_eta()]) pipeline = [dump_command] # return a command series of one pipeline series = [pipeline] return series
def report_file_size_status(dump_dir, dfname, item_status): """ args: DumpDir DumpFilename status ("in-progress", "missing", ...) """ filename = dump_dir.filename_public_path(dfname) size = None if exists(filename): size = os.path.getsize(filename) elif item_status == "in-progress": # note that because multiple files may be produced for a single dump # job, some may be complete while others are still in progress. # therefore we check the normal name first, falling back to the # inprogress name. filename = filename + DumpFilename.INPROG if exists(filename): try: size = os.path.getsize(filename) except Exception: # yes, it might be removed in that short interval of time. pass if size is None: item_status = "missing" size = 0 pretty_size = FileUtils.pretty_size(size) if item_status == "in-progress": txt = "<li class='file'>%s %s (written) </li>" % (dfname.filename, pretty_size) json_out = {'name': dfname.filename, 'size': size} elif item_status == "done": webpath_relative = dump_dir.web_path_relative(dfname) txt = ("<li class='file'><a href=\"%s\">%s</a> %s</li>" % (webpath_relative, dfname.filename, pretty_size)) json_out = {'name': dfname.filename, 'size': size, 'url': webpath_relative} else: txt = "<li class='missing'>%s</li>" % dfname.filename json_out = {'name': dfname.filename} content = {'txt': txt, 'json': json_out} return content
def status_line(wiki, aborted=False): ''' read the status information from the status html file and attempt to return it on failure, makes a reasonable guess about the dump status and returns that if 'aborted' is True, don't read in anything but return a line of html that dump was aborted ''' date = wiki.latest_dump() if date: if aborted: return StatusHtml.report_statusline( wiki, "<span class=\"failed\">dump aborted</span>") status = StatusHtml.get_statusfile_path(wiki, date) try: return FileUtils.read_file(status) except Exception: return StatusHtml.report_statusline(wiki, "missing status record") else: return StatusHtml.report_statusline(wiki, "has not yet been dumped")
def lock(self): ''' create lock file for the given wiki and date, also set up a watchdog that will update its timestamp every minute. ''' if not os.path.isdir(self.wiki.private_dir()): try: os.makedirs(self.wiki.private_dir()) except Exception as ex: # Maybe it was just created (race condition)? if not os.path.isdir(self.wiki.private_dir()): raise lockf = FileUtils.atomic_create(self.get_lock_file_path(), "w") lockf.write("%s %d" % (socket.getfqdn(), os.getpid())) lockf.close() self.watchdog = LockWatchdog(self.get_lock_file_path()) # when the main script dies this thread must die too, horribly if needed. self.watchdog.daemon = True self.watchdog.start() return True