def move_if_truncated(self, runner, dfname, emptycheck=0, tmpdir=False): """ check if the given file (DumpFile) is truncated or empty if so, move it out of the way and return True return False otherwise if emptycheck is set to a number, the file will only be checked to seee if it is empty, if the file covers a page range with more pages than the specific number. Eg a file named elwikivoyage-20180618-pages-meta-history2.xml-p140p150.bz2 would be checked for emptycheck = 8 but not for 12; files that don't have page start and end numbers in the filename would not be checked at all. if emptycheck is left as 0, the file will be checked to see if it is empty always. if file is located in the temp dir, set tmpdir=True for it to be found there; otherwise the public xml/sql dump output dir (or private, if the wiki is private), will be checked for the file. """ if "check_trunc_files" not in runner.enabled or not self.check_truncation(): return False if tmpdir: path = os.path.join( FileUtils.wiki_tempdir(runner.wiki.db_name, runner.wiki.config.temp_dir), dfname.filename) elif runner.wiki.is_private(): path = runner.dump_dir.filename_private_path(dfname) else: path = runner.dump_dir.filename_public_path(dfname) dcontents = DumpContents(runner.wiki, path) file_truncated = True if os.path.exists(dcontents.filename): # for some file types we will check that the file has the right closing tag last_tag = None if ('.xml' in dcontents.filename and ('.bz2' in dcontents.filename or '.gz' in dcontents.filename)): last_tag = b'</mediawiki>' # fixme hardcoded at 200? mmmm. but otoh configurable is kinda dumb if (not emptycheck or self.is_larger(dfname, 200)) and dcontents.check_if_empty(): # file exists and is empty, move it out of the way dcontents.rename(dcontents.filename + ".empty") elif dcontents.check_if_truncated(last_tag): # The file exists and is truncated, move it out of the way dcontents.rename(dcontents.filename + ".truncated") elif dcontents.check_if_binary_crap(): # The file exists and has binary junk in it, move it out of the way dcontents.rename(dcontents.filename + ".truncated") else: # The file exists and is not truncated and doesn't have random crap. # Heck, it's a good file! file_truncated = False else: # file doesn't exist, move on file_truncated = False return file_truncated
def dostubsbackup(wikidb, history_file, current_file, articles_file, wikiconf, start, end, dryrun, verbose): ''' do a stubs xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} if history_file is not None: outfiles['history'] = {'name': history_file} if current_file is not None: outfiles['current'] = {'name': current_file} if articles_file is not None: outfiles['articles'] = {'name': articles_file} for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir), os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']] else: outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']] script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command command.extend(["--wiki=%s" % wikidb, "--full", "--stub", "--report=1000"]) if history_file is not None: command.append("--output=file:%s" % outfiles['history']['temp']) if current_file is not None: command.extend(["--output=file:%s" % outfiles['current']['temp'], "--filter=latest"]) if articles_file is not None: command.extend(["--output=file:%s" % outfiles['articles']['temp'], "--filter=latest", "--filter=notalk", "--filter=namespace:!NS_USER"]) if wikiconf.stubs_orderrevs: command.append("--orderrevs") callback = get_page_interval else: callback = None # the xml header, the body, and the xml footer should be separate gzipped # streams all concatted together # note that do_xml_stream exits on failure after cleaning up all output files # so the parent process must simply retry later do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback, header=True) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback, footer=True)
def do_abstractsbackup(wikidb, output_files, variants, wikiconf, start, end, dryrun, verbose): ''' do an abstracts xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} index = 0 for variant in variants: outfiles[variant] = {'name': output_files[index]} index += 1 for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir), os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']] else: outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']] script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command version = MultiVersion.mw_version(wikiconf, wikidb) abstract_cmd_dir = wikiconf.wiki_dir if version: abstract_cmd_dir = abstract_cmd_dir + "/" + version filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/AbstractFilter.php") if not os.path.exists(filter_path): filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/includes/AbstractFilter.php") abstract_filter = ("--plugin=AbstractFilter:" + filter_path) command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir, abstract_filter, "--current", "--report=1000", "--namespaces=0"]) for filetype in outfiles: command.extend(["--output=file:%s" % outfiles[filetype]['temp'], "--filter=namespace:NS_MAIN", "--filter=noredirect", "--filter=abstract%s" % filetype]) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose, header=True) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose, footer=True)
def _get_checksum_filename_tmp(self, htype, fmt): """ args: hashtype ('md5', 'sha1',...) format of output ('json', 'txt', ...) returns: full path of a unique-enough temporary output file for wiki and date """ dfname = DumpFilename(self.wiki, None, Checksummer.get_checksum_filename_basename(htype, fmt) + "." + self.timestamp + ".tmp") return os.path.join(FileUtils.wiki_tempdir( self.wiki.db_name, self.wiki.config.temp_dir), dfname.filename)
def get_stub_gen_cmd_for_input(self, input_dfname, output_dfnames, runner): """ for the given input dumpfile (stub), write the requested output file (stub) """ if not exists(self.wiki.config.writeuptopageid): raise BackupError("writeuptopageid command %s not found" % self.wiki.config.writeuptopageid) if runner.wiki.is_private(): inputfile_path = runner.dump_dir.filename_private_path(input_dfname) else: inputfile_path = runner.dump_dir.filename_public_path(input_dfname) output_dir = FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir) argstrings = [] for output_dfname in output_dfnames: output_fname = output_dfname.filename # don't generate the file if we already have it (i.e. this is a retry) if not os.path.exists(os.path.join(output_dir, output_fname)): first_age_id = output_dfname.first_page_id if (output_dfname.last_page_id is not None and output_dfname.last_page_id != "00000"): last_page_id = str(int(output_dfname.last_page_id) + 1) else: last_page_id = "" argstrings.append("{outfile}:{firstpage}:{lastpage}".format( outfile=output_fname, firstpage=first_age_id, lastpage=last_page_id)) # don't generate an output file if there are no filespecs if not argstrings: return None if input_dfname.file_ext == "gz": # command1 = "%s -dc %s" % (self.wiki.config.gzip, inputfile_path) command1 = [self.wiki.config.gzip, "-dc", inputfile_path] elif input_dfname.file_ext == '7z': # command1 = "%s e -si %s" % (self.wiki.config.sevenzip, inputfile_path) command1 = [self.wiki.config.sevenzip, "e", "-si", inputfile_path] elif input_dfname.file_ext == 'bz': # command1 = "%s -dc %s" % (self.wiki.config.bzip2, inputfile_path) command1 = [self.wiki.config.bzip2, "-dc", inputfile_path] else: raise BackupError("unknown stub file extension %s" % input_dfname.file_ext) command2 = [self.wiki.config.writeuptopageid, "--odir", output_dir, "--fspecs", ";".join(argstrings)] pipeline = [command1] pipeline.append(command2) return pipeline
def has_no_pages(self, xmlfile, runner, tempdir=False): ''' see if it has a page id in it or not. no? then return True ''' if xmlfile.is_temp_file or tempdir: path = os.path.join( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), xmlfile.filename) else: if runner.wiki.is_private(): path = runner.dump_dir.filename_private_path(xmlfile, self.wiki.date) else: path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date) dcontents = DumpContents(self.wiki, path, xmlfile, self.verbose) return bool(dcontents.find_first_page_id_in_file() is None)
def cp_chksum_tmpfiles_to_permfile(self): """ during a dump run, checksum files are written to a temporary location and updated there; we copy the content from these files into the permanent location after each dump job completes """ if Checksummer.NAME in self._enabled: for htype in Checksummer.HASHTYPES: for fmt in Checksummer.FORMATS: tmp_filename = self._get_checksum_filename_tmp(htype, fmt) real_filename = self._get_checksum_path(htype, fmt) content = FileUtils.read_file(tmp_filename) FileUtils.write_file( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), real_filename, content, self.wiki.config.fileperms)
def build_command(self, runner, stub_dfname, prefetch, output_dfname): """ Build the command line for the dump, minus output and filter options args: Runner, stub DumpFilename, .... """ stub_path = os.path.join( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), stub_dfname.filename) if os.path.exists(stub_path): # if this is a pagerange stub file in temp dir, use that stub_option = "--stub=gzip:%s" % stub_path else: # use regular stub file if runner.wiki.is_private(): stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_private_path(stub_dfname) else: stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_dfname) if self.jobinfo['spawn']: spawn = "--spawn=%s" % (self.wiki.config.php) else: spawn = "" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php") dump_command = [self.wiki.config.php] dump_command.extend(script_command) dump_command.extend(["--wiki=%s" % runner.db_name, "%s" % stub_option, "%s" % prefetch, "--report=1000", "%s" % spawn]) dump_command = [entry for entry in dump_command if entry is not None] dump_command.extend([self.build_filters(runner, output_dfname), self.build_eta()]) pipeline = [dump_command] # return a command series of one pipeline series = [pipeline] return series
def save_feed(self, dfname): """ produce an rss feed file for the specified dump output file (dfname) If there is already such a feed, update it only if the date of the dump output file in the feed is not older than the date of dfname, as indicated in the dump dirs/filenames themselves, NOT via stat args: DumpFilename """ if Feeds.NAME in self._enabled: rss_path = os.path.join(self.dump_dir.latest_dir(), self.db_name + "-latest-" + dfname.basename + "-rss.xml") self.make_dir(self.dump_dir.latest_dir()) filename_and_path = self.dump_dir.web_path(dfname) web_path = os.path.dirname(filename_and_path) if self.feed_newer_than_file(rss_path, dfname): return rss_text = self.wiki.config.read_template("feed.xml") % { "chantitle": dfname.basename, "chanlink": web_path, "chandesc": "Wikimedia dump updates for %s" % self.db_name, "title": web_path, "link": web_path, "description": xml_escape("<a href=\"%s\">%s</a>" % ( filename_and_path, dfname.filename)), "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) } self.debugfn("adding rss feed file %s " % rss_path) FileUtils.write_file( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), rss_path, rss_text, self.wiki.config.fileperms)
def write_notice(self): ''' write notice file if self.notice has contents, or remove if it self.notice is false, or read existing file and stash contents, if self.notice is empty str ''' if Notice.NAME in self._enabled: notice_filepath = self._get_notice_filename() # delnotice. toss any existing file if self.notice is False: if os.path.exists(notice_filepath): os.remove(notice_filepath) self.notice = "" # addnotice, stuff notice in a file for other jobs etc elif self.notice != "": FileUtils.write_file( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), notice_filepath, self.notice, self.wiki.config.fileperms) # default case. if there is a file get the contents, otherwise # we have empty contents, all good else: if os.path.exists(notice_filepath): self.notice = FileUtils.read_file(notice_filepath)
def write_pagerange_stubs(self, iofile_pairs, runner, batchsize, move_if_truncated): """ put the io file pairs in ascending order (per part if there are parts), for each pair write out a stub file corresponding to the page range in the output filename, combining up those outputs that require the same input file into one command args: pairs of (DumpFilename, DumpFilename), Runner """ if not iofile_pairs: return # split up into batches where the input file is the same # and the pairs are ordered by output file name in_dfnames = list({pair[0] for pair in iofile_pairs}) out_dfnames = {} output_dfnames_to_check = [] for in_dfname in in_dfnames: out_dfnames[in_dfname.filename] = sorted([pair[1] for pair in iofile_pairs if pair[0].filename == in_dfname.filename], key=functools.cmp_to_key(DumpFilename.compare)) commands = [] for in_dfname in in_dfnames: pipeline = self.get_stub_gen_cmd_for_input( in_dfname, out_dfnames[in_dfname.filename], runner) if pipeline is not None: # list of command series. each series is a list of pipelines. commands.append([pipeline]) output_dfnames_to_check.extend(out_dfnames[in_dfname.filename]) errors = False while commands: command_batch = commands[:batchsize] error, broken = runner.run_command(command_batch) if error: for series in broken: for pipeline in series: failed_cmds_retcodes = pipeline.get_failed_cmds_with_retcode() for cmd_retcode in failed_cmds_retcodes: if (cmd_retcode[1] == -signal.SIGPIPE or cmd_retcode[1] == signal.SIGPIPE + 128): pass else: runner.log_and_print("error from commands: %s" % " ".join( [entry for entry in pipeline])) errors = True commands = commands[batchsize:] if errors: raise BackupError("failed to write pagerange stub files") if runner.dryrun: return # check the output files to see if we like them; # if not, we will move the bad ones out of the way and # whine about them bad_dfnames = [] output_dir = FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir) for temp_stub_dfname in output_dfnames_to_check: if os.path.exists(os.path.join(output_dir, temp_stub_dfname.filename)): bad = move_if_truncated(runner, temp_stub_dfname, emptycheck=200, tmpdir=True) if bad: bad_dfnames.append(temp_stub_dfname) if bad_dfnames: error_string = " ".join([bad_dfname.filename for bad_dfname in bad_dfnames]) raise BackupError( "failed to write pagerange stub files (bad contents) " + error_string)
def run(self): """ mark which dump jobs should run clean up old dump run files set up directories for the run run each dump job """ if self.job_requested: if not self.dump_item_list.old_runinfo_retrieved and self.wiki.exists_perdump_index(): # There was a previous run of all or part of this date, but... # There was no old RunInfo to be had (or an error was encountered getting it) # so we can't rerun a step and keep all the status information # about the old run around. # In this case ask the user if they reeeaaally want to go ahead print("No information about the previous run for this date could be retrieved.") print("This means that the status information about the old run will be lost, and") print("only the information about the current (and future) runs will be kept.") reply = input("Continue anyways? [y/N]: ") if reply not in ["y", "Y"]: raise RuntimeError("No run information available for previous dump, exiting") if not self.dump_item_list.mark_dumps_to_run(self.job_requested, self.skipdone): # probably no such job sys.stderr.write("No job marked to run, exiting") return None if self.restart: # mark all the following jobs to run as well self.dump_item_list.mark_following_jobs_to_run(self.skipdone) else: self.dump_item_list.mark_all_jobs_to_run(self.skipdone) Maintenance.exit_if_in_maintenance_mode( "In maintenance mode, exiting dump of %s" % self.db_name) self.make_dir(os.path.join(self.wiki.public_dir(), self.wiki.date)) self.make_dir(os.path.join(self.wiki.private_dir(), self.wiki.date)) FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir, create=True) self.show_runner_state("Cleaning up old dumps for %s" % self.db_name) self.clean_old_dumps() self.clean_old_dumps(private=True) # Informing what kind backup work we are about to do if self.job_requested: if self.restart: self.log_and_print("Preparing for restart from job %s of %s" % (self.job_requested, self.db_name)) else: self.log_and_print("Preparing for job %s of %s" % (self.job_requested, self.db_name)) else: self.show_runner_state("Starting backup of %s" % self.db_name) self.dumpjobdata.do_before_dump() for item in self.dump_item_list.dump_items: prereq_job = self.do_run_item(item) if self.do_prereqs and prereq_job is not None: doing = [] doing.append(item) # we have the lock so we might as well run the prereq job now. # there may be a string of prereqs not met, # i.e. articlesrecombine -> articles -> stubs # so we're willing to walk back up the list up to five items, # assume there's something really broken if it takes more than that while prereq_job is not None and len(doing) < 5: new_item = self.dump_item_list.find_item_by_name(prereq_job) new_item.set_to_run(True) prereq_job = self.do_run_item(new_item) if prereq_job is not None: # this job has a dependency too, add to the todo stack doing.insert(0, new_item) # back up the stack and do the dependents if stack isn't too long. if len(doing) < 5: for subitem in doing: self.do_run_item(subitem) # special case if self.job_requested == "createdirs": if not os.path.exists(os.path.join(self.wiki.public_dir(), self.wiki.date)): os.makedirs(os.path.join(self.wiki.public_dir(), self.wiki.date)) if not os.path.exists(os.path.join(self.wiki.private_dir(), self.wiki.date)): os.makedirs(os.path.join(self.wiki.private_dir(), self.wiki.date)) # we must do this here before the checksums are used for status reports below self.dumpjobdata.checksummer.move_chksumfiles_into_place() if self.dump_item_list.all_possible_jobs_done(): # All jobs are either in status "done", "waiting", "failed", "skipped" self.report.update_index_html_and_json("done") self.statushtml.update_status_file("done") self.runstatus_updater.write_statusapi_file() self.specialfiles_updater.write_specialfilesinfo_file() else: # This may happen if we start a dump now and abort before all items are # done. Then some are left for example in state "waiting". When # afterwards running a specific job, all (but one) of the jobs # previously in "waiting" are still in status "waiting" self.report.update_index_html_and_json("partialdone") self.statushtml.update_status_file("partialdone") self.runstatus_updater.write_statusapi_file() self.specialfiles_updater.write_specialfilesinfo_file() self.dumpjobdata.do_after_dump(self.dump_item_list.dump_items) # special case if (self.job_requested and self.job_requested == "latestlinks" and self.dump_item_list.all_possible_jobs_done()): self.dumpjobdata.do_latest_job() # Informing about completion if self.job_requested: if self.restart: self.show_runner_state("Completed run restarting from job %s for %s" % (self.job_requested, self.db_name)) else: self.show_runner_state("Completed job %s for %s" % (self.job_requested, self.db_name)) else: self.show_runner_state_complete() # let caller know if this was a successful run if sum(1 for item in self.dump_item_list.dump_items if item.status() == "failed"): return False return True