def do_abstractsbackup(wikidb, output_files, variants, wikiconf, start, end, dryrun, verbose): ''' do an abstracts xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} index = 0 for variant in variants: outfiles[variant] = {'name': output_files[index]} index += 1 for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir), os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']] else: outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']] script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command version = MultiVersion.mw_version(wikiconf, wikidb) abstract_cmd_dir = wikiconf.wiki_dir if version: abstract_cmd_dir = abstract_cmd_dir + "/" + version filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/AbstractFilter.php") if not os.path.exists(filter_path): filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/includes/AbstractFilter.php") abstract_filter = ("--plugin=AbstractFilter:" + filter_path) command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir, abstract_filter, "--current", "--report=1000", "--namespaces=0"]) for filetype in outfiles: command.extend(["--output=file:%s" % outfiles[filetype]['temp'], "--filter=namespace:NS_MAIN", "--filter=noredirect", "--filter=abstract%s" % filetype]) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose, header=True) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 10000, '</doc>\n', verbose=verbose, footer=True)
def dump_revs(self): ''' dump revision content corresponding to previously-dumped stubs (revision metadata) ''' if not self.steps['revs']['run']: return True dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date) outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date) revsfile = RevsFile(self.wiki.config, self.wiki.date, self.wiki.db_name) outputfile = revsfile.get_filename() script_command = MultiVersion.mw_script_as_array(self.wiki.config, "dumpTextPass.php") command = [self.wiki.config.php] command.extend(script_command) stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name) stuboutputfile = stubfile.get_filename() command.extend(["--wiki=%s" % self.wiki.db_name, "--stub=gzip:%s" % os.path.join(outputdir, stuboutputfile), "--quiet", "--spawn=%s" % self.wiki.config.php, "--output=bzip2:%s" % os.path.join(outputdir, outputfile)]) if self.dryrun: print "would run command for revs dump:", command else: log.info("running with no output: " + " ".join(command)) success = RunSimpleCommand.run_with_no_output( command, shell=False, timeout=self.get_lock_timeout_interval(), timeout_callback=self.periodic_callback) if not success: log.warning("error producing revision text files" " for wiki %s", self.wiki.db_name) return False return True
def dump_stub(self, start_revid, end_revid): ''' dump stubs (metadata) for revs from start_revid up to but not including end_revid ''' if not self.steps['stubs']['run']: return True dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date) outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date) stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name) outputfile = stubfile.get_filename() script_command = MultiVersion.mw_script_as_array(self.wiki.config, "dumpBackup.php") command = [self.wiki.config.php] command.extend(script_command) command.extend(["--wiki=%s" % self.wiki.db_name, "--stub", "--quiet", "--output=gzip:%s" % os.path.join(outputdir, outputfile), "--revrange", "--revstart=%s" % start_revid, "--revend=%s" % end_revid]) if self.dryrun: print "would run command for stubs dump:", command else: log.info("running with no output: " + " ".join(command)) success = RunSimpleCommand.run_with_no_output( command, shell=False, timeout=self.get_lock_timeout_interval(), timeout_callback=self.periodic_callback) if not success: log.warning("error producing stub files for wiki %s", self.wiki.db_name) return False return True
def run(self, runner): self.cleanup_old_files(runner.dump_dir, runner) files = self.list_outfiles_for_build_command(runner.dump_dir) if len(files) > 1: raise BackupError("flow content step wants to produce more than one output file") output_file_obj = files[0] if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) flow_output_file = runner.dump_dir.filename_public_path(output_file_obj) script_command = MultiVersion.mw_script_as_array( runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php") command = [runner.wiki.config.php] command.extend(script_command) command.extend(["--wiki=%s" % runner.db_name, "--current", "--report=1000", "--output=bzip2:%s" % flow_output_file]) if self.history: command.append("--full") pipeline = [command] series = [pipeline] error = runner.run_command([series], callback_stderr=self.progress_callback, callback_stderr_arg=runner) if error: raise BackupError("error dumping flow page files")
def dologsbackup(wikidb, outfile, wikiconf, start, end, dryrun): ''' do a logs xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {'logs': {'name': outfile}} for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( wikiconf.temp_dir, os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = None else: outfiles[filetype]['compr'] = gzippit(outfiles[filetype]['name']) script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command command.extend(["--wiki=%s" % wikidb, "--logs", "--report=1000", "--output=file:%s" % outfiles['logs']['temp'] ]) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'log_id', 'logging', 50000, 100000, '</logitem>\n')
def dostubsbackup(wikidb, history_file, current_file, articles_file, wikiconf, start, end, dryrun, verbose): ''' do a stubs xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} if history_file is not None: outfiles['history'] = {'name': history_file} if current_file is not None: outfiles['current'] = {'name': current_file} if articles_file is not None: outfiles['articles'] = {'name': articles_file} for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir), os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']] else: outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']] script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command command.extend(["--wiki=%s" % wikidb, "--full", "--stub", "--report=1000"]) if history_file is not None: command.append("--output=file:%s" % outfiles['history']['temp']) if current_file is not None: command.extend(["--output=file:%s" % outfiles['current']['temp'], "--filter=latest"]) if articles_file is not None: command.extend(["--output=file:%s" % outfiles['articles']['temp'], "--filter=latest", "--filter=notalk", "--filter=namespace:!NS_USER"]) if wikiconf.stubs_orderrevs: command.append("--orderrevs") callback = get_page_interval else: callback = None # the xml header, the body, and the xml footer should be separate gzipped # streams all concatted together # note that do_xml_stream exits on failure after cleaning up all output files # so the parent process must simply retry later do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback, header=True) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 20000, '</page>\n', verbose=verbose, callback=callback, footer=True)
def do_abstractsbackup(wikidb, output_files, variants, wikiconf, start, end, dryrun): ''' do an abstracts xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {} index = 0 for variant in variants: outfiles[variant] = {'name': output_files[index]} index += 1 for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( wikiconf.temp_dir, os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = None else: outfiles[filetype]['compr'] = catit(outfiles[filetype]['name']) script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command version = MultiVersion.mw_version(wikiconf, wikidb) abstract_cmd_dir = wikiconf.wiki_dir if version: abstract_cmd_dir = abstract_cmd_dir + "/" + version abstract_filter = ("--plugin=AbstractFilter:" "%s/extensions/ActiveAbstract/AbstractFilter.php" % abstract_cmd_dir) command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir, abstract_filter, "--current", "--report=1000"]) for filetype in outfiles: command.extend(["--output=file:%s" % outfiles[filetype]['temp'], "--filter=namespace:NS_MAIN", "--filter=noredirect", "--filter=abstract%s" % filetype]) do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 20000, 30000, '</doc>\n')
def get_domain_from_wikidbname(self): ''' given the name of the wiki db, turn this into the fqdn of the wiki project (i.e. enwiki -> en.wikipedia.org) ''' script_command = MultiVersion.mw_script_as_array(self.wiki.config, "eval.php") # echo $wgCanonicalServer | php "$multiversionscript" eval.php $wiki command = ["echo", "'echo $wgCanonicalServer;'", "|", self.wiki.config.php] command.extend(script_command) command.append(self.wiki.db_name) command_text = " ".join(command) self.log.info("running with no output: %s", command_text) output = RunSimpleCommand.run_with_output(command_text, shell=True) if not output: self.log.warning("error retrieving domain for wiki %s", self.wiki.db_name) return None # rstrip gets rid of any trailing newlines from eval.php return output.decode('utf-8').split('//')[1].rstrip()
def get_db_user_and_password(self): # get these by running a MediaWiki maintenance script; # yes, this means you need a full installation of MediaWiki # (but not web service) in order to use these methods command_list = MultiVersion.mw_script_as_array(self.config, "getConfiguration.php") pull_vars = ["wgDBuser", "wgDBpassword"] command = "{php} {command} --wiki={dbname} --format=json --regex='{vars}'" command = command.format( php=MiscUtils.shell_escape(self.config.php), command=" ".join(command_list), dbname=MiscUtils.shell_escape(self.db_name), vars="|".join(pull_vars)) results = RunSimpleCommand.run_with_output(command, shell=True).strip() settings = json.loads(results.decode('utf-8')) db_user = settings['wgDBuser'] db_password = settings['wgDBpassword'] return db_user, db_password
def build_command(self, runner, stub_dfname, prefetch, output_dfname): """ Build the command line for the dump, minus output and filter options args: Runner, stub DumpFilename, .... """ stub_path = os.path.join( FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir), stub_dfname.filename) if os.path.exists(stub_path): # if this is a pagerange stub file in temp dir, use that stub_option = "--stub=gzip:%s" % stub_path else: # use regular stub file if runner.wiki.is_private(): stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_private_path(stub_dfname) else: stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_dfname) if self.jobinfo['spawn']: spawn = "--spawn=%s" % (self.wiki.config.php) else: spawn = "" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php") dump_command = [self.wiki.config.php] dump_command.extend(script_command) dump_command.extend(["--wiki=%s" % runner.db_name, "%s" % stub_option, "%s" % prefetch, "--report=1000", "%s" % spawn]) dump_command = [entry for entry in dump_command if entry is not None] dump_command.extend([self.build_filters(runner, output_dfname), self.build_eta()]) pipeline = [dump_command] # return a command series of one pipeline series = [pipeline] return series
def get_command(self, wiki, output_dir, outfile_base, base): ''' given the output directory and filename and the wiki object, put together and return an array consisting of the script name, args, and any multiversion invocations that need to precede it ''' if base is None: base = wiki if self.scriptname.endswith('.php'): script_command = MultiVersion.mw_script_as_array( base.config, self.scriptname) script_command = [base.config.php] + script_command script_command.extend(["--wiki", base.db_name]) else: script_command = [self.scriptname] if self.args is not None: script_command.extend(self.args) script_command = [field.format(DIR=output_dir, FILE=outfile_base, w=wiki.db_name) for field in script_command] return script_command
def build_command(self, runner, output_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) if runner.wiki.is_private(): flow_output_fpath = runner.dump_dir.filename_private_path(output_dfname) else: flow_output_fpath = runner.dump_dir.filename_public_path(output_dfname) script_command = MultiVersion.mw_script_as_array( runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php") command = [runner.wiki.config.php] command.extend(script_command) command.extend(["--wiki=%s" % runner.db_name, "--current", "--report=1000", "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)]) if self.history: command.append("--full") pipeline = [command] series = [pipeline] return series
def dostubsbackup(wikidb, history_file, current_file, articles_file, wikiconf, start, end, dryrun): ''' do a stubs xml dump one piece at a time, writing into uncompressed temporary files and shovelling those into gzip's stdin for the concatenated compressed output ''' outfiles = {'history': {'name': history_file}, 'current': {'name': current_file}, 'articles': {'name': articles_file}} for filetype in outfiles: outfiles[filetype]['temp'] = os.path.join( wikiconf.temp_dir, os.path.basename(outfiles[filetype]['name']) + "_tmp") if dryrun: outfiles[filetype]['compr'] = None else: outfiles[filetype]['compr'] = gzippit(outfiles[filetype]['name']) script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php") command = [wikiconf.php] + script_command command.extend(["--wiki=%s" % wikidb, "--full", "--stub", "--report=1000", "--output=file:%s" % outfiles['history']['temp'], "--output=file:%s" % outfiles['current']['temp'], "--filter=latest", "--output=file:%s" % outfiles['articles']['temp'], "--filter=latest", "--filter=notalk", "--filter=namespace:!NS_USER"]) if wikiconf.stubs_orderrevs: command.append("--orderrevs") callback = get_page_interval else: callback = None do_xml_stream(wikidb, outfiles, command, wikiconf, start, end, dryrun, 'page_id', 'page', 5000, 100000, '</page>\n', callback)
def build_command(self, runner, stub_file): """Build the command line for the dump, minus output and filter options""" # we write a temp file, it will be checkpointed every so often. temp = bool(self._checkpoints_enabled) output_file = DumpFilename(self.wiki, stub_file.date, self.dumpname, self.get_filetype(), self.file_ext, stub_file.partnum, DumpFilename.make_checkpoint_string(stub_file.first_page_id, stub_file.last_page_id), temp) stub_path = os.path.join(self.wiki.config.temp_dir, stub_file.filename) if os.path.exists(stub_path): # if this is a partial stub file in temp dir, use that stub_option = "--stub=gzip:%s" % stub_path else: # use regular stub file stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_file) # Try to pull text from the previous run; most stuff hasn't changed # Source=$OutputDir/pages_$section.xml.bz2 sources = [] possible_sources = None if self._prefetch: possible_sources = self._find_previous_dump(runner, output_file.partnum) # if we have a list of more than one then # we need to check existence for each and put them together in a string if possible_sources: for sourcefile in possible_sources: # if we are doing partial stub run, include only the analogous # checkpointed prefetch files, if there are checkpointed files # otherwise we'll use the all the sourcefiles reported if not self.chkptfile_in_pagerange(stub_file, sourcefile): continue sname = runner.dump_dir.filename_public_path(sourcefile, sourcefile.date) if exists(sname): sources.append(sname) if output_file.partnum: partnum_str = "%s" % stub_file.partnum else: partnum_str = "" if len(sources) > 0: source = "bzip2:%s" % (";".join(sources)) runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." % (self._subset, partnum_str, source)) prefetch = "--prefetch=%s" % (source) else: runner.show_runner_state("... building %s %s XML dump, no text prefetch..." % (self._subset, partnum_str)) prefetch = "" if self._spawn: spawn = "--spawn=%s" % (self.wiki.config.php) else: spawn = "" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) if self._checkpoints_enabled: checkpoint_time = "--maxtime=%s" % (self.wiki.config.checkpoint_time) checkpoint_file = "--checkpointfile=%s" % output_file.new_filename( output_file.dumpname, output_file.file_type, output_file.file_ext, output_file.date, output_file.partnum, "p%sp%s", None) else: checkpoint_time = "" checkpoint_file = "" script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php") dump_command = [self.wiki.config.php] dump_command.extend(script_command) dump_command.extend(["--wiki=%s" % runner.db_name, "%s" % stub_option, "%s" % prefetch, "%s" % checkpoint_time, "%s" % checkpoint_file, "--report=1000", "%s" % spawn]) dump_command = [entry for entry in dump_command if entry is not None] command = dump_command filters = self.build_filters(runner, output_file) eta = self.build_eta(runner) command.extend([filters, eta]) pipeline = [command] series = [pipeline] return series