def build_command(self, runner, output_dfname): ''' arguments: runner: Runner object output_dfname: output file that will be produced ''' input_dfname = DumpFilename(self.wiki, None, output_dfname.dumpname, output_dfname.file_type, self.item_for_recompression.file_ext, output_dfname.partnum, output_dfname.checkpoint) if runner.wiki.is_private(): outfilepath = runner.dump_dir.filename_private_path( self.get_multistream_dfname(output_dfname)) outfilepath_index = runner.dump_dir.filename_private_path( self.get_multistream_index_dfname(output_dfname)) infilepath = runner.dump_dir.filename_private_path(input_dfname) else: outfilepath = runner.dump_dir.filename_public_path( self.get_multistream_dfname(output_dfname)) outfilepath_index = runner.dump_dir.filename_public_path( self.get_multistream_index_dfname(output_dfname)) infilepath = runner.dump_dir.filename_public_path(input_dfname) command_pipe = [["%s -dc %s | %s --pagesperstream 100 --buildindex %s -o %s" % (self.wiki.config.bzip2, infilepath, self.wiki.config.recompressxml, DumpFilename.get_inprogress_name(outfilepath_index), DumpFilename.get_inprogress_name(outfilepath))]] return [command_pipe]
def run(self, runner): retries = 0 maxretries = runner.wiki.config.max_retries dfnames = self.list_outfiles_for_build_command(runner.dump_dir) if len(dfnames) > 1: raise BackupError("siteinfo dump %s trying to produce more than one file" % self.dumpname) output_dfname = dfnames[0] commands = self.build_command(runner) if runner.wiki.is_private(): command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(output_dfname))) else: command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(output_dfname))) self.setup_command_info(runner, command_series, [output_dfname]) error, _broken = runner.save_command(command_series, self.command_completion_callback) while error and retries < maxretries: retries = retries + 1 time.sleep(5) error, _broken = runner.save_command(command_series) if error: raise BackupError("error dumping siteinfo props %s" % ','.join(self._properties))
def build_command(self, runner, output_dfname): commands = runner.db_server_info.build_sqldump_command(self._table, runner.wiki.config.gzip) if self.private or runner.wiki.is_private(): command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(output_dfname))) else: command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(output_dfname))) return command_series
def build_command(self, runner, query, out_dfname): if not exists(runner.wiki.config.gzip): raise BackupError("gzip command %s not found" % runner.wiki.config.gzip) series = runner.db_server_info.build_sql_command(query, runner.wiki.config.gzip) if runner.wiki.is_private(): return runner.get_save_command_series( series, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(out_dfname))) return runner.get_save_command_series( series, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(out_dfname)))
def build_command(self, runner, output_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) if runner.wiki.is_private(): logging_path = runner.dump_dir.filename_private_path(output_dfname) else: logging_path = runner.dump_dir.filename_public_path(output_dfname) config_file_arg = runner.wiki.config.files[0] if runner.wiki.config.override_section: config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python3", "xmllogs.py", "--config", config_file_arg, "--wiki", runner.db_name, "--outfile", DumpFilename.get_inprogress_name(logging_path)] if output_dfname.partnum: # set up start end end pageids for this piece # note there is no item id 0 I guess. so we start with 1 start = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int - 1)]) + 1 startopt = "--start=%s" % start # if we are on the last file part, we should get up to the last log item id, # whatever that is. command.append(startopt) if output_dfname.partnum_int < len(self._parts): end = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int)]) + 1 endopt = "--end=%s" % end command.append(endopt) pipeline = [command] series = [pipeline] return series
def build_filters(self, runner, input_dfname): """ Construct the output filter options for dumpTextPass.php args: Runner, DumpFilename """ # do we need checkpoints? ummm if runner.wiki.is_private(): xmlbz2_path = runner.dump_dir.filename_private_path(input_dfname) else: xmlbz2_path = runner.dump_dir.filename_public_path(input_dfname) if 'history' in self.jobinfo['subset'] and runner.wiki.config.lbzip2forhistory: # we will use lbzip2 for compression of pages-meta-history for this wiki # if configured bz2mode = "lbzip2" if not exists(self.wiki.config.lbzip2): raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2) elif self.wiki.config.bzip2[-6:] == "dbzip2": bz2mode = "dbzip2" else: bz2mode = "bzip2" if not exists(self.wiki.config.bzip2): raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2) return "--output=%s:%s" % (bz2mode, DumpFilename.get_inprogress_name(xmlbz2_path))
def build_command(self, runner, novariant_dfname, output_dfnames): """ args: Runner, DumpFilename for output without any language variant """ config_file_arg = runner.wiki.config.files[0] if runner.wiki.config.override_section: config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python3", "xmlabstracts.py", "--config", config_file_arg, "--wiki", self.db_name] output_paths = [] variants = [] for dfname in output_dfnames: variant = self.get_variant_from_dumpname(dfname.dumpname) variant_option = self._variant_option(variant) if runner.wiki.is_private(): output_paths.append(DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(dfname))) else: output_paths.append(DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(dfname))) variants.append(variant_option) command.extend(["--outfiles=%s" % ",".join(output_paths), "--variants=%s" % ",".join(variants)]) if novariant_dfname.partnum: # set up start end end pageids for this piece # note there is no page id 0 I guess. so we start with 1 start = sum([int(self._parts[i]) for i in range(0, novariant_dfname.partnum_int - 1)]) + 1 startopt = "--start=%s" % start # if we are on the last file part, we should get up to the last pageid, # whatever that is. command.append(startopt) if novariant_dfname.partnum_int < len(self._parts): end = sum([int(self._parts[i]) for i in range(0, novariant_dfname.partnum_int)]) + 1 endopt = "--end=%s" % end command.append(endopt) pipeline = [command] series = [pipeline] return series
def build_command(self, runner, output_dfname, history_dfname, current_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) config_file_arg = runner.wiki.config.files[0] if runner.wiki.config.override_section: config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python3", "xmlstubs.py", "--config", config_file_arg, "--wiki", runner.db_name] output_dir = self.get_output_dir(runner) if output_dfname is not None: command.extend(["--articles", DumpFilename.get_inprogress_name( os.path.join(output_dir, output_dfname.filename))]) if history_dfname is not None: command.extend(["--history", DumpFilename.get_inprogress_name( os.path.join(output_dir, history_dfname.filename))]) if current_dfname is not None: command.extend(["--current", DumpFilename.get_inprogress_name( os.path.join(output_dir, current_dfname.filename))]) partnum = None if output_dfname is not None: partnum = output_dfname.partnum elif history_dfname is not None: partnum = history_dfname.partnum elif current_dfname is not None: partnum = current_dfname.partnum if partnum is not None: # set up start end end pageids for this piece # note there is no page id 0 I guess. so we start with 1 start = sum([int(self._parts[i]) for i in range(0, int(partnum) - 1)]) + 1 startopt = "--start=%s" % start # if we are on the last file part, we should get up to the last pageid, # whatever that is. command.append(startopt) if int(partnum) < len(self._parts): end = sum([int(self._parts[i]) for i in range(0, int(partnum))]) + 1 endopt = "--end=%s" % end command.append(endopt) pipeline = [command] series = [pipeline] return series
def cleanup_inprog_files(self, dump_dir, runner): if self.checkpoint_file is not None: # we only rerun this one, so just remove this one pub_path = DumpFilename.get_inprogress_name( dump_dir.filename_public_path(self.checkpoint_file)) priv_path = DumpFilename.get_inprogress_name( dump_dir.filename_private_path(self.checkpoint_file)) if os.path.exists(pub_path): if runner.dryrun: print("would remove", pub_path) else: os.remove(pub_path) elif os.path.exists(priv_path): if runner.dryrun: print("would remove", priv_path) else: os.remove(priv_path) dfnames = self.list_inprog_files_for_cleanup(dump_dir) if runner.dryrun: print("would remove ", [dfname.filename for dfname in dfnames]) else: for dfname in dfnames: self.remove_output_file(dump_dir, dfname)
def build_command(self, runner, output_dfnames): ''' arguments: runner: Runner object output_dfnames: if checkpointing of files is enabled, this should be a list of checkpoint files (DumpFilename), otherwise it should be a list of the one file that will be produced by the dump Note that checkpoint files get done one at a time, not in parallel ''' # FIXME need shell escape if self.wiki.config.lbzip2threads: if not exists(self.wiki.config.lbzip2): raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2) elif not exists(self.wiki.config.bzip2): raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2) if not exists(self.wiki.config.sevenzip): raise BackupError("7zip command %s not found" % self.wiki.config.sevenzip) command_series = [] for out_dfname in output_dfnames: input_dfname = DumpFilename(self.wiki, None, out_dfname.dumpname, out_dfname.file_type, self.item_for_recompression.file_ext, out_dfname.partnum, out_dfname.checkpoint) if runner.wiki.is_private(): outfilepath = runner.dump_dir.filename_private_path(out_dfname) infilepath = runner.dump_dir.filename_private_path(input_dfname) else: outfilepath = runner.dump_dir.filename_public_path(out_dfname) infilepath = runner.dump_dir.filename_public_path(input_dfname) if self.wiki.config.lbzip2threads: # one thread only, as these already run in parallel decompr_command = "{lbzip2} -dc -n 1 {infile}".format( lbzip2=self.wiki.config.lbzip2, infile=infilepath) else: decompr_command = "{bzip2} -dc {infile}".format(bzip2=self.wiki.config.bzip2, infile=infilepath) command_pipe = [["{decompr} | {sevenzip} a -mx=4 -si {ofile}".format( decompr=decompr_command, sevenzip=self.wiki.config.sevenzip, ofile=DumpFilename.get_inprogress_name(outfilepath))]] command_series.append(command_pipe) return command_series
def build_command(self, runner, output_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) if runner.wiki.is_private(): flow_output_fpath = runner.dump_dir.filename_private_path(output_dfname) else: flow_output_fpath = runner.dump_dir.filename_public_path(output_dfname) script_command = MultiVersion.mw_script_as_array( runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php") command = [runner.wiki.config.php] command.extend(script_command) command.extend(["--wiki=%s" % runner.db_name, "--current", "--report=1000", "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)]) if self.history: command.append("--full") pipeline = [command] series = [pipeline] return series