def chkpt_file_from_page_range(self, page_range, partnum): checkpoint_string = DumpFilename.make_checkpoint_string( page_range[0], page_range[1]) output_file = DumpFilename(self.wiki, self.wiki.date, self.dumpname, self.get_filetype(), self.get_file_ext(), partnum, checkpoint=checkpoint_string, temp=False) return output_file
def make_dfname_from_pagerange(self, pagerange, partnum): """ given pagerange, make output file for appropriate type of page content dumps args: (startpage<str>, endpage<str>), string """ checkpoint_string = DumpFilename.make_checkpoint_string( pagerange[0], pagerange[1]) output_dfname = DumpFilename(self.wiki, self.wiki.date, self.get_dumpname(), self.get_filetype(), self.get_file_ext(), partnum, checkpoint=checkpoint_string, temp=False) return output_dfname
def get_pagerange_stub_dfname(self, wanted, runner): """ return the dumpfilename for stub file that would have the page range in 'wanted' """ stub_input_dfname = self.get_stub_dfname(wanted['partnum'], runner) stub_output_dfname = DumpFilename( self.wiki, stub_input_dfname.date, stub_input_dfname.dumpname, stub_input_dfname.file_type, stub_input_dfname.file_ext, stub_input_dfname.partnum, DumpFilename.make_checkpoint_string( wanted['outfile'].first_page_id, wanted['outfile'].last_page_id), temp=False) return stub_output_dfname
def get_chkptfile_from_pageids(self): if ',' in self.page_id_range: first_page_id, last_page_id = self.page_id_range.split(',', 1) else: first_page_id = self.page_id_range last_page_id = "00000" # indicates no last page id specified, go to end of stub checkpoint_string = DumpFilename.make_checkpoint_string(first_page_id, last_page_id) if self._partnum_todo: partnum = self._partnum_todo else: # fixme is that right? maybe NOT partnum = None fileobj = DumpFilename(self.get_dumpname(), self.wiki.date, self.get_filetype(), self.get_file_ext(), partnum, checkpoint_string) return fileobj.filename
def run(self, runner): # here we will either clean up or not depending on how we were called # FIXME callers should set this appropriately and they don't right now self.cleanup_old_files(runner.dump_dir, runner) # clean up all tmp output files from previous attempts of this job # for this dump wiki and date, otherwise we'll wind up indexing # them and hashsumming them etc. # they may have been left around from an interrupted or failed earlier # run # in cases where we have request of specific file, do it as asked, # no splitting it up into smaller pieces do_bitesize = False self.cleanup_tmp_files(runner.dump_dir, runner) commands = [] dfnames_todo = [] if self.jobinfo['pageid_range'] is not None: # convert to checkpoint filename, handle the same way dfnames_todo = [self.get_pagerange_output_dfname()] elif self.checkpoint_file: dfnames_todo = [self.checkpoint_file] elif self._checkpoints_enabled: do_bitesize = True stub_pageranges = self.get_ranges_covered_by_stubs(runner) stub_pageranges = sorted(stub_pageranges, key=lambda x: int(x[0])) dfnames_todo = self.get_dfnames_for_missing_pranges(runner, stub_pageranges) # replace stub ranges for output files that cover smaller # ranges, with just those numbers new_stub_ranges = [] for dfname in dfnames_todo: if dfname.is_checkpoint_file: new_stub_ranges.append((dfname.first_page_id, dfname.last_page_id, dfname.partnum)) else: for srange in stub_pageranges: if srange[2] == dfname.partnum: new_stub_ranges.append(srange) stub_pageranges = new_stub_ranges else: output_dfnames = self.get_reg_files_for_filepart_possible( runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames()) # at least some page ranges are covered, just do those that if runner.wiki.is_private(): dfnames_todo = [ dfname for dfname in output_dfnames if not os.path.exists( runner.dump_dir.filename_private_path(dfname))] else: dfnames_todo = [ dfname for dfname in output_dfnames if not os.path.exists( runner.dump_dir.filename_public_path(dfname))] if self._checkpoints_enabled and do_bitesize: dfnames_todo = self.make_bitesize_jobs(dfnames_todo, stub_pageranges) if self.jobinfo['prefetch']: if runner.wiki.config.sevenzip_prefetch: file_exts = ['7z', self.file_ext] else: file_exts = [self.file_ext] prefetcher = PrefetchFinder( self.wiki, {'name': self.name(), 'desc': self.jobinfo['desc'], 'dumpname': self.get_dumpname(), 'ftype': self.file_type, 'fexts': file_exts, 'subset': self.jobinfo['subset']}, {'date': self.jobinfo['prefetchdate'], 'parts': self._parts}, self.verbose) wanted = [self.setup_wanted(dfname, runner, prefetcher) for dfname in dfnames_todo] to_generate = [] for entry in wanted: if entry['generate']: to_generate.append((entry['stub_input'], entry['stub'])) if self._parts: batchsize = int(len(self._parts) / 2) else: batchsize = 1 self.stubber.write_pagerange_stubs(to_generate, runner, batchsize, self.move_if_truncated) for entry in wanted: if entry['generate']: if self.stubber.has_no_pages(entry['stub'], runner, tempdir=True): # this page range has no pages in it (all deleted?) so we need not # keep info on how to generate it continue # series = self.build_command(runner, entry['stub'], entry['prefetch']) output_dfname = DumpFilename(self.wiki, entry['stub'].date, self.get_dumpname(), self.get_filetype(), self.file_ext, entry['stub'].partnum, DumpFilename.make_checkpoint_string( entry['stub'].first_page_id, entry['stub'].last_page_id), False) entry['command'] = self.build_command(runner, entry['stub'], entry['prefetch'], output_dfname) self.setup_command_info(runner, entry['command'], [output_dfname]) commands.append(entry['command']) # don't do them all at once, do only up to _parts commands at the same time if self._parts: batchsize = len(self._parts) else: batchsize = 1 errors = False failed_commands = [] max_retries = self.wiki.config.max_retries retries = 0 while commands and (retries < max_retries or retries == 0): command_batch = commands[:batchsize] error, broken = runner.run_command( command_batch, callback_stderr=self.progress_callback, callback_stderr_arg=runner, callback_on_completion=self.command_completion_callback) if error: for series in broken: for pipeline in series: runner.log_and_print("error from commands: %s" % " ".join( [entry for entry in pipeline])) failed_commands.append(broken) errors = True commands = commands[batchsize:] if not commands and failed_commands: retries += 1 if retries < max_retries: # retry failed commands commands = failed_commands failed_commands = [] # no instant retries, give the servers a break time.sleep(self.wiki.config.retry_wait) errors = False if errors: raise BackupError("error producing xml file(s) %s" % self.get_dumpname())
def build_command(self, runner, stub_file): """Build the command line for the dump, minus output and filter options""" # we write a temp file, it will be checkpointed every so often. temp = bool(self._checkpoints_enabled) output_file = DumpFilename(self.wiki, stub_file.date, self.dumpname, self.get_filetype(), self.file_ext, stub_file.partnum, DumpFilename.make_checkpoint_string(stub_file.first_page_id, stub_file.last_page_id), temp) stub_path = os.path.join(self.wiki.config.temp_dir, stub_file.filename) if os.path.exists(stub_path): # if this is a partial stub file in temp dir, use that stub_option = "--stub=gzip:%s" % stub_path else: # use regular stub file stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_file) # Try to pull text from the previous run; most stuff hasn't changed # Source=$OutputDir/pages_$section.xml.bz2 sources = [] possible_sources = None if self._prefetch: possible_sources = self._find_previous_dump(runner, output_file.partnum) # if we have a list of more than one then # we need to check existence for each and put them together in a string if possible_sources: for sourcefile in possible_sources: # if we are doing partial stub run, include only the analogous # checkpointed prefetch files, if there are checkpointed files # otherwise we'll use the all the sourcefiles reported if not self.chkptfile_in_pagerange(stub_file, sourcefile): continue sname = runner.dump_dir.filename_public_path(sourcefile, sourcefile.date) if exists(sname): sources.append(sname) if output_file.partnum: partnum_str = "%s" % stub_file.partnum else: partnum_str = "" if len(sources) > 0: source = "bzip2:%s" % (";".join(sources)) runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." % (self._subset, partnum_str, source)) prefetch = "--prefetch=%s" % (source) else: runner.show_runner_state("... building %s %s XML dump, no text prefetch..." % (self._subset, partnum_str)) prefetch = "" if self._spawn: spawn = "--spawn=%s" % (self.wiki.config.php) else: spawn = "" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) if self._checkpoints_enabled: checkpoint_time = "--maxtime=%s" % (self.wiki.config.checkpoint_time) checkpoint_file = "--checkpointfile=%s" % output_file.new_filename( output_file.dumpname, output_file.file_type, output_file.file_ext, output_file.date, output_file.partnum, "p%sp%s", None) else: checkpoint_time = "" checkpoint_file = "" script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php") dump_command = [self.wiki.config.php] dump_command.extend(script_command) dump_command.extend(["--wiki=%s" % runner.db_name, "%s" % stub_option, "%s" % prefetch, "%s" % checkpoint_time, "%s" % checkpoint_file, "--report=1000", "%s" % spawn]) dump_command = [entry for entry in dump_command if entry is not None] command = dump_command filters = self.build_filters(runner, output_file) eta = self.build_eta(runner) command.extend([filters, eta]) pipeline = [command] series = [pipeline] return series
def run(self, runner): # here we will either clean up or not depending on how we were called FIXME self.cleanup_old_files(runner.dump_dir, runner) commands = [] todo = [] if self.page_id_range is not None: # convert to checkpoint filename, handle the same way self.checkpoint_file = self.get_chkptfile_from_pageids() if self.checkpoint_file: todo = [self.checkpoint_file] else: # list all the output files that would be produced w/o # checkpoint files on outfiles = self.get_reg_files_for_filepart_possible( runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames()) if self._checkpoints_enabled: # get the stub list that would be used for the current run stubs = self.get_stub_files(runner) stubs = sorted(stubs, key=lambda thing: thing.filename) # get the page ranges covered by stubs stub_ranges = [] for stub in stubs: fname = DumpFile(self.wiki, runner.dump_dir.filename_public_path(stub, stub.date), stub, self.verbose) stub_ranges.append((fname.find_first_page_id_in_file(), self.find_last_page_id(stub, runner), stub.partnum)) # get list of existing checkpoint files chkpt_files = self.list_checkpt_files( runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None) chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename) # get the page ranges covered by existing checkpoint files checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id, chkptfile.partnum) for chkptfile in chkpt_files] if self.verbose: print "checkpoint_ranges is", checkpoint_ranges print "stub_ranges is", stub_ranges if not checkpoint_ranges: # no page ranges covered by checkpoints. do all output files # the usual way todo = outfiles else: todo = [] parts = self.get_fileparts_list() for partnum in parts: if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges if int(chkpt_range[2]) == int(partnum)]: # no page ranges covered by checkpoints for a particular # file part (subjob) so do that output file the # regular way todo.extend([outfile for outfile in outfiles if int(outfile.partnum) == int(partnum)]) missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges) todo.extend([self.chkpt_file_from_page_range((first, last), partnum) for (first, last, partnum) in missing]) else: # do the missing files only # FIXME public or private depending on the wiki! todo = [outfile for outfile in outfiles if not os.path.exists(runner.dump_dir.filename_public_path(outfile))] partial_stubs = [] if self.verbose: print "todo is", [to.filename for to in todo] for fileobj in todo: stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0] if fileobj.first_page_id is None: partial_stubs.append(stub_for_file) else: stub_output_file = DumpFilename( self.wiki, fileobj.date, fileobj.dumpname, self.item_for_stubs.get_filetype(), self.item_for_stubs.get_file_ext(), fileobj.partnum, DumpFilename.make_checkpoint_string( fileobj.first_page_id, fileobj.last_page_id), temp=True) self.write_partial_stub(stub_for_file, stub_output_file, runner) if not self.has_no_entries(stub_output_file, runner): partial_stubs.append(stub_output_file) if self.verbose: print "partial_stubs is", [ps.filename for ps in partial_stubs] if partial_stubs: stub_files = partial_stubs else: return for stub_file in stub_files: series = self.build_command(runner, stub_file) commands.append(series) error = runner.run_command(commands, callback_stderr=self.progress_callback, callback_stderr_arg=runner) if error: raise BackupError("error producing xml file(s) %s" % self.dumpname)