示例#1
0
 def chkpt_file_from_page_range(self, page_range, partnum):
     checkpoint_string = DumpFilename.make_checkpoint_string(
         page_range[0], page_range[1])
     output_file = DumpFilename(self.wiki, self.wiki.date, self.dumpname,
                                self.get_filetype(), self.get_file_ext(),
                                partnum, checkpoint=checkpoint_string,
                                temp=False)
     return output_file
 def make_dfname_from_pagerange(self, pagerange, partnum):
     """
     given pagerange, make output file for appropriate type
     of page content dumps
     args: (startpage<str>, endpage<str>), string
     """
     checkpoint_string = DumpFilename.make_checkpoint_string(
         pagerange[0], pagerange[1])
     output_dfname = DumpFilename(self.wiki, self.wiki.date, self.get_dumpname(),
                                  self.get_filetype(), self.get_file_ext(),
                                  partnum, checkpoint=checkpoint_string,
                                  temp=False)
     return output_dfname
 def get_pagerange_stub_dfname(self, wanted, runner):
     """
     return the dumpfilename for stub file that would have
     the page range in 'wanted'
     """
     stub_input_dfname = self.get_stub_dfname(wanted['partnum'], runner)
     stub_output_dfname = DumpFilename(
         self.wiki, stub_input_dfname.date, stub_input_dfname.dumpname,
         stub_input_dfname.file_type,
         stub_input_dfname.file_ext,
         stub_input_dfname.partnum,
         DumpFilename.make_checkpoint_string(
             wanted['outfile'].first_page_id, wanted['outfile'].last_page_id), temp=False)
     return stub_output_dfname
示例#4
0
 def get_chkptfile_from_pageids(self):
     if ',' in self.page_id_range:
         first_page_id, last_page_id = self.page_id_range.split(',', 1)
     else:
         first_page_id = self.page_id_range
         last_page_id = "00000"  # indicates no last page id specified, go to end of stub
     checkpoint_string = DumpFilename.make_checkpoint_string(first_page_id, last_page_id)
     if self._partnum_todo:
         partnum = self._partnum_todo
     else:
         # fixme is that right? maybe NOT
         partnum = None
     fileobj = DumpFilename(self.get_dumpname(), self.wiki.date, self.get_filetype(),
                            self.get_file_ext(), partnum, checkpoint_string)
     return fileobj.filename
    def run(self, runner):
        # here we will either clean up or not depending on how we were called
        # FIXME callers should set this appropriately and they don't right now
        self.cleanup_old_files(runner.dump_dir, runner)

        # clean up all tmp output files from previous attempts of this job
        # for this dump wiki and date, otherwise we'll wind up indexing
        # them and hashsumming them etc.
        # they may have been left around from an interrupted or failed earlier
        # run

        # in cases where we have request of specific file, do it as asked,
        # no splitting it up into smaller pieces
        do_bitesize = False

        self.cleanup_tmp_files(runner.dump_dir, runner)

        commands = []

        dfnames_todo = []
        if self.jobinfo['pageid_range'] is not None:
            # convert to checkpoint filename, handle the same way
            dfnames_todo = [self.get_pagerange_output_dfname()]
        elif self.checkpoint_file:
            dfnames_todo = [self.checkpoint_file]
        elif self._checkpoints_enabled:
            do_bitesize = True
            stub_pageranges = self.get_ranges_covered_by_stubs(runner)
            stub_pageranges = sorted(stub_pageranges, key=lambda x: int(x[0]))
            dfnames_todo = self.get_dfnames_for_missing_pranges(runner, stub_pageranges)
            # replace stub ranges for output files that cover smaller
            # ranges, with just those numbers
            new_stub_ranges = []
            for dfname in dfnames_todo:
                if dfname.is_checkpoint_file:
                    new_stub_ranges.append((dfname.first_page_id,
                                            dfname.last_page_id, dfname.partnum))
                else:
                    for srange in stub_pageranges:
                        if srange[2] == dfname.partnum:
                            new_stub_ranges.append(srange)
            stub_pageranges = new_stub_ranges
        else:
            output_dfnames = self.get_reg_files_for_filepart_possible(
                runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames())
            # at least some page ranges are covered, just do those that
            if runner.wiki.is_private():
                dfnames_todo = [
                    dfname for dfname in output_dfnames if not os.path.exists(
                        runner.dump_dir.filename_private_path(dfname))]
            else:
                dfnames_todo = [
                    dfname for dfname in output_dfnames if not os.path.exists(
                        runner.dump_dir.filename_public_path(dfname))]
        if self._checkpoints_enabled and do_bitesize:
            dfnames_todo = self.make_bitesize_jobs(dfnames_todo, stub_pageranges)

        if self.jobinfo['prefetch']:
            if runner.wiki.config.sevenzip_prefetch:
                file_exts = ['7z', self.file_ext]
            else:
                file_exts = [self.file_ext]
            prefetcher = PrefetchFinder(
                self.wiki,
                {'name': self.name(), 'desc': self.jobinfo['desc'],
                 'dumpname': self.get_dumpname(),
                 'ftype': self.file_type, 'fexts': file_exts,
                 'subset': self.jobinfo['subset']},
                {'date': self.jobinfo['prefetchdate'], 'parts': self._parts},
                self.verbose)

        wanted = [self.setup_wanted(dfname, runner, prefetcher) for dfname in dfnames_todo]

        to_generate = []
        for entry in wanted:
            if entry['generate']:
                to_generate.append((entry['stub_input'], entry['stub']))
        if self._parts:
            batchsize = int(len(self._parts) / 2)
        else:
            batchsize = 1
        self.stubber.write_pagerange_stubs(to_generate, runner, batchsize, self.move_if_truncated)

        for entry in wanted:
            if entry['generate']:
                if self.stubber.has_no_pages(entry['stub'], runner, tempdir=True):
                    # this page range has no pages in it (all deleted?) so we need not
                    # keep info on how to generate it
                    continue
            # series = self.build_command(runner, entry['stub'], entry['prefetch'])
            output_dfname = DumpFilename(self.wiki, entry['stub'].date, self.get_dumpname(),
                                         self.get_filetype(), self.file_ext, entry['stub'].partnum,
                                         DumpFilename.make_checkpoint_string(
                                             entry['stub'].first_page_id,
                                             entry['stub'].last_page_id),
                                         False)
            entry['command'] = self.build_command(runner, entry['stub'],
                                                  entry['prefetch'], output_dfname)
            self.setup_command_info(runner, entry['command'], [output_dfname])
            commands.append(entry['command'])

        # don't do them all at once, do only up to _parts commands at the same time
        if self._parts:
            batchsize = len(self._parts)
        else:
            batchsize = 1
        errors = False
        failed_commands = []
        max_retries = self.wiki.config.max_retries
        retries = 0
        while commands and (retries < max_retries or retries == 0):
            command_batch = commands[:batchsize]
            error, broken = runner.run_command(
                command_batch, callback_stderr=self.progress_callback,
                callback_stderr_arg=runner,
                callback_on_completion=self.command_completion_callback)
            if error:
                for series in broken:
                    for pipeline in series:
                        runner.log_and_print("error from commands: %s" % " ".join(
                            [entry for entry in pipeline]))
                failed_commands.append(broken)
                errors = True
            commands = commands[batchsize:]
            if not commands and failed_commands:
                retries += 1
                if retries < max_retries:
                    # retry failed commands
                    commands = failed_commands
                    failed_commands = []
                    # no instant retries, give the servers a break
                    time.sleep(self.wiki.config.retry_wait)
                    errors = False
        if errors:
            raise BackupError("error producing xml file(s) %s" % self.get_dumpname())
示例#6
0
    def build_command(self, runner, stub_file):
        """Build the command line for the dump, minus output and filter options"""

        # we write a temp file, it will be checkpointed every so often.
        temp = bool(self._checkpoints_enabled)

        output_file = DumpFilename(self.wiki, stub_file.date, self.dumpname,
                                   self.get_filetype(), self.file_ext, stub_file.partnum,
                                   DumpFilename.make_checkpoint_string(stub_file.first_page_id,
                                                                       stub_file.last_page_id),
                                   temp)

        stub_path = os.path.join(self.wiki.config.temp_dir, stub_file.filename)
        if os.path.exists(stub_path):
            # if this is a partial stub file in temp dir, use that
            stub_option = "--stub=gzip:%s" % stub_path
        else:
            # use regular stub file
            stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_file)

        # Try to pull text from the previous run; most stuff hasn't changed
        # Source=$OutputDir/pages_$section.xml.bz2
        sources = []
        possible_sources = None
        if self._prefetch:
            possible_sources = self._find_previous_dump(runner, output_file.partnum)
            # if we have a list of more than one then
            # we need to check existence for each and put them together in a string
            if possible_sources:
                for sourcefile in possible_sources:
                    # if we are doing partial stub run, include only the analogous
                    # checkpointed prefetch files, if there are checkpointed files
                    # otherwise we'll use the all the sourcefiles reported
                    if not self.chkptfile_in_pagerange(stub_file, sourcefile):
                        continue
                    sname = runner.dump_dir.filename_public_path(sourcefile, sourcefile.date)
                    if exists(sname):
                        sources.append(sname)
        if output_file.partnum:
            partnum_str = "%s" % stub_file.partnum
        else:
            partnum_str = ""
        if len(sources) > 0:
            source = "bzip2:%s" % (";".join(sources))
            runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." %
                                     (self._subset, partnum_str, source))
            prefetch = "--prefetch=%s" % (source)
        else:
            runner.show_runner_state("... building %s %s XML dump, no text prefetch..." %
                                     (self._subset, partnum_str))
            prefetch = ""

        if self._spawn:
            spawn = "--spawn=%s" % (self.wiki.config.php)
        else:
            spawn = ""

        if not exists(self.wiki.config.php):
            raise BackupError("php command %s not found" % self.wiki.config.php)

        if self._checkpoints_enabled:
            checkpoint_time = "--maxtime=%s" % (self.wiki.config.checkpoint_time)
            checkpoint_file = "--checkpointfile=%s" % output_file.new_filename(
                output_file.dumpname, output_file.file_type, output_file.file_ext,
                output_file.date, output_file.partnum, "p%sp%s", None)
        else:
            checkpoint_time = ""
            checkpoint_file = ""
        script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php")
        dump_command = [self.wiki.config.php]
        dump_command.extend(script_command)
        dump_command.extend(["--wiki=%s" % runner.db_name,
                             "%s" % stub_option,
                             "%s" % prefetch,
                             "%s" % checkpoint_time,
                             "%s" % checkpoint_file,
                             "--report=1000",
                             "%s" % spawn])

        dump_command = [entry for entry in dump_command if entry is not None]
        command = dump_command
        filters = self.build_filters(runner, output_file)
        eta = self.build_eta(runner)
        command.extend([filters, eta])
        pipeline = [command]
        series = [pipeline]
        return series
示例#7
0
    def run(self, runner):
        # here we will either clean up or not depending on how we were called FIXME
        self.cleanup_old_files(runner.dump_dir, runner)
        commands = []

        todo = []

        if self.page_id_range is not None:
            # convert to checkpoint filename, handle the same way
            self.checkpoint_file = self.get_chkptfile_from_pageids()

        if self.checkpoint_file:
            todo = [self.checkpoint_file]
        else:
            # list all the output files that would be produced w/o
            # checkpoint files on
            outfiles = self.get_reg_files_for_filepart_possible(
                runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames())
            if self._checkpoints_enabled:

                # get the stub list that would be used for the current run
                stubs = self.get_stub_files(runner)
                stubs = sorted(stubs, key=lambda thing: thing.filename)

                # get the page ranges covered by stubs
                stub_ranges = []
                for stub in stubs:
                    fname = DumpFile(self.wiki,
                                     runner.dump_dir.filename_public_path(stub, stub.date),
                                     stub, self.verbose)
                    stub_ranges.append((fname.find_first_page_id_in_file(),
                                        self.find_last_page_id(stub, runner), stub.partnum))

                # get list of existing checkpoint files
                chkpt_files = self.list_checkpt_files(
                    runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None)
                chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename)
                # get the page ranges covered by existing checkpoint files
                checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id,
                                      chkptfile.partnum)
                                     for chkptfile in chkpt_files]
                if self.verbose:
                    print "checkpoint_ranges is", checkpoint_ranges
                    print "stub_ranges is", stub_ranges

                if not checkpoint_ranges:
                    # no page ranges covered by checkpoints. do all output files
                    # the usual way
                    todo = outfiles
                else:
                    todo = []
                    parts = self.get_fileparts_list()
                    for partnum in parts:
                        if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges
                                if int(chkpt_range[2]) == int(partnum)]:
                            # no page ranges covered by checkpoints for a particular
                            # file part (subjob) so do that output file the
                            # regular way
                            todo.extend([outfile for outfile in outfiles
                                         if int(outfile.partnum) == int(partnum)])

                    missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges)
                    todo.extend([self.chkpt_file_from_page_range((first, last), partnum)
                                 for (first, last, partnum) in missing])

            else:
                # do the missing files only
                # FIXME public or private depending on the wiki!
                todo = [outfile for outfile in outfiles
                        if not os.path.exists(runner.dump_dir.filename_public_path(outfile))]

        partial_stubs = []
        if self.verbose:
            print "todo is", [to.filename for to in todo]
        for fileobj in todo:

            stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0]

            if fileobj.first_page_id is None:
                partial_stubs.append(stub_for_file)
            else:
                stub_output_file = DumpFilename(
                    self.wiki, fileobj.date, fileobj.dumpname,
                    self.item_for_stubs.get_filetype(),
                    self.item_for_stubs.get_file_ext(),
                    fileobj.partnum,
                    DumpFilename.make_checkpoint_string(
                        fileobj.first_page_id, fileobj.last_page_id), temp=True)

                self.write_partial_stub(stub_for_file, stub_output_file, runner)
                if not self.has_no_entries(stub_output_file, runner):
                    partial_stubs.append(stub_output_file)

        if self.verbose:
            print "partial_stubs is", [ps.filename for ps in partial_stubs]
        if partial_stubs:
            stub_files = partial_stubs
        else:
            return

        for stub_file in stub_files:
            series = self.build_command(runner, stub_file)
            commands.append(series)

        error = runner.run_command(commands, callback_stderr=self.progress_callback,
                                   callback_stderr_arg=runner)
        if error:
            raise BackupError("error producing xml file(s) %s" % self.dumpname)