예제 #1
0
 def get_lineno_last_page(self, fileobj, runner):
     if not fileobj.filename or not exists(runner.dump_dir.filename_public_path(fileobj)):
         return None
     dumpfile = DumpFile(self.wiki,
                         runner.dump_dir.filename_public_path(fileobj, self.wiki.date),
                         fileobj, self.verbose)
     pipeline = dumpfile.setup_uncompression_command()
     grep = self.wiki.config.grep
     if not exists(grep):
         raise BackupError("grep command %s not found" % grep)
     pipeline.append([grep, "-n", "<page>"])
     tail = self.wiki.config.tail
     if not exists(tail):
         raise BackupError("tail command %s not found" % tail)
     pipeline.append([tail, "-1"])
     # without shell
     proc = CommandPipeline(pipeline, quiet=True)
     proc.run_pipeline_get_output()
     if (proc.exited_successfully() or
             (proc.get_failed_cmds_with_retcode() ==
              [[-signal.SIGPIPE, pipeline[0]]]) or
             (proc.get_failed_cmds_with_retcode() ==
              [[signal.SIGPIPE + 128, pipeline[0]]])):
         output = proc.output()
         # 339915646:  <page>
         if ':' in output:
             linecount = output.split(':')[0]
             if linecount.isdigit():
                 return linecount
     return None
예제 #2
0
 def has_no_entries(self, xmlfile, runner):
     '''
     see if it has a page id in it or not. no? then return True
     '''
     if xmlfile.is_temp_file:
         path = os.path.join(self.wiki.config.temp_dir, xmlfile.filename)
     else:
         path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date)
     fname = DumpFile(self.wiki, path, xmlfile, self.verbose)
     return bool(fname.find_first_page_id_in_file() is None)
예제 #3
0
 def checksums(self, file_obj, dumpjobdata):
     """Run checksum for an output file, and append to the list."""
     if Checksummer.NAME in self._enabled:
         for htype in Checksummer.HASHTYPES:
             checksum_filename = self._get_checksum_filename_tmp(htype)
             output = file(checksum_filename, "a")
             dumpjobdata.debugfn("Checksumming %s via %s" % (file_obj.filename, htype))
             dumpfile = DumpFile(self.wiki, dumpjobdata.dump_dir.filename_public_path(file_obj),
                                 None, self.verbose)
             checksum = dumpfile.checksum(htype)
             if checksum is not None:
                 output.write("%s  %s\n" % (checksum, file_obj.filename))
             output.close()
예제 #4
0
    def get_last_lines_from_n(self, fileobj, runner, count):
        if not fileobj.filename or not exists(runner.dump_dir.filename_public_path(fileobj)):
            return None

        dumpfile = DumpFile(self.wiki,
                            runner.dump_dir.filename_public_path(fileobj, self.wiki.date),
                            fileobj, self.verbose)
        pipeline = dumpfile.setup_uncompression_command()

        tail = self.wiki.config.tail
        if not exists(tail):
            raise BackupError("tail command %s not found" % tail)
        tail_esc = MiscUtils.shell_escape(tail)
        pipeline.append([tail, "-n", "+%s" % count])
        # without shell
        proc = CommandPipeline(pipeline, quiet=True)
        proc.run_pipeline_get_output()
        if (proc.exited_successfully() or
                (proc.get_failed_cmds_with_retcode() ==
                 [[-signal.SIGPIPE, pipeline[0]]]) or
                (proc.get_failed_cmds_with_retcode() ==
                 [[signal.SIGPIPE + 128, pipeline[0]]])):
            last_lines = proc.output()
        return last_lines
예제 #5
0
    def check_for_truncated_files(self, runner):
        """Returns the number of files that have been detected to be truncated.
        This function expects that all files to check for truncation live in the public dir"""
        ret = 0

        if "check_trunc_files" not in runner.enabled or not self._check_truncation:
            return ret

        for dump_fname in self.list_outfiles_to_check_for_truncation(
                runner.dump_dir):
            dfile = DumpFile(runner.wiki, runner.dump_dir.filename_public_path(
                dump_fname), dump_fname)

            file_truncated = True
            if exists(dfile.filename):
                if dfile.check_if_empty():
                    # file exists and is empty, move it out of the way
                    dfile.rename(dfile.filename + ".empty")
                elif dfile.check_if_truncated():
                    # The file exists and is truncated, we move it out of the way
                    dfile.rename(dfile.filename + ".truncated")

                    # We detected a failure and could abort right now. However,
                    # there might still be some further file parts, that are good.
                    # Hence, we go on treating the remaining files and in the end
                    # /all/ truncated files have been moved out of the way. So we
                    # see, which parts (instead of the whole job) need a rerun.
                else:
                    # The file exists and is not truncated. Heck, it's a good file!
                    file_truncated = False

            else:
                # file doesn't exist, move on
                file_truncated = False
            if file_truncated:
                ret += 1

        return ret
예제 #6
0
    def get_relevant_prefetch_files(self, file_list, start_page_id, end_page_id, date, runner):
        possibles = []
        if len(file_list):
            # (a) nasty hack, see below (b)
            maxparts = 0
            for file_obj in file_list:
                if file_obj.is_file_part and file_obj.partnum_int > maxparts:
                    maxparts = file_obj.partnum_int
                if not file_obj.first_page_id:
                    fname = DumpFile(
                        self.wiki, runner.dump_dir.filename_public_path(file_obj, date),
                        file_obj, self.verbose)
                    file_obj.first_page_id = fname.find_first_page_id_in_file()

            # get the files that cover our range
            for file_obj in file_list:
                # If some of the file_objs in file_list could not be properly be parsed, some of
                # the (int) conversions below will fail. However, it is of little use to us,
                # which conversion failed. /If any/ conversion fails, it means, that that we do
                # not understand how to make sense of the current file_obj. Hence we cannot use
                # it as prefetch object and we have to drop it, to avoid passing a useless file
                # to the text pass. (This could days as of a comment below, but by not passing
                # a likely useless file, we have to fetch more texts from the database)
                #
                # Therefore try...except-ing the whole block is sufficient: If whatever error
                # occurs, we do not abort, but skip the file for prefetch.
                try:
                    # If we could properly parse
                    first_page_id_in_file = int(file_obj.first_page_id)

                    # fixme what do we do here? this could be very expensive. is that worth it??
                    if not file_obj.last_page_id:
                        # (b) nasty hack, see (a)
                        # it's not a checkpoint fle or we'd have the pageid in the filename
                        # so... temporary hack which will give expensive results
                        # if file part, and it's the last one, put none
                        # if it's not the last part, get the first pageid in the next
                        #  part and subtract 1
                        # if not file part, put none.
                        if file_obj.is_file_part and file_obj.partnum_int < maxparts:
                            for fname in file_list:
                                if fname.partnum_int == file_obj.partnum_int + 1:
                                    # not true!  this could be a few past where it really is
                                    # (because of deleted pages that aren't included at all)
                                    file_obj.last_page_id = str(int(fname.first_page_id) - 1)
                    if file_obj.last_page_id:
                        last_page_id_in_file = int(file_obj.last_page_id)
                    else:
                        last_page_id_in_file = None

                    # FIXME there is no point in including files that have just a
                    # few rev ids in them that we need, and having to read through
                    # the whole file... could take hours or days (later it won't matter,
                    # right? but until a rewrite, this is important)
                    # also be sure that if a critical page is deleted by the time we
                    # try to figure out ranges, that we don't get hosed
                    if ((first_page_id_in_file <= int(start_page_id) and
                         (last_page_id_in_file is None or
                          last_page_id_in_file >= int(start_page_id))) or
                            (first_page_id_in_file >= int(start_page_id) and
                             (end_page_id is None or
                              first_page_id_in_file <= int(end_page_id)))):
                        possibles.append(file_obj)
                except Exception as ex:
                    runner.debug(
                        "Couldn't process %s for prefetch. Format update? Corrupt file?"
                        % file_obj.filename)
        return possibles
예제 #7
0
    def run(self, runner):
        # here we will either clean up or not depending on how we were called FIXME
        self.cleanup_old_files(runner.dump_dir, runner)
        commands = []

        todo = []

        if self.page_id_range is not None:
            # convert to checkpoint filename, handle the same way
            self.checkpoint_file = self.get_chkptfile_from_pageids()

        if self.checkpoint_file:
            todo = [self.checkpoint_file]
        else:
            # list all the output files that would be produced w/o
            # checkpoint files on
            outfiles = self.get_reg_files_for_filepart_possible(
                runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames())
            if self._checkpoints_enabled:

                # get the stub list that would be used for the current run
                stubs = self.get_stub_files(runner)
                stubs = sorted(stubs, key=lambda thing: thing.filename)

                # get the page ranges covered by stubs
                stub_ranges = []
                for stub in stubs:
                    fname = DumpFile(self.wiki,
                                     runner.dump_dir.filename_public_path(stub, stub.date),
                                     stub, self.verbose)
                    stub_ranges.append((fname.find_first_page_id_in_file(),
                                        self.find_last_page_id(stub, runner), stub.partnum))

                # get list of existing checkpoint files
                chkpt_files = self.list_checkpt_files(
                    runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None)
                chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename)
                # get the page ranges covered by existing checkpoint files
                checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id,
                                      chkptfile.partnum)
                                     for chkptfile in chkpt_files]
                if self.verbose:
                    print "checkpoint_ranges is", checkpoint_ranges
                    print "stub_ranges is", stub_ranges

                if not checkpoint_ranges:
                    # no page ranges covered by checkpoints. do all output files
                    # the usual way
                    todo = outfiles
                else:
                    todo = []
                    parts = self.get_fileparts_list()
                    for partnum in parts:
                        if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges
                                if int(chkpt_range[2]) == int(partnum)]:
                            # no page ranges covered by checkpoints for a particular
                            # file part (subjob) so do that output file the
                            # regular way
                            todo.extend([outfile for outfile in outfiles
                                         if int(outfile.partnum) == int(partnum)])

                    missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges)
                    todo.extend([self.chkpt_file_from_page_range((first, last), partnum)
                                 for (first, last, partnum) in missing])

            else:
                # do the missing files only
                # FIXME public or private depending on the wiki!
                todo = [outfile for outfile in outfiles
                        if not os.path.exists(runner.dump_dir.filename_public_path(outfile))]

        partial_stubs = []
        if self.verbose:
            print "todo is", [to.filename for to in todo]
        for fileobj in todo:

            stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0]

            if fileobj.first_page_id is None:
                partial_stubs.append(stub_for_file)
            else:
                stub_output_file = DumpFilename(
                    self.wiki, fileobj.date, fileobj.dumpname,
                    self.item_for_stubs.get_filetype(),
                    self.item_for_stubs.get_file_ext(),
                    fileobj.partnum,
                    DumpFilename.make_checkpoint_string(
                        fileobj.first_page_id, fileobj.last_page_id), temp=True)

                self.write_partial_stub(stub_for_file, stub_output_file, runner)
                if not self.has_no_entries(stub_output_file, runner):
                    partial_stubs.append(stub_output_file)

        if self.verbose:
            print "partial_stubs is", [ps.filename for ps in partial_stubs]
        if partial_stubs:
            stub_files = partial_stubs
        else:
            return

        for stub_file in stub_files:
            series = self.build_command(runner, stub_file)
            commands.append(series)

        error = runner.run_command(commands, callback_stderr=self.progress_callback,
                                   callback_stderr_arg=runner)
        if error:
            raise BackupError("error producing xml file(s) %s" % self.dumpname)