예제 #1
0
    def move_if_truncated(self, runner, dfname, emptycheck=0, tmpdir=False):
        """
        check if the given file (DumpFile) is truncated or empty
        if so, move it out of the way and return True
        return False otherwise

        if emptycheck is set to a number, the file will only be checked to
        seee if it is empty, if the file covers a page range with more
        pages than the specific number. Eg a file named
        elwikivoyage-20180618-pages-meta-history2.xml-p140p150.bz2
        would be checked for emptycheck = 8 but not for 12; files that
        don't have page start and end numbers in the filename would not
        be checked at all.

        if emptycheck is left as 0, the file will be checked to see if
        it is empty always.

        if file is located in the temp dir, set tmpdir=True for it to
        be found there; otherwise the public xml/sql dump output dir
        (or private, if the wiki is private), will be checked for the file.
        """
        if "check_trunc_files" not in runner.enabled or not self.check_truncation():
            return False

        if tmpdir:
            path = os.path.join(
                FileUtils.wiki_tempdir(runner.wiki.db_name, runner.wiki.config.temp_dir),
                dfname.filename)
        elif runner.wiki.is_private():
            path = runner.dump_dir.filename_private_path(dfname)
        else:
            path = runner.dump_dir.filename_public_path(dfname)
        dcontents = DumpContents(runner.wiki, path)

        file_truncated = True
        if os.path.exists(dcontents.filename):
            # for some file types we will check that the file has the right closing tag
            last_tag = None
            if ('.xml' in dcontents.filename and
                    ('.bz2' in dcontents.filename or '.gz' in dcontents.filename)):
                last_tag = b'</mediawiki>'

            # fixme hardcoded at 200? mmmm. but otoh configurable is kinda dumb
            if (not emptycheck or self.is_larger(dfname, 200)) and dcontents.check_if_empty():
                # file exists and is empty, move it out of the way
                dcontents.rename(dcontents.filename + ".empty")
            elif dcontents.check_if_truncated(last_tag):
                # The file exists and is truncated, move it out of the way
                dcontents.rename(dcontents.filename + ".truncated")
            elif dcontents.check_if_binary_crap():
                # The file exists and has binary junk in it, move it out of the way
                dcontents.rename(dcontents.filename + ".truncated")
            else:
                # The file exists and is not truncated and doesn't have random crap.
                # Heck, it's a good file!
                file_truncated = False
        else:
            # file doesn't exist, move on
            file_truncated = False
        return file_truncated
예제 #2
0
def dostubsbackup(wikidb, history_file, current_file, articles_file,
                  wikiconf, start, end, dryrun, verbose):
    '''
    do a stubs xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    if history_file is not None:
        outfiles['history'] = {'name': history_file}
    if current_file is not None:
        outfiles['current'] = {'name': current_file}
    if articles_file is not None:
        outfiles['articles'] = {'name': articles_file}

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir),
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']]
        else:
            outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']]

    script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php")
    command = [wikiconf.php] + script_command

    command.extend(["--wiki=%s" % wikidb,
                    "--full", "--stub", "--report=1000"])
    if history_file is not None:
        command.append("--output=file:%s" % outfiles['history']['temp'])
    if current_file is not None:
        command.extend(["--output=file:%s" % outfiles['current']['temp'],
                        "--filter=latest"])
    if articles_file is not None:
        command.extend(["--output=file:%s" % outfiles['articles']['temp'],
                        "--filter=latest", "--filter=notalk",
                        "--filter=namespace:!NS_USER"])

    if wikiconf.stubs_orderrevs:
        command.append("--orderrevs")
        callback = get_page_interval
    else:
        callback = None

    # the xml header, the body, and the xml footer should be separate gzipped
    # streams all concatted together
    # note that do_xml_stream exits on failure after cleaning up all output files
    # so the parent process must simply retry later
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback, header=True)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback, footer=True)
예제 #3
0
def do_abstractsbackup(wikidb, output_files, variants,
                       wikiconf, start, end, dryrun, verbose):
    '''
    do an abstracts xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    index = 0
    for variant in variants:
        outfiles[variant] = {'name': output_files[index]}
        index += 1

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir),
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']]
        else:
            outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']]

    script_command = MultiVersion.mw_script_as_array(wikiconf,
                                                     "dumpBackup.php")
    command = [wikiconf.php] + script_command
    version = MultiVersion.mw_version(wikiconf, wikidb)
    abstract_cmd_dir = wikiconf.wiki_dir
    if version:
        abstract_cmd_dir = abstract_cmd_dir + "/" + version
    filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/AbstractFilter.php")
    if not os.path.exists(filter_path):
        filter_path = os.path.join(abstract_cmd_dir,
                                   "extensions/ActiveAbstract/includes/AbstractFilter.php")
    abstract_filter = ("--plugin=AbstractFilter:" + filter_path)

    command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir,
                    abstract_filter,
                    "--current", "--report=1000", "--namespaces=0"])

    for filetype in outfiles:
        command.extend(["--output=file:%s" % outfiles[filetype]['temp'],
                        "--filter=namespace:NS_MAIN",
                        "--filter=noredirect",
                        "--filter=abstract%s" % filetype])

    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose, header=True)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose, footer=True)
예제 #4
0
 def _get_checksum_filename_tmp(self, htype, fmt):
     """
     args:
         hashtype ('md5', 'sha1',...)
         format of output ('json', 'txt', ...)
     returns:
         full path of a unique-enough temporary output file for wiki and date
     """
     dfname = DumpFilename(self.wiki, None,
                           Checksummer.get_checksum_filename_basename(htype, fmt) +
                           "." + self.timestamp + ".tmp")
     return os.path.join(FileUtils.wiki_tempdir(
         self.wiki.db_name, self.wiki.config.temp_dir), dfname.filename)
예제 #5
0
    def get_stub_gen_cmd_for_input(self, input_dfname, output_dfnames, runner):
        """
        for the given input dumpfile (stub), write the requested output file (stub)
        """
        if not exists(self.wiki.config.writeuptopageid):
            raise BackupError("writeuptopageid command %s not found" %
                              self.wiki.config.writeuptopageid)

        if runner.wiki.is_private():
            inputfile_path = runner.dump_dir.filename_private_path(input_dfname)
        else:
            inputfile_path = runner.dump_dir.filename_public_path(input_dfname)

        output_dir = FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir)
        argstrings = []

        for output_dfname in output_dfnames:
            output_fname = output_dfname.filename
            # don't generate the file if we already have it (i.e. this is a retry)
            if not os.path.exists(os.path.join(output_dir, output_fname)):
                first_age_id = output_dfname.first_page_id
                if (output_dfname.last_page_id is not None and
                        output_dfname.last_page_id != "00000"):
                    last_page_id = str(int(output_dfname.last_page_id) + 1)
                else:
                    last_page_id = ""
                argstrings.append("{outfile}:{firstpage}:{lastpage}".format(
                    outfile=output_fname, firstpage=first_age_id, lastpage=last_page_id))

        # don't generate an output file if there are no filespecs
        if not argstrings:
            return None

        if input_dfname.file_ext == "gz":
            # command1 = "%s -dc %s" % (self.wiki.config.gzip, inputfile_path)
            command1 = [self.wiki.config.gzip, "-dc", inputfile_path]
        elif input_dfname.file_ext == '7z':
            # command1 = "%s e -si %s" % (self.wiki.config.sevenzip, inputfile_path)
            command1 = [self.wiki.config.sevenzip, "e", "-si", inputfile_path]
        elif input_dfname.file_ext == 'bz':
            # command1 = "%s -dc %s" % (self.wiki.config.bzip2, inputfile_path)
            command1 = [self.wiki.config.bzip2, "-dc", inputfile_path]
        else:
            raise BackupError("unknown stub file extension %s" % input_dfname.file_ext)

        command2 = [self.wiki.config.writeuptopageid, "--odir", output_dir,
                    "--fspecs", ";".join(argstrings)]
        pipeline = [command1]
        pipeline.append(command2)
        return pipeline
예제 #6
0
 def has_no_pages(self, xmlfile, runner, tempdir=False):
     '''
     see if it has a page id in it or not. no? then return True
     '''
     if xmlfile.is_temp_file or tempdir:
         path = os.path.join(
             FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
             xmlfile.filename)
     else:
         if runner.wiki.is_private():
             path = runner.dump_dir.filename_private_path(xmlfile, self.wiki.date)
         else:
             path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date)
     dcontents = DumpContents(self.wiki, path, xmlfile, self.verbose)
     return bool(dcontents.find_first_page_id_in_file() is None)
예제 #7
0
 def cp_chksum_tmpfiles_to_permfile(self):
     """
     during a dump run, checksum files are written to a temporary
     location and updated there; we copy the content from these
     files into the permanent location after each dump job
     completes
     """
     if Checksummer.NAME in self._enabled:
         for htype in Checksummer.HASHTYPES:
             for fmt in Checksummer.FORMATS:
                 tmp_filename = self._get_checksum_filename_tmp(htype, fmt)
                 real_filename = self._get_checksum_path(htype, fmt)
                 content = FileUtils.read_file(tmp_filename)
                 FileUtils.write_file(
                     FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
                     real_filename, content,
                     self.wiki.config.fileperms)
예제 #8
0
    def build_command(self, runner, stub_dfname, prefetch, output_dfname):
        """
        Build the command line for the dump, minus output and filter options
        args:
            Runner, stub DumpFilename, ....
        """
        stub_path = os.path.join(
            FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
            stub_dfname.filename)
        if os.path.exists(stub_path):
            # if this is a pagerange stub file in temp dir, use that
            stub_option = "--stub=gzip:%s" % stub_path
        else:
            # use regular stub file
            if runner.wiki.is_private():
                stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_private_path(stub_dfname)
            else:
                stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_dfname)
        if self.jobinfo['spawn']:
            spawn = "--spawn=%s" % (self.wiki.config.php)
        else:
            spawn = ""

        if not exists(self.wiki.config.php):
            raise BackupError("php command %s not found" % self.wiki.config.php)

        script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php")
        dump_command = [self.wiki.config.php]
        dump_command.extend(script_command)
        dump_command.extend(["--wiki=%s" % runner.db_name,
                             "%s" % stub_option,
                             "%s" % prefetch,
                             "--report=1000",
                             "%s" % spawn])

        dump_command = [entry for entry in dump_command if entry is not None]
        dump_command.extend([self.build_filters(runner, output_dfname), self.build_eta()])
        pipeline = [dump_command]
        # return a command series of one pipeline
        series = [pipeline]
        return series
예제 #9
0
    def save_feed(self, dfname):
        """
        produce an rss feed file for the specified dump output file
        (dfname)

        If there is already such a feed, update it only if
        the date of the dump output file in the feed is not older
        than the date of dfname, as indicated in the dump dirs/filenames
        themselves, NOT via stat

        args:
            DumpFilename
        """
        if Feeds.NAME in self._enabled:
            rss_path = os.path.join(self.dump_dir.latest_dir(),
                                    self.db_name + "-latest-" + dfname.basename +
                                    "-rss.xml")

            self.make_dir(self.dump_dir.latest_dir())
            filename_and_path = self.dump_dir.web_path(dfname)
            web_path = os.path.dirname(filename_and_path)
            if self.feed_newer_than_file(rss_path, dfname):
                return
            rss_text = self.wiki.config.read_template("feed.xml") % {
                "chantitle": dfname.basename,
                "chanlink": web_path,
                "chandesc": "Wikimedia dump updates for %s" % self.db_name,
                "title": web_path,
                "link": web_path,
                "description": xml_escape("<a href=\"%s\">%s</a>" % (
                    filename_and_path, dfname.filename)),
                "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
            }
            self.debugfn("adding rss feed file %s " % rss_path)
            FileUtils.write_file(
                FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
                rss_path,
                rss_text, self.wiki.config.fileperms)
예제 #10
0
 def write_notice(self):
     '''
     write notice file if self.notice has contents,
     or remove if it self.notice is false,
     or read existing file and stash contents, if self.notice is empty str
     '''
     if Notice.NAME in self._enabled:
         notice_filepath = self._get_notice_filename()
         # delnotice.  toss any existing file
         if self.notice is False:
             if os.path.exists(notice_filepath):
                 os.remove(notice_filepath)
             self.notice = ""
         # addnotice, stuff notice in a file for other jobs etc
         elif self.notice != "":
             FileUtils.write_file(
                 FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
                 notice_filepath, self.notice,
                 self.wiki.config.fileperms)
         # default case. if there is a file get the contents, otherwise
         # we have empty contents, all good
         else:
             if os.path.exists(notice_filepath):
                 self.notice = FileUtils.read_file(notice_filepath)
예제 #11
0
    def write_pagerange_stubs(self, iofile_pairs, runner, batchsize, move_if_truncated):
        """
        put the io file pairs in ascending order (per part if there
        are parts), for each pair write out a stub file corresponding
        to the page range in the output filename, combining up
        those outputs that require the same input file into
        one command

        args: pairs of (DumpFilename, DumpFilename), Runner
        """
        if not iofile_pairs:
            return

        # split up into batches where the input file is the same
        # and the pairs are ordered by output file name
        in_dfnames = list({pair[0] for pair in iofile_pairs})
        out_dfnames = {}
        output_dfnames_to_check = []
        for in_dfname in in_dfnames:
            out_dfnames[in_dfname.filename] = sorted([pair[1] for pair in iofile_pairs
                                                      if pair[0].filename == in_dfname.filename],
                                                     key=functools.cmp_to_key(DumpFilename.compare))
        commands = []
        for in_dfname in in_dfnames:
            pipeline = self.get_stub_gen_cmd_for_input(
                in_dfname, out_dfnames[in_dfname.filename], runner)
            if pipeline is not None:
                # list of command series. each series is a list of pipelines.
                commands.append([pipeline])
                output_dfnames_to_check.extend(out_dfnames[in_dfname.filename])

        errors = False
        while commands:
            command_batch = commands[:batchsize]
            error, broken = runner.run_command(command_batch)
            if error:
                for series in broken:
                    for pipeline in series:
                        failed_cmds_retcodes = pipeline.get_failed_cmds_with_retcode()
                        for cmd_retcode in failed_cmds_retcodes:
                            if (cmd_retcode[1] == -signal.SIGPIPE or
                                    cmd_retcode[1] == signal.SIGPIPE + 128):
                                pass
                            else:
                                runner.log_and_print("error from commands: %s" % " ".join(
                                    [entry for entry in pipeline]))
                                errors = True
            commands = commands[batchsize:]
        if errors:
            raise BackupError("failed to write pagerange stub files")

        if runner.dryrun:
            return

        # check the output files to see if we like them;
        # if not, we will move the bad ones out of the way and
        # whine about them
        bad_dfnames = []
        output_dir = FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir)
        for temp_stub_dfname in output_dfnames_to_check:
            if os.path.exists(os.path.join(output_dir, temp_stub_dfname.filename)):
                bad = move_if_truncated(runner, temp_stub_dfname, emptycheck=200, tmpdir=True)
                if bad:
                    bad_dfnames.append(temp_stub_dfname)
        if bad_dfnames:
            error_string = " ".join([bad_dfname.filename for bad_dfname in bad_dfnames])
            raise BackupError(
                "failed to write pagerange stub files (bad contents) " + error_string)
예제 #12
0
    def run(self):
        """
        mark which dump jobs should run
        clean up old dump run files
        set up directories for the run
        run each dump job
        """
        if self.job_requested:
            if not self.dump_item_list.old_runinfo_retrieved and self.wiki.exists_perdump_index():

                # There was a previous run of all or part of this date, but...
                # There was no old RunInfo to be had (or an error was encountered getting it)
                # so we can't rerun a step and keep all the status information
                # about the old run around.
                # In this case ask the user if they reeeaaally want to go ahead
                print("No information about the previous run for this date could be retrieved.")
                print("This means that the status information about the old run will be lost, and")
                print("only the information about the current (and future) runs will be kept.")
                reply = input("Continue anyways? [y/N]: ")
                if reply not in ["y", "Y"]:
                    raise RuntimeError("No run information available for previous dump, exiting")

            if not self.dump_item_list.mark_dumps_to_run(self.job_requested, self.skipdone):
                # probably no such job
                sys.stderr.write("No job marked to run, exiting")
                return None
            if self.restart:
                # mark all the following jobs to run as well
                self.dump_item_list.mark_following_jobs_to_run(self.skipdone)
        else:
            self.dump_item_list.mark_all_jobs_to_run(self.skipdone)

        Maintenance.exit_if_in_maintenance_mode(
            "In maintenance mode, exiting dump of %s" % self.db_name)

        self.make_dir(os.path.join(self.wiki.public_dir(), self.wiki.date))
        self.make_dir(os.path.join(self.wiki.private_dir(), self.wiki.date))
        FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir, create=True)

        self.show_runner_state("Cleaning up old dumps for %s" % self.db_name)
        self.clean_old_dumps()
        self.clean_old_dumps(private=True)

        # Informing what kind backup work we are about to do
        if self.job_requested:
            if self.restart:
                self.log_and_print("Preparing for restart from job %s of %s"
                                   % (self.job_requested, self.db_name))
            else:
                self.log_and_print("Preparing for job %s of %s" %
                                   (self.job_requested, self.db_name))
        else:
            self.show_runner_state("Starting backup of %s" % self.db_name)

        self.dumpjobdata.do_before_dump()

        for item in self.dump_item_list.dump_items:
            prereq_job = self.do_run_item(item)
            if self.do_prereqs and prereq_job is not None:
                doing = []
                doing.append(item)
                # we have the lock so we might as well run the prereq job now.
                # there may be a string of prereqs not met,
                # i.e. articlesrecombine -> articles -> stubs
                # so we're willing to walk back up the list up to five items,
                # assume there's something really broken if it takes more than that
                while prereq_job is not None and len(doing) < 5:
                    new_item = self.dump_item_list.find_item_by_name(prereq_job)
                    new_item.set_to_run(True)
                    prereq_job = self.do_run_item(new_item)
                    if prereq_job is not None:
                        # this job has a dependency too, add to the todo stack
                        doing.insert(0, new_item)
                # back up the stack and do the dependents if stack isn't too long.
                if len(doing) < 5:
                    for subitem in doing:
                        self.do_run_item(subitem)

        # special case
        if self.job_requested == "createdirs":
            if not os.path.exists(os.path.join(self.wiki.public_dir(), self.wiki.date)):
                os.makedirs(os.path.join(self.wiki.public_dir(), self.wiki.date))
            if not os.path.exists(os.path.join(self.wiki.private_dir(), self.wiki.date)):
                os.makedirs(os.path.join(self.wiki.private_dir(), self.wiki.date))

        # we must do this here before the checksums are used for status reports below
        self.dumpjobdata.checksummer.move_chksumfiles_into_place()

        if self.dump_item_list.all_possible_jobs_done():
            # All jobs are either in status "done", "waiting", "failed", "skipped"
            self.report.update_index_html_and_json("done")
            self.statushtml.update_status_file("done")
            self.runstatus_updater.write_statusapi_file()
            self.specialfiles_updater.write_specialfilesinfo_file()
        else:
            # This may happen if we start a dump now and abort before all items are
            # done. Then some are left for example in state "waiting". When
            # afterwards running a specific job, all (but one) of the jobs
            # previously in "waiting" are still in status "waiting"
            self.report.update_index_html_and_json("partialdone")
            self.statushtml.update_status_file("partialdone")
            self.runstatus_updater.write_statusapi_file()
            self.specialfiles_updater.write_specialfilesinfo_file()

        self.dumpjobdata.do_after_dump(self.dump_item_list.dump_items)

        # special case
        if (self.job_requested and self.job_requested == "latestlinks" and
                self.dump_item_list.all_possible_jobs_done()):
            self.dumpjobdata.do_latest_job()

        # Informing about completion
        if self.job_requested:
            if self.restart:
                self.show_runner_state("Completed run restarting from job %s for %s"
                                       % (self.job_requested, self.db_name))
            else:
                self.show_runner_state("Completed job %s for %s"
                                       % (self.job_requested, self.db_name))
        else:
            self.show_runner_state_complete()

        # let caller know if this was a successful run
        if sum(1 for item in self.dump_item_list.dump_items if item.status() == "failed"):
            return False
        return True