Exemplo n.º 1
0
    def build_command(self, runner, output_dfname):
        '''
        arguments:
        runner: Runner object
        output_dfname: output file that will be produced
        '''

        input_dfname = DumpFilename(self.wiki, None, output_dfname.dumpname,
                                    output_dfname.file_type,
                                    self.item_for_recompression.file_ext,
                                    output_dfname.partnum, output_dfname.checkpoint)
        if runner.wiki.is_private():
            outfilepath = runner.dump_dir.filename_private_path(
                self.get_multistream_dfname(output_dfname))
            outfilepath_index = runner.dump_dir.filename_private_path(
                self.get_multistream_index_dfname(output_dfname))
            infilepath = runner.dump_dir.filename_private_path(input_dfname)
        else:
            outfilepath = runner.dump_dir.filename_public_path(
                self.get_multistream_dfname(output_dfname))
            outfilepath_index = runner.dump_dir.filename_public_path(
                self.get_multistream_index_dfname(output_dfname))
            infilepath = runner.dump_dir.filename_public_path(input_dfname)
        command_pipe = [["%s -dc %s | %s --pagesperstream 100 --buildindex %s -o %s" %
                         (self.wiki.config.bzip2, infilepath, self.wiki.config.recompressxml,
                          DumpFilename.get_inprogress_name(outfilepath_index),
                          DumpFilename.get_inprogress_name(outfilepath))]]
        return [command_pipe]
Exemplo n.º 2
0
    def run(self, runner):
        retries = 0
        maxretries = runner.wiki.config.max_retries
        dfnames = self.list_outfiles_for_build_command(runner.dump_dir)
        if len(dfnames) > 1:
            raise BackupError("siteinfo dump %s trying to produce more than one file" %
                              self.dumpname)
        output_dfname = dfnames[0]
        commands = self.build_command(runner)
        if runner.wiki.is_private():
            command_series = runner.get_save_command_series(
                commands, DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_private_path(output_dfname)))
        else:
            command_series = runner.get_save_command_series(
                commands, DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_public_path(output_dfname)))
        self.setup_command_info(runner, command_series, [output_dfname])

        error, _broken = runner.save_command(command_series, self.command_completion_callback)
        while error and retries < maxretries:
            retries = retries + 1
            time.sleep(5)
            error, _broken = runner.save_command(command_series)
        if error:
            raise BackupError("error dumping siteinfo props %s" % ','.join(self._properties))
Exemplo n.º 3
0
 def build_command(self, runner, output_dfname):
     commands = runner.db_server_info.build_sqldump_command(self._table, runner.wiki.config.gzip)
     if self.private or runner.wiki.is_private():
         command_series = runner.get_save_command_series(
             commands, DumpFilename.get_inprogress_name(
                 runner.dump_dir.filename_private_path(output_dfname)))
     else:
         command_series = runner.get_save_command_series(
             commands, DumpFilename.get_inprogress_name(
                 runner.dump_dir.filename_public_path(output_dfname)))
     return command_series
Exemplo n.º 4
0
 def build_command(self, runner, query, out_dfname):
     if not exists(runner.wiki.config.gzip):
         raise BackupError("gzip command %s not found" % runner.wiki.config.gzip)
     series = runner.db_server_info.build_sql_command(query, runner.wiki.config.gzip)
     if runner.wiki.is_private():
         return runner.get_save_command_series(
             series, DumpFilename.get_inprogress_name(
                 runner.dump_dir.filename_private_path(out_dfname)))
     return runner.get_save_command_series(
         series, DumpFilename.get_inprogress_name(
             runner.dump_dir.filename_public_path(out_dfname)))
Exemplo n.º 5
0
    def build_command(self, runner, output_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        if runner.wiki.is_private():
            logging_path = runner.dump_dir.filename_private_path(output_dfname)
        else:
            logging_path = runner.dump_dir.filename_public_path(output_dfname)

        config_file_arg = runner.wiki.config.files[0]
        if runner.wiki.config.override_section:
            config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section
        command = ["/usr/bin/python3", "xmllogs.py", "--config",
                   config_file_arg, "--wiki", runner.db_name,
                   "--outfile", DumpFilename.get_inprogress_name(logging_path)]

        if output_dfname.partnum:
            # set up start end end pageids for this piece
            # note there is no item id 0 I guess. so we start with 1
            start = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int - 1)]) + 1
            startopt = "--start=%s" % start
            # if we are on the last file part, we should get up to the last log item id,
            # whatever that is.
            command.append(startopt)
            if output_dfname.partnum_int < len(self._parts):
                end = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int)]) + 1
                endopt = "--end=%s" % end
                command.append(endopt)

        pipeline = [command]
        series = [pipeline]
        return series
Exemplo n.º 6
0
    def build_filters(self, runner, input_dfname):
        """
        Construct the output filter options for dumpTextPass.php
        args:
            Runner, DumpFilename
        """
        # do we need checkpoints? ummm
        if runner.wiki.is_private():
            xmlbz2_path = runner.dump_dir.filename_private_path(input_dfname)
        else:
            xmlbz2_path = runner.dump_dir.filename_public_path(input_dfname)

        if 'history' in self.jobinfo['subset'] and runner.wiki.config.lbzip2forhistory:
            # we will use lbzip2 for compression of pages-meta-history for this wiki
            # if configured
            bz2mode = "lbzip2"
            if not exists(self.wiki.config.lbzip2):
                raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2)
        elif self.wiki.config.bzip2[-6:] == "dbzip2":
            bz2mode = "dbzip2"
        else:
            bz2mode = "bzip2"
            if not exists(self.wiki.config.bzip2):
                raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2)
        return "--output=%s:%s" % (bz2mode, DumpFilename.get_inprogress_name(xmlbz2_path))
Exemplo n.º 7
0
    def build_command(self, runner, novariant_dfname, output_dfnames):
        """
        args:
            Runner, DumpFilename for output without any language variant
        """
        config_file_arg = runner.wiki.config.files[0]
        if runner.wiki.config.override_section:
            config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section
        command = ["/usr/bin/python3", "xmlabstracts.py", "--config",
                   config_file_arg, "--wiki", self.db_name]

        output_paths = []
        variants = []
        for dfname in output_dfnames:
            variant = self.get_variant_from_dumpname(dfname.dumpname)
            variant_option = self._variant_option(variant)
            if runner.wiki.is_private():
                output_paths.append(DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_private_path(dfname)))
            else:
                output_paths.append(DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_public_path(dfname)))
            variants.append(variant_option)

        command.extend(["--outfiles=%s" % ",".join(output_paths),
                        "--variants=%s" % ",".join(variants)])

        if novariant_dfname.partnum:
            # set up start end end pageids for this piece
            # note there is no page id 0 I guess. so we start with 1
            start = sum([int(self._parts[i])
                         for i in range(0, novariant_dfname.partnum_int - 1)]) + 1
            startopt = "--start=%s" % start
            # if we are on the last file part, we should get up to the last pageid,
            # whatever that is.
            command.append(startopt)
            if novariant_dfname.partnum_int < len(self._parts):
                end = sum([int(self._parts[i]) for i in range(0, novariant_dfname.partnum_int)]) + 1
                endopt = "--end=%s" % end
                command.append(endopt)
        pipeline = [command]
        series = [pipeline]
        return series
Exemplo n.º 8
0
    def build_command(self, runner, output_dfname, history_dfname, current_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        config_file_arg = runner.wiki.config.files[0]
        if runner.wiki.config.override_section:
            config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section
        command = ["/usr/bin/python3", "xmlstubs.py", "--config", config_file_arg,
                   "--wiki", runner.db_name]
        output_dir = self.get_output_dir(runner)
        if output_dfname is not None:
            command.extend(["--articles", DumpFilename.get_inprogress_name(
                os.path.join(output_dir, output_dfname.filename))])
        if history_dfname is not None:
            command.extend(["--history", DumpFilename.get_inprogress_name(
                os.path.join(output_dir, history_dfname.filename))])
        if current_dfname is not None:
            command.extend(["--current", DumpFilename.get_inprogress_name(
                os.path.join(output_dir, current_dfname.filename))])

        partnum = None
        if output_dfname is not None:
            partnum = output_dfname.partnum
        elif history_dfname is not None:
            partnum = history_dfname.partnum
        elif current_dfname is not None:
            partnum = current_dfname.partnum
        if partnum is not None:
            # set up start end end pageids for this piece
            # note there is no page id 0 I guess. so we start with 1
            start = sum([int(self._parts[i]) for i in range(0, int(partnum) - 1)]) + 1
            startopt = "--start=%s" % start
            # if we are on the last file part, we should get up to the last pageid,
            # whatever that is.
            command.append(startopt)
            if int(partnum) < len(self._parts):
                end = sum([int(self._parts[i]) for i in range(0, int(partnum))]) + 1
                endopt = "--end=%s" % end
                command.append(endopt)

        pipeline = [command]
        series = [pipeline]
        return series
Exemplo n.º 9
0
    def cleanup_inprog_files(self, dump_dir, runner):
        if self.checkpoint_file is not None:
            # we only rerun this one, so just remove this one
            pub_path = DumpFilename.get_inprogress_name(
                dump_dir.filename_public_path(self.checkpoint_file))
            priv_path = DumpFilename.get_inprogress_name(
                dump_dir.filename_private_path(self.checkpoint_file))
            if os.path.exists(pub_path):
                if runner.dryrun:
                    print("would remove", pub_path)
                else:
                    os.remove(pub_path)
            elif os.path.exists(priv_path):
                if runner.dryrun:
                    print("would remove", priv_path)
                else:
                    os.remove(priv_path)

        dfnames = self.list_inprog_files_for_cleanup(dump_dir)
        if runner.dryrun:
            print("would remove ", [dfname.filename for dfname in dfnames])
        else:
            for dfname in dfnames:
                self.remove_output_file(dump_dir, dfname)
Exemplo n.º 10
0
    def build_command(self, runner, output_dfnames):
        '''
        arguments:
        runner: Runner object
        output_dfnames: if checkpointing of files is enabled, this should be a
                        list of checkpoint files (DumpFilename), otherwise it
                        should be a list of the one file that will be produced
                        by the dump
        Note that checkpoint files get done one at a time, not in parallel
        '''
        # FIXME need shell escape
        if self.wiki.config.lbzip2threads:
            if not exists(self.wiki.config.lbzip2):
                raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2)
        elif not exists(self.wiki.config.bzip2):
            raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2)
        if not exists(self.wiki.config.sevenzip):
            raise BackupError("7zip command %s not found" % self.wiki.config.sevenzip)

        command_series = []
        for out_dfname in output_dfnames:
            input_dfname = DumpFilename(self.wiki, None, out_dfname.dumpname, out_dfname.file_type,
                                        self.item_for_recompression.file_ext, out_dfname.partnum,
                                        out_dfname.checkpoint)
            if runner.wiki.is_private():
                outfilepath = runner.dump_dir.filename_private_path(out_dfname)
                infilepath = runner.dump_dir.filename_private_path(input_dfname)
            else:
                outfilepath = runner.dump_dir.filename_public_path(out_dfname)
                infilepath = runner.dump_dir.filename_public_path(input_dfname)

            if self.wiki.config.lbzip2threads:
                # one thread only, as these already run in parallel
                decompr_command = "{lbzip2} -dc -n 1 {infile}".format(
                    lbzip2=self.wiki.config.lbzip2, infile=infilepath)
            else:
                decompr_command = "{bzip2} -dc {infile}".format(bzip2=self.wiki.config.bzip2,
                                                                infile=infilepath)
            command_pipe = [["{decompr} | {sevenzip} a -mx=4 -si {ofile}".format(
                decompr=decompr_command, sevenzip=self.wiki.config.sevenzip,
                ofile=DumpFilename.get_inprogress_name(outfilepath))]]
            command_series.append(command_pipe)
        return command_series
Exemplo n.º 11
0
    def build_command(self, runner, output_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        if runner.wiki.is_private():
            flow_output_fpath = runner.dump_dir.filename_private_path(output_dfname)
        else:
            flow_output_fpath = runner.dump_dir.filename_public_path(output_dfname)
        script_command = MultiVersion.mw_script_as_array(
            runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php")

        command = [runner.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % runner.db_name,
                        "--current", "--report=1000",
                        "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)])
        if self.history:
            command.append("--full")
        pipeline = [command]
        series = [pipeline]
        return series