Python MiscUtils示例，dumps.utils.MiscUtils Python示例

示例#1

0

显示文件

文件： wikidump.py 项目： wikimedia/operations-dumps

    def parse_conffile_overrideables(self):
        """
        globals like entries in 'wiki' or 'output' that can
        be overriden by a specific named section
        """
        self.db_list_unsorted = MiscUtils.db_list(self.get_opt_in_overrides_or_default(
            "wiki", "dblist", 0), nosort=True)
        # permit comma-separated list of files so that eg some script
        # can skip all private and/or closed wikis in addition to some
        # other exclusion list
        to_skip = self.get_opt_in_overrides_or_default("wiki", "skipdblist", 0)
        self.skip_db_list = self.get_skipdbs(to_skip)

        try:
            self.private_list = MiscUtils.db_list(self.get_opt_in_overrides_or_default(
                "wiki", "privatelist", 0))
        except FileNotFoundError:
            self.private_list = []
        try:
            self.closed_list = MiscUtils.db_list(self.get_opt_in_overrides_or_default(
                "wiki", "closedlist", 0))
        except FileNotFoundError:
            self.closed_list = []
        try:
            self.flow_list = MiscUtils.db_list(self.get_opt_in_overrides_or_default(
                "wiki", "flowlist", 0))
        except FileNotFoundError:
            self.flow_list = []
        self.tablejobs = self.get_opt_in_overrides_or_default(
            "wiki", "tablejobs", 0)
        self.apijobs = self.get_opt_in_overrides_or_default(
            "wiki", "apijobs", 0)

        self.db_list_unsorted = [dbname for dbname in self.db_list_unsorted
                                 if dbname not in self.skip_db_list]
        self.db_list = sorted(self.db_list_unsorted)

        if not self.conf.has_section('output'):
            self.conf.add_section('output')
        self.public_dir = self.get_opt_in_overrides_or_default("output", "public", 0)
        self.private_dir = self.get_opt_in_overrides_or_default("output", "private", 0)
        self.temp_dir = self.get_opt_in_overrides_or_default("output", "temp", 0)
        self.web_root = self.get_opt_in_overrides_or_default("output", "webroot", 0)
        self.index = self.get_opt_in_overrides_or_default("output", "index", 0)
        self.template_dir = self.get_opt_in_overrides_or_default("output", "templatedir", 0)
        self.perdump_index = self.get_opt_in_overrides_or_default("output", "perdumpindex", 0)
        self.log_file = self.get_opt_in_overrides_or_default("output", "logfile", 0)
        self.fileperms = self.get_opt_in_overrides_or_default("output", "fileperms", 0)
        self.fileperms = int(self.fileperms, 0)

        if not self.conf.has_section('misc'):
            self.conf.add_section('misc')
        self.fixed_dump_order = self.get_opt_in_overrides_or_default("misc", "fixeddumporder", 0)
        self.fixed_dump_order = int(self.fixed_dump_order, 0)

示例#2

0

显示文件

文件： wikidump.py 项目： wikimedia/operations-dumps

    def get_db_user_and_password(self):
        # get these by running a MediaWiki maintenance script;
        # yes, this means you need a full installation of MediaWiki
        # (but not web service) in order to use these methods

        command_list = MultiVersion.mw_script_as_array(self.config, "getConfiguration.php")
        pull_vars = ["wgDBuser", "wgDBpassword"]
        command = "{php} {command} --wiki={dbname} --format=json --regex='{vars}'"
        command = command.format(
            php=MiscUtils.shell_escape(self.config.php),
            command=" ".join(command_list),
            dbname=MiscUtils.shell_escape(self.db_name),
            vars="|".join(pull_vars))
        results = RunSimpleCommand.run_with_output(command, shell=True).strip()
        settings = json.loads(results.decode('utf-8'))
        db_user = settings['wgDBuser']
        db_password = settings['wgDBpassword']

        return db_user, db_password

示例#3

0

显示文件

文件： wikidump.py 项目： wikimedia/operations-dumps

 def get_skipdbs(self, filenames):
     """
     permit comma-separated list of files so that eg some script
     can skip all private and/or closed wikis in addition to some
     other exclusion list
     """
     if ',' in filenames:
         skipfiles = filenames.split(',')
     else:
         skipfiles = [filenames]
     skip_db_list = []
     for skipfile in skipfiles:
         skip_db_list.extend(MiscUtils.db_list(skipfile))
     return list(set(skip_db_list))

示例#4

0

显示文件

    def get_last_lines_from_n(self, fileobj, runner, count):
        if not fileobj.filename or not exists(runner.dump_dir.filename_public_path(fileobj)):
            return None

        dumpfile = DumpFile(self.wiki,
                            runner.dump_dir.filename_public_path(fileobj, self.wiki.date),
                            fileobj, self.verbose)
        pipeline = dumpfile.setup_uncompression_command()

        tail = self.wiki.config.tail
        if not exists(tail):
            raise BackupError("tail command %s not found" % tail)
        tail_esc = MiscUtils.shell_escape(tail)
        pipeline.append([tail, "-n", "+%s" % count])
        # without shell
        proc = CommandPipeline(pipeline, quiet=True)
        proc.run_pipeline_get_output()
        if (proc.exited_successfully() or
                (proc.get_failed_cmds_with_retcode() ==
                 [[-signal.SIGPIPE, pipeline[0]]]) or
                (proc.get_failed_cmds_with_retcode() ==
                 [[signal.SIGPIPE + 128, pipeline[0]]])):
            last_lines = proc.output()
        return last_lines

示例#5

0

显示文件

    def get_first_500_lines(self):
        if self.first_lines:
            return self.first_lines

        if not self.filename or not exists(self.filename):
            return None

        pipeline = self.setup_uncompression_command()

        if not exists(self._wiki.config.head):
            raise BackupError("head command %s not found" % self._wiki.config.head)
        head = self._wiki.config.head
        head_esc = MiscUtils.shell_escape(head)
        pipeline.append([head, "-500"])
        # without shell
        proc = CommandPipeline(pipeline, quiet=True)
        proc.run_pipeline_get_output()
        if (proc.exited_successfully() or
                (proc.get_failed_cmds_with_retcode() ==
                 [[-signal.SIGPIPE, pipeline[0]]]) or
                (proc.get_failed_cmds_with_retcode() ==
                 [[signal.SIGPIPE + 128, pipeline[0]]])):
            self.first_lines = proc.output()
        return self.first_lines

示例#6

0

显示文件

    def build_recombine_command_string(self, runner, files, output_file, compression_command,
                                       uncompression_command, end_header_marker="</siteinfo>"):
        output_filename = runner.dump_dir.filename_public_path(output_file)
        partnum = 0
        recombines = []
        if not exists(runner.wiki.config.head):
            raise BackupError("head command %s not found" % runner.wiki.config.head)
        head = runner.wiki.config.head
        if not exists(runner.wiki.config.tail):
            raise BackupError("tail command %s not found" % runner.wiki.config.tail)
        tail = runner.wiki.config.tail
        if not exists(runner.wiki.config.grep):
            raise BackupError("grep command %s not found" % runner.wiki.config.grep)
        grep = runner.wiki.config.grep

        # we assume the result is always going to be run in a subshell.
        # much quicker than this script trying to read output
        # and pass it to a subprocess
        output_filename_esc = MiscUtils.shell_escape(output_filename)
        head_esc = MiscUtils.shell_escape(head)
        tail_esc = MiscUtils.shell_escape(tail)
        grep_esc = MiscUtils.shell_escape(grep)

        uncompression_command_esc = uncompression_command[:]
        for command in uncompression_command_esc:
            command = MiscUtils.shell_escape(command)
        for command in compression_command:
            command = MiscUtils.shell_escape(command)

        if not files:
            raise BackupError("No files for the recombine step found in %s." % self.name())

        for file_obj in files:
            # uh oh FIXME
            # f = MiscUtils.shell_escape(file_obj.filename)
            fpath = runner.dump_dir.filename_public_path(file_obj)
            partnum = partnum + 1
            pipeline = []
            uncompress_this_file = uncompression_command[:]
            uncompress_this_file.append(fpath)
            pipeline.append(uncompress_this_file)
            # warning: we figure any header (<siteinfo>...</siteinfo>)
            # is going to be less than 2000 lines!
            pipeline.append([head, "-2000"])
            pipeline.append([grep, "-n", end_header_marker])
            # without shell
            proc = CommandPipeline(pipeline, quiet=True)
            proc.run_pipeline_get_output()
            if ((proc.output()) and
                    (proc.exited_successfully() or
                     proc.get_failed_cmds_with_retcode() ==
                     [[-signal.SIGPIPE, uncompress_this_file]] or
                     proc.get_failed_cmds_with_retcode() ==
                     [[signal.SIGPIPE + 128, uncompress_this_file]])):
                (header_end_num, junk_unused) = proc.output().split(":", 1)
                # get header_end_num
            else:
                raise BackupError("Could not find 'end of header' marker for %s" % fpath)
            recombine = " ".join(uncompress_this_file)
            header_end_num = int(header_end_num) + 1
            if partnum == 1:
                # first file, put header and contents
                recombine = recombine + " | %s -n -1 " % head
            elif partnum == len(files):
                # last file, put footer
                recombine = recombine + (" | %s -n +%s" % (tail, header_end_num))
            else:
                # put contents only
                recombine = recombine + (" | %s -n +%s" % (tail, header_end_num))
                recombine = recombine + " | %s -n -1 " % head
            recombines.append(recombine)
        recombine_command_string = ("(" + ";".join(recombines) + ")" + "|" +
                                    "%s %s" % (compression_command, output_filename))
        return recombine_command_string

示例#7

0

显示文件

    def parse_conffile_globally(self):
        self.db_list = MiscUtils.db_list(self.conf.get("wiki", "dblist"))

        # permit comma-separated list of files so that eg some script
        # can skip all private and/or closed wikis in addition to some
        # other exclusion list
        to_skip = self.conf.get("wiki", "skipdblist")
        if ',' in to_skip:
            skipfiles = to_skip.split(',')
        else:
            skipfiles = [to_skip]
        self.skip_db_list = []
        for skipfile in skipfiles:
            self.skip_db_list.extend(MiscUtils.db_list(skipfile))
        self.skip_db_list = list(set(self.skip_db_list))

        self.private_list = MiscUtils.db_list(self.conf.get("wiki", "privatelist"))
        self.closed_list = MiscUtils.db_list(self.conf.get("wiki", "closedlist"))
        self.flow_list = MiscUtils.db_list(self.conf.get("wiki", "flowlist"))
        self.tablejobs = self.conf.get("wiki", "tablejobs")

        self.db_list = list(set(self.db_list) - set(self.skip_db_list))

        if not self.conf.has_section('database'):
            self.conf.add_section('database')
        self.max_allowed_packet = self.conf.get("database", "max_allowed_packet")

        if not self.conf.has_section('output'):
            self.conf.add_section('output')
        self.public_dir = self.conf.get("output", "public")
        self.private_dir = self.conf.get("output", "private")
        self.temp_dir = self.conf.get("output", "temp")
        self.web_root = self.conf.get("output", "webroot")
        self.index = self.conf.get("output", "index")
        self.template_dir = self.conf.get("output", "templatedir")
        self.perdump_index = self.conf.get("output", "perdumpindex")
        self.log_file = self.conf.get("output", "logfile")
        self.fileperms = self.conf.get("output", "fileperms")
        self.fileperms = int(self.fileperms, 0)
        if not self.conf.has_section('reporting'):
            self.conf.add_section('reporting')
        self.admin_mail = self.conf.get("reporting", "adminmail")
        self.mail_from = self.conf.get("reporting", "mailfrom")
        self.smtp_server = self.conf.get("reporting", "smtpserver")
        self.stale_age = self.conf.getint("reporting", "staleage")
        self.skip_privatetables = self.conf.getint("reporting", "skipprivatetables")

        if not self.conf.has_section('tools'):
            self.conf.add_section('tools')
        self.php = self.conf.get("tools", "php")
        self.gzip = self.conf.get("tools", "gzip")
        self.bzip2 = self.conf.get("tools", "bzip2")
        self.sevenzip = self.conf.get("tools", "sevenzip")
        self.mysql = self.conf.get("tools", "mysql")
        self.mysqldump = self.conf.get("tools", "mysqldump")
        self.head = self.conf.get("tools", "head")
        self.tail = self.conf.get("tools", "tail")
        self.cat = self.conf.get("tools", "cat")
        self.grep = self.conf.get("tools", "grep")
        self.checkforbz2footer = self.conf.get("tools", "checkforbz2footer")
        self.writeuptopageid = self.conf.get("tools", "writeuptopageid")
        self.recompressxml = self.conf.get("tools", "recompressxml")

        if not self.conf.has_section('cleanup'):
            self.conf.add_section('cleanup')
        self.keep = self.conf.getint("cleanup", "keep")

        if not self.conf.has_section('query'):
            self.conf.add_section('query')
        self.queryfile = self.conf.get("query", "queryfile")