Python FileUtils 예제들, dumps.fileutils.FileUtils Python 예제들

예제 #1

0

파일 보기

파일: runnerutils.py 프로젝트: wikimedia/operations-dumps

 def _write_dump_runinfo(self, content):
     for fmt in RunInfo.FORMATS:
         dump_runinfo_filename = self._get_dump_runinfo_filename(fmt=fmt)
         #  FileUtils.write_file(directory, dumpRunInfoFilename, text,
         #    self.wiki.config.fileperms)
         FileUtils.write_file_in_place(dump_runinfo_filename, content[fmt],
                                       self.wiki.config.fileperms)

예제 #2

0

파일 보기

파일: report.py 프로젝트: wikimedia/operations-dumps

 def write_status(wiki, message):
     """
     get the status information for the wiki dump run for all dump jobs
     and write it into the status html file
     """
     index = StatusHtml.get_statusfile_path(wiki, wiki.date)
     FileUtils.write_file_in_place(index, message, wiki.config.fileperms)

예제 #3

0

파일 보기

 def cp_chksum_tmpfiles_to_permfile(self):
     if Checksummer.NAME in self._enabled:
         for htype in Checksummer.HASHTYPES:
             tmp_filename = self._get_checksum_filename_tmp(htype)
             real_filename = self._get_checksum_filename(htype)
             text = FileUtils.read_file(tmp_filename)
             FileUtils.write_file(self.wiki.config.temp_dir, real_filename, text,
                                  self.wiki.config.fileperms)

예제 #4

0

파일 보기

파일: wikidump.py 프로젝트: wikimedia/operations-dumps

    def db_info_by_age(self, use_status_time=False):
        """
        Sort wikis in reverse order of last successful dump and return
        tuples of information for each wiki:
          * whether the dump failed,
          * the date of the run as found in dump dir string OR
            as determined by time of status file, if use_status_time is True,
          * age of status file if any,
          * wiki name

        Order is (DumpFailed, Age), and False < True:
        First, wikis whose latest dump was successful, most recent dump first
        Then, wikis whose latest dump failed, most recent dump first.
        Finally, wikis which have never been dumped.

        According to that sort, the last item of this list is, when applicable,
        the oldest failed dump attempt.

        If some error occurs checking a dump status, that dump is put last in the
        list (sort value is (True, maxsize) )

        Note that we now sort this list by the date of the dump directory, not the
        last date that a dump file in that directory may have been touched. This
        allows us to rerun jobs to completion from older runs, for example
        an en pedia history urn that failed in the middle, without borking the
        index page links.
        """
        available = []
        today = int(TimeUtils.today())
        for dbname in self.db_list:
            wiki = Wiki(self, dbname)

            age = sys.maxsize
            date = sys.maxsize
            last = wiki.latest_dump()
            status = ''
            if last:
                dump_status = StatusHtml.get_statusfile_path(wiki, last)
                try:
                    if use_status_time:
                        # only use the status file time, not the dir date
                        date = today
                    else:
                        date = today - int(last)
                    # tack on the file mtime so that if we have multiple wikis
                    # dumped on the same day, they get ordered properly
                    age = FileUtils.file_age(dump_status)
                    status = FileUtils.read_file(dump_status)
                except Exception as ex:
                    print("dump dir missing status file %s?" % dump_status)
            dump_failed = (status == '') or ('dump aborted' in status)
            available.append((dump_failed, date, age, dbname))
        available = sorted(available)
        return available

예제 #5

0

파일 보기

파일: report.py 프로젝트: wikimedia/operations-dumps

    def update_index_html_and_json(self, dump_status=""):
        '''
        generate the index.html file for the wiki's dump run which contains
        information on each dump step as well as links to completed files
        for download, hash files, etc. and links to completed files;
        generate the json file with the same information as well'''
        if Report.NAME in self._enabled:

            self.dumpjobdata.notice.refresh_notice()
            status_items = [Report.report_dump_step_status(self.dump_dir, item)
                            for item in self.items]
            status_items_html = [item['html'] for item in status_items]
            status_items_html.reverse()
            html = "\n".join(status_items_html)
            checksums = [self.get_checksum_html(htype)
                         for htype in Checksummer.HASHTYPES]
            checksums_html = ", ".join(checksums)
            failed_jobs = sum(1 for item in self.items if item.status() == "failed")
            txt = self.wiki.config.read_template("report.html") % {
                "db": self.wiki.db_name,
                "date": self.wiki.date,
                "notice": self.dumpjobdata.notice.notice,
                "status": StatusHtml.report_dump_status(failed_jobs, dump_status),
                "previous": self.report_previous_dump_link(dump_status),
                "items": html,
                "checksum": checksums_html,
                "index": self.wiki.config.index}

            json_out = {'jobs': {}}
            for item in status_items:
                for jobname in item['json']:
                    json_out['jobs'][jobname] = item['json'][jobname]
            try:
                indexpath = os.path.join(self.wiki.public_dir(), self.wiki.date,
                                         self.wiki.config.perdump_index)
                FileUtils.write_file_in_place(indexpath, txt, self.wiki.config.fileperms)
                json_filepath = os.path.join(self.wiki.public_dir(), self.wiki.date,
                                             Report.JSONFILE)
                FileUtils.write_file_in_place(json_filepath, json.dumps(json_out),
                                              self.wiki.config.fileperms)
            except Exception:
                if self.verbose:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value,
                                                                     exc_traceback)))
                message = "Couldn't update status files. Continuing anyways"
                if self.error_callback:
                    self.error_callback(message)
                else:
                    sys.stderr.write("%s\n" % message)

예제 #6

0

파일 보기

파일: wikidump.py 프로젝트: wikimedia/operations-dumps

 def db_latest_status(self):
     '''
     return list of tuples for each wiki:
         status of latest wiki dump or None if wiki never dumped,
         wiki name
     '''
     dbinfo = []
     for dbname in self.db_list:
         wiki = Wiki(self, dbname)
         last = wiki.latest_dump()
         status = ''
         if last:
             dump_status = StatusHtml.get_statusfile_path(wiki, last)
             try:
                 status = FileUtils.read_file(dump_status)
             except Exception as ex:
                 status = 'failed'
             for value in ['missing', 'not yet', 'failed', 'aborted',
                           'progress', 'partial', 'complete']:
                 if value in status:
                     status = value
                     break
         else:
             status = None
         dbinfo.append((dbname, status, last))
     return dbinfo

예제 #7

0

파일 보기

파일: jobs.py 프로젝트: wikimedia/operations-dumps

    def move_if_truncated(self, runner, dfname, emptycheck=0, tmpdir=False):
        """
        check if the given file (DumpFile) is truncated or empty
        if so, move it out of the way and return True
        return False otherwise

        if emptycheck is set to a number, the file will only be checked to
        seee if it is empty, if the file covers a page range with more
        pages than the specific number. Eg a file named
        elwikivoyage-20180618-pages-meta-history2.xml-p140p150.bz2
        would be checked for emptycheck = 8 but not for 12; files that
        don't have page start and end numbers in the filename would not
        be checked at all.

        if emptycheck is left as 0, the file will be checked to see if
        it is empty always.

        if file is located in the temp dir, set tmpdir=True for it to
        be found there; otherwise the public xml/sql dump output dir
        (or private, if the wiki is private), will be checked for the file.
        """
        if "check_trunc_files" not in runner.enabled or not self.check_truncation():
            return False

        if tmpdir:
            path = os.path.join(
                FileUtils.wiki_tempdir(runner.wiki.db_name, runner.wiki.config.temp_dir),
                dfname.filename)
        elif runner.wiki.is_private():
            path = runner.dump_dir.filename_private_path(dfname)
        else:
            path = runner.dump_dir.filename_public_path(dfname)
        dcontents = DumpContents(runner.wiki, path)

        file_truncated = True
        if os.path.exists(dcontents.filename):
            # for some file types we will check that the file has the right closing tag
            last_tag = None
            if ('.xml' in dcontents.filename and
                    ('.bz2' in dcontents.filename or '.gz' in dcontents.filename)):
                last_tag = b'</mediawiki>'

            # fixme hardcoded at 200? mmmm. but otoh configurable is kinda dumb
            if (not emptycheck or self.is_larger(dfname, 200)) and dcontents.check_if_empty():
                # file exists and is empty, move it out of the way
                dcontents.rename(dcontents.filename + ".empty")
            elif dcontents.check_if_truncated(last_tag):
                # The file exists and is truncated, move it out of the way
                dcontents.rename(dcontents.filename + ".truncated")
            elif dcontents.check_if_binary_crap():
                # The file exists and has binary junk in it, move it out of the way
                dcontents.rename(dcontents.filename + ".truncated")
            else:
                # The file exists and is not truncated and doesn't have random crap.
                # Heck, it's a good file!
                file_truncated = False
        else:
            # file doesn't exist, move on
            file_truncated = False
        return file_truncated

예제 #8

0

파일 보기

 def refresh_notice(self):
     # if the notice file has changed or gone away, we comply.
     notice_file = self._get_notice_filename()
     if exists(notice_file):
         self.notice = FileUtils.read_file(notice_file)
     else:
         self.notice = ""

예제 #9

0

파일 보기

 def save_symlink(self, dumpfile):
     if SymLinks.NAME in self._enabled:
         self.make_dir(self.dump_dir.latest_dir())
         realfile = self.dump_dir.filename_public_path(dumpfile)
         latest_filename = dumpfile.new_filename(dumpfile.dumpname, dumpfile.file_type,
                                                 dumpfile.file_ext, 'latest',
                                                 dumpfile.partnum, dumpfile.checkpoint,
                                                 dumpfile.temp)
         link = os.path.join(self.dump_dir.latest_dir(), latest_filename)
         if exists(link) or os.path.islink(link):
             if os.path.islink(link):
                 oldrealfile = os.readlink(link)
                 # format of these links should be...
                 # ../20110228/elwikidb-20110228-templatelinks.sql.gz
                 rellinkpattern = re.compile(r'^\.\./(20[0-9]+)/')
                 dateinlink = rellinkpattern.search(oldrealfile)
                 if dateinlink:
                     dateoflinkedfile = dateinlink.group(1)
                     dateinterval = int(self.wiki.date) - int(dateoflinkedfile)
                 else:
                     dateinterval = 0
                 # no file or it's older than ours... *then* remove the link
                 if not exists(os.path.realpath(link)) or dateinterval > 0:
                     self.debugfn("Removing old symlink %s" % link)
                     os.remove(link)
             else:
                 self.logfn("What the hell dude, %s is not a symlink" % link)
                 raise BackupError("What the hell dude, %s is not a symlink" % link)
         relative = FileUtils.relative_path(realfile, os.path.dirname(link))
         # if we removed the link cause it's obsolete, make the new one
         if exists(realfile) and not exists(link):
             self.debugfn("Adding symlink %s -> %s" % (link, relative))
             os.symlink(relative, link)

예제 #10

0

파일 보기

파일: checksummers.py 프로젝트: wikimedia/operations-dumps

 def cp_chksum_tmpfiles_to_permfile(self):
     """
     during a dump run, checksum files are written to a temporary
     location and updated there; we copy the content from these
     files into the permanent location after each dump job
     completes
     """
     if Checksummer.NAME in self._enabled:
         for htype in Checksummer.HASHTYPES:
             for fmt in Checksummer.FORMATS:
                 tmp_filename = self._get_checksum_filename_tmp(htype, fmt)
                 real_filename = self._get_checksum_path(htype, fmt)
                 content = FileUtils.read_file(tmp_filename)
                 FileUtils.write_file(
                     FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
                     real_filename, content,
                     self.wiki.config.fileperms)

예제 #11

0

파일 보기

파일: xmlstubs.py 프로젝트: wikimedia/operations-dumps

def dostubsbackup(wikidb, history_file, current_file, articles_file,
                  wikiconf, start, end, dryrun, verbose):
    '''
    do a stubs xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    if history_file is not None:
        outfiles['history'] = {'name': history_file}
    if current_file is not None:
        outfiles['current'] = {'name': current_file}
    if articles_file is not None:
        outfiles['articles'] = {'name': articles_file}

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir),
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']]
        else:
            outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']]

    script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php")
    command = [wikiconf.php] + script_command

    command.extend(["--wiki=%s" % wikidb,
                    "--full", "--stub", "--report=1000"])
    if history_file is not None:
        command.append("--output=file:%s" % outfiles['history']['temp'])
    if current_file is not None:
        command.extend(["--output=file:%s" % outfiles['current']['temp'],
                        "--filter=latest"])
    if articles_file is not None:
        command.extend(["--output=file:%s" % outfiles['articles']['temp'],
                        "--filter=latest", "--filter=notalk",
                        "--filter=namespace:!NS_USER"])

    if wikiconf.stubs_orderrevs:
        command.append("--orderrevs")
        callback = get_page_interval
    else:
        callback = None

    # the xml header, the body, and the xml footer should be separate gzipped
    # streams all concatted together
    # note that do_xml_stream exits on failure after cleaning up all output files
    # so the parent process must simply retry later
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback, header=True)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback, footer=True)

예제 #12

0

파일 보기

파일: runnerutils.py 프로젝트: wikimedia/operations-dumps

 def refresh_notice(self):
     '''
     if the notice file has changed or gone away, we comply.
     '''
     notice_filepath = self._get_notice_filename()
     if os.path.exists(notice_filepath):
         self.notice = FileUtils.read_file(notice_filepath)
     else:
         self.notice = ""

예제 #13

0

파일 보기

 def write_notice_file(self):
     if NoticeFile.NAME in self._enabled:
         notice_file = self._get_notice_filename()
         # delnotice.  toss any existing file
         if self.notice is False:
             if exists(notice_file):
                 os.remove(notice_file)
             self.notice = ""
         # addnotice, stuff notice in a file for other jobs etc
         elif self.notice != "":
             # notice_dir = self._get_notice_dir()
             FileUtils.write_file(self.wiki.config.temp_dir, notice_file, self.notice,
                                  self.wiki.config.fileperms)
         # default case. if there is a file get the contents, otherwise
         # we have empty contents, all good
         else:
             if exists(notice_file):
                 self.notice = FileUtils.read_file(notice_file)

예제 #14

0

파일 보기

파일: xmlabstracts.py 프로젝트: wikimedia/operations-dumps

def do_abstractsbackup(wikidb, output_files, variants,
                       wikiconf, start, end, dryrun, verbose):
    '''
    do an abstracts xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    index = 0
    for variant in variants:
        outfiles[variant] = {'name': output_files[index]}
        index += 1

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir),
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']]
        else:
            outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']]

    script_command = MultiVersion.mw_script_as_array(wikiconf,
                                                     "dumpBackup.php")
    command = [wikiconf.php] + script_command
    version = MultiVersion.mw_version(wikiconf, wikidb)
    abstract_cmd_dir = wikiconf.wiki_dir
    if version:
        abstract_cmd_dir = abstract_cmd_dir + "/" + version
    filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/AbstractFilter.php")
    if not os.path.exists(filter_path):
        filter_path = os.path.join(abstract_cmd_dir,
                                   "extensions/ActiveAbstract/includes/AbstractFilter.php")
    abstract_filter = ("--plugin=AbstractFilter:" + filter_path)

    command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir,
                    abstract_filter,
                    "--current", "--report=1000", "--namespaces=0"])

    for filetype in outfiles:
        command.extend(["--output=file:%s" % outfiles[filetype]['temp'],
                        "--filter=namespace:NS_MAIN",
                        "--filter=noredirect",
                        "--filter=abstract%s" % filetype])

    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose, header=True)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose, footer=True)

예제 #15

0

파일 보기

    def update_index_html(self, dump_status=""):
        '''
        generate the index.html file for the wiki's dump run which contains
        information on each dump step as well as links to completed files
        for download, hash files, etc.
        and links to completed files'''
        if IndexHtml.NAME in self._enabled:

            self.dumpjobdata.noticefile.refresh_notice()
            status_items = [IndexHtml.report_dump_step_status(self.dump_dir, item)
                            for item in self.items]
            status_items.reverse()
            html = "\n".join(status_items)
            checksums = [self.get_checksum_html(htype)
                         for htype in Checksummer.HASHTYPES]
            checksums_html = ", ".join(checksums)
            text = self.wiki.config.read_template("report.html") % {
                "db": self.wiki.db_name,
                "date": self.wiki.date,
                "notice": self.dumpjobdata.noticefile.notice,
                "status": StatusHtml.report_dump_status(
                    self.failhandler.failure_count, dump_status),
                "previous": self.report_previous_dump_link(dump_status),
                "items": html,
                "checksum": checksums_html,
                "index": self.wiki.config.index}

            try:
                index = os.path.join(self.wiki.public_dir(), self.wiki.date,
                                     self.wiki.config.perdump_index)
                FileUtils.write_file_in_place(index, text, self.wiki.config.fileperms)
            except Exception as ex:
                if self.verbose:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value,
                                                                     exc_traceback)))
                message = "Couldn't update status files. Continuing anyways"
                if self.error_callback:
                    self.error_callback(message)
                else:
                    sys.stderr.write("%s\n" % message)

예제 #16

0

파일 보기

 def save_feed(self, file_obj):
     if Feeds.NAME in self._enabled:
         self.make_dir(self.dump_dir.latest_dir())
         filename_and_path = self.dump_dir.web_path(file_obj)
         web_path = os.path.dirname(filename_and_path)
         rss_text = self.wiki.config.read_template("feed.xml") % {
             "chantitle": file_obj.basename,
             "chanlink": web_path,
             "chandesc": "Wikimedia dump updates for %s" % self.db_name,
             "title": web_path,
             "link": web_path,
             "description": xml_escape("<a href=\"%s\">%s</a>" % (
                 filename_and_path, file_obj.filename)),
             "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
         }
         rss_path = os.path.join(self.dump_dir.latest_dir(),
                                 self.db_name + "-latest-" + file_obj.basename +
                                 "-rss.xml")
         self.debugfn("adding rss feed file %s " % rss_path)
         FileUtils.write_file(self.wiki.config.temp_dir, rss_path,
                              rss_text, self.wiki.config.fileperms)

예제 #17

0

파일 보기

파일: checksummers.py 프로젝트: wikimedia/operations-dumps

 def _get_checksum_filename_tmp(self, htype, fmt):
     """
     args:
         hashtype ('md5', 'sha1',...)
         format of output ('json', 'txt', ...)
     returns:
         full path of a unique-enough temporary output file for wiki and date
     """
     dfname = DumpFilename(self.wiki, None,
                           Checksummer.get_checksum_filename_basename(htype, fmt) +
                           "." + self.timestamp + ".tmp")
     return os.path.join(FileUtils.wiki_tempdir(
         self.wiki.db_name, self.wiki.config.temp_dir), dfname.filename)

예제 #18

0

파일 보기

파일: checksummers.py 프로젝트: wikimedia/operations-dumps

 def get_checksum_from_file(path):
     '''
     get the checksum recorded in a file which should have
     one line, consisting of the checksum, two spaces, and
     the filename that was checksummed
     return None on any error
     '''
     try:
         content = FileUtils.read_file(path)
         checksum, _filename = content.split('  ', 1)
         return checksum
     except Exception:
         return None

예제 #19

0

파일 보기

파일: symlinks.py 프로젝트: wikimedia/operations-dumps

    def save_feed(self, dfname):
        """
        produce an rss feed file for the specified dump output file
        (dfname)

        If there is already such a feed, update it only if
        the date of the dump output file in the feed is not older
        than the date of dfname, as indicated in the dump dirs/filenames
        themselves, NOT via stat

        args:
            DumpFilename
        """
        if Feeds.NAME in self._enabled:
            rss_path = os.path.join(self.dump_dir.latest_dir(),
                                    self.db_name + "-latest-" + dfname.basename +
                                    "-rss.xml")

            self.make_dir(self.dump_dir.latest_dir())
            filename_and_path = self.dump_dir.web_path(dfname)
            web_path = os.path.dirname(filename_and_path)
            if self.feed_newer_than_file(rss_path, dfname):
                return
            rss_text = self.wiki.config.read_template("feed.xml") % {
                "chantitle": dfname.basename,
                "chanlink": web_path,
                "chandesc": "Wikimedia dump updates for %s" % self.db_name,
                "title": web_path,
                "link": web_path,
                "description": xml_escape("<a href=\"%s\">%s</a>" % (
                    filename_and_path, dfname.filename)),
                "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
            }
            self.debugfn("adding rss feed file %s " % rss_path)
            FileUtils.write_file(
                FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
                rss_path,
                rss_text, self.wiki.config.fileperms)

예제 #20

0

파일 보기

    def status_line(wiki, aborted=False):
        date = wiki.latest_dump()
        if date:
            if aborted:
                return StatusHtml.report_statusline(
                    wiki, "<span class=\"failed\">dump aborted</span>")

            status = StatusHtml.get_statusfile_path(wiki, date)
            try:
                return FileUtils.read_file(status)
            except Exception as ex:
                return StatusHtml.report_statusline(wiki, "missing status record")
        else:
            return StatusHtml.report_statusline(wiki, "has not yet been dumped")

예제 #21

0

파일 보기

파일: xmlcontentjobs.py 프로젝트: wikimedia/operations-dumps

    def get_stub_gen_cmd_for_input(self, input_dfname, output_dfnames, runner):
        """
        for the given input dumpfile (stub), write the requested output file (stub)
        """
        if not exists(self.wiki.config.writeuptopageid):
            raise BackupError("writeuptopageid command %s not found" %
                              self.wiki.config.writeuptopageid)

        if runner.wiki.is_private():
            inputfile_path = runner.dump_dir.filename_private_path(input_dfname)
        else:
            inputfile_path = runner.dump_dir.filename_public_path(input_dfname)

        output_dir = FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir)
        argstrings = []

        for output_dfname in output_dfnames:
            output_fname = output_dfname.filename
            # don't generate the file if we already have it (i.e. this is a retry)
            if not os.path.exists(os.path.join(output_dir, output_fname)):
                first_age_id = output_dfname.first_page_id
                if (output_dfname.last_page_id is not None and
                        output_dfname.last_page_id != "00000"):
                    last_page_id = str(int(output_dfname.last_page_id) + 1)
                else:
                    last_page_id = ""
                argstrings.append("{outfile}:{firstpage}:{lastpage}".format(
                    outfile=output_fname, firstpage=first_age_id, lastpage=last_page_id))

        # don't generate an output file if there are no filespecs
        if not argstrings:
            return None

        if input_dfname.file_ext == "gz":
            # command1 = "%s -dc %s" % (self.wiki.config.gzip, inputfile_path)
            command1 = [self.wiki.config.gzip, "-dc", inputfile_path]
        elif input_dfname.file_ext == '7z':
            # command1 = "%s e -si %s" % (self.wiki.config.sevenzip, inputfile_path)
            command1 = [self.wiki.config.sevenzip, "e", "-si", inputfile_path]
        elif input_dfname.file_ext == 'bz':
            # command1 = "%s -dc %s" % (self.wiki.config.bzip2, inputfile_path)
            command1 = [self.wiki.config.bzip2, "-dc", inputfile_path]
        else:
            raise BackupError("unknown stub file extension %s" % input_dfname.file_ext)

        command2 = [self.wiki.config.writeuptopageid, "--odir", output_dir,
                    "--fspecs", ";".join(argstrings)]
        pipeline = [command1]
        pipeline.append(command2)
        return pipeline

예제 #22

0

파일 보기

파일: runnerutils.py 프로젝트: wikimedia/operations-dumps

 def write_notice(self):
     '''
     write notice file if self.notice has contents,
     or remove if it self.notice is false,
     or read existing file and stash contents, if self.notice is empty str
     '''
     if Notice.NAME in self._enabled:
         notice_filepath = self._get_notice_filename()
         # delnotice.  toss any existing file
         if self.notice is False:
             if os.path.exists(notice_filepath):
                 os.remove(notice_filepath)
             self.notice = ""
         # addnotice, stuff notice in a file for other jobs etc
         elif self.notice != "":
             FileUtils.write_file(
                 FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
                 notice_filepath, self.notice,
                 self.wiki.config.fileperms)
         # default case. if there is a file get the contents, otherwise
         # we have empty contents, all good
         else:
             if os.path.exists(notice_filepath):
                 self.notice = FileUtils.read_file(notice_filepath)

예제 #23

0

파일 보기

파일: xmlcontentjobs.py 프로젝트: wikimedia/operations-dumps

 def has_no_pages(self, xmlfile, runner, tempdir=False):
     '''
     see if it has a page id in it or not. no? then return True
     '''
     if xmlfile.is_temp_file or tempdir:
         path = os.path.join(
             FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
             xmlfile.filename)
     else:
         if runner.wiki.is_private():
             path = runner.dump_dir.filename_private_path(xmlfile, self.wiki.date)
         else:
             path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date)
     dcontents = DumpContents(self.wiki, path, xmlfile, self.verbose)
     return bool(dcontents.find_first_page_id_in_file() is None)

예제 #24

0

파일 보기

def do_main():
    '''
    main entry point, do all the work
    '''

    (configfile, date, dryrun, filenameformat,
     output_dir, overwrite, wikiname, script,
     basename, query, retries, verbose, remainder) = get_args()

    validate_args(date, output_dir, retries, script, query)

    if retries is None:
        retries = "3"
    retries = int(retries)

    if configfile:
        config = Config(configfile)
    else:
        config = Config()

    if date is None:
        date = TimeUtils.today()

    if script is not None:
        runner = ScriptRunner(script, remainder, dryrun, verbose)
    else:
        if query is None:
            query = FileUtils.read_file(config.queryfile)
        runner = QueryRunner(query, dryrun, verbose)

    if basename is not None:
        base = Wiki(config, basename)
        base.set_date(date)
        if base is not None:
            base.config.parse_conffile_per_project(base.db_name)
    else:
        base = None

    if wikiname is not None:
        wiki = Wiki(config, wikiname)
        wiki.set_date(date)
        wikirunner = WikiRunner(runner, wiki, filenameformat,
                                output_dir, base)
        wikirunner.do_one_wiki(overwrite)
    else:
        wikirunner = WikiRunnerLoop(config, runner, filenameformat,
                                    output_dir, base)
        wikirunner.do_all_wikis_til_done(retries, overwrite, date)

예제 #25

0

파일 보기

 def report_file_size_status(dump_dir, file_obj, item_status):
     filename = dump_dir.filename_public_path(file_obj)
     if exists(filename):
         size = os.path.getsize(filename)
     else:
         item_status = "missing"
         size = 0
     size = FileUtils.pretty_size(size)
     if item_status == "in-progress":
         return "<li class='file'>%s %s (written) </li>" % (file_obj.filename, size)
     elif item_status == "done":
         webpath_relative = dump_dir.web_path_relative(file_obj)
         return ("<li class='file'><a href=\"%s\">%s</a> %s</li>"
                 % (webpath_relative, file_obj.filename, size))
     else:
         return "<li class='missing'>%s</li>" % file_obj.filename

예제 #26

0

파일 보기

파일: monitor.py 프로젝트: wikimedia/operations-dumps

def generate_index(config, other_indexhtml=None, sorted_by_db=False):
    running = False
    states = []

    if sorted_by_db:
        dbs = sorted(config.db_list)
    else:
        dbs = config.db_list_by_age()

    for db_name in dbs:
        try:
            wiki = Wiki(config, db_name)
            locker = Locker(wiki)
            lockfiles = locker.is_stale(all_locks=True)
            if lockfiles:
                locker.cleanup_stale_locks(lockfiles)
            running = running or locker.is_locked(all_locks=True)
            states.append(StatusHtml.status_line(wiki))
        except Exception:
            # if there's a problem with one wiki at least
            # let's show the rest
            if VERBOSE:
                traceback.print_exc(file=sys.stdout)
    if running:
        status = "Dumps are in progress..."
    elif exists("maintenance.txt"):
        status = FileUtils.read_file("maintenance.txt")
    else:
        status = "Dump process is idle."

    if other_indexhtml is None:
        other_index_link = ""
    else:
        if sorted_by_db:
            other_sortedby = "dump date"
        else:
            other_sortedby = "wiki name"

        other_index_link = ('Also view sorted by <a href="%s">%s</a>'
                            % (os.path.basename(other_indexhtml), other_sortedby))

    return config.read_template("download-index.html") % {
        "otherIndexLink": other_index_link,
        "status": status,
        "items": "\n".join(states)}

예제 #27

0

파일 보기

파일: xmlcontentjobs.py 프로젝트: wikimedia/operations-dumps

    def build_command(self, runner, stub_dfname, prefetch, output_dfname):
        """
        Build the command line for the dump, minus output and filter options
        args:
            Runner, stub DumpFilename, ....
        """
        stub_path = os.path.join(
            FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
            stub_dfname.filename)
        if os.path.exists(stub_path):
            # if this is a pagerange stub file in temp dir, use that
            stub_option = "--stub=gzip:%s" % stub_path
        else:
            # use regular stub file
            if runner.wiki.is_private():
                stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_private_path(stub_dfname)
            else:
                stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_dfname)
        if self.jobinfo['spawn']:
            spawn = "--spawn=%s" % (self.wiki.config.php)
        else:
            spawn = ""

        if not exists(self.wiki.config.php):
            raise BackupError("php command %s not found" % self.wiki.config.php)

        script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php")
        dump_command = [self.wiki.config.php]
        dump_command.extend(script_command)
        dump_command.extend(["--wiki=%s" % runner.db_name,
                             "%s" % stub_option,
                             "%s" % prefetch,
                             "--report=1000",
                             "%s" % spawn])

        dump_command = [entry for entry in dump_command if entry is not None]
        dump_command.extend([self.build_filters(runner, output_dfname), self.build_eta()])
        pipeline = [dump_command]
        # return a command series of one pipeline
        series = [pipeline]
        return series

예제 #28

0

파일 보기

파일: report.py 프로젝트: wikimedia/operations-dumps

 def report_file_size_status(dump_dir, dfname, item_status):
     """
     args:
         DumpDir
         DumpFilename
         status ("in-progress", "missing", ...)
     """
     filename = dump_dir.filename_public_path(dfname)
     size = None
     if exists(filename):
         size = os.path.getsize(filename)
     elif item_status == "in-progress":
         # note that because multiple files may be produced for a single dump
         # job, some may be complete while others are still in progress.
         # therefore we check the normal name first, falling back to the
         # inprogress name.
         filename = filename + DumpFilename.INPROG
         if exists(filename):
             try:
                 size = os.path.getsize(filename)
             except Exception:
                 # yes, it might be removed in that short interval of time.
                 pass
     if size is None:
         item_status = "missing"
         size = 0
     pretty_size = FileUtils.pretty_size(size)
     if item_status == "in-progress":
         txt = "<li class='file'>%s %s (written) </li>" % (dfname.filename, pretty_size)
         json_out = {'name': dfname.filename, 'size': size}
     elif item_status == "done":
         webpath_relative = dump_dir.web_path_relative(dfname)
         txt = ("<li class='file'><a href=\"%s\">%s</a> %s</li>"
                % (webpath_relative, dfname.filename, pretty_size))
         json_out = {'name': dfname.filename, 'size': size,
                     'url': webpath_relative}
     else:
         txt = "<li class='missing'>%s</li>" % dfname.filename
         json_out = {'name': dfname.filename}
     content = {'txt': txt, 'json': json_out}
     return content

예제 #29

0

파일 보기

파일: report.py 프로젝트: wikimedia/operations-dumps

    def status_line(wiki, aborted=False):
        '''
        read the status information from the status html
        file and attempt to return it
        on failure, makes a reasonable guess about the dump status
        and returns that
        if 'aborted' is True, don't read in anything but return
        a line of html that dump was aborted
        '''
        date = wiki.latest_dump()
        if date:
            if aborted:
                return StatusHtml.report_statusline(
                    wiki, "<span class=\"failed\">dump aborted</span>")

            status = StatusHtml.get_statusfile_path(wiki, date)
            try:
                return FileUtils.read_file(status)
            except Exception:
                return StatusHtml.report_statusline(wiki, "missing status record")
        else:
            return StatusHtml.report_statusline(wiki, "has not yet been dumped")

예제 #30

0

파일 보기

파일: wikidump.py 프로젝트: wikimedia/operations-dumps

    def lock(self):
        '''
        create lock file for the given wiki and date, also
        set up a watchdog that will update its timestamp
        every minute.
        '''
        if not os.path.isdir(self.wiki.private_dir()):
            try:
                os.makedirs(self.wiki.private_dir())
            except Exception as ex:
                # Maybe it was just created (race condition)?
                if not os.path.isdir(self.wiki.private_dir()):
                    raise
        lockf = FileUtils.atomic_create(self.get_lock_file_path(), "w")
        lockf.write("%s %d" % (socket.getfqdn(), os.getpid()))
        lockf.close()

        self.watchdog = LockWatchdog(self.get_lock_file_path())
        # when the main script dies this thread must die too, horribly if needed.
        self.watchdog.daemon = True
        self.watchdog.start()
        return True