Пример #1
0
def get_max_id(wikiconf, wikidb, id_field, table):
    '''
    retrieve the largest id for this wiki from the db for specific table
    pass in name of id field, name of table
    '''
    wiki = Wiki(wikiconf, wikidb)

    db_info = DbServerInfo(wiki, wikidb)
    query = "select MAX(%s) from %s%s;" % (
        id_field, db_info.db_table_prefix, table)
    results = None
    retries = 0
    maxretries = 5
    end = 0
    results = db_info.run_sql_and_get_output(query)
    if results:
        lines = results.splitlines()
        if lines and lines[1]:
            if not lines[1].isdigit():
                return 0   # probably NULL or missing table
            end = int(lines[1])
            return end

    while results is None and retries < maxretries:
        retries = retries + 1
        time.sleep(5)
        results = db_info.run_sql_and_get_output(query)
        if not results:
            continue
        lines = results.splitlines()
        if lines and lines[1]:
            end = int(lines[1])
            break

    if not end:
        sys.stderr.write("failed to get max page id from db, exiting\n")
        sys.exit(1)
    else:
        return end
Пример #2
0
    def dump_max_revid(self):
        '''
        dump maximum rev id from wiki that's older than
        the configured number of seconds (cutoff)

        we have this cutoff so that content really new
        is not dumped; we want to give curators the chance to
        remove problematic entries first.

        a cutoff of some hours is reasonable.
        '''
        max_revid = None
        revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        if exists(revidfile.get_path()):
            self.log.info("Wiki %s, max rev id file %s already exists",
                          self.wiki.db_name, revidfile.get_path())
        else:
            self.log.info("Wiki %s retrieving max revid from db.",
                          self.wiki.db_name)
            query = ("select rev_id from revision where rev_timestamp < \"%s\" "
                     "order by rev_timestamp desc limit 1" % self.cutoff)
            db_info = DbServerInfo(self.wiki, self.wiki.db_name)
            results = db_info.run_sql_and_get_output(query)
            if results:
                lines = results.splitlines()
                if lines and lines[1] and lines[1].isdigit():
                    max_revid = lines[1]
                    if self.dryrun:
                        print("would write file {path} with contents {revid}".format(
                            path=revidfile.get_path(), revid=max_revid))
                    else:
                        FileUtils.write_file_in_place(
                            revidfile.get_path(), max_revid.decode('utf-8'),
                            self.wiki.config.fileperms)
        if not max_revid:
            try:
                file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
                max_revid = FileUtils.read_file(file_obj.get_path().rstrip())
            except Exception as ex:
                self.log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(),
                              exc_info=ex)
                max_revid = None

        # end rev id is not included in dump
        if max_revid is not None:
            max_revid = str(int(max_revid) + 1)

        self.log.info("max_revid is %s", safe(max_revid))
        return max_revid
Пример #3
0
def get_revs_per_page_interval(page_id_start, interval, wiki, db_info):
    '''
    given page id start and the number of pages, get
    and return total number of revisions these pages have

    wiki is a Wiki object for the specific wiki
    db_info is a DbServerInfo object for the specific wiki
    '''

    query = ("select COUNT(rev_id) from revision where "
             "rev_page >= %s and rev_page < %s;" % (
                 page_id_start, page_id_start + interval))
    results = None
    retries = 0
    maxretries = 5
    end = 0
    results = db_info.run_sql_and_get_output(query)
    if results:
        lines = results.splitlines()
        if lines and lines[1]:
            if not lines[1].isdigit():
                return 0   # probably NULL or missing table
            end = int(lines[1])
            return end

    while results is None and retries < maxretries:
        retries = retries + 1
        time.sleep(5)
        # maybe the server was depooled. if so we will get another one
        db_info = DbServerInfo(wiki, wiki.db_name)
        results = db_info.run_sql_and_get_output(query)
        if not results:
            continue
        lines = results.splitlines()
        if lines and lines[1]:
            end = int(lines[1])
            break

    if not end:
        sys.stderr.write("failed to get revision count for page range from db, exiting\n")
        sys.exit(1)
    else:
        return end