def record_max_revid(self): ''' get max rev id for wiki from db, save it to file ''' self.get_max_revid() if not self.dryrun: file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) FileUtils.write_file_in_place(file_obj.get_path(), self.max_id, self.wiki.config.fileperms)
def do_all_wikis(self): """ generate index.html file for all wikis for the given date. FIXME maybe this should be for the latest run date? Hrm. """ text = "" for wikiname in self.args["config"].all_wikis_list: result = self.do_one_wiki(wikiname) if result: log.info("result for wiki %s is %s", wikiname, result) text = text + "<li>" + result + "</li>\n" index_text = self.args["config"].read_template(self.args["config"].indextmpl) % {"items": text} FileUtils.write_file_in_place(self.indexfile.get_path(), index_text, self.args["config"].fileperms)
def get_prev_incrdate(self, date, dumpok=False, revidok=False): ''' find the most recent incr dump before the specified date if "dumpok" is True, find most recent dump that completed successfully if "revidok" is True, find most recent dump that has a populated maxrevid.txt file ''' previous = None old = self.dirs.get_misc_dumpdirs() if old: for dump in old: if dump == date: return previous else: if dumpok: status_info = StatusInfo(self.wiki.config, dump, self.wiki.db_name) if status_info.get_status(dump) == "done": previous = dump elif revidok: max_revid_file = MaxRevIDFile(self.wiki.config, dump, self.wiki.db_name) if exists(max_revid_file.get_path()): revid = FileUtils.read_file(max_revid_file.get_path().rstrip()) if int(revid) > 0: previous = dump else: previous = dump return previous
def get_outputfile_indextxt(self, filenames_tocheck, expected, wikiname, dump_date): """ generate and return a list of text strings that provide a link to the given files, along with filename, size and date. if the file does not exist, it will be silently excluded from the list. the expected list is a list of filenames that are expected to be produced by the dump; currently no errors are generated on this basis but this may change in the future. """ dirinfo = MiscDumpDir(self.args["config"], dump_date) path = dirinfo.get_dumpdir(wikiname) output_fileinfo = {} for filename in filenames_tocheck: output_fileinfo[filename] = FileUtils.file_info(os.path.join(path, filename)) files_text = [] filenames = sorted(output_fileinfo.keys()) for filename in filenames: file_date, file_size = output_fileinfo[filename] log.info("output file %s for %s %s %s", filename, wikiname, safe(file_date), safe(file_size)) if filename in expected and file_date is None: # may do more with this sort of error in the future # for now, just get stats on the other files continue if file_date: files_text.append( "%s: %s (size %s)<br />" # FIXME check that this link is correct % ( make_link(os.path.join(wikiname, dump_date, filename), os.path.basename(filename)), file_date, file_size, ) ) return files_text
def dump_aliases(self): ''' returns True on success False or exception on error are fine ''' if not self.steps['aliases']['run']: return True try: contents = "for wiki %s: alias meow=more\n" % self.wiki.db_name aliasesfile = AliasesFile(self.wiki.config, self.wiki.date, self.wiki.db_name) FileUtils.write_file_in_place(aliasesfile.get_path(), contents, self.wiki.config.fileperms) return True except Exception as ex: log.info("Error encountered dumping namespaces for %s ", self.wiki.db_name, exc_info=ex) raise
def get_status(self, date=None): ''' return the status of the dump run for the given wiki and date, or the empty string if there is no run or no information available ''' status = "" if exists(self.status_file.get_path(date)): status = FileUtils.read_file(self.status_file.get_path(date)).rstrip() return status
def md5sums(wiki, fileperms, files, mandatory): ''' generate md5sums for specified files for dump of given wiki and specific date, and save them to output file ''' md5file = MD5File(wiki.config, wiki.date, wiki.db_name) text = "" errors = False for fname in files: try: text = text + "%s\n" % md5sum_one_file(fname) FileUtils.write_file_in_place(md5file.get_path(), text, fileperms) except Exception as ex: log.warning("Error encountered in md5sum for %s", fname, exc_info=ex) if fname in mandatory: errors = True return not errors
def dump_max_revid(self): ''' dump maximum rev id from wiki that's older than the configured number of seconds (cutoff) we have this cutoff so that content really new is not dumped; we want to give curators the chance to remove problematic entries first. a cutoff of some hours is reasonable. ''' max_id = None revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) if not exists(revidfile.get_path()): log.info("Wiki %s retrieving max revid from db.", self.wiki.db_name) query = ("select rev_id from revision where rev_timestamp < \"%s\" " "order by rev_timestamp desc limit 1" % self.cutoff) db_info = DbServerInfo(self.wiki, self.wiki.db_name) results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1] and lines[1].isdigit(): max_id = lines[1] FileUtils.write_file_in_place(revidfile.get_path(), max_id, self.wiki.config.fileperms) try: file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) max_revid = FileUtils.read_file(file_obj.get_path().rstrip()) except Exception as ex: log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(), exc_info=ex) max_revid = None # end rev id is not included in dump if max_revid is not None: max_revid = str(int(max_revid) + 1) log.info("max_revid is %s", safe(max_revid)) return max_revid
def read_max_revid_from_file(self, date=None): ''' read and return max rev id for wiki from file ''' if date is None: date = self.wiki.date try: file_obj = MaxRevIDFile(self.wiki.config, date, self.wiki.db_name) return FileUtils.read_file(file_obj.get_path().rstrip()) except Exception as ex: log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(), exc_info=ex) return None
def get_stat_text(self, dump_date, wikiname): """ generate and return the text string describing the status of the dump of the wiki for the given date """ stat = StatusFile(self.args["config"], dump_date, wikiname) stat_contents = FileUtils.read_file(stat.get_path()) log.info("status for %s %s", wikiname, safe(stat_contents)) if stat_contents: stat_text = "(%s)" % (stat_contents) else: stat_text = None return stat_text
def get_lock(self): ''' acquire lock for wiki and return True. if it does not exist, create it return False if lock could not be acquired ''' try: if not exists(self._config.dump_dir): os.makedirs(self._config.dump_dir) fhandle = FileUtils.atomic_create(self.lockfile.get_path(), "w") fcntl.lockf(fhandle.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) fhandle.write("%s %d" % (socket.getfqdn(), os.getpid())) fhandle.close() return True except Exception as ex: log.info("Error encountered getting lock", exc_info=ex) return False
def get_fileinfo(self): ''' return a FileInfo object corresponding to the file ''' return FileUtils.file_info(self.get_path())
def read_template(self, name): ''' read a file out of the configured template dir and return the contents ''' template = os.path.join(self.template_dir, name) return FileUtils.read_file(template)
def _get_lockfile_contents(self): try: contents = FileUtils.read_file(self.lockfile.get_path(self.date)) return contents.split() except Exception: return None, None
def set_status(self, status): ''' write out the status information supplied for the dump run ''' FileUtils.write_file_in_place(self.status_file.get_path(), status, self._config.fileperms)