def dump_revs(self): ''' dump revision content corresponding to previously-dumped stubs (revision metadata) ''' if not self.steps['revs']['run']: return True dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date) outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date) revsfile = RevsFile(self.wiki.config, self.wiki.date, self.wiki.db_name) outputfile = revsfile.get_filename() script_command = MultiVersion.mw_script_as_array(self.wiki.config, "dumpTextPass.php") command = [self.wiki.config.php] command.extend(script_command) stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name) stuboutputfile = stubfile.get_filename() command.extend(["--wiki=%s" % self.wiki.db_name, "--stub=gzip:%s" % os.path.join(outputdir, stuboutputfile), "--quiet", "--spawn=%s" % self.wiki.config.php, "--output=bzip2:%s" % os.path.join(outputdir, outputfile)]) if self.dryrun: print "would run command for revs dump:", command else: log.info("running with no output: " + " ".join(command)) success = RunSimpleCommand.run_with_no_output( command, shell=False, timeout=self.get_lock_timeout_interval(), timeout_callback=self.periodic_callback) if not success: log.warning("error producing revision text files" " for wiki %s", self.wiki.db_name) return False return True
def run(self): ''' dump maxrevid, stubs for revs from previous maxrevid to current one, revision content for these stubs, for given wiki and date ''' try: log.info("retrieving max rev id for wiki %s", self.wiki.db_name) max_revid = self.dump_max_revid() if not max_revid: return False log.info("retrieving prev max rev id for wiki %s", self.wiki.db_name) prev_revid = self.get_prev_revid(max_revid) if not prev_revid: return False log.info("producing stub file for wiki %s", self.wiki.db_name) if not self.dump_stub(prev_revid, max_revid): return False log.info("producing content file for wiki %s", self.wiki.db_name) if not self.dump_revs(): return False except Exception as ex: log.warning("Error encountered runing dump for %s ", self.wiki.db_name, exc_info=ex) return False return True
def dump_stub(self, start_revid, end_revid): ''' dump stubs (metadata) for revs from start_revid up to but not including end_revid ''' if not self.steps['stubs']['run']: return True dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date) outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date) stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name) outputfile = stubfile.get_filename() script_command = MultiVersion.mw_script_as_array(self.wiki.config, "dumpBackup.php") command = [self.wiki.config.php] command.extend(script_command) command.extend(["--wiki=%s" % self.wiki.db_name, "--stub", "--quiet", "--output=gzip:%s" % os.path.join(outputdir, outputfile), "--revrange", "--revstart=%s" % start_revid, "--revend=%s" % end_revid]) if self.dryrun: print "would run command for stubs dump:", command else: log.info("running with no output: " + " ".join(command)) success = RunSimpleCommand.run_with_no_output( command, shell=False, timeout=self.get_lock_timeout_interval(), timeout_callback=self.periodic_callback) if not success: log.warning("error producing stub files for wiki %s", self.wiki.db_name) return False return True
def dump_html(self): ''' dump HTML-formated revision content from RESTBase for the given wiki and date ''' dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date) outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date) htmlfile = HTMLFile(self.wiki.config, self.wiki.date, self.wiki.db_name) outputfile = htmlfile.get_filename(self.args['ns']) # /usr/bin/nodejs ./bin/dump_wiki --domain en.wikipedia.org --ns 0 \ # --apiURL http://en.wikipedia.org/w/api.php \ # --dataBase /srv/www/htmldumps/dumps/20160826/en.wikipedia.org.articles.ns0.sqlite3 domain = self.get_domain_from_wikidbname() # FIXME: the nodejs wrapper which will do the compress etc stuff for one wiki is # not yet written command = [self.wiki.config.nodejs] command.append(self.wiki.config.scriptpath) command.extend(["--domain", domain, "--ns", self.args['ns'], "--apiURL", "http://%s/w/api.php" % domain, "--dataBase", os.path.join(outputdir, outputfile), "--wiki=%s" % self.wiki.db_name, "--output=gzip:%s" % os.path.join(outputdir, outputfile)]) if self.dryrun: print "would run command for html dump:", command else: success = RunSimpleCommand.run_with_no_output( command, shell=False, timeout=self.get_lock_timeout_interval(), timeout_callback=self.periodic_callback) if not success: log.warning("error producing html files for wiki %s", self.wiki.db_name) return False return True
def do_one_wiki(self): """ run dump of specified type for one wiki, for given date unless it is among the wikis we skip, has already been run for the date, or some other process has the lock and is therefore presumably already dumping it """ if not skip_wiki(self.wiki.db_name, self.wiki.config): dumpdir = MiscDumpDir(self.args["config"], self.args["date"]) if not exists(dumpdir.get_dumpdir(self.wiki.db_name)): os.makedirs(dumpdir.get_dumpdir(self.wiki.db_name)) status_info = StatusInfo(self.args["config"], self.wiki.date, self.wiki.db_name) status = status_info.get_status() if status == "done:all" and not self.flags["forcerun"]: log.info("wiki %s skipped, adds/changes dump already" " complete", self.wiki.db_name) return STATUS_GOOD if not self.flags["dryrun"]: lock = MiscDumpLock(self.args["config"], self.wiki.date, self.wiki.db_name) # if lock is stale, remove it lock.remove_if_stale(self.wiki.config.lock_stale) # try to get the lock ourselves if not lock.get_lock(): log.info( "wiki %s skipped, wiki is locked," " another process should be doing the job", self.wiki.db_name ) return STATUS_TODO self.dumper.set_lockinfo(lock) dumps_dirs = MiscDumpDirs(self.wiki.config, self.wiki.db_name) dumps_dirs.cleanup_old_dumps(self.wiki.date) log.info("Doing run for wiki: %s", self.wiki.db_name) try: result = self.dumper.run() if not result: return STATUS_FAILED if not self.flags["dryrun"]: output_files, expected = self.dumper.get_output_files() if not md5sums(self.wiki, self.wiki.config.fileperms, output_files, expected): return STATUS_FAILED status_info.set_status("done:" + self.dumper.get_steps_done()) lock.unlock_if_owner() if self.flags["do_index"]: index = Index(self.args) index.do_all_wikis() except Exception as ex: log.warning("error from dump run" " for wiki %s", self.wiki.db_name, exc_info=ex) if not self.flags["dryrun"]: lock.unlock_if_owner() return STATUS_FAILED log.info("Success! Wiki %s %s dump complete.", self.wiki.db_name, self.args["dumptype"]) return STATUS_GOOD
def run(self): ''' dump html from RESTBase of revision content, for given wiki and date ''' try: log.info("dumping html for wiki %s", self.wiki.db_name) if not self.dump_html(): return False except Exception as ex: log.warning("Error encountered runing dump for %s ", self.wiki.db_name, exc_info=ex) return False return True
def do_one_wiki(self, wikiname, date=None): """ collect the text strings for one wiki to be inserted into the index.html file """ if not skip_wiki(wikiname, self.args["config"]): dumps_dirs = MiscDumpDirs(self.args["config"], wikiname) if not exists(self.dumpdir.get_dumpdir_no_date(wikiname)): log.info("No dump for wiki %s", wikiname) return if date is not None: dump_date = date else: dump_date = dumps_dirs.get_latest_dump_date(True) if not dump_date: log.info("No dump for wiki %s", wikiname) return other_runs_text = "other runs: %s<br />" % make_link(wikiname, wikiname) try: wiki = Wiki(self.args["config"], wikiname) wiki.set_date(dump_date) files_text = self.get_files_text(wiki) stat_text = self.get_stat_text(dump_date, wikiname) except Exception as ex: log.warning("Error encountered, no information available" " for wiki %s", wikiname, exc_info=ex) return "<strong>%s</strong> Error encountered," " no information available | %s" % ( wikiname, other_runs_text, ) try: wikiname_text = "<strong>%s</strong>" % wikiname wiki_info = " ".join([entry for entry in [wikiname_text, stat_text] if entry is not None]) + "<br />" wiki_info = wiki_info + " " + "\n ".join(files_text) wiki_info = wiki_info + "\n " + other_runs_text except Exception as ex: log.warning("Error encountered formatting information" " for wiki %s", wikiname, exc_info=ex) return "Error encountered formatting information" " for wiki %s" % wikiname return wiki_info
def get_domain_from_wikidbname(self): ''' given the name of the wiki db, turn this into the fqdn of the wiki project (i.e. enwiki -> en.wikipedia.org) ''' script_command = MultiVersion.mw_script_as_array(self.wiki.config, "eval.php") # echo $wgCanonicalServer | php "$multiversionscript" eval.php $wiki command = ["echo", "'echo $wgCanonicalServer;'", "|", self.wiki.config.php] command.extend(script_command) command.append(self.wiki.db_name) command_text = " ".join(command) log.info("running with no output: " + command_text) output = RunSimpleCommand.run_with_output(command_text, shell=True) if not output: log.warning("error retrieving domain for wiki %s", self.wiki.db_name) return None # rstrip gets rid of any trailing newlines from eval.php return output.split('//')[1].rstrip()