def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = { 'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'type': self._type } self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text
def get_revisions(title, csv_writer, lang, textcleaner, startid=None, prev_text=""): api_base = 'http://%s.wikipedia.org/w/api.php' % lang options = {} options.update({ 'action': 'query', 'prop': 'revisions', 'rvlimit': 500, 'titles': title, 'rvprop': 'ids|timestamp|content', 'rvdir': 'newer', 'format': 'json' }) if startid != None: options.update({ 'rvstartid': startid }) url = api_base + '?' + urllib.urlencode(options) logging.info(url) result = simplejson.load(urllib.urlopen(url)) pages = result["query"]["pages"] for page in pages: revs = pages[page]["revisions"] for r in revs: text_cleaned = textcleaner.clean_all(r["*"]) text = smart_str(_diff_text(prev_text, text_cleaned)[0]) csv_writer.writerow([r["timestamp"], lang, smart_str(title), "", text]) prev_text = text_cleaned try: cont = result['query-continue']['revisions']['rvstartid'] logging.info("Continue to %d", cont) get_revisions(title, csv_writer, lang, cont, prev_text) except KeyError: logging.info("Finished!")
def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] page = {'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(diff), 'user': smart_str(self._sender), 'type': self._type} self.queue.append(page) else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text
def get_revisions(title, csv_writer, lang, startid=None, prev_text=""): api_base = "http://%s.wikipedia.org/w/api.php" % lang options = {} options.update( { "action": "query", "prop": "revisions", "rvlimit": 500, "titles": title, "rvprop": "ids|timestamp|content", "rvdir": "newer", "format": "json", } ) if startid != None: options.update({"rvstartid": startid}) url = api_base + "?" + urllib.urlencode(options) logging.info(url) result = simplejson.load(urllib.urlopen(url)) pages = result["query"]["pages"] for page in pages: revs = pages[page]["revisions"] for r in revs: text = smart_str(_diff_text(prev_text, r["*"])[0]) csv_writer.writerow([r["timestamp"], lang, smart_str(title), "", text]) prev_text = r["*"] try: cont = result["query-continue"]["revisions"]["rvstartid"] logging.info("Continue to %d", cont) get_revisions(title, csv_writer, lang, cont, prev_text) except KeyError: logging.info("Finished!")
def save(self): if self._skip_revision: return if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] self.pywc.parse_col(diff) if not self.data.has_key(self._type): self.data[self._type] = {} current = self.data[self._type] date_str = self._date.strftime("%Y/%m/%d") tmp = {"date": date_str, "qmarks": self.pywc._qmarks, "unique": len(self.pywc._unique), "dic": self.pywc._dic, "sixltr": self.pywc._sixltr, "total": self.pywc._total} for x in self.pywc.categories: tmp[x] = self.pywc._results[x] if not current.has_key(date_str): current[date_str] = tmp current[date_str]["edits"] = 1 else: for elem in tmp: if elem != "date": current[date_str][elem] += tmp[elem] current[date_str]["edits"] += 1 del tmp if self.pywc.detailed and self._type == self.detailed_ns: date_str = self._date.strftime("%Y/%m/%d") if not self.detailed_data.has_key(date_str): self.detailed_data[date_str] = defaultdict(dict) for keyword in self.pywc._detailed_data: occ = self.pywc._detailed_data[keyword] tmp = self.detailed_data[date_str][keyword] if not tmp: tmp = {} tmp["total"] = 0 tmp["pages"] = Counter() tmp["users"] = Counter() tmp["total"] += occ tmp["pages"][self._title] += occ tmp["users"][self._sender] += occ self.detailed_data[date_str][keyword] = tmp else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text
def save(self): """ Saves data to the queue. The queue is stored using self.flush() """ if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) page = {'title': smart_str(self._title), 'lang': self.lang, 'timestamp': self._date, 'text': smart_str(_diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0]), 'type': self._type} self.queue.append(page) self._prev_text = self._text
def save(self): if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] self.pywc.parse_col(diff) if not self.data.has_key(self._type): self.data[self._type] = {} current = self.data[self._type] date = mwlib.ts2dt(self._date) date_str = date.strftime("%Y/%m/%d") tmp = {"date": date_str, "qmarks": self.pywc._qmarks, "unique": len(self.pywc._unique), "dic": self.pywc._dic, "sixltr": self.pywc._sixltr, "total": self.pywc._total} for x in self.pywc.categories: tmp[x] = self.pywc._results[x] if not current.has_key(date_str): current[date_str] = tmp current[date_str]["edits"] = 1 else: for elem in tmp: if elem != "date": current[date_str][elem] += tmp[elem] current[date_str]["edits"] += 1 del tmp else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text