Exemplo n.º 1
0
    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None:  # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {
                'title': smart_str(self._title),
                'lang': self.lang,
                'timestamp': self._date,
                'text': smart_str(diff),
                'type': self._type
            }
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text
Exemplo n.º 2
0
def get_revisions(title, csv_writer, lang, textcleaner,
                  startid=None, prev_text=""):
    api_base = 'http://%s.wikipedia.org/w/api.php' % lang
    options = {}
    options.update({
        'action': 'query',
        'prop': 'revisions',
        'rvlimit': 500,
        'titles': title,
        'rvprop': 'ids|timestamp|content',
        'rvdir': 'newer',
        'format': 'json'
    })
    if startid != None:
        options.update({
            'rvstartid': startid
        })
    url = api_base + '?' + urllib.urlencode(options)
    logging.info(url)
    result = simplejson.load(urllib.urlopen(url))
    pages = result["query"]["pages"]
    for page in pages:
        revs = pages[page]["revisions"]
        for r in revs:
            text_cleaned = textcleaner.clean_all(r["*"])
            text = smart_str(_diff_text(prev_text, text_cleaned)[0])
            csv_writer.writerow([r["timestamp"], lang, smart_str(title),
                                "", text])
            prev_text = text_cleaned
    try:
        cont = result['query-continue']['revisions']['rvstartid']
        logging.info("Continue to %d", cont)
        get_revisions(title, csv_writer, lang, cont, prev_text)
    except KeyError:
        logging.info("Finished!")
Exemplo n.º 3
0
    def save(self):
        """
        Saves data to the queue.
        The queue is stored using self.flush()
        """
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)

        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            page = {'title': smart_str(self._title),
                    'lang': self.lang,
                    'timestamp': self._date,
                    'text': smart_str(diff),
                    'user': smart_str(self._sender),
                    'type': self._type}
            self.queue.append(page)
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text
Exemplo n.º 4
0
def get_revisions(title, csv_writer, lang, startid=None, prev_text=""):
    api_base = "http://%s.wikipedia.org/w/api.php" % lang
    options = {}
    options.update(
        {
            "action": "query",
            "prop": "revisions",
            "rvlimit": 500,
            "titles": title,
            "rvprop": "ids|timestamp|content",
            "rvdir": "newer",
            "format": "json",
        }
    )
    if startid != None:
        options.update({"rvstartid": startid})
    url = api_base + "?" + urllib.urlencode(options)
    logging.info(url)
    result = simplejson.load(urllib.urlopen(url))
    pages = result["query"]["pages"]
    for page in pages:
        revs = pages[page]["revisions"]
        for r in revs:
            text = smart_str(_diff_text(prev_text, r["*"])[0])
            csv_writer.writerow([r["timestamp"], lang, smart_str(title), "", text])
            prev_text = r["*"]
    try:
        cont = result["query-continue"]["revisions"]["rvstartid"]
        logging.info("Continue to %d", cont)
        get_revisions(title, csv_writer, lang, cont, prev_text)
    except KeyError:
        logging.info("Finished!")
Exemplo n.º 5
0
    def save(self):
        if self._skip_revision:
            return
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            self.pywc.parse_col(diff)
            if not self.data.has_key(self._type):
                self.data[self._type] = {}
            current = self.data[self._type]
            date_str = self._date.strftime("%Y/%m/%d")
            tmp = {"date": date_str,
                   "qmarks": self.pywc._qmarks,
                   "unique": len(self.pywc._unique),
                   "dic": self.pywc._dic,
                   "sixltr": self.pywc._sixltr,
                   "total": self.pywc._total}
            for x in self.pywc.categories:
                tmp[x] = self.pywc._results[x]

            if not current.has_key(date_str):
                current[date_str] = tmp
                current[date_str]["edits"] = 1
            else:
                for elem in tmp:
                    if elem != "date":
                        current[date_str][elem] += tmp[elem]
                current[date_str]["edits"] += 1
            del tmp

            if self.pywc.detailed and self._type == self.detailed_ns:
                date_str = self._date.strftime("%Y/%m/%d")
                if not self.detailed_data.has_key(date_str):
                    self.detailed_data[date_str] = defaultdict(dict)
                for keyword in self.pywc._detailed_data:
                    occ = self.pywc._detailed_data[keyword]
                    tmp = self.detailed_data[date_str][keyword]
                    if not tmp:
                        tmp = {}
                        tmp["total"] = 0
                        tmp["pages"] = Counter()
                        tmp["users"] = Counter()
                    tmp["total"] += occ
                    tmp["pages"][self._title] += occ
                    tmp["users"][self._sender] += occ
                    self.detailed_data[date_str][keyword] = tmp
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text
Exemplo n.º 6
0
 def save(self):
     """
     Saves data to the queue.
     The queue is stored using self.flush()
     """
     if self._text is None: # difflib doesn't like NoneType
         self._text = ""
     if self.clean:
         self._text = self.textcleaner.clean_all(self._text)
     page = {'title': smart_str(self._title),
             'lang': self.lang,
             'timestamp': self._date,
             'text': smart_str(_diff_text(self._prev_text,
                                          self._text,
                                          timeout=self.diff_timeout)[0]),
             'type': self._type}
     self.queue.append(page)
     self._prev_text = self._text
Exemplo n.º 7
0
    def save(self):
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            self.pywc.parse_col(diff)
            if not self.data.has_key(self._type):
                self.data[self._type] = {}
            current = self.data[self._type]
            date = mwlib.ts2dt(self._date)
            date_str = date.strftime("%Y/%m/%d")
            tmp = {"date": date_str,
                   "qmarks": self.pywc._qmarks,
                   "unique": len(self.pywc._unique),
                   "dic": self.pywc._dic,
                   "sixltr": self.pywc._sixltr,
                   "total": self.pywc._total}
            for x in self.pywc.categories:
                tmp[x] = self.pywc._results[x]

            if not current.has_key(date_str):
                current[date_str] = tmp
                current[date_str]["edits"] = 1
            else:
                for elem in tmp:
                    if elem != "date":
                        current[date_str][elem] += tmp[elem]
                current[date_str]["edits"] += 1
            del tmp
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text