def process_timestamp(self, elem): if self._skip_revision: return revision_time = mwlib.ts2dt(elem.text) if ((self.detailed_end and revision_time > self.detailed_end) or (self.detailed_start and revision_time < self.detailed_start)): self._skip_revision = True else: self._date = revision_time del revision_time
def process_timestamp(self, elem): if self._skip_revision: return self._time = elem.text # Used only because there are two id tags. We're intrested in the # id child of contributor. As timestamp is before contributor is good # to clear self._id, self._username now. self._id = None self._username = None revision_time = mwlib.ts2dt(elem.text) if (self.time_end and revision_time > self.time_end): self._skip_revision = True
def process_timestamp(self, elem): if self._skip_revision: return revision_time = mwlib.ts2dt(elem.text) if ((self.time_end and revision_time > self.time_end) or (self.time_start and revision_time < self.time_start)): self._skip_revision = True else: self._time = revision_time del revision_time # Used only because there are two id tags. We're intrested in the # id child of contributor. As timestamp is before contributor is good # to clear self._id, self._username, self._ip now. self.delattr(("_id", "_username", "_ip"))
def first_time(self): if self.data[7] == 0: api_base = 'http://%s.wikipedia.org/w/api.php' % self.lang options = { 'action': 'query', 'list': 'usercontribs', 'ucuser': smart_str(self.user), 'ucdir': 'newer', 'uclimit': 1, 'format': 'json' } url = api_base + '?' + urllib.urlencode(options) result = simplejson.load(urllib.urlopen(url)) try: dtime = mwlib.ts2dt(result["query"]["usercontribs"][0]["timestamp"]) self.data[7] = int(time.mktime(dtime.timetuple())) except (IndexError, KeyError): logging.warn("Error while fetching firstedit %s", url) self.data[7] = self.current_time self.deleted = True return datetime.fromtimestamp(self.data[7])
def first_time(self): if self.data[7] == 0: api_base = 'http://%s.wikipedia.org/w/api.php' % self.lang options = { 'action': 'query', 'list': 'usercontribs', 'ucuser': smart_str(self.user), 'ucdir': 'newer', 'uclimit': 1, 'format': 'json' } url = api_base + '?' + urllib.urlencode(options) result = simplejson.load(urllib.urlopen(url)) try: dtime = mwlib.ts2dt( result["query"]["usercontribs"][0]["timestamp"]) self.data[7] = int(time.mktime(dtime.timetuple())) except (IndexError, KeyError): logging.warn("Error while fetching firstedit %s", url) self.data[7] = self.current_time self.deleted = True return datetime.fromtimestamp(self.data[7])
def save(self): if self._text is None: # difflib doesn't like NoneType self._text = "" if self.clean: self._text = self.textcleaner.clean_all(self._text) text_words = len(self.rwords.findall(self._text)) prev_words = len(self.rwords.findall(self._prev_text)) if text_words < 1000 or text_words <= 2 * prev_words: diff = _diff_text(self._prev_text, self._text, timeout=self.diff_timeout)[0] self.pywc.parse_col(diff) if not self.data.has_key(self._type): self.data[self._type] = {} current = self.data[self._type] date = mwlib.ts2dt(self._date) date_str = date.strftime("%Y/%m/%d") tmp = {"date": date_str, "qmarks": self.pywc._qmarks, "unique": len(self.pywc._unique), "dic": self.pywc._dic, "sixltr": self.pywc._sixltr, "total": self.pywc._total} for x in self.pywc.categories: tmp[x] = self.pywc._results[x] if not current.has_key(date_str): current[date_str] = tmp current[date_str]["edits"] = 1 else: for elem in tmp: if elem != "date": current[date_str][elem] += tmp[elem] current[date_str]["edits"] += 1 del tmp else: logging.warn("Revert detected: skipping... (%s)", self._date) self._prev_text = self._text
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] input_file output_file", option_class=SonetOption) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output") p.add_option('-i', '--ignorecols', action="store", dest="ignorecols", help="Columns numbers of the source file to ignore" "(comma separated and starting from 0)") p.add_option('-I', '--id', action="store", dest="id_col", type="int", help="Id column number (starting from 0)", default=0) p.add_option('-o', '--onlycols', action="store", dest="onlycols", help="Select only this set of columns" + \ "(comma separated and starting from 0)") p.add_option('-p', '--percentages', action="store_true", dest="perc", help="Use percentages instead of absolute value") p.add_option('-w', '--window', action="store", dest="window", type=int, help="Collapse days") p.add_option('-S', '--sliding', action="store", dest="smooth", type=int, help="Sliding window") p.add_option('--exclude-less-than', action="store", dest="excludelessthan", type=int, help="Exclude lines with totals (or dic if -d option is used) " + \ "smaller than this parameter") p.add_option('--exclude-more-than', action="store", dest="excludemorethan", type=int, help="Exclude lines with totals (or dic if -d option is used) " + \ "greater than this parameter") p.add_option('-s', '--start', action="store", dest='start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions starting from this date") p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions until this date") p.add_option('-d', '--dic', action="store_true", dest="dic", default=False, help="Calculate percentage over dic column instead of total") opts, files = p.parse_args() if len(files) != 2: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[0]), delimiter="\t") onlycols = None ignorecols = None if opts.onlycols: onlycols = [int(x) for x in opts.onlycols.split(",")] if opts.ignorecols: ignorecols = [int(x) for x in opts.ignorecols.split(",")] # content contains all the csv file content = [row for row in csv_reader] # CSV header, only of interesting columns header = [x for x in _gen_data(content[0], opts.id_col, ignorecols, onlycols)] # Creates a matrix (list) with percentages of the occurrencies of every # category. Don't count id, total, text, ignore columns. If onlycols is set # consider only them. mat = [] timestamps = [] totals = [] tot_index = -2 if opts.dic: tot_index = -4 for line in content[1:]: #filter only pages with total (or dic is -d) greater or smaller than X if opts.excludemorethan: if float(line[tot_index]) > opts.excludemorethan: continue if opts.excludelessthan: if float(line[tot_index]) < opts.excludelessthan: continue mat.append([x for x in _gen_data(line, opts.id_col, ignorecols, onlycols)]) totals.append(float(line[tot_index])) timestamps.append(ts2dt(line[opts.id_col])) del content mat = np.array(mat, dtype=np.float).transpose() logging.info("Input file read. Ready to plot") pdf_pag = PdfPages(files[1]) with Timr("Plotting"): for i, series in enumerate(mat): logging.info("Plotting page %d", i + 1) # Don't plot zeros and skip zero revisions! #ser = [x for x in series if x != 0] #time = [x for k, x in enumerate(timestamps) if series[k] != 0] #tot = [x for k, x in enumerate(totals) if series[k] != 0] ser = [x for k, x in enumerate(series) \ if (not opts.start or timestamps[k] >= opts.start) and \ (not opts.end or timestamps[k] <= opts.end)] time = [x for k, x in enumerate(timestamps) \ if (not opts.start or x >= opts.start) and \ (not opts.end or x <= opts.end)] tot = [x for k, x in enumerate(totals) \ if (not opts.start or timestamps[k] >= opts.start) and \ (not opts.end or timestamps[k] <= opts.end)] if opts.smooth and len(time) and len(ser) and len(tot): time, ser, tot = smooth_values(time, ser, tot, opts.smooth) if opts.window and len(time) and len(ser) and len(tot): time, ser, tot = collapse_values(time, ser, tot, opts.window) mean = float(sum(series)) / len(series) #rel_mean is the mean for the period [opts.end, opts.start] rel_mean = float(sum(ser)) / len(ser) if opts.perc: try: mean = float(sum(series)) / sum(totals) rel_mean = float(sum(ser)) / sum(tot) except ZeroDivisionError: mean = 0 rel_mean = 0 # Calculate percentages ser = [calc_perc(x, tot[k]) for k, x in enumerate(ser)] # Set axis limit 0-1 IS IT GOOD OR BAD? #axis.set_ylim(0, 1) plt.ylabel("%") first_time = time[0].date() last_time = time[-1].date() plt.clf() plt.subplots_adjust(bottom=0.25) plt.xticks(rotation=90) fig = plt.gcf() fig.set_size_inches(11.7, 8.3) axis = plt.gca() axis.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d')) axis.set_xlim(matplotlib.dates.date2num(first_time), matplotlib.dates.date2num(last_time)) if last_time - first_time < timedelta(days=30): axis.xaxis.set_major_locator(md.DayLocator(interval=1)) axis.xaxis.set_minor_locator(md.DayLocator(interval=1)) else: axis.xaxis.set_minor_locator(md.MonthLocator(interval=1)) #auto_loc = md.AutoDateLocator(minticks=8, maxticks=12, interval_multiples=True) #auto_loc.intervald[md.MONTHLY] = [6] rule = md.rrulewrapper(md.MONTHLY, interval=4) auto_loc = md.RRuleLocator(rule) axis.xaxis.set_major_locator(auto_loc) axis.tick_params(labelsize='x-small') plt.xlabel("Revisions Timestamp") if len(time) and len(ser): if opts.window: time = [t.date() for t in time] logging.info("Mean: %f", mean) logging.info("Relative Mean: %f", rel_mean) if header[i] == "negemo" or header[i] == "posemo": print ser # ONLY FOR TESTING, FIXME WHEN FINISHED plt.plot(matplotlib.dates.date2num(time), ser, "b.-") plt.axhline(y=mean, color="r") plt.title("%s - Mean: %.5f - Relative mean: %.5f" % (header[i], round(mean, 5), round(rel_mean, 5))) pdf_pag.savefig() pdf_pag.close()
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] input_file output_file", option_class=SonetOption) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output") p.add_option('-i', '--ignorecols', action="store", dest="ignorecols", help="Columns numbers of the source file to ignore" "(comma separated and starting from 0)") p.add_option('-I', '--id', action="store", dest="id_col", type="int", help="Id column number (starting from 0)", default=0) p.add_option('-o', '--onlycols', action="store", dest="onlycols", help="Select only this set of columns" + \ "(comma separated and starting from 0)") p.add_option('-p', '--percentages', action="store_true", dest="perc", help="Use percentages instead of absolute value") p.add_option('-w', '--window', action="store", dest="window", type=int, help="Collapse days") p.add_option('-S', '--sliding', action="store", dest="smooth", type=int, help="Sliding window") p.add_option('--exclude-less-than', action="store", dest="excludelessthan", type=int, help="Exclude lines with totals (or dic if -d option is used) " + \ "smaller than this parameter") p.add_option('--exclude-more-than', action="store", dest="excludemorethan", type=int, help="Exclude lines with totals (or dic if -d option is used) " + \ "greater than this parameter") p.add_option('-s', '--start', action="store", dest='start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions starting from this date") p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions until this date") p.add_option('-d', '--dic', action="store_true", dest="dic", default=False, help="Calculate percentage over dic column instead of total") opts, files = p.parse_args() if len(files) != 2: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[0]), delimiter="\t") onlycols = None ignorecols = None if opts.onlycols: onlycols = [int(x) for x in opts.onlycols.split(",")] if opts.ignorecols: ignorecols = [int(x) for x in opts.ignorecols.split(",")] # content contains all the csv file content = [row for row in csv_reader] # CSV header, only of interesting columns header = [ x for x in _gen_data(content[0], opts.id_col, ignorecols, onlycols) ] # Creates a matrix (list) with percentages of the occurrencies of every # category. Don't count id, total, text, ignore columns. If onlycols is set # consider only them. mat = [] timestamps = [] totals = [] tot_index = -2 if opts.dic: tot_index = -4 for line in content[1:]: #filter only pages with total (or dic is -d) greater or smaller than X if opts.excludemorethan: if float(line[tot_index]) > opts.excludemorethan: continue if opts.excludelessthan: if float(line[tot_index]) < opts.excludelessthan: continue mat.append( [x for x in _gen_data(line, opts.id_col, ignorecols, onlycols)]) totals.append(float(line[tot_index])) timestamps.append(ts2dt(line[opts.id_col])) del content mat = np.array(mat, dtype=np.float).transpose() logging.info("Input file read. Ready to plot") pdf_pag = PdfPages(files[1]) with Timr("Plotting"): for i, series in enumerate(mat): logging.info("Plotting page %d", i + 1) # Don't plot zeros and skip zero revisions! #ser = [x for x in series if x != 0] #time = [x for k, x in enumerate(timestamps) if series[k] != 0] #tot = [x for k, x in enumerate(totals) if series[k] != 0] ser = [x for k, x in enumerate(series) \ if (not opts.start or timestamps[k] >= opts.start) and \ (not opts.end or timestamps[k] <= opts.end)] time = [x for k, x in enumerate(timestamps) \ if (not opts.start or x >= opts.start) and \ (not opts.end or x <= opts.end)] tot = [x for k, x in enumerate(totals) \ if (not opts.start or timestamps[k] >= opts.start) and \ (not opts.end or timestamps[k] <= opts.end)] if opts.smooth and len(time) and len(ser) and len(tot): time, ser, tot = smooth_values(time, ser, tot, opts.smooth) if opts.window and len(time) and len(ser) and len(tot): time, ser, tot = collapse_values(time, ser, tot, opts.window) mean = float(sum(series)) / len(series) #rel_mean is the mean for the period [opts.end, opts.start] rel_mean = float(sum(ser)) / len(ser) if opts.perc: try: mean = float(sum(series)) / sum(totals) rel_mean = float(sum(ser)) / sum(tot) except ZeroDivisionError: mean = 0 rel_mean = 0 # Calculate percentages ser = [calc_perc(x, tot[k]) for k, x in enumerate(ser)] # Set axis limit 0-1 IS IT GOOD OR BAD? #axis.set_ylim(0, 1) plt.ylabel("%") first_time = time[0].date() last_time = time[-1].date() plt.clf() plt.subplots_adjust(bottom=0.25) plt.xticks(rotation=90) fig = plt.gcf() fig.set_size_inches(11.7, 8.3) axis = plt.gca() axis.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d')) axis.set_xlim(matplotlib.dates.date2num(first_time), matplotlib.dates.date2num(last_time)) if last_time - first_time < timedelta(days=30): axis.xaxis.set_major_locator(md.DayLocator(interval=1)) axis.xaxis.set_minor_locator(md.DayLocator(interval=1)) else: axis.xaxis.set_minor_locator(md.MonthLocator(interval=1)) #auto_loc = md.AutoDateLocator(minticks=8, maxticks=12, interval_multiples=True) #auto_loc.intervald[md.MONTHLY] = [6] rule = md.rrulewrapper(md.MONTHLY, interval=4) auto_loc = md.RRuleLocator(rule) axis.xaxis.set_major_locator(auto_loc) axis.tick_params(labelsize='x-small') plt.xlabel("Revisions Timestamp") if len(time) and len(ser): if opts.window: time = [t.date() for t in time] logging.info("Mean: %f", mean) logging.info("Relative Mean: %f", rel_mean) if header[i] == "negemo" or header[i] == "posemo": print ser # ONLY FOR TESTING, FIXME WHEN FINISHED plt.plot(matplotlib.dates.date2num(time), ser, "b.-") plt.axhline(y=mean, color="r") plt.title("%s - Mean: %.5f - Relative mean: %.5f" % (header[i], round(mean, 5), round(rel_mean, 5))) pdf_pag.savefig() pdf_pag.close()
def process_timestamp(self, elem): if self._skip: return self._date = ts2dt(elem.text)