示例#1
0
 def process_timestamp(self, elem):
     if self._skip_revision:
         return
     revision_time = mwlib.ts2dt(elem.text)
     if ((self.detailed_end and revision_time > self.detailed_end) or
         (self.detailed_start and revision_time < self.detailed_start)):
         self._skip_revision = True
     else:
         self._date = revision_time
     del revision_time
示例#2
0
 def process_timestamp(self, elem):
     if self._skip_revision:
         return
     revision_time = mwlib.ts2dt(elem.text)
     if ((self.detailed_end and revision_time > self.detailed_end) or
         (self.detailed_start and revision_time < self.detailed_start)):
         self._skip_revision = True
     else:
         self._date = revision_time
     del revision_time
    def process_timestamp(self, elem):
        if self._skip_revision: return

        self._time = elem.text

        # Used only because there are two id tags. We're intrested in the
        # id child of contributor. As timestamp is before contributor is good
        # to clear self._id, self._username now.
        self._id = None
        self._username = None

        revision_time = mwlib.ts2dt(elem.text)
        if (self.time_end and revision_time > self.time_end):
            self._skip_revision = True
示例#4
0
    def process_timestamp(self, elem):
        if self._skip_revision: return

        self._time = elem.text

        # Used only because there are two id tags. We're intrested in the
        # id child of contributor. As timestamp is before contributor is good
        # to clear self._id, self._username now.
        self._id = None
        self._username = None

        revision_time = mwlib.ts2dt(elem.text)
        if (self.time_end and revision_time > self.time_end):
            self._skip_revision = True
示例#5
0
    def process_timestamp(self, elem):
        if self._skip_revision:
            return

        revision_time = mwlib.ts2dt(elem.text)
        if ((self.time_end and revision_time > self.time_end) or
            (self.time_start and revision_time < self.time_start)):
            self._skip_revision = True
        else:
            self._time = revision_time
        del revision_time

        # Used only because there are two id tags. We're intrested in the
        # id child of contributor. As timestamp is before contributor is good
        # to clear self._id, self._username, self._ip now.
        self.delattr(("_id", "_username", "_ip"))
示例#6
0
    def process_timestamp(self, elem):
        if self._skip_revision:
            return

        revision_time = mwlib.ts2dt(elem.text)
        if ((self.time_end and revision_time > self.time_end)
                or (self.time_start and revision_time < self.time_start)):
            self._skip_revision = True
        else:
            self._time = revision_time
        del revision_time

        # Used only because there are two id tags. We're intrested in the
        # id child of contributor. As timestamp is before contributor is good
        # to clear self._id, self._username, self._ip now.
        self.delattr(("_id", "_username", "_ip"))
 def first_time(self):
     if self.data[7] == 0:
         api_base = 'http://%s.wikipedia.org/w/api.php' % self.lang
         options = {
             'action': 'query',
             'list': 'usercontribs',
             'ucuser': smart_str(self.user),
             'ucdir': 'newer',
             'uclimit': 1,
             'format': 'json'
         }
         url = api_base + '?' + urllib.urlencode(options)
         result = simplejson.load(urllib.urlopen(url))
         try:
             dtime =  mwlib.ts2dt(result["query"]["usercontribs"][0]["timestamp"])
             self.data[7] =  int(time.mktime(dtime.timetuple()))
         except (IndexError, KeyError):
             logging.warn("Error while fetching firstedit %s", url)
             self.data[7] = self.current_time
             self.deleted = True
     return datetime.fromtimestamp(self.data[7])
示例#8
0
 def first_time(self):
     if self.data[7] == 0:
         api_base = 'http://%s.wikipedia.org/w/api.php' % self.lang
         options = {
             'action': 'query',
             'list': 'usercontribs',
             'ucuser': smart_str(self.user),
             'ucdir': 'newer',
             'uclimit': 1,
             'format': 'json'
         }
         url = api_base + '?' + urllib.urlencode(options)
         result = simplejson.load(urllib.urlopen(url))
         try:
             dtime = mwlib.ts2dt(
                 result["query"]["usercontribs"][0]["timestamp"])
             self.data[7] = int(time.mktime(dtime.timetuple()))
         except (IndexError, KeyError):
             logging.warn("Error while fetching firstedit %s", url)
             self.data[7] = self.current_time
             self.deleted = True
     return datetime.fromtimestamp(self.data[7])
    def save(self):
        if self._text is None: # difflib doesn't like NoneType
            self._text = ""
        if self.clean:
            self._text = self.textcleaner.clean_all(self._text)
        text_words = len(self.rwords.findall(self._text))
        prev_words = len(self.rwords.findall(self._prev_text))
        if text_words < 1000 or text_words <= 2 * prev_words:
            diff = _diff_text(self._prev_text,
                              self._text,
                              timeout=self.diff_timeout)[0]
            self.pywc.parse_col(diff)
            if not self.data.has_key(self._type):
                self.data[self._type] = {}
            current = self.data[self._type]
            date = mwlib.ts2dt(self._date)
            date_str = date.strftime("%Y/%m/%d")
            tmp = {"date": date_str,
                   "qmarks": self.pywc._qmarks,
                   "unique": len(self.pywc._unique),
                   "dic": self.pywc._dic,
                   "sixltr": self.pywc._sixltr,
                   "total": self.pywc._total}
            for x in self.pywc.categories:
                tmp[x] = self.pywc._results[x]

            if not current.has_key(date_str):
                current[date_str] = tmp
                current[date_str]["edits"] = 1
            else:
                for elem in tmp:
                    if elem != "date":
                        current[date_str][elem] += tmp[elem]
                current[date_str]["edits"] += 1
            del tmp
        else:
            logging.warn("Revert detected: skipping... (%s)", self._date)
        self._prev_text = self._text
示例#10
0
def main():
    import optparse
    from sonet.lib import SonetOption
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file output_file",
        option_class=SonetOption)
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output")
    p.add_option('-i', '--ignorecols', action="store", dest="ignorecols",
                 help="Columns numbers of the source file to ignore"
                      "(comma separated and starting from 0)")
    p.add_option('-I', '--id', action="store", dest="id_col", type="int",
                 help="Id column number (starting from 0)", default=0)
    p.add_option('-o', '--onlycols', action="store", dest="onlycols",
                 help="Select only this set of columns" + \
                      "(comma separated and starting from 0)")
    p.add_option('-p', '--percentages', action="store_true", dest="perc",
                 help="Use percentages instead of absolute value")
    p.add_option('-w', '--window', action="store", dest="window", type=int,
                 help="Collapse days")
    p.add_option('-S', '--sliding', action="store", dest="smooth", type=int,
                 help="Sliding window")
    p.add_option('--exclude-less-than', action="store",
                 dest="excludelessthan", type=int,
                 help="Exclude lines with totals (or dic if -d option is used) " + \
                      "smaller than this parameter")
    p.add_option('--exclude-more-than', action="store",
                 dest="excludemorethan", type=int,
                 help="Exclude lines with totals (or dic if -d option is used) " + \
                      "greater than this parameter")
    p.add_option('-s', '--start', action="store",
        dest='start', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Look for revisions starting from this date")
    p.add_option('-e', '--end', action="store",
        dest='end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Look for revisions until this date")
    p.add_option('-d', '--dic', action="store_true", dest="dic", default=False,
                 help="Calculate percentage over dic column instead of total")
    opts, files = p.parse_args()

    if len(files) != 2:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[0]), delimiter="\t")
    onlycols = None
    ignorecols = None
    if opts.onlycols:
        onlycols = [int(x) for x in opts.onlycols.split(",")]
    if opts.ignorecols:
        ignorecols = [int(x) for x in opts.ignorecols.split(",")]

    # content contains all the csv file
    content = [row for row in csv_reader]

    # CSV header, only of interesting columns
    header = [x for x in _gen_data(content[0], opts.id_col,
                                   ignorecols, onlycols)]

    # Creates a matrix (list) with percentages of the occurrencies of every
    # category. Don't count id, total, text, ignore columns. If onlycols is set
    # consider only them.
    mat = []
    timestamps = []
    totals = []
    tot_index = -2
    if opts.dic:
        tot_index = -4

    for line in content[1:]:
        #filter only pages with total (or dic is -d) greater or smaller than X
        if opts.excludemorethan:
            if float(line[tot_index]) > opts.excludemorethan:
                continue
        if opts.excludelessthan:
            if float(line[tot_index]) < opts.excludelessthan:
                continue

        mat.append([x for x in _gen_data(line, opts.id_col,
                                         ignorecols, onlycols)])
        totals.append(float(line[tot_index]))
        timestamps.append(ts2dt(line[opts.id_col]))
    del content

    mat = np.array(mat, dtype=np.float).transpose()
    logging.info("Input file read. Ready to plot")
    pdf_pag = PdfPages(files[1])

    with Timr("Plotting"):
        for i, series in enumerate(mat):
            logging.info("Plotting page %d", i + 1)

            # Don't plot zeros and skip zero revisions!
            #ser = [x for x in series if x != 0]
            #time = [x for k, x in enumerate(timestamps) if series[k] != 0]
            #tot = [x for k, x in enumerate(totals) if series[k] != 0]
            ser = [x for k, x in enumerate(series) \
                   if (not opts.start or timestamps[k] >= opts.start) and \
                      (not opts.end or timestamps[k] <= opts.end)]
            time = [x for k, x in enumerate(timestamps) \
                    if (not opts.start or x >= opts.start) and \
                       (not opts.end or x <= opts.end)]
            tot = [x for k, x in enumerate(totals) \
                   if (not opts.start or timestamps[k] >= opts.start) and \
                      (not opts.end or timestamps[k] <= opts.end)]

            if opts.smooth and len(time) and len(ser) and len(tot):
                time, ser, tot = smooth_values(time, ser, tot,
                                               opts.smooth)

            if opts.window and len(time) and len(ser) and len(tot):
                time, ser, tot = collapse_values(time, ser, tot,
                                                 opts.window)

            mean = float(sum(series)) / len(series)
            #rel_mean is the mean for the period [opts.end, opts.start]
            rel_mean = float(sum(ser)) / len(ser)

            if opts.perc:
                try:
                    mean = float(sum(series)) / sum(totals)
                    rel_mean = float(sum(ser)) / sum(tot)
                except ZeroDivisionError:
                    mean = 0
                    rel_mean = 0
                # Calculate percentages
                ser = [calc_perc(x, tot[k]) for k, x in enumerate(ser)]
                # Set axis limit 0-1 IS IT GOOD OR BAD?
                #axis.set_ylim(0, 1)
                plt.ylabel("%")

            first_time = time[0].date()
            last_time = time[-1].date()
            plt.clf()
            plt.subplots_adjust(bottom=0.25)
            plt.xticks(rotation=90)
            fig = plt.gcf()
            fig.set_size_inches(11.7, 8.3)
            axis = plt.gca()
            axis.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d'))
            axis.set_xlim(matplotlib.dates.date2num(first_time),
                          matplotlib.dates.date2num(last_time))
            if last_time - first_time < timedelta(days=30):
                axis.xaxis.set_major_locator(md.DayLocator(interval=1))
                axis.xaxis.set_minor_locator(md.DayLocator(interval=1))
            else:
                axis.xaxis.set_minor_locator(md.MonthLocator(interval=1))
                #auto_loc = md.AutoDateLocator(minticks=8, maxticks=12, interval_multiples=True)
                #auto_loc.intervald[md.MONTHLY] = [6]
                rule = md.rrulewrapper(md.MONTHLY, interval=4)
                auto_loc = md.RRuleLocator(rule)
                axis.xaxis.set_major_locator(auto_loc)
            axis.tick_params(labelsize='x-small')
            plt.xlabel("Revisions Timestamp")

            if len(time) and len(ser):
                if opts.window:
                    time = [t.date() for t in time]
                logging.info("Mean: %f", mean)
                logging.info("Relative Mean: %f", rel_mean)
                if header[i] == "negemo" or header[i] == "posemo":
                    print ser  # ONLY FOR TESTING, FIXME WHEN FINISHED
                plt.plot(matplotlib.dates.date2num(time), ser, "b.-")
                plt.axhline(y=mean, color="r")
                plt.title("%s - Mean: %.5f - Relative mean: %.5f" % (header[i], round(mean, 5), round(rel_mean, 5)))
                pdf_pag.savefig()

        pdf_pag.close()
示例#11
0
def main():
    import optparse
    from sonet.lib import SonetOption
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file output_file",
        option_class=SonetOption)
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output")
    p.add_option('-i',
                 '--ignorecols',
                 action="store",
                 dest="ignorecols",
                 help="Columns numbers of the source file to ignore"
                 "(comma separated and starting from 0)")
    p.add_option('-I',
                 '--id',
                 action="store",
                 dest="id_col",
                 type="int",
                 help="Id column number (starting from 0)",
                 default=0)
    p.add_option('-o', '--onlycols', action="store", dest="onlycols",
                 help="Select only this set of columns" + \
                      "(comma separated and starting from 0)")
    p.add_option('-p',
                 '--percentages',
                 action="store_true",
                 dest="perc",
                 help="Use percentages instead of absolute value")
    p.add_option('-w',
                 '--window',
                 action="store",
                 dest="window",
                 type=int,
                 help="Collapse days")
    p.add_option('-S',
                 '--sliding',
                 action="store",
                 dest="smooth",
                 type=int,
                 help="Sliding window")
    p.add_option('--exclude-less-than', action="store",
                 dest="excludelessthan", type=int,
                 help="Exclude lines with totals (or dic if -d option is used) " + \
                      "smaller than this parameter")
    p.add_option('--exclude-more-than', action="store",
                 dest="excludemorethan", type=int,
                 help="Exclude lines with totals (or dic if -d option is used) " + \
                      "greater than this parameter")
    p.add_option('-s',
                 '--start',
                 action="store",
                 dest='start',
                 type="yyyymmdd",
                 metavar="YYYYMMDD",
                 default=None,
                 help="Look for revisions starting from this date")
    p.add_option('-e',
                 '--end',
                 action="store",
                 dest='end',
                 type="yyyymmdd",
                 metavar="YYYYMMDD",
                 default=None,
                 help="Look for revisions until this date")
    p.add_option('-d',
                 '--dic',
                 action="store_true",
                 dest="dic",
                 default=False,
                 help="Calculate percentage over dic column instead of total")
    opts, files = p.parse_args()

    if len(files) != 2:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[0]), delimiter="\t")
    onlycols = None
    ignorecols = None
    if opts.onlycols:
        onlycols = [int(x) for x in opts.onlycols.split(",")]
    if opts.ignorecols:
        ignorecols = [int(x) for x in opts.ignorecols.split(",")]

    # content contains all the csv file
    content = [row for row in csv_reader]

    # CSV header, only of interesting columns
    header = [
        x for x in _gen_data(content[0], opts.id_col, ignorecols, onlycols)
    ]

    # Creates a matrix (list) with percentages of the occurrencies of every
    # category. Don't count id, total, text, ignore columns. If onlycols is set
    # consider only them.
    mat = []
    timestamps = []
    totals = []
    tot_index = -2
    if opts.dic:
        tot_index = -4

    for line in content[1:]:
        #filter only pages with total (or dic is -d) greater or smaller than X
        if opts.excludemorethan:
            if float(line[tot_index]) > opts.excludemorethan:
                continue
        if opts.excludelessthan:
            if float(line[tot_index]) < opts.excludelessthan:
                continue

        mat.append(
            [x for x in _gen_data(line, opts.id_col, ignorecols, onlycols)])
        totals.append(float(line[tot_index]))
        timestamps.append(ts2dt(line[opts.id_col]))
    del content

    mat = np.array(mat, dtype=np.float).transpose()
    logging.info("Input file read. Ready to plot")
    pdf_pag = PdfPages(files[1])

    with Timr("Plotting"):
        for i, series in enumerate(mat):
            logging.info("Plotting page %d", i + 1)

            # Don't plot zeros and skip zero revisions!
            #ser = [x for x in series if x != 0]
            #time = [x for k, x in enumerate(timestamps) if series[k] != 0]
            #tot = [x for k, x in enumerate(totals) if series[k] != 0]
            ser = [x for k, x in enumerate(series) \
                   if (not opts.start or timestamps[k] >= opts.start) and \
                      (not opts.end or timestamps[k] <= opts.end)]
            time = [x for k, x in enumerate(timestamps) \
                    if (not opts.start or x >= opts.start) and \
                       (not opts.end or x <= opts.end)]
            tot = [x for k, x in enumerate(totals) \
                   if (not opts.start or timestamps[k] >= opts.start) and \
                      (not opts.end or timestamps[k] <= opts.end)]

            if opts.smooth and len(time) and len(ser) and len(tot):
                time, ser, tot = smooth_values(time, ser, tot, opts.smooth)

            if opts.window and len(time) and len(ser) and len(tot):
                time, ser, tot = collapse_values(time, ser, tot, opts.window)

            mean = float(sum(series)) / len(series)
            #rel_mean is the mean for the period [opts.end, opts.start]
            rel_mean = float(sum(ser)) / len(ser)

            if opts.perc:
                try:
                    mean = float(sum(series)) / sum(totals)
                    rel_mean = float(sum(ser)) / sum(tot)
                except ZeroDivisionError:
                    mean = 0
                    rel_mean = 0
                # Calculate percentages
                ser = [calc_perc(x, tot[k]) for k, x in enumerate(ser)]
                # Set axis limit 0-1 IS IT GOOD OR BAD?
                #axis.set_ylim(0, 1)
                plt.ylabel("%")

            first_time = time[0].date()
            last_time = time[-1].date()
            plt.clf()
            plt.subplots_adjust(bottom=0.25)
            plt.xticks(rotation=90)
            fig = plt.gcf()
            fig.set_size_inches(11.7, 8.3)
            axis = plt.gca()
            axis.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d'))
            axis.set_xlim(matplotlib.dates.date2num(first_time),
                          matplotlib.dates.date2num(last_time))
            if last_time - first_time < timedelta(days=30):
                axis.xaxis.set_major_locator(md.DayLocator(interval=1))
                axis.xaxis.set_minor_locator(md.DayLocator(interval=1))
            else:
                axis.xaxis.set_minor_locator(md.MonthLocator(interval=1))
                #auto_loc = md.AutoDateLocator(minticks=8, maxticks=12, interval_multiples=True)
                #auto_loc.intervald[md.MONTHLY] = [6]
                rule = md.rrulewrapper(md.MONTHLY, interval=4)
                auto_loc = md.RRuleLocator(rule)
                axis.xaxis.set_major_locator(auto_loc)
            axis.tick_params(labelsize='x-small')
            plt.xlabel("Revisions Timestamp")

            if len(time) and len(ser):
                if opts.window:
                    time = [t.date() for t in time]
                logging.info("Mean: %f", mean)
                logging.info("Relative Mean: %f", rel_mean)
                if header[i] == "negemo" or header[i] == "posemo":
                    print ser  # ONLY FOR TESTING, FIXME WHEN FINISHED
                plt.plot(matplotlib.dates.date2num(time), ser, "b.-")
                plt.axhline(y=mean, color="r")
                plt.title("%s - Mean: %.5f - Relative mean: %.5f" %
                          (header[i], round(mean, 5), round(rel_mean, 5)))
                pdf_pag.savefig()

        pdf_pag.close()
示例#12
0
 def process_timestamp(self, elem):
     if self._skip:
         return
     self._date = ts2dt(elem.text)
 def process_timestamp(self, elem):
     if self._skip:
         return
     self._date = ts2dt(elem.text)