예제 #1
0
파일: update.py 프로젝트: hatnote/pacetrack
    def update(self, force=False, _act=None):
        "does it all"
        final_update_log_path = STATIC_PATH + 'campaigns/%s/update.log' % self.id
        _act['name'] = self.name
        _act['id'] = self.id
        _act['log_path'] = final_update_log_path
        now = datetime.datetime.utcnow()
        with atomic_save(final_update_log_path) as f:
            cur_update_sink = build_stream_sink(f)
            old_sinks = tlog.sinks
            tlog.set_sinks(old_sinks + [cur_update_sink])
            try:
                self.load_article_list()
                self.load_latest_state()

                next_fetch = now if not self.latest_state else self.latest_state.timestamp + self.fetch_frequency
                if not force and next_fetch > now:
                    tlog.critical('skip_fetch').success(
                        '{cid} not out of date, skipping until next fetch at {next_fetch}. ',
                        cid=self.id,
                        next_fetch=next_fetch)
                    return

                self.record_state()  # defaults to now
                self.load_latest_state()
                self.prune_by_frequency()
                self.render_report()
                self.render_article_list()
            finally:
                tlog.set_sinks(old_sinks)
        return
예제 #2
0
파일: update.py 프로젝트: hatnote/pacetrack
    def prune_by_frequency(self, dry_run=False):
        # TODO: make this work for all campaign YYYYMM directories
        # under data dir, not just the most recent one.
        if not self.save_frequency:
            return
        if not self.latest_state:
            self.load_latest_state()
        state_path = self.get_latest_state_path()
        if state_path is None:
            return
        target_dir = os.path.dirname(state_path)

        for full in (True, False):
            state_paths = get_state_filepaths(target_dir, full=full)
            if not state_paths:
                return
            tmpl = os.path.basename(
                STATE_FULL_PATH_TMPL if full else STATE_PATH_TMPL)
            last_kept_dt = datetime.datetime.strptime(
                os.path.basename(state_paths[0]), tmpl)
            to_prune = []
            for fsp in state_paths[1:-1]:  # ignore the latest and first
                cur_dt = datetime.datetime.strptime(os.path.basename(fsp),
                                                    tmpl)
                if last_kept_dt < (cur_dt - self.save_frequency):
                    last_kept_dt = cur_dt
                else:
                    to_prune.append(fsp)

            for p in to_prune:
                with tlog.critical('prune data file', path=p):
                    if dry_run:
                        continue
                    os.remove(p)
        return
예제 #3
0
 def get_hashtag_stats(self,
                       tag,
                       lang=None,
                       startdate=None,
                       enddate=None):
     if not tag:
         return self.get_all_hashtag_stats(lang=lang, startdate=startdate, enddate=enddate)
     if tag and tag[0] == '#':
         tag = tag[1:]
     if not lang:
         lang = '%'
     query = '''
     SELECT COUNT(*) as revisions,
     COUNT(DISTINCT rc_user) as users,
     COUNT(DISTINCT rc_title) as pages,
     COUNT(DISTINCT htrc_lang) as langs,
     MIN(rc_timestamp) as oldest,
     MAX(rc_timestamp) as newest,
     SUM(ABS(rc_new_len - rc_old_len)) as bytes
     FROM recentchanges AS rc
     JOIN hashtag_recentchanges AS htrc
     ON htrc.htrc_id = rc.htrc_id
     JOIN hashtags AS ht
     ON ht.ht_id = htrc.ht_id
     WHERE ht.ht_text = ?
     AND rc.htrc_lang LIKE ?
     AND rc.rc_timestamp BETWEEN ? AND ?
     ORDER BY rc.rc_id DESC'''
     params = (tag, lang, startdate, enddate)
     with tlog.critical('get_hashtag_stats') as rec:
         ret = self.execute(query, params)
         rec.success('Fetched stats for {tag}',
                     tag=tag)
         return ret
예제 #4
0
 def get_all_hashtag_stats(self, lang=None, startdate=None, enddate=None):
     # TODO: Add conditions here
     if not lang:
         lang = '%'
     query = '''
     SELECT COUNT(*) as revisions,
     COUNT(DISTINCT rc_user) as users,
     COUNT(DISTINCT rc_title) as pages,
     MIN(rc_timestamp) as oldest,
     MAX(rc_timestamp) as newest,
     SUM(ABS(rc_new_len - rc_old_len)) as bytes
     FROM recentchanges AS rc
     JOIN hashtag_recentchanges AS htrc
     ON htrc.htrc_id = rc.htrc_id
     JOIN hashtags AS ht
     ON ht.ht_id = htrc.ht_id
     WHERE rc.rc_type = 0
     AND rc.htrc_lang LIKE ?
     AND rc.rc_timestamp BETWEEN ? AND ?
     AND ht.ht_text NOT IN(%s)
     AND ht.ht_text REGEXP '[[:alpha:]]+' ''' % ', '.join(['?' for i in range(len(EXCLUDED))])
     with tlog.critical('get_all_hashtag_stats') as rec:
         ret = self.execute(query, (lang, startdate, enddate,) + EXCLUDED)
         rec.success('Fetched all hashtag stats')
         return ret
예제 #5
0
 def get_all_hashtags(self, lang=None, start=0, end=PAGINATION):
     """Rules for hashtags:
     1. Does not include MediaWiki magic words
     (like #REDIRECT) or parser functions
     2. Must be longer than one character
     3. Must contain at least one non-numeric
     character.
     """
     if not lang:
         lang = '%'
     query = '''
     SELECT *
     FROM recentchanges AS rc
     JOIN hashtag_recentchanges AS htrc
     ON htrc.htrc_id = rc.htrc_id
     JOIN hashtags AS ht
     ON ht.ht_id = htrc.ht_id
     WHERE rc.rc_type = 0
     AND rc.htrc_lang LIKE ?
     AND ht.ht_text NOT IN(%s)
     AND ht.ht_text REGEXP '[[:alpha:]]+'
     AND CHAR_LENGTH(ht.ht_text) > 1
     ORDER BY rc.rc_id DESC
     LIMIT ?, ?''' % ', '.join(['?' for i in range(len(EXCLUDED))])
     params = (lang, ) + EXCLUDED + (start, end)
     with tlog.critical('get_all_hashtags') as rec:
         ret = self.execute(query, params)
         rec.success('Fetched all hashtags starting at {start}',
                     start=start)
         return ret
예제 #6
0
 def get_hashtags(self,
                  tag=None,
                  lang=None,
                  start=0,
                  end=PAGINATION,
                  startdate=None,
                  enddate=None):
     if not tag:
         return self.get_all_hashtags(lang=lang,
                                      start=start,
                                      end=end,
                                      startdate=startdate,
                                      enddate=enddate)
     if tag and tag[0] == '#':
         tag = tag[1:]
     if not lang:
         lang = '%'
     query = '''
     SELECT *
     FROM recentchanges AS rc
     JOIN hashtag_recentchanges AS htrc
     ON htrc.htrc_id = rc.htrc_id
     JOIN hashtags AS ht
     ON ht.ht_id = htrc.ht_id
     WHERE ht.ht_text = ?
     AND rc.htrc_lang LIKE ?
     AND rc.rc_timestamp BETWEEN ? AND ?
     ORDER BY rc.rc_timestamp DESC
     LIMIT ?, ?'''
     params = (tag, lang, startdate, enddate, start, end)
     with tlog.critical('get_hashtags') as rec:
         ret = self.execute(query, params)
         rec.success('Fetched revisions tagged with {tag}',
                     tag=tag)
         return ret
예제 #7
0
 def get_top_hashtags(self, limit=10, recent_count=100000, nobots=True):
     """Gets the top hashtags from an arbitrarily "recent" group of edits
     (not all time).
     """
     excluded_p = ', '.join(['?' for i in range(len(EXCLUDED))])
     if nobots:
         bot_condition = 'AND rc_bot = 0'
     else:
         bot_condition = ''
     query_tmpl = '''
     SELECT ht.ht_text,
            COUNT(ht.ht_text) AS count
     FROM   recentchanges AS rc
            JOIN hashtag_recentchanges AS htrc
              ON htrc.htrc_id = rc.htrc_id
                 AND rc.htrc_id > (SELECT MAX(htrc_id)
                                   FROM   recentchanges) - ?
            JOIN hashtags AS ht
              ON ht.ht_id = htrc.ht_id
     WHERE  ht.ht_text REGEXP '[[:alpha:]]{1}[[:alnum:]]+'
     AND    ht.ht_text NOT IN (%s)
     %s
     GROUP  BY ht.ht_text
     ORDER  BY count DESC
     LIMIT  ?;'''
     query = query_tmpl % (excluded_p, bot_condition)
     params = (recent_count,) + EXCLUDED + (limit,)
     # This query is cached because it's loaded for each visit to
     # the index page
     with tlog.critical('get_top_hashtags') as rec:
         ret = self.execute(query, params, cache_name='top-tags-%s-%s' % (nobots, limit))
         rec.success('Fetched top tags with limit of {limit}',
                     limit=limit)
         return ret
예제 #8
0
def main():
    with tlog.critical('start') as act:
        parser = get_argparser()
        args = parser.parse_args()
        wait = random.randint(0, args.jitter)
        act.success(
            'started pid {process_id}, fetch for {lang} beginning in {wait} seconds',
            lang=args.lang,
            wait=wait)
        time.sleep(wait)

    run_logger = RunLogDAL()
    run_logger.add_start_record(lang=args.lang)
    output = '{}'
    try:
        if args.debug:
            import log
            log.set_debug(True)

        rcu = RecentChangeUpdater(lang=args.lang, debug=args.debug)
        rcu.connect()
        rcu.update_recentchanges(hours=args.hours)
        output = json.dumps(rcu.stats)
    except Exception:
        output = json.dumps({'error': traceback.format_exc()})
        raise
    finally:
        run_logger.add_complete_record(lang=args.lang, output=output)
예제 #9
0
 def connect(self, read_default_file=DB_CONFIG_PATH):
     with tlog.critical('connect') as rec:
         self.connection = oursql.connect(db=HT_DB_NAME,
                                          host=HT_DB_HOST,
                                          read_default_file=read_default_file,
                                          charset=None,
                                          use_unicode=False,
                                          autoping=True)
예제 #10
0
 def get_langs(self):
     query = '''
     SELECT htrc_lang
     FROM recentchanges
     GROUP BY htrc_lang'''
     params = ()
     with tlog.critical('get_langs') as rec:
         ret = self.execute(query, params, cache_name='langs')
         rec.success('Fetched available languages')
         return ret
예제 #11
0
파일: get_data.py 프로젝트: hatnote/top
def save_traffic_stats(lang, project, query_date, limit=DEFAULT_LIMIT):
    '''\
    1. Get articles
    2. Add images and summaries
    3. Prepare and save results
    '''
    articles = make_article_list(query_date,
                                 lang=lang,
                                 project=project)
    total_traffic = get_project_traffic(query_date, lang, project)
    articles = articles[:limit]
    articles = add_extras(articles, lang=lang, project=project)
    ret = {'articles': articles,
           'formatted_date': format_date(query_date,
                                         format='d MMMM yyyy',
                                         locale=lang),
           'date': {'day': query_date.day,
                    'month': query_date.month,
                    'year': query_date.year},
           'lang': lang,
           'full_lang': LOCAL_LANG_MAP[lang],
           'total_traffic': total_traffic,
           'total_traffic_short': shorten_number(total_traffic),
           'examples': [articles[0],
                        articles[1],
                        articles[2],
                        articles[query_date.day * 2]],  # haha ok..
           'project': project.capitalize(),
           'permalink': DATE_PERMALINK_TMPL.format(lang=lang,
                                                   project=project,
                                                   year=query_date.year,
                                                   month=query_date.month,
                                                   day=query_date.day),
           'meta': {'fetched': datetime.utcnow().isoformat()}}
    outfile_name = DATA_PATH_TMPL.format(lang=lang,
                                         project=project,
                                         year=query_date.year,
                                         month=query_date.month,
                                         day=query_date.day)

    with tlog.critical('saving_single_day_stats') as rec:
        rec['out_file'] = os.path.abspath(outfile_name)
        try:
            out_file = codecs.open(outfile_name, 'w')
        except IOError:
            mkdir_p(os.path.dirname(outfile_name))
            out_file = codecs.open(outfile_name, 'w')
        with out_file:
            data_bytes = json.dumps(ret, indent=2, sort_keys=True)
            rec['len_bytes'] = len(data_bytes)
            out_file.write(data_bytes)

        rec.success('wrote {len_bytes} bytes to {out_file}')

    return
예제 #12
0
 def get_run_log(self, limit=50000):
     query = '''
     SELECT *
     FROM start_log AS sl 
     JOIN complete_log AS cl 
     ON sl.run_uuid = cl.run_uuid 
     WHERE cl.complete_timestamp > DATE_SUB(NOW(), INTERVAL 3 DAY)
     ORDER BY cl.complete_timestamp DESC
     LIMIT ?'''
     with tlog.critical('get_run_log') as rec:
         return self.execute(query, (limit,), show_tables=True)
예제 #13
0
def home():
    with tlog.critical('home') as rec:
        top_tags = Database.get_top_hashtags()
        for tag in top_tags:
            # TODO: cleaner data input
            tag['ht_text'] = tag['ht_text'].decode('utf8', errors='replace')

        langs = Database.get_langs()
        rec.success('Homepage ready')

    return {'top_tags': top_tags, 'langs': [l['htrc_lang'] for l in langs]}
예제 #14
0
파일: get_data.py 프로젝트: hatnote/top
def get_traffic(query_date, lang, project):
    '''\
    Get the traffic report for the top 1000 articles for a given day.
    TODO: Get from local file, if available
    '''
    url = TOP_API_URL.format(lang=lang,
                             project=project,
                             year=query_date.year,
                             month='%02d' % query_date.month,
                             day='%02d' % query_date.day)
    if DEBUG:
        print 'Getting %s' % url
    with tlog.critical('fetch_traffic') as rec:
        resp = urllib2.urlopen(url)
        resp_bytes = resp.read()
        rec.success('Fetched {len_bytes} bytes from {url}',
                    len_bytes=len(resp_bytes), url=url)

    with tlog.critical('deserialize_traffic'):
        data = json.loads(resp_bytes)
        articles = data['items'][0]['articles']
    return articles
예제 #15
0
파일: update.py 프로젝트: intracer/hashtags
def main():
    tlog.critical('start').success('started {0}', os.getpid())
    parser = get_argparser()
    args = parser.parse_args()

    run_logger = RunLogDAL()
    run_logger.add_start_record(lang=args.lang)
    output = '{}'
    try:
        if args.debug:
            import log
            log.set_debug(True)

        rcu = RecentChangeUpdater(lang=args.lang, debug=args.debug)
        rcu.connect()
        rcu.update_recentchanges(hours=args.hours)
        output = json.dumps(rcu.stats)
    except Exception:
        output = json.dumps({'error': traceback.format_exc()})
        raise
    finally:
        run_logger.add_complete_record(lang=args.lang, output=output)
예제 #16
0
def get_traffic(query_date, lang, project):
    '''\
    Get the traffic report for the top 1000 articles for a given day.
    TODO: Get from local file, if available
    '''
    url = TOP_API_URL.format(lang=lang,
                             project=project,
                             year=query_date.year,
                             month='%02d' % query_date.month,
                             day='%02d' % query_date.day)
    if DEBUG:
        print 'Getting %s' % url
    with tlog.critical('fetch_traffic') as rec:
        resp = urllib2.urlopen(url)
        resp_bytes = resp.read()
        rec.success('Fetched {len_bytes} bytes from {url}',
                    len_bytes=len(resp_bytes),
                    url=url)

    with tlog.critical('deserialize_traffic'):
        data = json.loads(resp_bytes)
        articles = data['items'][0]['articles']
    return articles
예제 #17
0
파일: update.py 프로젝트: hatnote/pacetrack
def load_and_update_campaign(campaign_dir, force=False):
    with tlog.critical('load_campaign_dir', path=campaign_dir) as _act:
        ptc = PTCampaign.from_path(campaign_dir)
        _act['name'] = ptc.name
        if ptc.disabled:
            _act.failure("campaign {name!r} disabled, skipping.")
            return ptc
    ptc.update(force=force)
    print()
    print('Goal results:')
    for key, results in ptc.latest_state.goal_results.items():
        print(' - {name}  ({done_count}/{total_count})  Done: {done}'.format(
            **results))
    print()
    return ptc
예제 #18
0
파일: update.py 프로젝트: hatnote/pacetrack
    def from_path(cls, path, auto_start_state=True):
        config_data = yaml.safe_load(open(path + '/config.yaml', 'rb'))

        kwargs = dict(config_data)
        kwargs['article_list_config'] = dict(kwargs.pop('article_list'))
        kwargs['base_path'] = path

        if kwargs.get('save_frequency'):
            kwargs['save_frequency'] = parse_timedelta(
                kwargs['save_frequency'])
        if kwargs.get('fetch_frequency'):
            kwargs['fetch_frequency'] = parse_timedelta(
                kwargs['fetch_frequency'])

        ret = cls(**kwargs)

        needs_backfill = False
        with tlog.info('load_start_state') as _act:
            try:
                start_state = PTCampaignState.from_timestamp(
                    ret, ret.campaign_start_date)
            except StateNotFound as snf:
                if not auto_start_state:
                    raise
                needs_backfill = True
                _act.failure(
                    'start state not found (got {0!r}), backfilling...', snf)

        if needs_backfill:
            with tlog.critical('backfill_start_state', verbose=True):
                ret.load_article_list()
                start_state = PTCampaignState.from_api(ret,
                                                       ret.campaign_start_date)
                start_state.save()

        ret.start_state = start_state

        return ret
예제 #19
0
def save_traffic_stats(lang, project, query_date, limit=DEFAULT_LIMIT):
    '''\
    1. Get articles
    2. Add images and summaries
    3. Prepare and save results
    '''
    articles = make_article_list(query_date, lang=lang, project=project)
    total_traffic = get_project_traffic(query_date, lang, project)
    articles = articles[:limit]
    articles = add_extras(articles, lang=lang, project=project)
    ret = {
        'articles':
        articles,
        'formatted_date':
        format_date(query_date, format='d MMMM yyyy', locale=lang),
        'date': {
            'day': query_date.day,
            'month': query_date.month,
            'year': query_date.year
        },
        'lang':
        lang,
        'full_lang':
        LOCAL_LANG_MAP[lang],
        'total_traffic':
        total_traffic,
        'total_traffic_short':
        shorten_number(total_traffic),
        'examples':
        [articles[0], articles[1], articles[2],
         articles[query_date.day * 2]],  # haha ok..
        'project':
        project.capitalize(),
        'permalink':
        DATE_PERMALINK_TMPL.format(lang=lang,
                                   project=project,
                                   year=query_date.year,
                                   month=query_date.month,
                                   day=query_date.day),
        'meta': {
            'fetched': datetime.utcnow().isoformat()
        }
    }
    outfile_name = DATA_PATH_TMPL.format(lang=lang,
                                         project=project,
                                         year=query_date.year,
                                         month=query_date.month,
                                         day=query_date.day)

    with tlog.critical('saving_single_day_stats') as rec:
        rec['out_file'] = os.path.abspath(outfile_name)
        try:
            out_file = codecs.open(outfile_name, 'w')
        except IOError:
            mkdir_p(os.path.dirname(outfile_name))
            out_file = codecs.open(outfile_name, 'w')
        with out_file:
            data_bytes = json.dumps(ret, indent=2, sort_keys=True)
            rec['len_bytes'] = len(data_bytes)
            out_file.write(data_bytes)

        rec.success('wrote {len_bytes} bytes to {out_file}')

    return