def update(self, force=False, _act=None): "does it all" final_update_log_path = STATIC_PATH + 'campaigns/%s/update.log' % self.id _act['name'] = self.name _act['id'] = self.id _act['log_path'] = final_update_log_path now = datetime.datetime.utcnow() with atomic_save(final_update_log_path) as f: cur_update_sink = build_stream_sink(f) old_sinks = tlog.sinks tlog.set_sinks(old_sinks + [cur_update_sink]) try: self.load_article_list() self.load_latest_state() next_fetch = now if not self.latest_state else self.latest_state.timestamp + self.fetch_frequency if not force and next_fetch > now: tlog.critical('skip_fetch').success( '{cid} not out of date, skipping until next fetch at {next_fetch}. ', cid=self.id, next_fetch=next_fetch) return self.record_state() # defaults to now self.load_latest_state() self.prune_by_frequency() self.render_report() self.render_article_list() finally: tlog.set_sinks(old_sinks) return
def prune_by_frequency(self, dry_run=False): # TODO: make this work for all campaign YYYYMM directories # under data dir, not just the most recent one. if not self.save_frequency: return if not self.latest_state: self.load_latest_state() state_path = self.get_latest_state_path() if state_path is None: return target_dir = os.path.dirname(state_path) for full in (True, False): state_paths = get_state_filepaths(target_dir, full=full) if not state_paths: return tmpl = os.path.basename( STATE_FULL_PATH_TMPL if full else STATE_PATH_TMPL) last_kept_dt = datetime.datetime.strptime( os.path.basename(state_paths[0]), tmpl) to_prune = [] for fsp in state_paths[1:-1]: # ignore the latest and first cur_dt = datetime.datetime.strptime(os.path.basename(fsp), tmpl) if last_kept_dt < (cur_dt - self.save_frequency): last_kept_dt = cur_dt else: to_prune.append(fsp) for p in to_prune: with tlog.critical('prune data file', path=p): if dry_run: continue os.remove(p) return
def get_hashtag_stats(self, tag, lang=None, startdate=None, enddate=None): if not tag: return self.get_all_hashtag_stats(lang=lang, startdate=startdate, enddate=enddate) if tag and tag[0] == '#': tag = tag[1:] if not lang: lang = '%' query = ''' SELECT COUNT(*) as revisions, COUNT(DISTINCT rc_user) as users, COUNT(DISTINCT rc_title) as pages, COUNT(DISTINCT htrc_lang) as langs, MIN(rc_timestamp) as oldest, MAX(rc_timestamp) as newest, SUM(ABS(rc_new_len - rc_old_len)) as bytes FROM recentchanges AS rc JOIN hashtag_recentchanges AS htrc ON htrc.htrc_id = rc.htrc_id JOIN hashtags AS ht ON ht.ht_id = htrc.ht_id WHERE ht.ht_text = ? AND rc.htrc_lang LIKE ? AND rc.rc_timestamp BETWEEN ? AND ? ORDER BY rc.rc_id DESC''' params = (tag, lang, startdate, enddate) with tlog.critical('get_hashtag_stats') as rec: ret = self.execute(query, params) rec.success('Fetched stats for {tag}', tag=tag) return ret
def get_all_hashtag_stats(self, lang=None, startdate=None, enddate=None): # TODO: Add conditions here if not lang: lang = '%' query = ''' SELECT COUNT(*) as revisions, COUNT(DISTINCT rc_user) as users, COUNT(DISTINCT rc_title) as pages, MIN(rc_timestamp) as oldest, MAX(rc_timestamp) as newest, SUM(ABS(rc_new_len - rc_old_len)) as bytes FROM recentchanges AS rc JOIN hashtag_recentchanges AS htrc ON htrc.htrc_id = rc.htrc_id JOIN hashtags AS ht ON ht.ht_id = htrc.ht_id WHERE rc.rc_type = 0 AND rc.htrc_lang LIKE ? AND rc.rc_timestamp BETWEEN ? AND ? AND ht.ht_text NOT IN(%s) AND ht.ht_text REGEXP '[[:alpha:]]+' ''' % ', '.join(['?' for i in range(len(EXCLUDED))]) with tlog.critical('get_all_hashtag_stats') as rec: ret = self.execute(query, (lang, startdate, enddate,) + EXCLUDED) rec.success('Fetched all hashtag stats') return ret
def get_all_hashtags(self, lang=None, start=0, end=PAGINATION): """Rules for hashtags: 1. Does not include MediaWiki magic words (like #REDIRECT) or parser functions 2. Must be longer than one character 3. Must contain at least one non-numeric character. """ if not lang: lang = '%' query = ''' SELECT * FROM recentchanges AS rc JOIN hashtag_recentchanges AS htrc ON htrc.htrc_id = rc.htrc_id JOIN hashtags AS ht ON ht.ht_id = htrc.ht_id WHERE rc.rc_type = 0 AND rc.htrc_lang LIKE ? AND ht.ht_text NOT IN(%s) AND ht.ht_text REGEXP '[[:alpha:]]+' AND CHAR_LENGTH(ht.ht_text) > 1 ORDER BY rc.rc_id DESC LIMIT ?, ?''' % ', '.join(['?' for i in range(len(EXCLUDED))]) params = (lang, ) + EXCLUDED + (start, end) with tlog.critical('get_all_hashtags') as rec: ret = self.execute(query, params) rec.success('Fetched all hashtags starting at {start}', start=start) return ret
def get_hashtags(self, tag=None, lang=None, start=0, end=PAGINATION, startdate=None, enddate=None): if not tag: return self.get_all_hashtags(lang=lang, start=start, end=end, startdate=startdate, enddate=enddate) if tag and tag[0] == '#': tag = tag[1:] if not lang: lang = '%' query = ''' SELECT * FROM recentchanges AS rc JOIN hashtag_recentchanges AS htrc ON htrc.htrc_id = rc.htrc_id JOIN hashtags AS ht ON ht.ht_id = htrc.ht_id WHERE ht.ht_text = ? AND rc.htrc_lang LIKE ? AND rc.rc_timestamp BETWEEN ? AND ? ORDER BY rc.rc_timestamp DESC LIMIT ?, ?''' params = (tag, lang, startdate, enddate, start, end) with tlog.critical('get_hashtags') as rec: ret = self.execute(query, params) rec.success('Fetched revisions tagged with {tag}', tag=tag) return ret
def get_top_hashtags(self, limit=10, recent_count=100000, nobots=True): """Gets the top hashtags from an arbitrarily "recent" group of edits (not all time). """ excluded_p = ', '.join(['?' for i in range(len(EXCLUDED))]) if nobots: bot_condition = 'AND rc_bot = 0' else: bot_condition = '' query_tmpl = ''' SELECT ht.ht_text, COUNT(ht.ht_text) AS count FROM recentchanges AS rc JOIN hashtag_recentchanges AS htrc ON htrc.htrc_id = rc.htrc_id AND rc.htrc_id > (SELECT MAX(htrc_id) FROM recentchanges) - ? JOIN hashtags AS ht ON ht.ht_id = htrc.ht_id WHERE ht.ht_text REGEXP '[[:alpha:]]{1}[[:alnum:]]+' AND ht.ht_text NOT IN (%s) %s GROUP BY ht.ht_text ORDER BY count DESC LIMIT ?;''' query = query_tmpl % (excluded_p, bot_condition) params = (recent_count,) + EXCLUDED + (limit,) # This query is cached because it's loaded for each visit to # the index page with tlog.critical('get_top_hashtags') as rec: ret = self.execute(query, params, cache_name='top-tags-%s-%s' % (nobots, limit)) rec.success('Fetched top tags with limit of {limit}', limit=limit) return ret
def main(): with tlog.critical('start') as act: parser = get_argparser() args = parser.parse_args() wait = random.randint(0, args.jitter) act.success( 'started pid {process_id}, fetch for {lang} beginning in {wait} seconds', lang=args.lang, wait=wait) time.sleep(wait) run_logger = RunLogDAL() run_logger.add_start_record(lang=args.lang) output = '{}' try: if args.debug: import log log.set_debug(True) rcu = RecentChangeUpdater(lang=args.lang, debug=args.debug) rcu.connect() rcu.update_recentchanges(hours=args.hours) output = json.dumps(rcu.stats) except Exception: output = json.dumps({'error': traceback.format_exc()}) raise finally: run_logger.add_complete_record(lang=args.lang, output=output)
def connect(self, read_default_file=DB_CONFIG_PATH): with tlog.critical('connect') as rec: self.connection = oursql.connect(db=HT_DB_NAME, host=HT_DB_HOST, read_default_file=read_default_file, charset=None, use_unicode=False, autoping=True)
def get_langs(self): query = ''' SELECT htrc_lang FROM recentchanges GROUP BY htrc_lang''' params = () with tlog.critical('get_langs') as rec: ret = self.execute(query, params, cache_name='langs') rec.success('Fetched available languages') return ret
def save_traffic_stats(lang, project, query_date, limit=DEFAULT_LIMIT): '''\ 1. Get articles 2. Add images and summaries 3. Prepare and save results ''' articles = make_article_list(query_date, lang=lang, project=project) total_traffic = get_project_traffic(query_date, lang, project) articles = articles[:limit] articles = add_extras(articles, lang=lang, project=project) ret = {'articles': articles, 'formatted_date': format_date(query_date, format='d MMMM yyyy', locale=lang), 'date': {'day': query_date.day, 'month': query_date.month, 'year': query_date.year}, 'lang': lang, 'full_lang': LOCAL_LANG_MAP[lang], 'total_traffic': total_traffic, 'total_traffic_short': shorten_number(total_traffic), 'examples': [articles[0], articles[1], articles[2], articles[query_date.day * 2]], # haha ok.. 'project': project.capitalize(), 'permalink': DATE_PERMALINK_TMPL.format(lang=lang, project=project, year=query_date.year, month=query_date.month, day=query_date.day), 'meta': {'fetched': datetime.utcnow().isoformat()}} outfile_name = DATA_PATH_TMPL.format(lang=lang, project=project, year=query_date.year, month=query_date.month, day=query_date.day) with tlog.critical('saving_single_day_stats') as rec: rec['out_file'] = os.path.abspath(outfile_name) try: out_file = codecs.open(outfile_name, 'w') except IOError: mkdir_p(os.path.dirname(outfile_name)) out_file = codecs.open(outfile_name, 'w') with out_file: data_bytes = json.dumps(ret, indent=2, sort_keys=True) rec['len_bytes'] = len(data_bytes) out_file.write(data_bytes) rec.success('wrote {len_bytes} bytes to {out_file}') return
def get_run_log(self, limit=50000): query = ''' SELECT * FROM start_log AS sl JOIN complete_log AS cl ON sl.run_uuid = cl.run_uuid WHERE cl.complete_timestamp > DATE_SUB(NOW(), INTERVAL 3 DAY) ORDER BY cl.complete_timestamp DESC LIMIT ?''' with tlog.critical('get_run_log') as rec: return self.execute(query, (limit,), show_tables=True)
def home(): with tlog.critical('home') as rec: top_tags = Database.get_top_hashtags() for tag in top_tags: # TODO: cleaner data input tag['ht_text'] = tag['ht_text'].decode('utf8', errors='replace') langs = Database.get_langs() rec.success('Homepage ready') return {'top_tags': top_tags, 'langs': [l['htrc_lang'] for l in langs]}
def get_traffic(query_date, lang, project): '''\ Get the traffic report for the top 1000 articles for a given day. TODO: Get from local file, if available ''' url = TOP_API_URL.format(lang=lang, project=project, year=query_date.year, month='%02d' % query_date.month, day='%02d' % query_date.day) if DEBUG: print 'Getting %s' % url with tlog.critical('fetch_traffic') as rec: resp = urllib2.urlopen(url) resp_bytes = resp.read() rec.success('Fetched {len_bytes} bytes from {url}', len_bytes=len(resp_bytes), url=url) with tlog.critical('deserialize_traffic'): data = json.loads(resp_bytes) articles = data['items'][0]['articles'] return articles
def main(): tlog.critical('start').success('started {0}', os.getpid()) parser = get_argparser() args = parser.parse_args() run_logger = RunLogDAL() run_logger.add_start_record(lang=args.lang) output = '{}' try: if args.debug: import log log.set_debug(True) rcu = RecentChangeUpdater(lang=args.lang, debug=args.debug) rcu.connect() rcu.update_recentchanges(hours=args.hours) output = json.dumps(rcu.stats) except Exception: output = json.dumps({'error': traceback.format_exc()}) raise finally: run_logger.add_complete_record(lang=args.lang, output=output)
def load_and_update_campaign(campaign_dir, force=False): with tlog.critical('load_campaign_dir', path=campaign_dir) as _act: ptc = PTCampaign.from_path(campaign_dir) _act['name'] = ptc.name if ptc.disabled: _act.failure("campaign {name!r} disabled, skipping.") return ptc ptc.update(force=force) print() print('Goal results:') for key, results in ptc.latest_state.goal_results.items(): print(' - {name} ({done_count}/{total_count}) Done: {done}'.format( **results)) print() return ptc
def from_path(cls, path, auto_start_state=True): config_data = yaml.safe_load(open(path + '/config.yaml', 'rb')) kwargs = dict(config_data) kwargs['article_list_config'] = dict(kwargs.pop('article_list')) kwargs['base_path'] = path if kwargs.get('save_frequency'): kwargs['save_frequency'] = parse_timedelta( kwargs['save_frequency']) if kwargs.get('fetch_frequency'): kwargs['fetch_frequency'] = parse_timedelta( kwargs['fetch_frequency']) ret = cls(**kwargs) needs_backfill = False with tlog.info('load_start_state') as _act: try: start_state = PTCampaignState.from_timestamp( ret, ret.campaign_start_date) except StateNotFound as snf: if not auto_start_state: raise needs_backfill = True _act.failure( 'start state not found (got {0!r}), backfilling...', snf) if needs_backfill: with tlog.critical('backfill_start_state', verbose=True): ret.load_article_list() start_state = PTCampaignState.from_api(ret, ret.campaign_start_date) start_state.save() ret.start_state = start_state return ret
def save_traffic_stats(lang, project, query_date, limit=DEFAULT_LIMIT): '''\ 1. Get articles 2. Add images and summaries 3. Prepare and save results ''' articles = make_article_list(query_date, lang=lang, project=project) total_traffic = get_project_traffic(query_date, lang, project) articles = articles[:limit] articles = add_extras(articles, lang=lang, project=project) ret = { 'articles': articles, 'formatted_date': format_date(query_date, format='d MMMM yyyy', locale=lang), 'date': { 'day': query_date.day, 'month': query_date.month, 'year': query_date.year }, 'lang': lang, 'full_lang': LOCAL_LANG_MAP[lang], 'total_traffic': total_traffic, 'total_traffic_short': shorten_number(total_traffic), 'examples': [articles[0], articles[1], articles[2], articles[query_date.day * 2]], # haha ok.. 'project': project.capitalize(), 'permalink': DATE_PERMALINK_TMPL.format(lang=lang, project=project, year=query_date.year, month=query_date.month, day=query_date.day), 'meta': { 'fetched': datetime.utcnow().isoformat() } } outfile_name = DATA_PATH_TMPL.format(lang=lang, project=project, year=query_date.year, month=query_date.month, day=query_date.day) with tlog.critical('saving_single_day_stats') as rec: rec['out_file'] = os.path.abspath(outfile_name) try: out_file = codecs.open(outfile_name, 'w') except IOError: mkdir_p(os.path.dirname(outfile_name)) out_file = codecs.open(outfile_name, 'w') with out_file: data_bytes = json.dumps(ret, indent=2, sort_keys=True) rec['len_bytes'] = len(data_bytes) out_file.write(data_bytes) rec.success('wrote {len_bytes} bytes to {out_file}') return