def get_db(lang_code): localized_dbs = getattr(flask.g, '_localized_dbs', {}) db = localized_dbs.get(lang_code, None) if db is None: db = localized_dbs[lang_code] = chdb.init_db(lang_code) flask.g._localized_dbs = localized_dbs return db
def get_db_names_to_archive(lang_code): database_names = [] for db in [chdb.init_db(lang_code), chdb.init_stats_db()]: with db.cursor() as cursor: cursor.execute('SELECT DATABASE()') database_names.append(cursor.fetchone()[0]) return database_names
def get_db_names_to_archive(lang_code): database_names = [] for db in [chdb.init_db(lang_code), chdb.init_stats_db()]: with db as cursor: cursor.execute('SELECT DATABASE()') database_names.append(cursor.fetchone()[0]) return database_names
def setUp(self): self.app = app.app.test_client() db = chdb.init_db('en') cursor = db.cursor() # FIXME should really mock chdb instead. cursor.execute('SELECT snippets.id, category_id FROM ' \ 'snippets, articles_categories WHERE ' \ 'snippets.article_id = articles_categories.article_id ' \ 'LIMIT 1;') self.sid, self.cat = cursor.fetchone()
def compute_fixed_snippets(cfg): logger.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours = 3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: logger.info('No pages to process!') return logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys())) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI( 'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, clicked_snippets in page_title_to_snippets.items(): start_ts = min(cs.ts for cs in clicked_snippets) revisions = get_page_revisions(wiki, page_title, start_ts) for rev in revisions: snippets = parser.extract(rev['contents']) gone_in_this_revision = { cs.snippet_id: cs for cs in clicked_snippets} # FIXME Duplicated logic with parse_live.py :( for sni in snippets: id = mkid(d(page_title) + sni.snippet) gone_in_this_revision.pop(id, None) for snippet_id, clicked_snippet in gone_in_this_revision.items(): if clicked_snippet.ts < rev['timestamp']: logger.info('%s fixed at revision %s' % ( snippet_id, rev['rev_id'])) clicked_snippets.remove(clicked_snippet) stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)', clicked_snippet.ts, clicked_snippet.snippet_id, cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id) live_db.close() stats_db.close() return 0
def compute_fixed_snippets(cfg): logger.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours = 3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: logger.info('No pages to process!') return logger.info('Will reparse pages: %r' % page_title_to_snippets.keys()) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI( 'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, snippet_to_ts in page_title_to_snippets.items(): start_ts = min(snippet_to_ts.values()) revisions = get_page_revisions(wiki, page_title, start_ts) for rev in revisions: snippets = parser.extract(rev['contents']) gone_in_this_revision = dict(snippet_to_ts) # FIXME Duplicated logic with parse_live.py :( for sec, snips in snippets: for sni in snips: id = mkid(d(page_title) + sni) gone_in_this_revision.pop(id, None) for snippet_id, clicked_ts in gone_in_this_revision.items(): if clicked_ts < rev['timestamp']: logger.info('%s fixed at revision %s' % ( snippet_id, rev['rev_id'])) del snippet_to_ts[snippet_id] stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)', clicked_ts, snippet_id, cfg.lang_code, rev['rev_id']) live_db.close() stats_db.close() return 0
def compute_fixed_snippets(cfg): log.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours=3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: log.info('No pages to process!') return log.info('Will reparse pages: %r' % page_title_to_snippets.keys()) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, snippet_to_ts in page_title_to_snippets.items(): contents, page_ts = get_page_contents_and_timestamp(wiki, page_title) snippets = parser.extract(contents) # FIXME Duplicated logic with parse_live.py :( for sec, snips in snippets: for sni in snips: id = mkid(d(page_title) + sni) snippet_to_ts.pop(id, None) for snippet_id, clicked_ts in snippet_to_ts.items(): if clicked_ts < page_ts: log.info(snippet_id) stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts, snippet_id, cfg.lang_code) live_db.close() stats_db.close() return 0
def compute_fixed_snippets(): start = time.time() # FIXME This could probably just be one query on a single database # connection, insead of one connection per database and loading all # snippets in memory for comparison. cfg = config.get_localized_config() scratch_db = chdb.init_scratch_db() live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Find the set of snippets that that were "clicked" (redirected to article) # between the dates of the previous/live and next/scratch database from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets') to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets') clicked = stats_db.execute_with_retry(load_snippet_clicks_between, cfg.lang_code, from_ts, to_ts) # Load the snippets from both databases scratch_snippets = scratch_db.execute_with_retry(load_snippets) live_snippets = live_db.execute_with_retry(load_snippets) # And for each snippet that disappeared across databases AND had been # clicked in the meantime, store its information in the stats database. gone = live_snippets.difference(scratch_snippets) for id, clicked_ts in clicked.iteritems(): if id in gone: log.info(id) stats_db.execute_with_retry_s( 'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id, cfg.lang_code) log.info('all done in %d seconds.' % (time.time() - start)) scratch_db.close() live_db.close() stats_db.close() return 0