Exemplo n.º 1
def get_db(lang_code):
    localized_dbs = getattr(flask.g, '_localized_dbs', {})
    db = localized_dbs.get(lang_code, None)
    if db is None:
        db = localized_dbs[lang_code] = chdb.init_db(lang_code)
    flask.g._localized_dbs = localized_dbs
    return db
Exemplo n.º 2
def get_db_names_to_archive(lang_code):
    database_names = []
    for db in [chdb.init_db(lang_code), chdb.init_stats_db()]:
        with db.cursor() as cursor:
            cursor.execute('SELECT DATABASE()')
    return database_names
Exemplo n.º 3
def get_db_names_to_archive(lang_code):
    database_names = []
    for db in [chdb.init_db(lang_code), chdb.init_stats_db()]:
        with db as cursor:
            cursor.execute('SELECT DATABASE()')
    return database_names
Exemplo n.º 4
def get_db(lang_code):
    localized_dbs = getattr(flask.g, '_localized_dbs', {})
    db = localized_dbs.get(lang_code, None)
    if db is None:
        db = localized_dbs[lang_code] = chdb.init_db(lang_code)
    flask.g._localized_dbs = localized_dbs
    return db
Exemplo n.º 5
    def setUp(self):
        self.app = app.app.test_client()
        db = chdb.init_db('en')
        cursor = db.cursor()

        # FIXME should really mock chdb instead.
        cursor.execute('SELECT snippets.id, category_id FROM ' \
            'snippets, articles_categories WHERE ' \
            'snippets.article_id = articles_categories.article_id ' \
            'LIMIT 1;')
        self.sid, self.cat = cursor.fetchone()
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
    logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys()))

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, clicked_snippets in page_title_to_snippets.items():
        start_ts = min(cs.ts for cs in clicked_snippets)
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = {
                cs.snippet_id: cs for cs in clicked_snippets}
            # FIXME Duplicated logic with parse_live.py :(
            for sni in snippets:
                id = mkid(d(page_title) + sni.snippet)
                gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_snippet in gone_in_this_revision.items():
                if clicked_snippet.ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)',
                        clicked_snippet.ts, clicked_snippet.snippet_id,
                        cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id)

    return 0
Exemplo n.º 7
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
    logger.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        start_ts = min(snippet_to_ts.values())
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = dict(snippet_to_ts)
            # FIXME Duplicated logic with parse_live.py :(
            for sec, snips in snippets:
                for sni in snips:
                    id = mkid(d(page_title) + sni)
                    gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_ts in gone_in_this_revision.items():
                if clicked_ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    del snippet_to_ts[snippet_id]
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)',
                        clicked_ts, snippet_id, cfg.lang_code, rev['rev_id'])

    return 0
def compute_fixed_snippets(cfg):
    log.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours=3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        log.info('No pages to process!')
    log.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php',
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        contents, page_ts = get_page_contents_and_timestamp(wiki, page_title)
        snippets = parser.extract(contents)
        # FIXME Duplicated logic with parse_live.py :(
        for sec, snips in snippets:
            for sni in snips:
                id = mkid(d(page_title) + sni)
                snippet_to_ts.pop(id, None)

        for snippet_id, clicked_ts in snippet_to_ts.items():
            if clicked_ts < page_ts:
                    'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts,
                    snippet_id, cfg.lang_code)

    return 0
Exemplo n.º 9
def compute_fixed_snippets():
    start = time.time()
    # FIXME This could probably just be one query on a single database
    # connection, insead of one connection per database and loading all
    # snippets in memory for comparison.
    cfg = config.get_localized_config()
    scratch_db = chdb.init_scratch_db()
    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Find the set of snippets that that were "clicked" (redirected to article)
    # between the dates of the previous/live and next/scratch database
    from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets')
    to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets')
    clicked = stats_db.execute_with_retry(load_snippet_clicks_between,
                                          cfg.lang_code, from_ts, to_ts)

    # Load the snippets from both databases
    scratch_snippets = scratch_db.execute_with_retry(load_snippets)
    live_snippets = live_db.execute_with_retry(load_snippets)

    # And for each snippet that disappeared across databases AND had been
    # clicked in the meantime, store its information in the stats database.
    gone = live_snippets.difference(scratch_snippets)
    for id, clicked_ts in clicked.iteritems():
        if id in gone:
                'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id,

    log.info('all done in %d seconds.' % (time.time() - start))
    return 0