示例#1
0
def intersect_with_page_titles(cfg, page_titles):
    wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php',
                              cfg.user_agent)
    page_ids = []
    for chunk in ichunk(page_titles, PAGE_TITLES_PER_API_REQUEST):
        params = {'titles': '|'.join(chunk)}
        for response in wiki.query(params):
            if 'query' in response and 'pages' in response['query']:
                page_ids.extend(list(response['query']['pages'].keys()))
    if not page_ids:
        return '', []
    return intersect_with_page_ids(cfg, page_ids)
示例#2
0
def initializer(backdir):
    self.backdir = backdir

    self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg)
    self.exception_count = 0

    if cfg.profile:
        self.profiler = cProfile.Profile()
        self.profiler.enable()
        # Undocumented :( https://stackoverflow.com/questions/24717468
        multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
        return
    logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys()))

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, clicked_snippets in page_title_to_snippets.items():
        start_ts = min(cs.ts for cs in clicked_snippets)
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = {
                cs.snippet_id: cs for cs in clicked_snippets}
            # FIXME Duplicated logic with parse_live.py :(
            for sni in snippets:
                id = mkid(d(page_title) + sni.snippet)
                gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_snippet in gone_in_this_revision.items():
                if clicked_snippet.ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    clicked_snippets.remove(clicked_snippet)
                    stats_db.execute_with_retry_s(
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)',
                        clicked_snippet.ts, clicked_snippet.snippet_id,
                        cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id)

    live_db.close()
    stats_db.close()
    return 0
def compute_fixed_snippets(cfg):
    log.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours=3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        log.info('No pages to process!')
        return
    log.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php',
                              cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        contents, page_ts = get_page_contents_and_timestamp(wiki, page_title)
        snippets = parser.extract(contents)
        # FIXME Duplicated logic with parse_live.py :(
        for sec, snips in snippets:
            for sni in snips:
                id = mkid(d(page_title) + sni)
                snippet_to_ts.pop(id, None)

        for snippet_id, clicked_ts in snippet_to_ts.items():
            if clicked_ts < page_ts:
                log.info(snippet_id)
                stats_db.execute_with_retry_s(
                    'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts,
                    snippet_id, cfg.lang_code)

    live_db.close()
    stats_db.close()
    return 0
示例#5
0
def get_localized_config(lang_code=None, api=True):
    if lang_code is None:
        lang_code = os.getenv('CH_LANG')
    lang_config = _LANG_CODE_TO_CONFIG[lang_code]
    cfg = Config(lang_code=lang_code,
                 **reduce(_inherit,
                          [_GLOBAL_CONFIG, _BASE_LANG_CONFIG, lang_config]))
    cfg.lang_codes_to_lang_names = LANG_CODES_TO_LANG_NAMES

    cfg.wikipedia = None
    if api:
        # This module is imported pretty often during some manual operations
        # (e.g. creating cronjobs), and yamwapi is the only third-party
        # dependency that would require us to enter the virtualenv so... as
        # a convenience hack, we avoid importing it at module level.
        import yamwapi
        cfg.wikipedia = yamwapi.MediaWikiAPI(
            'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
        cfg.citation_needed_templates = _resolve_redirects_to_templates(
            cfg.wikipedia, cfg.citation_needed_templates)
    return cfg
示例#6
0
        stdin = subprocess.PIPE, stdout = subprocess.PIPE)
    stdout, _ = lynx.communicate(html.encode('utf-8'))
    if lynx.returncode:
        print('Failed to render HTML! Do you have lynx?', file=sys.stderr)
        return html
    return stdout.decode('utf-8').strip('\n')

if __name__ == '__main__':
    arguments = docopt.docopt(__doc__)
    cfg = config.get_localized_config()

    WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain
    WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/'
    WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php'

    wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wikipedia, cfg)

    try:
        int(arguments['<title_or_pageid>'])
        wikitext = wikipedia.get_page_contents(
            pageid = int(arguments['<title_or_pageid>']))
    except:
        wikitext = wikipedia.get_page_contents(
            title = arguments['<title_or_pageid>'])

    for snippet in parser.extract(wikitext):
        print('Section: %s' % snippet.section)
        if arguments['--output'] != 'raw':
            output = format_html(snippet.snippet)
        else:
示例#7
0
 def setUp(self):
     self._api = yamwapi.MediaWikiAPI(self.TEST_API_URL,
                                      self.TEST_USER_AGENT)