def intersect_with_page_titles(cfg, page_titles): wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) page_ids = [] for chunk in ichunk(page_titles, PAGE_TITLES_PER_API_REQUEST): params = {'titles': '|'.join(chunk)} for response in wiki.query(params): if 'query' in response and 'pages' in response['query']: page_ids.extend(list(response['query']['pages'].keys())) if not page_ids: return '', [] return intersect_with_page_ids(cfg, page_ids)
def initializer(backdir): self.backdir = backdir self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg) self.exception_count = 0 if cfg.profile: self.profiler = cProfile.Profile() self.profiler.enable() # Undocumented :( https://stackoverflow.com/questions/24717468 multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def compute_fixed_snippets(cfg): logger.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours = 3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: logger.info('No pages to process!') return logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys())) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI( 'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, clicked_snippets in page_title_to_snippets.items(): start_ts = min(cs.ts for cs in clicked_snippets) revisions = get_page_revisions(wiki, page_title, start_ts) for rev in revisions: snippets = parser.extract(rev['contents']) gone_in_this_revision = { cs.snippet_id: cs for cs in clicked_snippets} # FIXME Duplicated logic with parse_live.py :( for sni in snippets: id = mkid(d(page_title) + sni.snippet) gone_in_this_revision.pop(id, None) for snippet_id, clicked_snippet in gone_in_this_revision.items(): if clicked_snippet.ts < rev['timestamp']: logger.info('%s fixed at revision %s' % ( snippet_id, rev['rev_id'])) clicked_snippets.remove(clicked_snippet) stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)', clicked_snippet.ts, clicked_snippet.snippet_id, cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id) live_db.close() stats_db.close() return 0
def compute_fixed_snippets(cfg): log.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours=3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: log.info('No pages to process!') return log.info('Will reparse pages: %r' % page_title_to_snippets.keys()) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, snippet_to_ts in page_title_to_snippets.items(): contents, page_ts = get_page_contents_and_timestamp(wiki, page_title) snippets = parser.extract(contents) # FIXME Duplicated logic with parse_live.py :( for sec, snips in snippets: for sni in snips: id = mkid(d(page_title) + sni) snippet_to_ts.pop(id, None) for snippet_id, clicked_ts in snippet_to_ts.items(): if clicked_ts < page_ts: log.info(snippet_id) stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts, snippet_id, cfg.lang_code) live_db.close() stats_db.close() return 0
def get_localized_config(lang_code=None, api=True): if lang_code is None: lang_code = os.getenv('CH_LANG') lang_config = _LANG_CODE_TO_CONFIG[lang_code] cfg = Config(lang_code=lang_code, **reduce(_inherit, [_GLOBAL_CONFIG, _BASE_LANG_CONFIG, lang_config])) cfg.lang_codes_to_lang_names = LANG_CODES_TO_LANG_NAMES cfg.wikipedia = None if api: # This module is imported pretty often during some manual operations # (e.g. creating cronjobs), and yamwapi is the only third-party # dependency that would require us to enter the virtualenv so... as # a convenience hack, we avoid importing it at module level. import yamwapi cfg.wikipedia = yamwapi.MediaWikiAPI( 'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) cfg.citation_needed_templates = _resolve_redirects_to_templates( cfg.wikipedia, cfg.citation_needed_templates) return cfg
stdin = subprocess.PIPE, stdout = subprocess.PIPE) stdout, _ = lynx.communicate(html.encode('utf-8')) if lynx.returncode: print('Failed to render HTML! Do you have lynx?', file=sys.stderr) return html return stdout.decode('utf-8').strip('\n') if __name__ == '__main__': arguments = docopt.docopt(__doc__) cfg = config.get_localized_config() WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/' WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php' wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) parser = snippet_parser.create_snippet_parser(wikipedia, cfg) try: int(arguments['<title_or_pageid>']) wikitext = wikipedia.get_page_contents( pageid = int(arguments['<title_or_pageid>'])) except: wikitext = wikipedia.get_page_contents( title = arguments['<title_or_pageid>']) for snippet in parser.extract(wikitext): print('Section: %s' % snippet.section) if arguments['--output'] != 'raw': output = format_html(snippet.snippet) else:
def setUp(self): self._api = yamwapi.MediaWikiAPI(self.TEST_API_URL, self.TEST_USER_AGENT)