def main(*args): wikipedia_site = pywikibot.Site() # Use the site configured in params/user-config commons_site = pywikibot.Site("commons", "commons") checker = TemplateChecker() checker.load_config("config/templates.json") commons_bot = CommonsBot(wikipedia_site, checker) callbacks = ArticleIteratorCallbacks( logging_callback=pywikibot.log, article_callback=commons_bot.cb_check_article ) article_iterator = ArticleIterator(callbacks) article_iterator.log_every_n = 1 parser = ArticleIteratorArgumentParser(article_iterator, None) run_cmd = commons_bot.run_continuously category_name = u"Images from Wiki Loves Monuments 2015 in Germany" start_time = first_day_of_month() for argument in pywikibot.handle_args(args): if argument.find("-category:") == 0: category_name = argument[10:] continue elif parser.check_argument(argument): continue elif argument.find("-start-at:") == 0: start_time_iso = argument[10:] + "T0:00:00Z" start_time = pywikibot.Timestamp.fromISOformat(start_time_iso) elif argument.find("-sleep-seconds:") == 0 and int(argument[15:]) > 0: commons_bot.sleep_seconds = int(argument[15:]) elif argument == "-once": run_cmd = commons_bot.run_once elif argument == "-local-media": commons_site = wikipedia_site category = pywikibot.Category(commons_site, category_name) run_cmd(article_iterator, start_time, category)
def main(*args): site = pywikibot.Site() fetcher = CategoryFetcher(site) checker = TemplateChecker() checker.load_config("config/templates.json") checker_bot = CheckerBot(checker, site) all_categories = fetcher.get_categories() callbacks = ArticleIteratorCallbacks( category_callback=checker_bot.cb_store_category_result, article_callback=checker_bot.cb_check_article, logging_callback=pywikibot.log, ) article_iterator = ArticleIterator(callbacks, categories=all_categories) parser = ArticleIteratorArgumentParser(article_iterator, fetcher) for argument in pywikibot.handle_args(list(args)): if parser.check_argument(argument): continue elif argument.find("-outputpage:") == 0: checker_bot.outputpage = argument[12:] elif argument.find("-exclude-articles:") == 0: page = pywikibot.Page(site, argument[18:]) article_iterator.excluded_articles = load_excluded_articles_from_wiki(page) article_iterator.iterate_categories() if article_iterator.categories != all_categories: # Don't update summary page if only single categories were crawled return summary = checker_bot.generate_summary_page() if checker_bot.outputpage: checker_bot.save_wikipage(summary, checker_bot.outputpage + u"/Zusammenfassung") else: pywikibot.output(u"Zusammenfassung") pywikibot.output(u"===============") pywikibot.output(summary) pywikibot.output(checker_bot.generate_config_table())
def main(): parser = argparse.ArgumentParser(description='Generate JSON info about monument data in wiki text.') parser.add_argument('--monument_id', '-i', help='Unique ID of the monument. Validity will be checked.', default='', metavar='ID') parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin) args = parser.parse_args() checker = TemplateChecker() checker.load_config("config/templates.json") mapper = CommonscatMapper() info = get_template_info(checker, mapper, args.infile.read(), args.monument_id) utf8_writer = codecs.getwriter('utf8') json.dump(info, utf8_writer(sys.stdout))
def setup_instances(): g.site_commons = mwclient.Site("commons.wikimedia.org") g.site_wikipedia = mwclient.Site("de.wikipedia.org") g.campaign_validator = CampaignValidator(g.site_commons) checker = TemplateChecker() checker.load_config("config/templates.json") mapper = CommonscatMapper() mapper.load_mapping("config/commonscat_mapping.json") g.page_information_collector = PageInformationCollector(checker, mapper) if app.config["REDIS_CACHE_PREFIX"]: g.campaign_cache = RedisCache(host=app.config["REDIS_HOST"], key_prefix=app.config["REDIS_CACHE_PREFIX"]) else: g.campaign_cache = SimpleCache()
def test_is_allowed_template_normalizes_underscores(self): template = Mock() template.name = u"Denkmalliste_Sachsen_Tabellenzeile" self.assertTrue(self.checker.is_allowed_template(template), "Template not found") # Check if config is normalized self.config[u"Denkmalliste_Brandenburg_Tabellenzeile"] = { "id": "ID", "id_check": "\\d{4,}", "id_check_description": u"Nummer, mindestens vierstellig" } self.checker = TemplateChecker(self.config) template.name = u"Denkmalliste Brandenburg Tabellenzeile" self.assertTrue(self.checker.is_allowed_template(template), "Template not found")
def setUp(self): self.config = { u"Denkmalliste Sachsen Tabellenzeile": { "id": "ID", "id_check": "\\d{4,}", "id_check_description": u"Nummer, mindestens vierstellig" }, u"Denkmalliste Bayern Tabellenzeile": { "id": "Nummer", "id_check": "D-\\d-\\d{3}", "id_check_description": u"Nummer im Format D-n-nnn" } } self.checker = TemplateChecker(self.config)
class TestTemplateChecker(unittest.TestCase): def setUp(self): self.config = { u"Denkmalliste Sachsen Tabellenzeile": { "id": "ID", "id_check": "\\d{4,}", "id_check_description": u"Nummer, mindestens vierstellig" }, u"Denkmalliste Bayern Tabellenzeile": { "id": "Nummer", "id_check": "D-\\d-\\d{3}", "id_check_description": u"Nummer im Format D-n-nnn" } } self.checker = TemplateChecker(self.config) def create_article_with_text(self, text): """ Build an Article fixture """ article = Mock() article.get.return_value = text article.isRedirectPage.return_value = False return article def test_text_contains_templates_finds_template_name(self): text = "{{Denkmalliste Sachsen Tabellenzeile|}}" self.assertTrue(self.checker.text_contains_templates(text)) def test_get_id_returns_id(self): template = Mock() template.get.return_value = u"ID=12345" template.name = u"Denkmalliste Sachsen Tabellenzeile" self.assertEqual(self.checker.get_id(template), u"12345") def test_get_id_returns_empty_string_if_is_empty(self): template = Mock() template.get.return_value = u"ID=" template.name = u"Denkmalliste Sachsen Tabellenzeile" self.assertEqual(self.checker.get_id(template), u"") def test_has_valid_id_true_for_valid_ids(self): template = Mock() template.get.return_value = u"ID=12345" template.name = u"Denkmalliste Sachsen Tabellenzeile" self.assertTrue(self.checker.has_valid_id(template)) def test_has_valid_id_true_for_invalid_ids(self): template = Mock() template.get.return_value = u"ID=123" template.name = u"Denkmalliste Sachsen Tabellenzeile" self.assertFalse(self.checker.has_valid_id(template)) def test_setting_configuration_compiles_regex_patterns(self): self.checker.config = { "Denkmalliste Bayern Tabellenzeile": { "id": "ID", "id_check": "D-d{3}" } } expected_class = type(re.compile("test")) self.assertIsInstance(self.checker.config["Denkmalliste Bayern Tabellenzeile"]["id_check"], expected_class) def test_is_allowed_template_checks_if_template_name_is_configured(self): template = Mock() template.name = u"Denkmalliste Sachsen Tabellenzeile" self.assertTrue(self.checker.is_allowed_template(template)) template.name = u"Denkmalliste Kleinkleckersdorf Tabellenzeile" self.assertFalse(self.checker.is_allowed_template(template)) def test_is_allowed_template_normalizes_underscores(self): template = Mock() template.name = u"Denkmalliste_Sachsen_Tabellenzeile" self.assertTrue(self.checker.is_allowed_template(template), "Template not found") # Check if config is normalized self.config[u"Denkmalliste_Brandenburg_Tabellenzeile"] = { "id": "ID", "id_check": "\\d{4,}", "id_check_description": u"Nummer, mindestens vierstellig" } self.checker = TemplateChecker(self.config) template.name = u"Denkmalliste Brandenburg Tabellenzeile" self.assertTrue(self.checker.is_allowed_template(template), "Template not found") def test_check_for_errors_skips_redirect_pages(self): article = Mock() article.isRedirectPage.return_value = True self.assertEqual(None, self.checker.check_article_for_errors(article)) def test_check_for_errors_reports_pages_without_templates(self): article = self.create_article_with_text(u"Just some test text") errors = self.checker.check_article_for_errors(article) self.assertEqual({TemplateChecker.ERROR_MISSING_TEMPLATE: True}, errors) def test_check_for_errors_reports_invalid_ids(self): article = self.create_article_with_text(u"{{Denkmalliste Sachsen Tabellenzeile|ID=1}}") errors = self.checker.check_article_for_errors(article) self.assertEqual({TemplateChecker.ERROR_INVALID_IDS: 1}, errors) def test_check_for_errors_returns_empty_dict_for_valid_text(self): article = self.create_article_with_text(u"{{Denkmalliste Sachsen Tabellenzeile|ID=1234}}") errors = self.checker.check_article_for_errors(article) self.assertEqual({}, errors) def test_check_for_errors_reports_duplicate_ids(self): article = self.create_article_with_text( u"{{Denkmalliste Sachsen Tabellenzeile|ID=1234}}{{Denkmalliste Sachsen Tabellenzeile|ID=1234}}{{Denkmalliste Sachsen Tabellenzeile|ID=1223}}") errors = self.checker.check_article_for_errors(article) self.assertEqual({TemplateChecker.ERROR_DUPLICATE_IDS: {u"1234": 2}}, errors) def test_check_for_errors_reports_too_many_templates(self): article = self.create_article_with_text(u"{{Denkmalliste Sachsen Tabellenzeile|ID=1234}}" * 600) errors = self.checker.check_article_for_errors(article) self.assertEqual({ TemplateChecker.ERROR_DUPLICATE_IDS: {u"1234": 600}, TemplateChecker.ERROR_TOO_MANY_TEMPLATES: 600 }, errors) def test_check_for_errors_can_report_multiple_errors(self): article = self.create_article_with_text( u"{{Denkmalliste Sachsen Tabellenzeile|ID=1}}{{Denkmalliste Sachsen Tabellenzeile|ID=1}}{{Denkmalliste Sachsen Tabellenzeile|}}") errors = self.checker.check_article_for_errors(article) expected_errors = { TemplateChecker.ERROR_INVALID_IDS: 2, TemplateChecker.ERROR_MISSING_IDS: 1, TemplateChecker.ERROR_DUPLICATE_IDS: {u"1": 2} } self.assertEqual(expected_errors, errors)