def petscan(self, categories, not_categories=None, article=False, year=None): searcher = PetScan() searcher.set_timeout(120) if article: # Article searcher.add_namespace(0) else: # Seite searcher.add_namespace(102) searcher.set_search_depth(5) if year: searcher.add_positive_category("Die Gartenlaube (" + str(year) + ")") else: searcher.add_positive_category("Die Gartenlaube") for category in categories: searcher.add_positive_category(category) if not_categories: for category in not_categories: searcher.add_negative_category(category) self.logger.debug(str(searcher)) return len(searcher.run())
def search_pages(self): # pragma: no cover searcher = PetScan() for template in self._templates: searcher.add_any_template(template) searcher.add_namespace(0) self.logger.info(str(searcher)) lemmas = searcher.run() self.logger.info(f"{len(lemmas)} to process.") return lemmas
def _prepare_searcher(self) -> PetScan: searcher = PetScan() searcher.add_yes_template("REDaten") if self.debug: searcher.add_namespace(2) else: searcher.add_namespace(0) searcher.add_positive_category("RE:Fertig") searcher.add_positive_category("RE:Korrigiert") searcher.add_positive_category("RE:Platzhalter") searcher.set_logic_union() searcher.set_sort_criteria("date") searcher.set_sortorder_decending() searcher.set_timeout(120) return searcher
def petscan(self, categories, not_categories=None, article=False, year=None): searcher = PetScan() if article: searcher.add_namespace("Article") else: searcher.add_namespace("Seite") searcher.set_search_depth(5) if year: searcher.add_positive_category("Die Gartenlaube (" + str(year) + ")") else: searcher.add_positive_category("Die Gartenlaube") for category in categories: searcher.add_positive_category(category) if not_categories: for category in not_categories: searcher.add_negative_category(category) self.logger.debug(str(searcher)) return len(searcher.run())
def get_sites_in_cat(self, list_of_cat, namespace=None, depth=None, any_template: list = None, union=False): # pylint: disable=too-many-arguments searcher = PetScan() for cat in list_of_cat: searcher.add_positive_category(cat) if any_template: for cat in any_template: searcher.add_any_template(cat) if union: searcher.set_logic_union() if namespace: searcher.add_namespace(namespace) if depth: searcher.set_search_depth(depth) self.logger.info(searcher) list_of_lemmas = searcher.run() del searcher return '{0:,}'.format(len(list_of_lemmas)).replace(',', '.')
def crawler_cat_index_site(): searcher_werke = PetScan() for item in watch_themes: searcher_werke.add_positive_category(item) searcher_werke.add_negative_category('Zeitschrift') searcher_werke.set_search_depth(4) searcher_werke.set_logic(log_or=True) list_werke = searcher_werke.run() for row in range(len(list_werke)): list_werke[row] = list_werke[row]['a']['title'] pass all_sites = set([]) counter = 1 for werk in list_werke: searcher_sites = PetScan() searcher_sites.add_namespace(102) searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(werk) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() if len(sites) > 0: for row in range(len(sites)): sites[row] = sites[row]['a']['title'] all_sites = all_sites | set(sites) else: searcher_index = PetScan() searcher_index.add_any_outlink(werk) searcher_index.add_namespace(104) searcher_index.add_positive_category('Index') index = searcher_index.run() if index: searcher_sites = PetScan() searcher_sites.add_namespace(102) searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(index[0]['a']['nstext'] + ':' + index[0]['a']['title']) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() else: print(werk) print(counter, '/', len(list_werke), ' result:', len(all_sites)) counter += 1 with open('output.txt', 'w', encoding='utf-8') as f: f.writelines(["Seite:%s\n" % item for item in all_sites])
def crawler_cat_index_site(): searcher_werke =PetScan() for item in watch_themes: searcher_werke.add_positive_category(item) searcher_werke.add_negative_category('Zeitschrift') searcher_werke.set_search_depth(4) searcher_werke.set_logic(log_or=True) list_werke = searcher_werke.run() for row in range(len(list_werke)): list_werke[row] = list_werke[row]['a']['title'] pass all_sites = set([]) counter = 1 for werk in list_werke: searcher_sites = PetScan() searcher_sites.add_namespace('Seite') searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(werk) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() if len(sites) > 0: for row in range(len(sites)): sites[row] = sites[row]['a']['title'] all_sites = all_sites | set(sites) else: searcher_index = PetScan() searcher_index.add_any_outlink(werk) searcher_index.add_namespace('Index') searcher_index.add_positive_category('Index') index = searcher_index.run() if index: searcher_sites = PetScan() searcher_sites.add_namespace('Seite') searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(index[0]['a']['nstext'] + ':' + index[0]['a']['title']) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() else: print(werk) print(counter, '/', len(list_werke), ' result:', len(all_sites)) counter += 1 with open('output.txt', 'w', encoding='utf-8') as f: f.writelines(["Seite:%s\n" % item for item in all_sites])
class TestCatScan(TestCase): def setUp(self): self.petscan = PetScan() def test_add_options(self): self.petscan.add_options({"max_age": "45"}) self.petscan.add_options({"smaller": "300"}) self.assertDictEqual({"smaller": "300", "max_age": "45"}, self.petscan.options) def test_add_categoy(self): self.petscan.add_positive_category("pos1") self.petscan.add_positive_category("pos2") self.petscan.add_positive_category("pos3", 2) self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg2") self.petscan.add_negative_category("neg3", 3) self.assertEqual(["pos1", "pos2", "pos3|2"], self.petscan.categories["positive"]) self.assertEqual(["neg1", "neg2", "neg3|3"], self.petscan.categories["negative"]) def test_add_namespace(self): self.petscan.add_namespace(0) self.petscan.add_namespace("Datei") self.petscan.add_namespace([2, "Vorlage"]) self.assertDictEqual({"ns[0]": "1", "ns[2]": "1", "ns[6]": "1", "ns[10]": "1"}, self.petscan.options) def test_activate_redirects(self): self.petscan.activate_redirects() self.assertDictEqual({"show_redirects": "yes"}, self.petscan.options) def test_deactivate_redirects(self): self.petscan.deactivate_redirects() self.assertDictEqual({"show_redirects": "no"}, self.petscan.options) def test_last_change_before(self): self.petscan.last_change_before(datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"before": "12340101020242"}, self.petscan.options) def test_last_change_after(self): self.petscan.last_change_after(datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"after": "12340101020242"}, self.petscan.options) def test_max_age(self): self.petscan.max_age(1234) self.assertDictEqual({"max_age": "1234"}, self.petscan.options) def test_only_new(self): self.petscan.only_new() self.assertDictEqual({"only_new": "1"}, self.petscan.options) def test_smaller_then(self): self.petscan.smaller_then(42) self.assertDictEqual({"smaller": "42"}, self.petscan.options) def test_larger_then(self): self.petscan.larger_then(42) self.assertDictEqual({"larger": "42"}, self.petscan.options) def test_get_wikidata(self): self.petscan.get_wikidata_items() self.assertDictEqual({"wikidata_item": "any"}, self.petscan.options) def test_get_Pages_with_wikidata(self): self.petscan.get_pages_with_wd_items() self.assertDictEqual({"wikidata_item": "with"}, self.petscan.options) def test_get_Pages_without_wikidata(self): self.petscan.get_pages_without_wd_items() self.assertDictEqual({"wikidata_item": "without"}, self.petscan.options) def test_set_or(self): self.petscan.set_logic_union() self.assertDictEqual({"combination": "union"}, self.petscan.options) def test_set_regex(self): self.petscan.set_regex_filter("abc") self.assertDictEqual({"regexp_filter": "abc"}, self.petscan.options) def test_set_last_edits(self): self.petscan.set_last_edit_bots(True) self.petscan.set_last_edit_anons(False) self.petscan.set_last_edit_flagged() self.assertDictEqual({"edits[bots]": "yes", "edits[anons]": "no", "edits[flagged]": "yes"}, self.petscan.options) def test_construct_cat_string(self): self.petscan.add_positive_category("pos 1") self.petscan.add_positive_category("pos2") self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg 2") self.petscan.add_negative_category("neg3") self.assertEqual("pos+1\r\npos2", self.petscan._construct_list_argument(self.petscan.categories["positive"])) self.assertEqual("neg1\r\nneg+2\r\nneg3", self.petscan._construct_list_argument(self.petscan.categories["negative"])) def test_construct_templates(self): self.petscan.add_yes_template("yes1") self.petscan.add_yes_template("yes2") self.petscan.add_any_template("any1") self.petscan.add_any_template("any2") self.petscan.add_any_template("any3") self.petscan.add_no_template("no1") self.petscan.add_no_template("no2") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=de&project=wikisource&templates_yes=yes1%0D%0Ayes2&templates_any=any1%0D%0Aany2%0D%0Aany3&templates_no=no1%0D%0Ano2") def test_construct_outlinks(self): self.petscan.add_yes_outlink("yes1") self.petscan.add_yes_outlink("yes2") self.petscan.add_any_outlink("any1") self.petscan.add_any_outlink("any2") self.petscan.add_any_outlink("any3") self.petscan.add_no_outlink("no1") self.petscan.add_no_outlink("no2") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=de&project=wikisource&outlinks_yes=yes1%0D%0Ayes2&outlinks_any=any1%0D%0Aany2%0D%0Aany3&outlinks_no=no1%0D%0Ano2") def test_construct_links_to(self): self.petscan.add_yes_links_to("yes1") self.petscan.add_yes_links_to("yes2") self.petscan.add_any_links_to("any1") self.petscan.add_any_links_to("any2") self.petscan.add_any_links_to("any3") self.petscan.add_no_links_to("no1") self.petscan.add_no_links_to("no2") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=de&project=wikisource&links_to_all=yes1%0D%0Ayes2&links_to_any=any1%0D%0Aany2%0D%0Aany3&links_to_no=no1%0D%0Ano2") def test_construct_options(self): self.petscan.options = {"max_age": "1234", "get_q": "1", "show_redirects": "yes"} self.assertEqual("&max_age=1234" in str(self.petscan), True) self.assertEqual("&get_q=1" in str(self.petscan), True) self.assertEqual("&show_redirects=yes" in str(self.petscan), True) def test_construct_string(self): self.petscan.set_language("en") self.petscan.set_project("wikipedia") # only a positive category self.petscan.add_positive_category("test") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&categories=test") # only a negative category self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_negative_category("test") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&negcats=test") # only a option self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_options({"max_age": "10"}) self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10") def test_do_positive(self): with requests_mock.mock() as mock: mock.get("https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", text='{"n": "result","a": {"querytime_sec": 1.572163,' '"query": "https://petscan.wmflabs.org/?language=de' '&project=wikisource&categories=Autoren&get_q=1' '&show_redirects=no&ns[0]=1&max_age=48' '&format=json&doit=1"},' '"*": [{"n": "combination",' '"a": {"type": "subset",' '"*": [{"id": 3279,' '"len": 10197,' '"n": "page",' '"namespace": 0,' '"nstext": "",' '"q": "Q60644",' '"title": "Friedrich_Rückert",' '"touched": "20161024211701"}]}}]}') self.assertEqual(self.petscan.run(), [{"id": 3279, "len": 10197, "n": "page", "namespace": 0, "nstext": "", "q": "Q60644", "title": "Friedrich_Rückert", "touched": "20161024211701"}]) def test_do_negative(self): with requests_mock.mock() as mock: mock.get("https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", status_code=404) with self.assertRaises(ConnectionError): self.petscan.run()
class AuthorList(CanonicalBot): # pylint: disable=bare-except,too-many-branches,broad-except def __init__(self, wiki, debug): CanonicalBot.__init__(self, wiki, debug) self.searcher = PetScan() self.repo = self.wiki.data_repository() # this is a DataSite object self.string_list = [] self.match_property = re.compile(r"\{\{#property:P(\d{1,4})\}\}") self.number_to_month = { 1: "Januar", 2: "Februar", 3: "März", 4: "April", 5: "Mai", 6: "Juni", 7: "Juli", 8: "August", 9: "September", 10: "Oktober", 11: "November", 12: "Dezember" } def __enter__(self): CanonicalBot.__enter__(self) if self.timestamp.start_of_run.day == 1: self.data.assign_dict({}) self.logger.warning( "The data is thrown away. It is the first of the month") return self def task(self): lemma_list = self._run_searcher() self._build_database(lemma_list) if self.debug: dump = Page(self.wiki, f"Benutzer:THEbotIT/{self.bot_name}") else: dump = Page(self.wiki, "Liste der Autoren") old_text = dump.text new_text = self._convert_to_table() if new_text[150:] != old_text[150:]: # compare all but the date dump.text = new_text dump.save("Die Liste wurde auf den aktuellen Stand gebracht.", botflag=True) else: self.logger.info("Heute gab es keine Änderungen, " "daher wird die Seite nicht überschrieben.") return True def _run_searcher(self): # was the last run successful if self.debug: # if False yesterday = datetime.now() - timedelta(days=5) self.searcher.last_change_after( datetime(year=int(yesterday.strftime("%Y")), month=int(yesterday.strftime("%m")), day=int(yesterday.strftime("%d")))) elif self.last_run_successful and self.data: start_of_search = self.create_timestamp_for_search() self.searcher.last_change_after(start_of_search) self.logger.info( f"The date {start_of_search.strftime('%d.%m.%Y')} " f"is set to the argument \"after\".") else: self.logger.warning( "There was no timestamp found of the last run, " "so the argument \"after\" is not set.") self.searcher.add_namespace(0) # search in main namespace self.searcher.add_positive_category("Autoren") self.searcher.add_yes_template("Personendaten") self.searcher.get_wikidata_items() self.logger.debug(self.searcher) entries_to_search = self.searcher.run() return entries_to_search _space_regex = re.compile(r"\s+") def _strip_spaces(self, raw_string: str): return self._space_regex.subn(raw_string.strip(), " ")[0] def _build_database(self, lemma_list): # pylint: disable=too-many-statements for idx, author in enumerate(lemma_list): self.logger.debug(f"{idx + 1}/{len(lemma_list)} {author['title']}") # delete preexisting data of this author try: del self.data[str(author["id"])] except KeyError: if self.last_run_successful: self.logger.info( f"Can't delete old entry of [[{author['title']}]]") dict_author = {"title": author["title"]} # extract the Personendaten-block form the wikisource page page = Page(self.wiki, author["title"]) try: try: personendaten = re.search( r"\{\{Personendaten(?:.|\n)*?\n\}\}\n", page.text).group() except AttributeError: self.logger.error( f"No valid block \"Personendaten\" was found for " f"[[{author['title']}]].") personendaten = None if personendaten: # personendaten = re.sub('<ref.*?>.*?<\/ref>|<ref.*?\/>', '', personendaten) # personendaten = re.sub('\{\{CRef|.*?(?:\{\{.*?\}\})?}}', '', personendaten) template_extractor = TemplateHandler(personendaten) dict_author.update({ "name": self._strip_spaces( template_extractor.get_parameter("NACHNAME") ["value"]) }) dict_author.update({ "first_name": self._strip_spaces( template_extractor.get_parameter("VORNAMEN") ["value"]) }) try: dict_author.update({ "birth": self._strip_spaces( template_extractor.get_parameter( "GEBURTSDATUM")["value"]) }) except Exception: dict_author.update({"birth": ""}) self.logger.warning( f"Templatehandler couldn't find a birthdate for: " f"[[{author['title']}]]") try: dict_author.update({ "death": self._strip_spaces( template_extractor.get_parameter("STERBEDATUM") ["value"]) }) except Exception: dict_author.update({"death": ""}) self.logger.warning( f"Templatehandler couldn't find a deathdate for: " f"[[{author['title']}]]") try: dict_author.update({ "description": template_extractor.get_parameter( "KURZBESCHREIBUNG")["value"] }) except Exception: dict_author.update({"description": ""}) self.logger.warning( f"Templatehandler couldn't find a description for: " f"[[{author['title']}]]") try: dict_author.update({ "synonyms": template_extractor.get_parameter("ALTERNATIVNAMEN") ["value"] }) except Exception: dict_author.update({"synonyms": ""}) self.logger.warning( f"Templatehandler couldn't find synonyms for: " f"[[{author['title']}]]") try: dict_author.update({ "sortkey": template_extractor.get_parameter("SORTIERUNG") ["value"] }) if dict_author["sortkey"] == "": raise ValueError except Exception: self.logger.debug( f"there is no sortkey for [[{author['title']}]].") # make a dummy key if not dict_author["name"]: dict_author["sortkey"] = dict_author["first_name"] self.logger.warning("Author has no last name.") elif not dict_author["first_name"]: dict_author["sortkey"] = dict_author["name"] self.logger.warning( "Author has no last first_name.") else: dict_author["sortkey"] = \ dict_author["name"] + ", " + dict_author["first_name"] try: dict_author.update({"wikidata": author["q"]}) except KeyError: self.logger.warning( f"The autor [[{author['title']}]] has no wikidata_item" ) self.data.update({author["id"]: dict_author}) except Exception as exception: self.logger.exception("Exception not catched: ", exc_info=exception) self.logger.error(f"author {author['title']} have a problem") @staticmethod def _sort_author_list(list_authors): list_authors.sort(key=lambda x: x[0]) for i in range(len(list_authors) - 1): if list_authors[i][0] == list_authors[i + 1][0]: equal_count = 2 while True: if i + equal_count <= len(list_authors): if list_authors[i][0] != list_authors[i + equal_count][0]: break equal_count += 1 temp_list = list_authors[i:i + equal_count] temp_list.sort(key=lambda x: x[5]) # sort by birth date list_authors[i:i + equal_count] = temp_list def _convert_to_table(self): # pylint: disable=too-many-locals # make a list of lists self.logger.info("Start compiling.") list_authors = [] for key in self.data: author_dict = self.data[key] list_author = [] list_author.append(author_dict["sortkey"]) # 0 list_author.append(author_dict["title"].replace("_", " ")) # 1 list_author.append(author_dict["name"]) # 2 list_author.append(author_dict["first_name"]) # 3 for event in ["birth", "death"]: list_author.append( self._handle_birth_and_death(event, author_dict)) # 4,6 try: list_author.append(str(DateConversion( list_author[-1]))) # 5,7 except ValueError: self.logger.error( f"Can´t compile sort key for {author_dict['title']}: " f"{event}/{author_dict[event]}") list_author.append("!-00-00") # 5,7 list_author.append(author_dict["description"]) # 8 list_authors.append(list_author) # sorting the list self.logger.info("Start sorting.") self._sort_author_list(list_authors) self.logger.info("Start printing.") start_of_run = self.timestamp.start_of_run self.string_list.append( f"Diese Liste der Autoren enthält alle {len(self.data)}<ref>Stand: " f"{start_of_run.day}.{start_of_run.month}.{start_of_run.year}, " f"{self.timestamp.start_of_run.strftime('%H:%M')} (UTC)</ref> Autoren, " f"zu denen in Wikisource eine Autorenseite existiert.") self.string_list.append( "Die Liste kann mit den Buttons neben den Spaltenüberschriften" " nach der jeweiligen Spalte sortiert werden.") self.string_list.append("<!--") self.string_list.append( "Diese Liste wurde durch ein Computerprogramm erstellt, " "das die Daten verwendet, " "die aus den Infoboxen auf den Autorenseiten stammen.") self.string_list.append( "Sollten daher Fehler vorhanden sein, " "sollten diese jeweils dort korrigiert werden.") self.string_list.append("-->") self.string_list.append("{|class=\"wikitable sortable\"") self.string_list.append("!style=\"width:20%\"| Name") self.string_list.append( "!data-sort-type=\"text\" style=\"width:15%\"| Geb.-datum") self.string_list.append( "!data-sort-type=\"text\" style=\"width:15%\"| Tod.-datum") self.string_list.append( "!class=\"unsortable\" style=\"width:50%\"| Beschreibung") for list_author in list_authors: aut_sort, aut_page, aut_sur, aut_pre, birth_str, \ birth_sort, death_str, death_sort, description = \ list_author self.string_list.append("|-") if aut_sur and aut_pre: self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|" f"[[{aut_page}|{aut_sur}, {aut_pre}]]") elif aut_pre: self.string_list.append( f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_pre}]]" ) else: self.string_list.append( f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_sur}]]" ) self.string_list.append( f"|data-sort-value=\"{birth_sort}\"|{birth_str}") self.string_list.append( f"|data-sort-value=\"{death_sort}\"|{death_str}") self.string_list.append(f"|{description}") self.string_list.append("|}") self.string_list.append('') self.string_list.append("== Anmerkungen ==") self.string_list.append("<references/>") self.string_list.append('') self.string_list.append("{{SORTIERUNG:Autoren #Liste der}}") self.string_list.append("[[Kategorie:Listen]]") self.string_list.append("[[Kategorie:Autoren|!]]") return "\n".join(self.string_list) def _handle_birth_and_death(self, event, author_dict): if author_dict[event] == '' or self.match_property.search( author_dict[event]): self.logger.debug( f"No valid entry in {event} for " f"[[{author_dict['title']}]] ... Fallback to wikidata") try: item = ItemPage(self.repo, author_dict["wikidata"]) if event == "birth": property_label = "P569" else: property_label = "P570" claim = item.text["claims"][property_label][0] date_from_data = claim.getTarget() if date_from_data.precision < 7: self.logger.error( f"Precison is to low for [[{author_dict['title']}]]") elif date_from_data.precision < 8: date_from_data = int( ceil(float(date_from_data.year) / 100.0) * 100) if date_from_data < 1000: date_from_data = str(date_from_data)[0:1] + ". Jh." else: date_from_data = str(date_from_data)[0:2] + ". Jh." elif date_from_data.precision < 10: date_from_data = str(date_from_data.year) elif date_from_data.precision < 11: date_from_data = self.number_to_month[date_from_data.month] + " " + \ str(date_from_data.year) else: date_from_data = f"{date_from_data.day}. " \ f"{self.number_to_month[date_from_data.month]} " \ f"{date_from_data.year}" if re.search("-", date_from_data): date_from_data = date_from_data.replace("-", "") + " v. Chr." self.logger.debug( f"Found {date_from_data} @ wikidata for {event}") return date_from_data # 4,6 except Exception: self.logger.debug("Wasn't able to ge any data from wikidata") return '' # 4,6 else: return author_dict[event] # 4,6
class GlCreateMagazine(CanonicalBot): def __init__(self, wiki, debug): CanonicalBot.__init__(self, wiki, debug) self.searcher_pages = PetScan() self.searcher_indexes = PetScan() self.regex_page = re.compile( r"Die_Gartenlaube_\((\d{4})\)_([^\.]*?)\.(?:jpg|JPG)") self.regex_index = re.compile(r"Die_Gartenlaube_\((\d{4})\)") self.regex_magazine_in_index = \ re.compile(r"((?:Heft|Halbheft) (?:\{\{0\}\})?\d{1,2}:.*?(?:\n\n|\Z))", re.DOTALL) self.regex_page_in_magazine = re.compile(r"_([_\w]{1,9}).(?:jpg|JPG)") self.regex_number_in_index = re.compile( r"(?:Heft|Halbheft) (?:\{\{0\}\})?(\d{1,2}):?") self.new_data_model = datetime(year=2018, month=7, day=1, hour=14) self.lemmas = None def __enter__(self): CanonicalBot.__enter__(self) if not self.data: self.data.assign_dict({"pages": {}, "indexes": {}}) return self def task(self): self.lemmas = self.search_pages() temp_data_pages = {} self.process_indexes() self.process_pages(temp_data_pages) temp_data_magazines = self.process_actual_pages(temp_data_pages) self.make_magazines(temp_data_magazines) return True def process_pages(self, temp_data): for idx, lemma in enumerate(self.lemmas): try: hit = self.regex_page.search(lemma["title"]) year = hit.group(1) page = hit.group(2) if year not in self.data["pages"].keys(): self.data["pages"][year] = {} proofread_lemma = ProofreadPage(self.wiki, f"Seite:{lemma['title']}") if self.debug: self.logger.debug( f"{idx + 1}/{len(self.lemmas)} Page {page}({year}) " f"has quality level {proofread_lemma.quality_level} " f"_ Seite:{lemma['title']}") ref = search_for_refs(proofread_lemma.text) page_dict = {"q": int(proofread_lemma.quality_level)} if ref: self.logger.debug( f"There are refs ({ref}) @ {year}, {page}") page_dict.update({"r": ref}) self.data["pages"][year][page] = page_dict if year not in temp_data.keys(): temp_data[year] = [] temp_data[year].append(page) except Exception as error: # pylint: disable=broad-except self.logger.error( f"wasn't able to process {lemma['title']}, error: {error}") def process_indexes(self): for index_lemma, index_page in self._get_indexes(): self.logger.debug(f"[[Index:{index_lemma}]]") magazines = self.regex_magazine_in_index.findall(index_page.text) hit_year = self.regex_index.search(index_lemma) year = hit_year.group(1) if year not in self.data["indexes"].keys(): self.data["indexes"][year] = {} for magazine in magazines: pages = self.regex_page_in_magazine.findall(magazine) hit_number = self.regex_number_in_index.findall(magazine) number = int(hit_number[0]) self.data["indexes"][year][number] = pages def process_actual_pages(self, dictionary_of_new_pages): tempdata_magzines = {} for year in dictionary_of_new_pages: set_of_pages = set(dictionary_of_new_pages[year]) tempdata_magzines[year] = set() try: dictionary_of_magazines = self.data["indexes"][year] except KeyError as error: raise BotException( f"The list of indexes is incorrect, {year} is missing." ) from error for magazine in dictionary_of_magazines: set_of_potential_pages = set(dictionary_of_magazines[magazine]) if set_of_potential_pages.intersection(set_of_pages): tempdata_magzines[year].add(magazine) return tempdata_magzines def make_magazines(self, dictionary_of_magazines_by_year): for idx_year, year in enumerate(dictionary_of_magazines_by_year): magazines = dictionary_of_magazines_by_year[year] self.logger.debug(f"make_mag_year {idx_year + 1}/" f"{len(dictionary_of_magazines_by_year)}") for idx_mag, magazine in enumerate(magazines): self.logger.debug( f"make_mag_mag {idx_mag + 1}/{len(magazines)} ... issue:{year}/{magazine}" ) if year == "1986" and magazine == "31": self.logger.warning( "There is magazine 1986, 31, this is special, no creating here" ) continue if self.debug: lemma = Page(self.wiki, "Benutzer:THEbotIT/Test") else: lemma = Page( self.wiki, f"Die Gartenlaube ({year})/Heft {int(magazine):d}") new_text = self.make_magazine(year, magazine) if new_text: if hash(new_text.strip()) != hash(lemma.text.strip()): self.logger.debug( f"Print [[Die Gartenlaube ({year})/Heft {magazine}]]." ) if lemma.text != '': lemma.text = new_text lemma.save( "Automatische Aktualisierung des Heftes", botflag=True) else: lemma.text = new_text lemma.save("automatische Hefterstellung", botflag=True) else: self.logger.debug( f"Keine Änderung im Text ({year}/{magazine}).") def make_magazine(self, year, magazine): last_magazine = True try: for key in self.data["indexes"][year].keys(): if int(key) > int(magazine): last_magazine = False break except KeyError as error: raise BotException(f"The list of indexes is incorrect, {year} is missing.") \ from error try: list_of_pages = self.data["indexes"][year][magazine] except KeyError as error: raise BotException(f"The list of indexes is incorrect, year:{year} or mag:{magazine} is missing.") \ from error quality = 4 for page in list_of_pages: try: if self.data["pages"][year][page]["q"] == 0: page_quality = 4 else: page_quality = self.data["pages"][year][page]["q"] if page_quality < quality: quality = page_quality if quality < 3: self.logger.debug( f"The quality of {year}/{magazine} is too poor.") return None except KeyError: self.logger.warning(f"The list of pages is incorrect, " f"year:{year} or page:{page} is missing.") return None return self.make_magazine_text(year, magazine, quality, list_of_pages, last_magazine) @staticmethod def convert_page_no(page: str): while True: if page[0] == "0": page = page[1:] else: break return page.replace("_", " ") def make_magazine_text(self, year, magazine, quality, list_of_pages, last): # pylint: disable=too-many-arguments,too-many-branches magazine = int(magazine) year = int(year) string_list = [] string_list.append( "<!--Diese Seite wurde automatisch durch einen Bot erstellt. " "Wenn du einen Fehler findest oder eine Änderung wünscht, " "benachrichtige bitte den Betreiber, THE IT, des Bots.-->\n" "{{Textdaten\n") if magazine > 1: string_list.append( f"|VORIGER=Die Gartenlaube ({year:d})/Heft {magazine - 1:d}\n") else: string_list.append("|VORIGER=\n") if last: string_list.append("|NÄCHSTER=\n") else: string_list.append( f"|NÄCHSTER=Die Gartenlaube ({year:d})/Heft {magazine + 1:d}\n" ) string_list.append( f"|AUTOR=Verschiedene\n" f"|TITEL=[[Die Gartenlaube ({year})|Die Gartenlaube]]\n" f"|SUBTITEL=''Illustrirtes Familienblatt''\n" f"|HERKUNFT=off\n") if year < 1863: string_list.append("|HERAUSGEBER=[[Ferdinand Stolle]]\n") elif (year < 1878) or (year == 1878 and magazine < 14): string_list.append("|HERAUSGEBER=[[Ernst Keil]]\n") elif year < 1885: string_list.append("|HERAUSGEBER=Ernst Ziel\n") else: string_list.append("|HERAUSGEBER=Adolf Kröner\n") string_list.append(f"|ENTSTEHUNGSJAHR={year:d}\n" f"|ERSCHEINUNGSJAHR={year:d}\n" f"|ERSCHEINUNGSORT=Leipzig\n" f"|VERLAG=Ernst Keil\n" f"|WIKIPEDIA=Die Gartenlaube\n") if year == 1873: extension = "JPG" else: extension = "jpg" string_list.append( f"|BILD=Die Gartenlaube ({year:d}) {list_of_pages[0]}.{extension}\n" ) string_list.append( f"|QUELLE=[[commons:category:Gartenlaube ({year})|commons]]\n") if quality == 4: string_list.append("|BEARBEITUNGSSTAND=fertig\n") else: string_list.append("|BEARBEITUNGSSTAND=korrigiert\n") string_list.append(f"|INDEXSEITE=Die Gartenlaube ({year})\n}}}}\n" f"{{{{BlockSatzStart}}}}\n__TOC__\n") ref = [] for page in list_of_pages: page_format = self.convert_page_no(page) string_list.append( f"{{{{SeitePR|{page_format}|Die Gartenlaube ({year}) {page}.{extension}}}}}\n" ) try: page_dict = self.data["pages"][str(year)][page] if "r" in page_dict.keys(): if "ref" in page_dict["r"]: if "ref" not in ref: ref.append("ref") for ref_type in page_dict["r"]: if (ref_type != "ref") and (ref_type not in ref): ref.append(ref_type) except KeyError: self.logger.error(f"The list of pages is incorrect, " f"year:{year} or page:{page} is missing.") return None if "ref" in ref: string_list.append("{{references|x}}\n") for ref_type in ref: if ref_type != "ref": string_list.append(f"{{{{references|TIT|{ref_type}}}}}\n") string_list.append( f"{{{{BlockSatzEnd}}}}\n\n[[Kategorie:Deutschland]]\n" f"[[Kategorie:Neuhochdeutsch]]\n[[Kategorie:Illustrierte Werke]]\n" f"[[Kategorie:Die Gartenlaube ({year:d}) Hefte| {magazine:02d}]]\n" ) string_list.append(f"[[Kategorie:{str(year)[0:3]}0er Jahre]]\n\n") return ''.join(string_list) def _get_indexes(self) -> Iterator[IndexPage]: self.searcher_indexes.add_positive_category("Die Gartenlaube") self.searcher_indexes.add_positive_category("Index") self.searcher_indexes.set_regex_filter(r".*Die Gartenlaube \(\d{4}\)") self.searcher_indexes.set_timeout(60) for index in self.searcher_indexes.run(): yield index["title"], IndexPage(self.wiki, f"Index:{index['title']}") def search_pages(self): self.searcher_pages.add_positive_category("Die Gartenlaube") self.searcher_pages.add_namespace(102) # namespace Seite self.searcher_pages.set_search_depth(1) self.searcher_pages.set_timeout(60) if self.last_run_successful or self.debug: delta = (self.timestamp.start_of_run - self.timestamp.last_run).days if self.debug: delta = 10 start_of_search = self.create_timestamp_for_search(delta) self.searcher_pages.last_change_after(start_of_search) self.logger.info( f"The date {start_of_search.strftime('%d.%m.%Y')} is set to the argument \"after\"." ) return self.searcher_pages.run()
def get_list(): searcher = PetScan() searcher.add_positive_category("Aachener Stadtrechnungen") searcher.add_namespace(0) return searcher.run()
def get_count(): searcher = PetScan() searcher.add_positive_category("David Hilbert Gesammelte Abhandlungen Erster Band") searcher.add_positive_category("Unkorrigiert") searcher.add_namespace("Seite") return len(searcher.run())
class TestCatScan(TestCase): def setUp(self): self.petscan = PetScan() def test_add_options(self): self.petscan.add_options({"max_age": "45"}) self.petscan.add_options({"smaller": "300"}) self.assertDictEqual({ "smaller": "300", "max_age": "45" }, self.petscan.options) def test_add_categoy(self): self.petscan.add_positive_category("pos1") self.petscan.add_positive_category("pos2") self.petscan.add_positive_category("pos3", 2) self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg2") self.petscan.add_negative_category("neg3", 3) self.assertEqual(["pos1", "pos2", "pos3|2"], self.petscan.categories["positive"]) self.assertEqual(["neg1", "neg2", "neg3|3"], self.petscan.categories["negative"]) def test_add_namespace(self): self.petscan.add_namespace(0) self.petscan.add_namespace([2, 10]) self.assertDictEqual({ "ns[0]": "1", "ns[2]": "1", "ns[10]": "1" }, self.petscan.options) def test_activate_redirects(self): self.petscan.activate_redirects() self.assertDictEqual({"show_redirects": "yes"}, self.petscan.options) def test_deactivate_redirects(self): self.petscan.deactivate_redirects() self.assertDictEqual({"show_redirects": "no"}, self.petscan.options) def test_last_change_before(self): self.petscan.last_change_before( datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"before": "12340101020242"}, self.petscan.options) def test_last_change_after(self): self.petscan.last_change_after( datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"after": "12340101020242"}, self.petscan.options) def test_max_age(self): self.petscan.max_age(1234) self.assertDictEqual({"max_age": "1234"}, self.petscan.options) def test_only_new(self): self.petscan.only_new() self.assertDictEqual({"only_new": "1"}, self.petscan.options) def test_smaller_then(self): self.petscan.smaller_then(42) self.assertDictEqual({"smaller": "42"}, self.petscan.options) def test_larger_then(self): self.petscan.larger_then(42) self.assertDictEqual({"larger": "42"}, self.petscan.options) def test_get_wikidata(self): self.petscan.get_wikidata_items() self.assertDictEqual({"wikidata_item": "any"}, self.petscan.options) def test_get_Pages_with_wikidata(self): self.petscan.get_pages_with_wd_items() self.assertDictEqual({"wikidata_item": "with"}, self.petscan.options) def test_get_Pages_without_wikidata(self): self.petscan.get_pages_without_wd_items() self.assertDictEqual({"wikidata_item": "without"}, self.petscan.options) def test_set_or(self): self.petscan.set_logic_union() self.assertDictEqual({"combination": "union"}, self.petscan.options) def test_set_regex(self): self.petscan.set_regex_filter("abc") self.assertDictEqual({"regexp_filter": "abc"}, self.petscan.options) def test_set_last_edits(self): self.petscan.set_last_edit_bots(True) self.petscan.set_last_edit_anons(False) self.petscan.set_last_edit_flagged() self.assertDictEqual( { "edits[bots]": "yes", "edits[anons]": "no", "edits[flagged]": "yes" }, self.petscan.options) def test_construct_cat_string(self): self.petscan.add_positive_category("pos 1") self.petscan.add_positive_category("pos2") self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg 2") self.petscan.add_negative_category("neg3") self.assertEqual( "pos+1\r\npos2", self.petscan._construct_list_argument( self.petscan.categories["positive"])) self.assertEqual( "neg1\r\nneg+2\r\nneg3", self.petscan._construct_list_argument( self.petscan.categories["negative"])) def test_construct_templates(self): self.petscan.add_yes_template("yes1") self.petscan.add_yes_template("yes2") self.petscan.add_any_template("any1") self.petscan.add_any_template("any2") self.petscan.add_any_template("any3") self.petscan.add_no_template("no1") self.petscan.add_no_template("no2") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=de" "&project=wikisource" "&templates_yes=yes1%0D%0Ayes2" "&templates_any=any1%0D%0Aany2%0D%0Aany3" "&templates_no=no1%0D%0Ano2") def test_construct_outlinks(self): self.petscan.add_yes_outlink("yes1") self.petscan.add_yes_outlink("yes2") self.petscan.add_any_outlink("any1") self.petscan.add_any_outlink("any2") self.petscan.add_any_outlink("any3") self.petscan.add_no_outlink("no1") self.petscan.add_no_outlink("no2") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=de" "&project=wikisource" "&outlinks_yes=yes1%0D%0Ayes2" "&outlinks_any=any1%0D%0Aany2%0D%0Aany3" "&outlinks_no=no1%0D%0Ano2") def test_construct_links_to(self): self.petscan.add_yes_links_to("yes1") self.petscan.add_yes_links_to("yes2") self.petscan.add_any_links_to("any1") self.petscan.add_any_links_to("any2") self.petscan.add_any_links_to("any3") self.petscan.add_no_links_to("no1") self.petscan.add_no_links_to("no2") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=de" "&project=wikisource" "&links_to_all=yes1%0D%0Ayes2" "&links_to_any=any1%0D%0Aany2%0D%0Aany3" "&links_to_no=no1%0D%0Ano2") def test_construct_options(self): self.petscan.options = { "max_age": "1234", "get_q": "1", "show_redirects": "yes" } self.assertEqual("&max_age=1234" in str(self.petscan), True) self.assertEqual("&get_q=1" in str(self.petscan), True) self.assertEqual("&show_redirects=yes" in str(self.petscan), True) def test_construct_string(self): self.petscan.set_language("en") self.petscan.set_project("wikipedia") # only a positive category self.petscan.add_positive_category("test") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&categories=test" ) # only a negative category self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_negative_category("test") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&negcats=test" ) # only a option self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_options({"max_age": "10"}) self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10" ) def test_do_positive(self): with requests_mock.mock() as mock: mock.get( "https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", text='{"n": "result","a": {"querytime_sec": 1.572163,' '"query": "https://petscan.wmflabs.org/?language=de' '&project=wikisource&categories=Autoren&get_q=1' '&show_redirects=no&ns[0]=1&max_age=48' '&format=json&doit=1"},' '"*": [{"n": "combination",' '"a": {"type": "subset",' '"*": [{"id": 3279,' '"len": 10197,' '"n": "page",' '"namespace": 0,' '"nstext": "",' '"q": "Q60644",' '"title": "Friedrich_Rückert",' '"touched": "20161024211701"}]}}]}') self.assertEqual(self.petscan.run(), [{ "id": 3279, "len": 10197, "n": "page", "namespace": 0, "nstext": "", "q": "Q60644", "title": "Friedrich_Rückert", "touched": "20161024211701" }]) def test_do_negative(self): with requests_mock.mock() as mock: mock.get( "https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", status_code=404) with self.assertRaises(PetScanException): self.petscan.run()
class AuthorList(CanonicalBot): # pylint: disable=bare-except,too-many-branches,broad-except def __init__(self, wiki, debug): CanonicalBot.__init__(self, wiki, debug) self.searcher = PetScan() self.repo = self.wiki.data_repository() # this is a DataSite object self.string_list = [] self.match_property = re.compile(r"\{\{#property:P(\d{1,4})\}\}") self.number_to_month = {1: "Januar", 2: "Februar", 3: "März", 4: "April", 5: "Mai", 6: "Juni", 7: "Juli", 8: "August", 9: "September", 10: "Oktober", 11: "November", 12: "Dezember"} def __enter__(self): CanonicalBot.__enter__(self) if self.timestamp.start_of_run.day == 1: self.data.assign_dict(dict()) self.logger.warning("The data is thrown away. It is the first of the month") return self def task(self): lemma_list = self._run_searcher() self._build_database(lemma_list) if self.debug: dump = Page(self.wiki, f"Benutzer:THEbotIT/{self.bot_name}") else: dump = Page(self.wiki, "Liste der Autoren") old_text = dump.text new_text = self._convert_to_table() if new_text[150:] != old_text[150:]: # compare all but the date dump.text = new_text dump.save("Die Liste wurde auf den aktuellen Stand gebracht.", botflag=True) else: self.logger.info("Heute gab es keine Änderungen, " "daher wird die Seite nicht überschrieben.") return True def _run_searcher(self): # was the last run successful if self.debug: # if False yesterday = datetime.now() - timedelta(days=5) self.searcher.last_change_after(datetime(year=int(yesterday.strftime("%Y")), month=int(yesterday.strftime("%m")), day=int(yesterday.strftime("%d")))) elif self.last_run_successful and self.data: start_of_search = self.create_timestamp_for_search() self.searcher.last_change_after(start_of_search) self.logger.info(f"The date {start_of_search.strftime('%d.%m.%Y')} " f"is set to the argument \"after\".") else: self.logger.warning("There was no timestamp found of the last run, " "so the argument \"after\" is not set.") self.searcher.add_namespace(0) # search in main namespace self.searcher.add_positive_category("Autoren") self.searcher.add_yes_template("Personendaten") self.searcher.get_wikidata_items() self.logger.debug(self.searcher) entries_to_search = self.searcher.run() return entries_to_search _space_regex = re.compile(r"\s+") def _strip_spaces(self, raw_string: str): return self._space_regex.subn(raw_string.strip(), " ")[0] def _build_database(self, lemma_list): # pylint: disable=too-many-statements for idx, author in enumerate(lemma_list): self.logger.debug(f"{idx + 1}/{len(lemma_list)} {author['title']}") # delete preexisting data of this author try: del self.data[str(author["id"])] except KeyError: if self.last_run_successful: self.logger.info(f"Can't delete old entry of [[{author['title']}]]") dict_author = {"title": author["title"]} # extract the Personendaten-block form the wikisource page page = Page(self.wiki, author["title"]) try: try: personendaten = re.search(r"\{\{Personendaten(?:.|\n)*?\n\}\}\n", page.text).group() except AttributeError: self.logger.error(f"No valid block \"Personendaten\" was found for " f"[[{author['title']}]].") personendaten = None if personendaten: # personendaten = re.sub('<ref.*?>.*?<\/ref>|<ref.*?\/>', '', personendaten) # personendaten = re.sub('\{\{CRef|.*?(?:\{\{.*?\}\})?}}', '', personendaten) template_extractor = TemplateHandler(personendaten) dict_author.update({"name": self._strip_spaces( template_extractor.get_parameter("NACHNAME")["value"])}) dict_author.update({"first_name": self._strip_spaces( template_extractor.get_parameter("VORNAMEN")["value"])}) try: dict_author.update({"birth": self._strip_spaces( template_extractor.get_parameter("GEBURTSDATUM")["value"])}) except Exception: dict_author.update({"birth": ""}) self.logger.warning(f"Templatehandler couldn't find a birthdate for: " f"[[{author['title']}]]") try: dict_author.update({"death": self._strip_spaces( template_extractor.get_parameter("STERBEDATUM")["value"])}) except Exception: dict_author.update({"death": ""}) self.logger.warning(f"Templatehandler couldn't find a deathdate for: " f"[[{author['title']}]]") try: dict_author.update( {"description": template_extractor.get_parameter("KURZBESCHREIBUNG")["value"]}) except Exception: dict_author.update({"description": ""}) self.logger.warning( f"Templatehandler couldn't find a description for: " f"[[{author['title']}]]") try: dict_author.update( {"synonyms": template_extractor.get_parameter("ALTERNATIVNAMEN")["value"]}) except Exception: dict_author.update({"synonyms": ""}) self.logger.warning(f"Templatehandler couldn't find synonyms for: " f"[[{author['title']}]]") try: dict_author.update( {"sortkey": template_extractor.get_parameter("SORTIERUNG")["value"]}) if dict_author["sortkey"] == "": raise ValueError except Exception: self.logger.debug(f"there is no sortkey for [[{author['title']}]].") # make a dummy key if not dict_author["name"]: dict_author["sortkey"] = dict_author["first_name"] self.logger.warning("Author has no last name.") elif not dict_author["first_name"]: dict_author["sortkey"] = dict_author["name"] self.logger.warning("Author has no last first_name.") else: dict_author["sortkey"] = \ dict_author["name"] + ", " + dict_author["first_name"] try: dict_author.update({"wikidata": author["q"]}) except KeyError: self.logger.warning(f"The autor [[{author['title']}]] has no wikidata_item") self.data.update({author["id"]: dict_author}) except Exception as exception: self.logger.exception("Exception not catched: ", exc_info=exception) self.logger.error(f"author {author['title']} have a problem") @staticmethod def _sort_author_list(list_authors): list_authors.sort(key=lambda x: x[0]) for i in range(len(list_authors) - 1): if list_authors[i][0] == list_authors[i + 1][0]: equal_count = 2 while True: if i + equal_count <= len(list_authors): if list_authors[i][0] != list_authors[i + equal_count][0]: break equal_count += 1 temp_list = list_authors[i:i + equal_count] temp_list.sort(key=lambda x: x[5]) # sort by birth date list_authors[i:i + equal_count] = temp_list def _convert_to_table(self): # pylint: disable=too-many-locals # make a list of lists self.logger.info("Start compiling.") list_authors = [] for key in self.data: author_dict = self.data[key] list_author = list() list_author.append(author_dict["sortkey"]) # 0 list_author.append(author_dict["title"].replace("_", " ")) # 1 list_author.append(author_dict["name"]) # 2 list_author.append(author_dict["first_name"]) # 3 for event in ["birth", "death"]: list_author.append(self._handle_birth_and_death(event, author_dict)) # 4,6 try: list_author.append(str(DateConversion(list_author[-1]))) # 5,7 except ValueError: self.logger.error(f"Can´t compile sort key for {author_dict['title']}: " f"{event}/{author_dict[event]}") list_author.append("!-00-00") # 5,7 list_author.append(author_dict["description"]) # 8 list_authors.append(list_author) # sorting the list self.logger.info("Start sorting.") self._sort_author_list(list_authors) self.logger.info("Start printing.") start_of_run = self.timestamp.start_of_run self.string_list.append(f"Diese Liste der Autoren enthält alle {len(self.data)}<ref>Stand: " f"{start_of_run.day}.{start_of_run.month}.{start_of_run.year}, " f"{self.timestamp.start_of_run.strftime('%H:%M')} (UTC)</ref> Autoren, " f"zu denen in Wikisource eine Autorenseite existiert.") self.string_list.append("Die Liste kann mit den Buttons neben den Spaltenüberschriften" " nach der jeweiligen Spalte sortiert werden.") self.string_list.append("<!--") self.string_list.append("Diese Liste wurde durch ein Computerprogramm erstellt, " "das die Daten verwendet, " "die aus den Infoboxen auf den Autorenseiten stammen.") self.string_list.append("Sollten daher Fehler vorhanden sein, " "sollten diese jeweils dort korrigiert werden.") self.string_list.append("-->") self.string_list.append("{|class=\"wikitable sortable\"") self.string_list.append("!style=\"width:20%\"| Name") self.string_list.append("!data-sort-type=\"text\" style=\"width:15%\"| Geb.-datum") self.string_list.append("!data-sort-type=\"text\" style=\"width:15%\"| Tod.-datum") self.string_list.append("!class=\"unsortable\" style=\"width:50%\"| Beschreibung") for list_author in list_authors: aut_sort, aut_page, aut_sur, aut_pre, birth_str, \ birth_sort, death_str, death_sort, description = \ list_author self.string_list.append("|-") if aut_sur and aut_pre: self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|" f"[[{aut_page}|{aut_sur}, {aut_pre}]]") elif aut_pre: self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_pre}]]") else: self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_sur}]]") self.string_list.append(f"|data-sort-value=\"{birth_sort}\"|{birth_str}") self.string_list.append(f"|data-sort-value=\"{death_sort}\"|{death_str}") self.string_list.append(f"|{description}") self.string_list.append("|}") self.string_list.append('') self.string_list.append("== Anmerkungen ==") self.string_list.append("<references/>") self.string_list.append('') self.string_list.append("{{SORTIERUNG:Autoren #Liste der}}") self.string_list.append("[[Kategorie:Listen]]") self.string_list.append("[[Kategorie:Autoren|!]]") return "\n".join(self.string_list) def _handle_birth_and_death(self, event, author_dict): if author_dict[event] == '' or self.match_property.search(author_dict[event]): self.logger.debug(f"No valid entry in {event} for " f"[[{author_dict['title']}]] ... Fallback to wikidata") try: item = ItemPage(self.repo, author_dict["wikidata"]) if event == "birth": property_label = "P569" else: property_label = "P570" claim = item.text["claims"][property_label][0] date_from_data = claim.getTarget() if date_from_data.precision < 7: self.logger.error(f"Precison is to low for [[{author_dict['title']}]]") elif date_from_data.precision < 8: date_from_data = int(ceil(float(date_from_data.year) / 100.0) * 100) if date_from_data < 1000: date_from_data = str(date_from_data)[0:1] + ". Jh." else: date_from_data = str(date_from_data)[0:2] + ". Jh." elif date_from_data.precision < 10: date_from_data = str(date_from_data.year) elif date_from_data.precision < 11: date_from_data = self.number_to_month[date_from_data.month] + " " + \ str(date_from_data.year) else: date_from_data = f"{date_from_data.day}. " \ f"{self.number_to_month[date_from_data.month]} " \ f"{date_from_data.year}" if re.search("-", date_from_data): date_from_data = date_from_data.replace("-", "") + " v. Chr." self.logger.debug(f"Found {date_from_data} @ wikidata for {event}") return date_from_data # 4,6 except Exception: self.logger.debug("Wasn't able to ge any data from wikidata") return '' # 4,6 else: return author_dict[event] # 4,6
class GlCreateMagazine(CanonicalBot): def __init__(self, wiki, debug): CanonicalBot.__init__(self, wiki, debug) self.searcher_pages = PetScan() self.searcher_indexes = PetScan() self.regex_page = re.compile(r"Die_Gartenlaube_\((\d{4})\)_([^\.]*?)\.(?:jpg|JPG)") self.regex_index = re.compile(r"Die_Gartenlaube_\((\d{4})\)") self.regex_magazine_in_index = \ re.compile(r"((?:Heft|Halbheft) (?:\{\{0\}\})?\d{1,2}:.*?(?:\n\n|\Z))", re.DOTALL) self.regex_page_in_magazine = re.compile(r"_([_\w]{1,9}).(?:jpg|JPG)") self.regex_number_in_index = re.compile(r"(?:Heft|Halbheft) (?:\{\{0\}\})?(\d{1,2}):?") self.new_data_model = datetime(year=2018, month=7, day=1, hour=14) self.lemmas = None def __enter__(self): CanonicalBot.__enter__(self) if not self.data: self.data.assign_dict({"pages": {}, "indexes": {}}) return self def task(self): self.lemmas = self.search_pages() temp_data_pages = {} self.process_indexes() self.process_pages(temp_data_pages) temp_data_magazines = self.process_actual_pages(temp_data_pages) self.make_magazines(temp_data_magazines) return True def process_pages(self, temp_data): for idx, lemma in enumerate(self.lemmas): try: hit = self.regex_page.search(lemma["title"]) year = hit.group(1) page = hit.group(2) if year not in self.data["pages"].keys(): self.data["pages"][year] = {} proofread_lemma = ProofreadPage(self.wiki, f"Seite:{lemma['title']}") if self.debug: self.logger.debug(f"{idx + 1}/{len(self.lemmas)} Page {page}({year}) " f"has quality level {proofread_lemma.quality_level} " f"_ Seite:{lemma['title']}") ref = search_for_refs(proofread_lemma.text) page_dict = {"q": int(proofread_lemma.quality_level)} if ref: self.logger.debug(f"There are refs ({ref}) @ {year}, {page}") page_dict.update({"r": ref}) self.data["pages"][year][page] = page_dict if year not in temp_data.keys(): temp_data[year] = [] temp_data[year].append(page) except Exception as error: # pylint: disable=broad-except self.logger.error(f"wasn't able to process {lemma['title']}, error: {error}") def process_indexes(self): for index_lemma, index_page in self._get_indexes(): self.logger.debug("[[Index:{}]]".format(index_lemma)) magazines = self.regex_magazine_in_index.findall(index_page.text) hit_year = self.regex_index.search(index_lemma) year = hit_year.group(1) if year not in self.data["indexes"].keys(): self.data["indexes"][year] = {} for magazine in magazines: pages = self.regex_page_in_magazine.findall(magazine) hit_number = self.regex_number_in_index.findall(magazine) number = int(hit_number[0]) self.data["indexes"][year][number] = pages def process_actual_pages(self, dictionary_of_new_pages): tempdata_magzines = {} for year in dictionary_of_new_pages: set_of_pages = set(dictionary_of_new_pages[year]) tempdata_magzines[year] = set() try: dictionary_of_magazines = self.data["indexes"][year] except KeyError: raise BotException(f"The list of indexes is incorrect, {year} is missing.") for magazine in dictionary_of_magazines: set_of_potential_pages = set(dictionary_of_magazines[magazine]) if set_of_potential_pages.intersection(set_of_pages): tempdata_magzines[year].add(magazine) return tempdata_magzines def make_magazines(self, dictionary_of_magazines_by_year): for idx_year, year in enumerate(dictionary_of_magazines_by_year): magazines = dictionary_of_magazines_by_year[year] self.logger.debug(f"make_mag_year {idx_year + 1}/" f"{len(dictionary_of_magazines_by_year)}") for idx_mag, magazine in enumerate(magazines): self.logger.debug("make_mag_mag {idx}/{len} ... issue:{year}/{mag}" .format(idx=idx_mag + 1, len=len(magazines), year=year, mag=magazine)) if year == "1986" and magazine == "31": self.logger.warning("There is magazine 1986, 31, " "this is special, no creating here") continue if self.debug: lemma = Page(self.wiki, "Benutzer:THEbotIT/Test") else: lemma = Page(self.wiki, f"Die Gartenlaube ({year})/Heft {int(magazine):d}") new_text = self.make_magazine(year, magazine) if new_text: if hash(new_text.strip()) != hash(lemma.text.strip()): self.logger.debug("Print [[Die Gartenlaube ({year})/Heft {magazine}]]." .format(year=year, magazine=magazine)) if lemma.text != '': lemma.text = new_text lemma.save("Automatische Aktualisierung des Heftes", botflag=True) else: lemma.text = new_text lemma.save("automatische Hefterstellung", botflag=True) else: self.logger.debug("Keine Änderung im Text ({year}/{magazine})." .format(year=year, magazine=magazine)) def make_magazine(self, year, magazine): last_magazine = True try: for key in self.data["indexes"][year].keys(): if int(key) > int(magazine): last_magazine = False break except KeyError: raise BotException("The list of indexes is incorrect, {year} is missing." .format(year=year)) try: list_of_pages = self.data["indexes"][year][magazine] except KeyError: raise BotException("The list of indexes is incorrect, " "year:{year} or mag:{mag} is missing." .format(year=year, mag=magazine)) quality = 4 for page in list_of_pages: try: if self.data["pages"][year][page]["q"] == 0: page_quality = 4 else: page_quality = self.data["pages"][year][page]["q"] if page_quality < quality: quality = page_quality if quality < 3: self.logger.debug("The quality of {year}/{magazine} is too poor." .format(year=year, magazine=magazine)) return None except KeyError: self.logger.warning("The list of pages is incorrect, " "year:{year} or page:{page} is missing." .format(year=year, page=page)) return None return self.make_magazine_text(year, magazine, quality, list_of_pages, last_magazine) @staticmethod def convert_page_no(page: str): while True: if page[0] == "0": page = page[1:] else: break return page.replace("_", " ") def make_magazine_text(self, year, magazine, quality, list_of_pages, last): # pylint: disable=too-many-arguments,too-many-branches magazine = int(magazine) year = int(year) string_list = list() string_list.append("<!--Diese Seite wurde automatisch durch einen Bot erstellt. " "Wenn du einen Fehler findest oder eine Änderung wünscht, " "benachrichtige bitte den Betreiber, THE IT, des Bots.-->\n" "{{Textdaten\n") if magazine > 1: string_list.append("|VORIGER=Die Gartenlaube ({year:d})/Heft {magazine:d}\n" .format(year=year, magazine=magazine - 1)) else: string_list.append("|VORIGER=\n") if last: string_list.append("|NÄCHSTER=\n") else: string_list.append("|NÄCHSTER=Die Gartenlaube ({year:d})/Heft {magazine:d}\n" .format(year=year, magazine=magazine + 1)) string_list.append("|AUTOR=Verschiedene\n|TITEL=[[Die Gartenlaube]]\n" "|SUBTITEL=''Illustrirtes Familienblatt''\n|HERKUNFT=off\n") if year < 1863: string_list.append("|HERAUSGEBER=[[Ferdinand Stolle]]\n") elif (year < 1878) or (year == 1878 and magazine < 14): string_list.append("|HERAUSGEBER=[[Ernst Keil]]\n") elif year < 1885: string_list.append("|HERAUSGEBER=Ernst Ziel\n") else: string_list.append("|HERAUSGEBER=Adolf Kröner\n") string_list.append("|ENTSTEHUNGSJAHR={year:d}\n|ERSCHEINUNGSJAHR={year:d}\n" "|ERSCHEINUNGSORT=Leipzig\n|VERLAG=Ernst Keil\n" "|WIKIPEDIA=Die Gartenlaube\n".format(year=year)) if year == 1873: extension = "JPG" else: extension = "jpg" string_list.append("|BILD=Die Gartenlaube ({year:d}) {page1}.{extension}\n" .format(year=year, page1=list_of_pages[0], extension=extension)) string_list.append("|QUELLE=[[commons:category:Gartenlaube ({year})|commons]]\n" .format(year=year)) if quality == 4: string_list.append("|BEARBEITUNGSSTAND=fertig\n") else: string_list.append("|BEARBEITUNGSSTAND=korrigiert\n") string_list.append("|INDEXSEITE=Die Gartenlaube ({year})\n}}}}\n\n" "{{{{BlockSatzStart}}}}\n__TOC__\n".format(year=year)) ref = [] for page in list_of_pages: page_format = self.convert_page_no(page) string_list.append( "{{{{SeitePR|{page_format}|Die Gartenlaube ({year}) {page}.{extension}}}}}\n" .format(year=year, page_format=page_format, page=page, extension=extension)) try: page_dict = self.data["pages"][str(year)][page] if "r" in page_dict.keys(): if "ref" in page_dict["r"]: if "ref" not in ref: ref.append("ref") for ref_type in page_dict["r"]: if (ref_type != "ref") and (ref_type not in ref): ref.append(ref_type) except KeyError: self.logger.error("The list of pages is incorrect, " "year:{year} or page:{page} is missing." .format(year=year, page=page)) return None if "ref" in ref: string_list.append("{{references|x}}\n") for ref_type in ref: if ref_type != "ref": string_list.append("{{{{references|TIT|{ref}}}}}\n".format(ref=ref_type)) string_list.append("{{{{BlockSatzEnd}}}}\n\n[[Kategorie:Deutschland]]\n" "[[Kategorie:Neuhochdeutsch]]\n[[Kategorie:Illustrierte Werke]]\n" "[[Kategorie:Die Gartenlaube ({year:d}) Hefte| {magazine:02d}]]\n" .format(year=year, magazine=magazine)) string_list.append("[[Kategorie:{year}0er Jahre]]\n\n".format(year=str(year)[0:3])) return ''.join(string_list) def _get_indexes(self) -> Iterator[IndexPage]: self.searcher_indexes.add_positive_category("Die Gartenlaube") self.searcher_indexes.add_positive_category("Index") self.searcher_indexes.set_regex_filter(r".*Die Gartenlaube \(\d{4}\)") self.searcher_indexes.set_timeout(60) for index in self.searcher_indexes.run(): yield index["title"], IndexPage(self.wiki, "Index:{}".format(index["title"])) def search_pages(self): self.searcher_pages.add_positive_category("Die Gartenlaube") self.searcher_pages.add_namespace("Seite") self.searcher_pages.set_search_depth(1) self.searcher_pages.set_timeout(60) if self.last_run_successful or self.debug: delta = (self.timestamp.start_of_run - self.timestamp.last_run).days if self.debug: delta = 10 start_of_search = self.create_timestamp_for_search(delta) self.searcher_pages.last_change_after(start_of_search) self.logger.info("The date {} is set to the argument \"after\"." .format(start_of_search.strftime("%d.%m.%Y"))) return self.searcher_pages.run()
# -*- coding: utf-8 -*- __author__ = 'eso' import sys sys.path.append('../../') from tools.petscan import PetScan import re import requests import pywikibot searcher_catscan = PetScan() searcher_catscan.add_namespace('Seite') searcher_catscan.add_namespace(0) searcher_catscan.add_yes_template('Sperrschrift') sites = searcher_catscan.run() site = pywikibot.Site() for lemma in sites: if lemma['a']['nstext'] == '(Article)': page = pywikibot.Page(site, lemma['a']['title']) else: page = pywikibot.Page(site, lemma['a']['nstext'] + ':' + lemma['a']['title']) test_for_fit = re.search('Sperrschrift', page.text) #print(lemma['a']['title']) if test_for_fit: page.text = re.sub('Sperrschrift', 'SperrSchrift', page.text) page.save(summary='bot edit: Vereinheitlichung der Vorlage Sperrschrift zu SperrSchrift', botflag=True, )
# -*- coding: utf-8 -*- __author__ = 'eso' import sys sys.path.append('../../') from tools.petscan import PetScan import pywikibot searcher_index = PetScan() searcher_index.add_namespace('Index') searcher_index.add_positive_category('Index') list_of_indexes = searcher_index.run() wiki = pywikibot.Site() for idx, index in enumerate(list_of_indexes): print('{}/{} {}'.format(idx + 1, len(list_of_indexes), index['a']['title'])) searcher_sites_of_index = PetScan() searcher_sites_of_index.add_namespace('Seite') searcher_sites_of_index.add_yes_outlink(index['a']['nstext'] + ':' + index['a']['title']) searcher_sites_of_index.add_positive_category('Fertig') searcher_sites_of_index.add_positive_category('Korrigiert') searcher_sites_of_index.add_positive_category('Unkorrigiert') searcher_sites_of_index.set_logic(log_or=True) list_of_sites = searcher_sites_of_index.run() for idx_, site in enumerate(list_of_sites): print('\t{}/{} {}'.format(idx_ + 1, len(list_of_sites), site['a']['nstext'] + ':' + site['a']['title'])) touchpage = pywikibot.Page(wiki, title=site['a']['nstext'] + ':' + site['a']['title']) touchpage.touch() del searcher_sites_of_index
# -*- coding: utf-8 -*- __author__ = 'eso' import sys sys.path.append('../../') from tools.petscan import PetScan import re import requests import pywikibot searcher_catscan = PetScan() searcher_catscan.add_namespace('Seite') searcher_catscan.add_namespace(0) searcher_catscan.add_yes_template('Sperrschrift') sites = searcher_catscan.run() site = pywikibot.Site() for lemma in sites: if lemma['a']['nstext'] == '(Article)': page = pywikibot.Page(site, lemma['a']['title']) else: page = pywikibot.Page(site, lemma['a']['nstext'] + ':' + lemma['a']['title']) test_for_fit = re.search('Sperrschrift', page.text) #print(lemma['a']['title']) if test_for_fit: page.text = re.sub('Sperrschrift', 'SperrSchrift', page.text) page.save( summary= 'bot edit: Vereinheitlichung der Vorlage Sperrschrift zu SperrSchrift', botflag=True, )
def get_list(): searcher = PetScan() searcher.add_positive_category("RE:Autor:Arthur Stein") searcher.add_namespace(0) return searcher.run()