Пример #1
0
class FixIndex(OneTimeBot):
    bot_name = '20170922_FixIndex'

    def __init__(self, wiki, debug):
        OneTimeBot.__init__(self, wiki, debug)
        self.searcher = PetScan()

    def _search(self):
        self.searcher.add_positive_category('Index')
        return self.searcher.run()

    def task(self):
        lemma_list = self._search()
        for idx, lemma in enumerate(lemma_list):
            page = Page(self.wiki, title='Index:{}'.format(lemma['title']))
            self.logger.info('{}/{}:{}'.format(idx, len(lemma_list), page))
            match = regex_picture.search(page.text)
            if match:
                self.logger.info(match.group(1))
                temp = re.sub('\|\d{2,3}px', '', match.group(1))
                if not re.search('thumb', match.group(1)):
                    temp = temp + '|thumb'
                self.logger.info(temp)
                if temp == match.group(1):
                    self.logger.info('nothing to do here.')
                    continue
                temp = '|BILD=[[{}]]'.format(temp)
                temp_text = regex_picture.sub(temp, page.text)
                page.text = temp_text
                page.save(botflag=True, summary='set thumb as parameter')
        return True
class FixIndex(OneTimeBot):
    bot_name = '20170922_FixIndex'

    def __init__(self, wiki, debug):
        OneTimeBot.__init__(self, wiki, debug)
        self.searcher = PetScan()

    def _search(self):
        self.searcher.add_positive_category('Index')
        return self.searcher.run()

    def task(self):
        lemma_list = self._search()
        for idx, lemma in enumerate(lemma_list):
            page = Page(self.wiki, title='Index:{}'.format(lemma['title']))
            self.logger.info('{}/{}:{}'.format(idx, len(lemma_list), page))
            match = regex_picture.search(page.text)
            if match:
                self.logger.info(match.group(1))
                temp = re.sub('\|\d{2,3}px', '', match.group(1))
                if not re.search('thumb', match.group(1)):
                    temp = temp + '|thumb'
                self.logger.info(temp)
                if temp == match.group(1):
                    self.logger.info('nothing to do here.')
                    continue
                temp = '|BILD=[[{}]]'.format(temp)
                temp_text = regex_picture.sub(temp, page.text)
                page.text = temp_text
                page.save(botflag=True, summary='set thumb as parameter')
        return True
Пример #3
0
 def petscan(self, categories, negative_categories):
     searcher = PetScan()
     for category in categories:
         searcher.add_positive_category(category)
     for neg_category in negative_categories:
         searcher.add_negative_category(neg_category)
     searcher.set_logic_union()
     self.logger.debug(searcher)
     return searcher.run()
Пример #4
0
 def petscan(self, categories: List[str],
             negative_categories: List[str]) -> List[PetscanLemma]:
     searcher = PetScan()
     for category in categories:
         searcher.add_positive_category(category)
     for neg_category in negative_categories:
         searcher.add_negative_category(neg_category)
     searcher.set_logic_union()
     self.logger.debug(str(searcher))
     return searcher.run()
Пример #5
0
def cat_crawler():
    searcher_werke = PetScan()
    for item in watch_themes:
        searcher_werke.add_positive_category(item)
        searcher_werke.set_logic(log_or=True)
    searcher_werke.set_search_depth(4)
    list_werke = searcher_werke.run()
    for row in range(len(list_werke)):
        list_werke[row] = list_werke[row]['a']['title']
    with open('output_cat.txt', 'w', encoding='utf-8') as f:
        f.writelines(["%s\n" % item  for item in list_werke])
Пример #6
0
def cat_crawler():
    searcher_werke = PetScan()
    for item in watch_themes:
        searcher_werke.add_positive_category(item)
        searcher_werke.set_logic(log_or=True)
    searcher_werke.set_search_depth(4)
    list_werke = searcher_werke.run()
    for row in range(len(list_werke)):
        list_werke[row] = list_werke[row]['a']['title']
    with open('output_cat.txt', 'w', encoding='utf-8') as f:
        f.writelines(["%s\n" % item for item in list_werke])
Пример #7
0
 def task(self):  # pragma: no cover
     regex = re.compile(r"\n\|PND=")
     searcher = PetScan()
     searcher.add_yes_template("ADBDaten")
     searcher.add_positive_category("ADB:Ohne GND-Link")
     lemma_list = searcher.run()
     for lemma in lemma_list:
         page = Page(self.wiki, lemma["title"])
         temp_text = page.text
         if regex.search(temp_text):
             self.logger.info(f"change {lemma['title']}")
             temp_text = regex.sub("\n|GND=", temp_text)
         page.text = temp_text
         page.save("PND -> GND", botflag=True)
     return True
Пример #8
0
 def get_sites_in_cat(self, list_of_cat, namespace=None, depth=None, any_template: list = None,
                      union=False):
     # pylint: disable=too-many-arguments
     searcher = PetScan()
     for cat in list_of_cat:
         searcher.add_positive_category(cat)
     if any_template:
         for cat in any_template:
             searcher.add_any_template(cat)
     if union:
         searcher.set_logic_union()
     if namespace:
         searcher.add_namespace(namespace)
     if depth:
         searcher.set_search_depth(depth)
     self.logger.info(searcher)
     list_of_lemmas = searcher.run()
     del searcher
     return '{0:,}'.format(len(list_of_lemmas)).replace(',', '.')
Пример #9
0
 def get_sites_in_cat(self, list_of_cat, namespace=None, depth=None, any_template: list = None,
                      union=False):
     # pylint: disable=too-many-arguments
     searcher = PetScan()
     for cat in list_of_cat:
         searcher.add_positive_category(cat)
     if any_template:
         for cat in any_template:
             searcher.add_any_template(cat)
     if union:
         searcher.set_logic_union()
     if namespace:
         searcher.add_namespace(namespace)
     if depth:
         searcher.set_search_depth(depth)
     self.logger.info(searcher)
     list_of_lemmas = searcher.run()
     del searcher
     return '{0:,}'.format(len(list_of_lemmas)).replace(',', '.')
Пример #10
0
 def petscan(self,
             categories,
             not_categories=None,
             article=False,
             year=None):
     searcher = PetScan()
     searcher.set_timeout(120)
     if article:
         # Article
         searcher.add_namespace(0)
     else:
         # Seite
         searcher.add_namespace(102)
     searcher.set_search_depth(5)
     if year:
         searcher.add_positive_category("Die Gartenlaube (" + str(year) +
                                        ")")
     else:
         searcher.add_positive_category("Die Gartenlaube")
     for category in categories:
         searcher.add_positive_category(category)
     if not_categories:
         for category in not_categories:
             searcher.add_negative_category(category)
     self.logger.debug(str(searcher))
     return len(searcher.run())
Пример #11
0
class FixReStructure(OneTimeBot):
    bot_name = '20180125_FixReStructure'

    def __init__(self, wiki, debug):
        OneTimeBot.__init__(self, wiki, debug)
        self.searcher = PetScan()
        self.timeout = timedelta(hours=5)

    def get_lemmas(self):
        self.searcher.add_positive_category("RE:Verweisung")
        self.searcher.add_no_template("REAutor")
        self.searcher.add_yes_template("REDaten")
        self.searcher.set_sort_criteria("size")
        self.searcher.set_sortorder_decending()
        for lemma in self.searcher.run():
            yield Page(self.wiki, lemma['title'])

    @staticmethod
    def process_text(text):
        regex_anmerkungen = re.compile("\s*== Anmerkungen")
        if regex_anmerkungen.search(text):
            return regex_anmerkungen.sub("\n{{REAutor|OFF}}\n== Anmerkungen",
                                         text).rstrip()
        else:
            return text.rstrip() + "\n{{REAutor|OFF}}"

    def task(self):
        for idx, page in enumerate(self.get_lemmas()):
            self.logger.info(str(idx) + "/" + str(page))
            pre_text = page.text
            page.text = self.process_text(pre_text)
            if pre_text != page.text:
                page.save(
                    "Inserted a REAutor statement for a correct structure")
            if self._watchdog():
                self.logger.warning("Enough for the day, don't run to long.")
                return False
        return True
class FixReStructure(OneTimeBot):
    bot_name = '20180125_FixReStructure'

    def __init__(self, wiki, debug):
        OneTimeBot.__init__(self, wiki, debug)
        self.searcher = PetScan()
        self.timeout = timedelta(hours=5)

    def get_lemmas(self):
        self.searcher.add_positive_category("RE:Verweisung")
        self.searcher.add_no_template("REAutor")
        self.searcher.add_yes_template("REDaten")
        self.searcher.set_sort_criteria("size")
        self.searcher.set_sortorder_decending()
        for lemma in self.searcher.run():
            yield Page(self.wiki, lemma['title'])

    @staticmethod
    def process_text(text):
        regex_anmerkungen = re.compile("\s*== Anmerkungen")
        if regex_anmerkungen.search(text):
            return regex_anmerkungen.sub("\n{{REAutor|OFF}}\n== Anmerkungen", text).rstrip()
        else:
            return text.rstrip() + "\n{{REAutor|OFF}}"

    def task(self):
        for idx, page in enumerate(self.get_lemmas()):
            self.logger.info(str(idx) + "/" + str(page))
            pre_text = page.text
            page.text = self.process_text(pre_text)
            if pre_text != page.text:
                page.save("Inserted a REAutor statement for a correct structure")
            if self._watchdog():
                self.logger.warning("Enough for the day, don't run to long.")
                return False
        return True
Пример #13
0
    def _prepare_searcher(self) -> PetScan:
        searcher = PetScan()
        searcher.add_yes_template("REDaten")

        if self.debug:
            searcher.add_namespace(2)
        else:
            searcher.add_namespace(0)
            searcher.add_positive_category("RE:Fertig")
            searcher.add_positive_category("RE:Korrigiert")
            searcher.add_positive_category("RE:Platzhalter")
            searcher.set_logic_union()
            searcher.set_sort_criteria("date")
            searcher.set_sortorder_decending()
            searcher.set_timeout(120)
        return searcher
Пример #14
0
 def petscan(self, categories, not_categories=None, article=False, year=None):
     searcher = PetScan()
     if article:
         searcher.add_namespace("Article")
     else:
         searcher.add_namespace("Seite")
     searcher.set_search_depth(5)
     if year:
         searcher.add_positive_category("Die Gartenlaube (" + str(year) + ")")
     else:
         searcher.add_positive_category("Die Gartenlaube")
     for category in categories:
         searcher.add_positive_category(category)
     if not_categories:
         for category in not_categories:
             searcher.add_negative_category(category)
     self.logger.debug(str(searcher))
     return len(searcher.run())
Пример #15
0
class TestCatScan(TestCase):
    def setUp(self):
        self.petscan = PetScan()

    def test_add_options(self):
        self.petscan.add_options({"max_age": "45"})
        self.petscan.add_options({"smaller": "300"})
        self.assertDictEqual({"smaller": "300", "max_age": "45"}, self.petscan.options)

    def test_add_categoy(self):
        self.petscan.add_positive_category("pos1")
        self.petscan.add_positive_category("pos2")
        self.petscan.add_positive_category("pos3", 2)
        self.petscan.add_negative_category("neg1")
        self.petscan.add_negative_category("neg2")
        self.petscan.add_negative_category("neg3", 3)
        self.assertEqual(["pos1", "pos2", "pos3|2"], self.petscan.categories["positive"])
        self.assertEqual(["neg1", "neg2", "neg3|3"], self.petscan.categories["negative"])

    def test_add_namespace(self):
        self.petscan.add_namespace(0)
        self.petscan.add_namespace("Datei")
        self.petscan.add_namespace([2, "Vorlage"])
        self.assertDictEqual({"ns[0]": "1", "ns[2]": "1", "ns[6]": "1", "ns[10]": "1"}, self.petscan.options)

    def test_activate_redirects(self):
        self.petscan.activate_redirects()
        self.assertDictEqual({"show_redirects": "yes"}, self.petscan.options)

    def test_deactivate_redirects(self):
        self.petscan.deactivate_redirects()
        self.assertDictEqual({"show_redirects": "no"}, self.petscan.options)

    def test_last_change_before(self):
        self.petscan.last_change_before(datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42))
        self.assertDictEqual({"before": "12340101020242"}, self.petscan.options)

    def test_last_change_after(self):
        self.petscan.last_change_after(datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42))
        self.assertDictEqual({"after": "12340101020242"}, self.petscan.options)

    def test_max_age(self):
        self.petscan.max_age(1234)
        self.assertDictEqual({"max_age": "1234"}, self.petscan.options)

    def test_only_new(self):
        self.petscan.only_new()
        self.assertDictEqual({"only_new": "1"}, self.petscan.options)

    def test_smaller_then(self):
        self.petscan.smaller_then(42)
        self.assertDictEqual({"smaller": "42"}, self.petscan.options)

    def test_larger_then(self):
        self.petscan.larger_then(42)
        self.assertDictEqual({"larger": "42"}, self.petscan.options)

    def test_get_wikidata(self):
        self.petscan.get_wikidata_items()
        self.assertDictEqual({"wikidata_item": "any"}, self.petscan.options)

    def test_get_Pages_with_wikidata(self):
        self.petscan.get_pages_with_wd_items()
        self.assertDictEqual({"wikidata_item": "with"}, self.petscan.options)

    def test_get_Pages_without_wikidata(self):
        self.petscan.get_pages_without_wd_items()
        self.assertDictEqual({"wikidata_item": "without"}, self.petscan.options)

    def test_set_or(self):
        self.petscan.set_logic_union()
        self.assertDictEqual({"combination": "union"}, self.petscan.options)

    def test_set_regex(self):
        self.petscan.set_regex_filter("abc")
        self.assertDictEqual({"regexp_filter": "abc"}, self.petscan.options)

    def test_set_last_edits(self):
        self.petscan.set_last_edit_bots(True)
        self.petscan.set_last_edit_anons(False)
        self.petscan.set_last_edit_flagged()
        self.assertDictEqual({"edits[bots]": "yes", "edits[anons]": "no", "edits[flagged]": "yes"}, self.petscan.options)

    def test_construct_cat_string(self):
        self.petscan.add_positive_category("pos 1")
        self.petscan.add_positive_category("pos2")
        self.petscan.add_negative_category("neg1")
        self.petscan.add_negative_category("neg 2")
        self.petscan.add_negative_category("neg3")
        self.assertEqual("pos+1\r\npos2", self.petscan._construct_list_argument(self.petscan.categories["positive"]))
        self.assertEqual("neg1\r\nneg+2\r\nneg3",
                         self.petscan._construct_list_argument(self.petscan.categories["negative"]))

    def test_construct_templates(self):
        self.petscan.add_yes_template("yes1")
        self.petscan.add_yes_template("yes2")
        self.petscan.add_any_template("any1")
        self.petscan.add_any_template("any2")
        self.petscan.add_any_template("any3")
        self.petscan.add_no_template("no1")
        self.petscan.add_no_template("no2")
        self.assertEqual(str(self.petscan),
                         "https://petscan.wmflabs.org/?language=de&project=wikisource&templates_yes=yes1%0D%0Ayes2&templates_any=any1%0D%0Aany2%0D%0Aany3&templates_no=no1%0D%0Ano2")

    def test_construct_outlinks(self):
        self.petscan.add_yes_outlink("yes1")
        self.petscan.add_yes_outlink("yes2")
        self.petscan.add_any_outlink("any1")
        self.petscan.add_any_outlink("any2")
        self.petscan.add_any_outlink("any3")
        self.petscan.add_no_outlink("no1")
        self.petscan.add_no_outlink("no2")
        self.assertEqual(str(self.petscan),
                         "https://petscan.wmflabs.org/?language=de&project=wikisource&outlinks_yes=yes1%0D%0Ayes2&outlinks_any=any1%0D%0Aany2%0D%0Aany3&outlinks_no=no1%0D%0Ano2")

    def test_construct_links_to(self):
        self.petscan.add_yes_links_to("yes1")
        self.petscan.add_yes_links_to("yes2")
        self.petscan.add_any_links_to("any1")
        self.petscan.add_any_links_to("any2")
        self.petscan.add_any_links_to("any3")
        self.petscan.add_no_links_to("no1")
        self.petscan.add_no_links_to("no2")
        self.assertEqual(str(self.petscan),
                         "https://petscan.wmflabs.org/?language=de&project=wikisource&links_to_all=yes1%0D%0Ayes2&links_to_any=any1%0D%0Aany2%0D%0Aany3&links_to_no=no1%0D%0Ano2")


    def test_construct_options(self):
        self.petscan.options = {"max_age": "1234",
                                 "get_q": "1",
                                 "show_redirects": "yes"}
        self.assertEqual("&max_age=1234" in str(self.petscan), True)
        self.assertEqual("&get_q=1" in str(self.petscan), True)
        self.assertEqual("&show_redirects=yes" in str(self.petscan), True)

    def test_construct_string(self):
        self.petscan.set_language("en")
        self.petscan.set_project("wikipedia")
        # only a positive category
        self.petscan.add_positive_category("test")
        self.assertEqual(str(self.petscan),
                         "https://petscan.wmflabs.org/?language=en&project=wikipedia&categories=test")
        # only a negative category
        self.petscan.categories = {"positive": [], "negative": []}
        self.petscan.add_negative_category("test")
        self.assertEqual(str(self.petscan),
                         "https://petscan.wmflabs.org/?language=en&project=wikipedia&negcats=test")
        # only a option
        self.petscan.categories = {"positive": [], "negative": []}
        self.petscan.add_options({"max_age": "10"})
        self.assertEqual(str(self.petscan),
                         "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10")

    def test_do_positive(self):
        with requests_mock.mock() as mock:
            mock.get("https://petscan.wmflabs.org/"
                     "?language=de&project=wikisource&format=json&doit=1",
                     text='{"n": "result","a": {"querytime_sec": 1.572163,'
                          '"query": "https://petscan.wmflabs.org/?language=de'
                          '&project=wikisource&categories=Autoren&get_q=1'
                          '&show_redirects=no&ns[0]=1&max_age=48'
                          '&format=json&doit=1"},'
                          '"*": [{"n": "combination",'
                          '"a": {"type": "subset",'
                          '"*": [{"id": 3279,'
                          '"len": 10197,'
                          '"n": "page",'
                          '"namespace": 0,'
                          '"nstext": "",'
                          '"q": "Q60644",'
                          '"title": "Friedrich_Rückert",'
                          '"touched": "20161024211701"}]}}]}')
            self.assertEqual(self.petscan.run(), [{"id": 3279,
                                                   "len": 10197,
                                                   "n": "page",
                                                   "namespace": 0,
                                                   "nstext": "",
                                                   "q": "Q60644",
                                                   "title": "Friedrich_Rückert",
                                                   "touched": "20161024211701"}])

    def test_do_negative(self):
        with requests_mock.mock() as mock:
            mock.get("https://petscan.wmflabs.org/"
                     "?language=de&project=wikisource&format=json&doit=1",
                     status_code=404)
            with self.assertRaises(ConnectionError):
                self.petscan.run()
Пример #16
0
class TestCatScan(TestCase):
    def setUp(self):
        self.petscan = PetScan()

    def test_add_options(self):
        self.petscan.add_options({"max_age": "45"})
        self.petscan.add_options({"smaller": "300"})
        self.assertDictEqual({
            "smaller": "300",
            "max_age": "45"
        }, self.petscan.options)

    def test_add_categoy(self):
        self.petscan.add_positive_category("pos1")
        self.petscan.add_positive_category("pos2")
        self.petscan.add_positive_category("pos3", 2)
        self.petscan.add_negative_category("neg1")
        self.petscan.add_negative_category("neg2")
        self.petscan.add_negative_category("neg3", 3)
        self.assertEqual(["pos1", "pos2", "pos3|2"],
                         self.petscan.categories["positive"])
        self.assertEqual(["neg1", "neg2", "neg3|3"],
                         self.petscan.categories["negative"])

    def test_add_namespace(self):
        self.petscan.add_namespace(0)
        self.petscan.add_namespace([2, 10])
        self.assertDictEqual({
            "ns[0]": "1",
            "ns[2]": "1",
            "ns[10]": "1"
        }, self.petscan.options)

    def test_activate_redirects(self):
        self.petscan.activate_redirects()
        self.assertDictEqual({"show_redirects": "yes"}, self.petscan.options)

    def test_deactivate_redirects(self):
        self.petscan.deactivate_redirects()
        self.assertDictEqual({"show_redirects": "no"}, self.petscan.options)

    def test_last_change_before(self):
        self.petscan.last_change_before(
            datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42))
        self.assertDictEqual({"before": "12340101020242"},
                             self.petscan.options)

    def test_last_change_after(self):
        self.petscan.last_change_after(
            datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42))
        self.assertDictEqual({"after": "12340101020242"}, self.petscan.options)

    def test_max_age(self):
        self.petscan.max_age(1234)
        self.assertDictEqual({"max_age": "1234"}, self.petscan.options)

    def test_only_new(self):
        self.petscan.only_new()
        self.assertDictEqual({"only_new": "1"}, self.petscan.options)

    def test_smaller_then(self):
        self.petscan.smaller_then(42)
        self.assertDictEqual({"smaller": "42"}, self.petscan.options)

    def test_larger_then(self):
        self.petscan.larger_then(42)
        self.assertDictEqual({"larger": "42"}, self.petscan.options)

    def test_get_wikidata(self):
        self.petscan.get_wikidata_items()
        self.assertDictEqual({"wikidata_item": "any"}, self.petscan.options)

    def test_get_Pages_with_wikidata(self):
        self.petscan.get_pages_with_wd_items()
        self.assertDictEqual({"wikidata_item": "with"}, self.petscan.options)

    def test_get_Pages_without_wikidata(self):
        self.petscan.get_pages_without_wd_items()
        self.assertDictEqual({"wikidata_item": "without"},
                             self.petscan.options)

    def test_set_or(self):
        self.petscan.set_logic_union()
        self.assertDictEqual({"combination": "union"}, self.petscan.options)

    def test_set_regex(self):
        self.petscan.set_regex_filter("abc")
        self.assertDictEqual({"regexp_filter": "abc"}, self.petscan.options)

    def test_set_last_edits(self):
        self.petscan.set_last_edit_bots(True)
        self.petscan.set_last_edit_anons(False)
        self.petscan.set_last_edit_flagged()
        self.assertDictEqual(
            {
                "edits[bots]": "yes",
                "edits[anons]": "no",
                "edits[flagged]": "yes"
            }, self.petscan.options)

    def test_construct_cat_string(self):
        self.petscan.add_positive_category("pos 1")
        self.petscan.add_positive_category("pos2")
        self.petscan.add_negative_category("neg1")
        self.petscan.add_negative_category("neg 2")
        self.petscan.add_negative_category("neg3")
        self.assertEqual(
            "pos+1\r\npos2",
            self.petscan._construct_list_argument(
                self.petscan.categories["positive"]))
        self.assertEqual(
            "neg1\r\nneg+2\r\nneg3",
            self.petscan._construct_list_argument(
                self.petscan.categories["negative"]))

    def test_construct_templates(self):
        self.petscan.add_yes_template("yes1")
        self.petscan.add_yes_template("yes2")
        self.petscan.add_any_template("any1")
        self.petscan.add_any_template("any2")
        self.petscan.add_any_template("any3")
        self.petscan.add_no_template("no1")
        self.petscan.add_no_template("no2")
        self.assertEqual(
            str(self.petscan), "https://petscan.wmflabs.org/?language=de"
            "&project=wikisource"
            "&templates_yes=yes1%0D%0Ayes2"
            "&templates_any=any1%0D%0Aany2%0D%0Aany3"
            "&templates_no=no1%0D%0Ano2")

    def test_construct_outlinks(self):
        self.petscan.add_yes_outlink("yes1")
        self.petscan.add_yes_outlink("yes2")
        self.petscan.add_any_outlink("any1")
        self.petscan.add_any_outlink("any2")
        self.petscan.add_any_outlink("any3")
        self.petscan.add_no_outlink("no1")
        self.petscan.add_no_outlink("no2")
        self.assertEqual(
            str(self.petscan), "https://petscan.wmflabs.org/?language=de"
            "&project=wikisource"
            "&outlinks_yes=yes1%0D%0Ayes2"
            "&outlinks_any=any1%0D%0Aany2%0D%0Aany3"
            "&outlinks_no=no1%0D%0Ano2")

    def test_construct_links_to(self):
        self.petscan.add_yes_links_to("yes1")
        self.petscan.add_yes_links_to("yes2")
        self.petscan.add_any_links_to("any1")
        self.petscan.add_any_links_to("any2")
        self.petscan.add_any_links_to("any3")
        self.petscan.add_no_links_to("no1")
        self.petscan.add_no_links_to("no2")
        self.assertEqual(
            str(self.petscan), "https://petscan.wmflabs.org/?language=de"
            "&project=wikisource"
            "&links_to_all=yes1%0D%0Ayes2"
            "&links_to_any=any1%0D%0Aany2%0D%0Aany3"
            "&links_to_no=no1%0D%0Ano2")

    def test_construct_options(self):
        self.petscan.options = {
            "max_age": "1234",
            "get_q": "1",
            "show_redirects": "yes"
        }
        self.assertEqual("&max_age=1234" in str(self.petscan), True)
        self.assertEqual("&get_q=1" in str(self.petscan), True)
        self.assertEqual("&show_redirects=yes" in str(self.petscan), True)

    def test_construct_string(self):
        self.petscan.set_language("en")
        self.petscan.set_project("wikipedia")
        # only a positive category
        self.petscan.add_positive_category("test")
        self.assertEqual(
            str(self.petscan),
            "https://petscan.wmflabs.org/?language=en&project=wikipedia&categories=test"
        )
        # only a negative category
        self.petscan.categories = {"positive": [], "negative": []}
        self.petscan.add_negative_category("test")
        self.assertEqual(
            str(self.petscan),
            "https://petscan.wmflabs.org/?language=en&project=wikipedia&negcats=test"
        )
        # only a option
        self.petscan.categories = {"positive": [], "negative": []}
        self.petscan.add_options({"max_age": "10"})
        self.assertEqual(
            str(self.petscan),
            "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10"
        )

    def test_do_positive(self):
        with requests_mock.mock() as mock:
            mock.get(
                "https://petscan.wmflabs.org/"
                "?language=de&project=wikisource&format=json&doit=1",
                text='{"n": "result","a": {"querytime_sec": 1.572163,'
                '"query": "https://petscan.wmflabs.org/?language=de'
                '&project=wikisource&categories=Autoren&get_q=1'
                '&show_redirects=no&ns[0]=1&max_age=48'
                '&format=json&doit=1"},'
                '"*": [{"n": "combination",'
                '"a": {"type": "subset",'
                '"*": [{"id": 3279,'
                '"len": 10197,'
                '"n": "page",'
                '"namespace": 0,'
                '"nstext": "",'
                '"q": "Q60644",'
                '"title": "Friedrich_Rückert",'
                '"touched": "20161024211701"}]}}]}')
            self.assertEqual(self.petscan.run(), [{
                "id": 3279,
                "len": 10197,
                "n": "page",
                "namespace": 0,
                "nstext": "",
                "q": "Q60644",
                "title": "Friedrich_Rückert",
                "touched": "20161024211701"
            }])

    def test_do_negative(self):
        with requests_mock.mock() as mock:
            mock.get(
                "https://petscan.wmflabs.org/"
                "?language=de&project=wikisource&format=json&doit=1",
                status_code=404)
            with self.assertRaises(PetScanException):
                self.petscan.run()
Пример #17
0
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
import re
from pywikibot import Page, Site
from tools.petscan import PetScan

wiki = Site()

searcher = PetScan()
searcher.add_positive_category("RE:Korrigiert")
lemma_list = searcher.run()

list_for_pfaerrich = []
for idx_lem, lemma in enumerate(lemma_list):
    print(idx_lem)
    page = Page(wiki, lemma['title'])
    version_history = page.fullVersionHistory()[::-1]
    size_all_changes = 0
    for idx_rev, revision in enumerate(version_history):
        user = revision.user
        if user == 'Pfaerrich':
            if idx_rev > 0:
                size_prev = len(version_history[idx_rev - 1].text)
            else:
                size_prev = 0
            size_all_changes += abs(
                len(version_history[idx_rev].text) - size_prev)
    korrigiert_flag = False
    if size_all_changes > 0:
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
import re
import pywikibot
import datetime
from tools.petscan import PetScan

searcher = PetScan()
searcher.add_positive_category('Werke')
searcher.add_positive_category('Unkorrigiert')
lemmas = searcher.run()

wiki = pywikibot.Site()
Пример #19
0
class AuthorList(CanonicalBot):
    # pylint: disable=bare-except,too-many-branches,broad-except
    def __init__(self, wiki, debug):
        CanonicalBot.__init__(self, wiki, debug)
        self.searcher = PetScan()
        self.repo = self.wiki.data_repository()  # this is a DataSite object
        self.string_list = []
        self.match_property = re.compile(r"\{\{#property:P(\d{1,4})\}\}")
        self.number_to_month = {
            1: "Januar",
            2: "Februar",
            3: "März",
            4: "April",
            5: "Mai",
            6: "Juni",
            7: "Juli",
            8: "August",
            9: "September",
            10: "Oktober",
            11: "November",
            12: "Dezember"
        }

    def __enter__(self):
        CanonicalBot.__enter__(self)
        if self.timestamp.start_of_run.day == 1:
            self.data.assign_dict({})
            self.logger.warning(
                "The data is thrown away. It is the first of the month")
        return self

    def task(self):
        lemma_list = self._run_searcher()
        self._build_database(lemma_list)
        if self.debug:
            dump = Page(self.wiki, f"Benutzer:THEbotIT/{self.bot_name}")
        else:
            dump = Page(self.wiki, "Liste der Autoren")
        old_text = dump.text
        new_text = self._convert_to_table()
        if new_text[150:] != old_text[150:]:  # compare all but the date
            dump.text = new_text
            dump.save("Die Liste wurde auf den aktuellen Stand gebracht.",
                      botflag=True)
        else:
            self.logger.info("Heute gab es keine Änderungen, "
                             "daher wird die Seite nicht überschrieben.")
        return True

    def _run_searcher(self):
        # was the last run successful
        if self.debug:
            # if False
            yesterday = datetime.now() - timedelta(days=5)
            self.searcher.last_change_after(
                datetime(year=int(yesterday.strftime("%Y")),
                         month=int(yesterday.strftime("%m")),
                         day=int(yesterday.strftime("%d"))))
        elif self.last_run_successful and self.data:
            start_of_search = self.create_timestamp_for_search()
            self.searcher.last_change_after(start_of_search)
            self.logger.info(
                f"The date {start_of_search.strftime('%d.%m.%Y')} "
                f"is set to the argument \"after\".")
        else:
            self.logger.warning(
                "There was no timestamp found of the last run, "
                "so the argument \"after\" is not set.")
        self.searcher.add_namespace(0)  # search in main namespace
        self.searcher.add_positive_category("Autoren")
        self.searcher.add_yes_template("Personendaten")
        self.searcher.get_wikidata_items()

        self.logger.debug(self.searcher)

        entries_to_search = self.searcher.run()
        return entries_to_search

    _space_regex = re.compile(r"\s+")

    def _strip_spaces(self, raw_string: str):
        return self._space_regex.subn(raw_string.strip(), " ")[0]

    def _build_database(self, lemma_list):
        # pylint: disable=too-many-statements
        for idx, author in enumerate(lemma_list):
            self.logger.debug(f"{idx + 1}/{len(lemma_list)} {author['title']}")
            # delete preexisting data of this author
            try:
                del self.data[str(author["id"])]
            except KeyError:
                if self.last_run_successful:
                    self.logger.info(
                        f"Can't delete old entry of [[{author['title']}]]")

            dict_author = {"title": author["title"]}
            # extract the Personendaten-block form the wikisource page
            page = Page(self.wiki, author["title"])
            try:
                try:
                    personendaten = re.search(
                        r"\{\{Personendaten(?:.|\n)*?\n\}\}\n",
                        page.text).group()
                except AttributeError:
                    self.logger.error(
                        f"No valid block \"Personendaten\" was found for "
                        f"[[{author['title']}]].")
                    personendaten = None
                if personendaten:
                    # personendaten = re.sub('<ref.*?>.*?<\/ref>|<ref.*?\/>', '', personendaten)
                    # personendaten = re.sub('\{\{CRef|.*?(?:\{\{.*?\}\})?}}', '', personendaten)
                    template_extractor = TemplateHandler(personendaten)
                    dict_author.update({
                        "name":
                        self._strip_spaces(
                            template_extractor.get_parameter("NACHNAME")
                            ["value"])
                    })
                    dict_author.update({
                        "first_name":
                        self._strip_spaces(
                            template_extractor.get_parameter("VORNAMEN")
                            ["value"])
                    })
                    try:
                        dict_author.update({
                            "birth":
                            self._strip_spaces(
                                template_extractor.get_parameter(
                                    "GEBURTSDATUM")["value"])
                        })
                    except Exception:
                        dict_author.update({"birth": ""})
                        self.logger.warning(
                            f"Templatehandler couldn't find a birthdate for: "
                            f"[[{author['title']}]]")
                    try:
                        dict_author.update({
                            "death":
                            self._strip_spaces(
                                template_extractor.get_parameter("STERBEDATUM")
                                ["value"])
                        })
                    except Exception:
                        dict_author.update({"death": ""})
                        self.logger.warning(
                            f"Templatehandler couldn't find a deathdate for: "
                            f"[[{author['title']}]]")
                    try:
                        dict_author.update({
                            "description":
                            template_extractor.get_parameter(
                                "KURZBESCHREIBUNG")["value"]
                        })
                    except Exception:
                        dict_author.update({"description": ""})
                        self.logger.warning(
                            f"Templatehandler couldn't find a description for: "
                            f"[[{author['title']}]]")
                    try:
                        dict_author.update({
                            "synonyms":
                            template_extractor.get_parameter("ALTERNATIVNAMEN")
                            ["value"]
                        })
                    except Exception:
                        dict_author.update({"synonyms": ""})
                        self.logger.warning(
                            f"Templatehandler couldn't find synonyms for: "
                            f"[[{author['title']}]]")
                    try:
                        dict_author.update({
                            "sortkey":
                            template_extractor.get_parameter("SORTIERUNG")
                            ["value"]
                        })
                        if dict_author["sortkey"] == "":
                            raise ValueError
                    except Exception:
                        self.logger.debug(
                            f"there is no sortkey for [[{author['title']}]].")
                        # make a dummy key
                        if not dict_author["name"]:
                            dict_author["sortkey"] = dict_author["first_name"]
                            self.logger.warning("Author has no last name.")
                        elif not dict_author["first_name"]:
                            dict_author["sortkey"] = dict_author["name"]
                            self.logger.warning(
                                "Author has no last first_name.")
                        else:
                            dict_author["sortkey"] = \
                                dict_author["name"] + ", " + dict_author["first_name"]
                    try:
                        dict_author.update({"wikidata": author["q"]})
                    except KeyError:
                        self.logger.warning(
                            f"The autor [[{author['title']}]] has no wikidata_item"
                        )
                    self.data.update({author["id"]: dict_author})
            except Exception as exception:
                self.logger.exception("Exception not catched: ",
                                      exc_info=exception)
                self.logger.error(f"author {author['title']} have a problem")

    @staticmethod
    def _sort_author_list(list_authors):
        list_authors.sort(key=lambda x: x[0])
        for i in range(len(list_authors) - 1):
            if list_authors[i][0] == list_authors[i + 1][0]:
                equal_count = 2
                while True:
                    if i + equal_count <= len(list_authors):
                        if list_authors[i][0] != list_authors[i +
                                                              equal_count][0]:
                            break
                        equal_count += 1
                temp_list = list_authors[i:i + equal_count]
                temp_list.sort(key=lambda x: x[5])  # sort by birth date
                list_authors[i:i + equal_count] = temp_list

    def _convert_to_table(self):
        # pylint: disable=too-many-locals
        # make a list of lists
        self.logger.info("Start compiling.")
        list_authors = []
        for key in self.data:
            author_dict = self.data[key]
            list_author = []
            list_author.append(author_dict["sortkey"])  # 0
            list_author.append(author_dict["title"].replace("_", " "))  # 1
            list_author.append(author_dict["name"])  # 2
            list_author.append(author_dict["first_name"])  # 3

            for event in ["birth", "death"]:
                list_author.append(
                    self._handle_birth_and_death(event, author_dict))  # 4,6
                try:
                    list_author.append(str(DateConversion(
                        list_author[-1])))  # 5,7
                except ValueError:
                    self.logger.error(
                        f"Can´t compile sort key for {author_dict['title']}: "
                        f"{event}/{author_dict[event]}")
                    list_author.append("!-00-00")  # 5,7
            list_author.append(author_dict["description"])  # 8
            list_authors.append(list_author)

        # sorting the list
        self.logger.info("Start sorting.")
        self._sort_author_list(list_authors)

        self.logger.info("Start printing.")
        start_of_run = self.timestamp.start_of_run
        self.string_list.append(
            f"Diese Liste der Autoren enthält alle {len(self.data)}<ref>Stand: "
            f"{start_of_run.day}.{start_of_run.month}.{start_of_run.year}, "
            f"{self.timestamp.start_of_run.strftime('%H:%M')} (UTC)</ref> Autoren, "
            f"zu denen in Wikisource eine Autorenseite existiert.")
        self.string_list.append(
            "Die Liste kann mit den Buttons neben den Spaltenüberschriften"
            " nach der jeweiligen Spalte sortiert werden.")
        self.string_list.append("<!--")
        self.string_list.append(
            "Diese Liste wurde durch ein Computerprogramm erstellt, "
            "das die Daten verwendet, "
            "die aus den Infoboxen auf den Autorenseiten stammen.")
        self.string_list.append(
            "Sollten daher Fehler vorhanden sein, "
            "sollten diese jeweils dort korrigiert werden.")
        self.string_list.append("-->")
        self.string_list.append("{|class=\"wikitable sortable\"")
        self.string_list.append("!style=\"width:20%\"| Name")
        self.string_list.append(
            "!data-sort-type=\"text\" style=\"width:15%\"| Geb.-datum")
        self.string_list.append(
            "!data-sort-type=\"text\" style=\"width:15%\"| Tod.-datum")
        self.string_list.append(
            "!class=\"unsortable\" style=\"width:50%\"| Beschreibung")
        for list_author in list_authors:
            aut_sort, aut_page, aut_sur, aut_pre, birth_str, \
                birth_sort, death_str, death_sort, description = \
                list_author
            self.string_list.append("|-")
            if aut_sur and aut_pre:
                self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|"
                                        f"[[{aut_page}|{aut_sur}, {aut_pre}]]")
            elif aut_pre:
                self.string_list.append(
                    f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_pre}]]"
                )
            else:
                self.string_list.append(
                    f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_sur}]]"
                )
            self.string_list.append(
                f"|data-sort-value=\"{birth_sort}\"|{birth_str}")
            self.string_list.append(
                f"|data-sort-value=\"{death_sort}\"|{death_str}")
            self.string_list.append(f"|{description}")
        self.string_list.append("|}")
        self.string_list.append('')
        self.string_list.append("== Anmerkungen ==")
        self.string_list.append("<references/>")
        self.string_list.append('')
        self.string_list.append("{{SORTIERUNG:Autoren #Liste der}}")
        self.string_list.append("[[Kategorie:Listen]]")
        self.string_list.append("[[Kategorie:Autoren|!]]")

        return "\n".join(self.string_list)

    def _handle_birth_and_death(self, event, author_dict):
        if author_dict[event] == '' or self.match_property.search(
                author_dict[event]):
            self.logger.debug(
                f"No valid entry in {event} for "
                f"[[{author_dict['title']}]] ... Fallback to wikidata")
            try:
                item = ItemPage(self.repo, author_dict["wikidata"])
                if event == "birth":
                    property_label = "P569"
                else:
                    property_label = "P570"
                claim = item.text["claims"][property_label][0]
                date_from_data = claim.getTarget()
                if date_from_data.precision < 7:
                    self.logger.error(
                        f"Precison is to low for [[{author_dict['title']}]]")
                elif date_from_data.precision < 8:
                    date_from_data = int(
                        ceil(float(date_from_data.year) / 100.0) * 100)
                    if date_from_data < 1000:
                        date_from_data = str(date_from_data)[0:1] + ". Jh."
                    else:
                        date_from_data = str(date_from_data)[0:2] + ". Jh."
                elif date_from_data.precision < 10:
                    date_from_data = str(date_from_data.year)
                elif date_from_data.precision < 11:
                    date_from_data = self.number_to_month[date_from_data.month] + " " + \
                        str(date_from_data.year)
                else:
                    date_from_data = f"{date_from_data.day}. " \
                        f"{self.number_to_month[date_from_data.month]} " \
                        f"{date_from_data.year}"
                if re.search("-", date_from_data):
                    date_from_data = date_from_data.replace("-",
                                                            "") + " v. Chr."
                self.logger.debug(
                    f"Found {date_from_data} @ wikidata for {event}")
                return date_from_data  # 4,6
            except Exception:
                self.logger.debug("Wasn't able to ge any data from wikidata")
                return ''  # 4,6
        else:
            return author_dict[event]  # 4,6
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
import re
import pywikibot
import datetime
from tools.petscan import PetScan

searcher = PetScan()
searcher.add_positive_category('Werke')
searcher.add_positive_category('Unkorrigiert')
lemmas = searcher.run()


wiki = pywikibot.Site()



Пример #21
0
class GlCreateMagazine(CanonicalBot):
    def __init__(self, wiki, debug):
        CanonicalBot.__init__(self, wiki, debug)
        self.searcher_pages = PetScan()
        self.searcher_indexes = PetScan()
        self.regex_page = re.compile(
            r"Die_Gartenlaube_\((\d{4})\)_([^\.]*?)\.(?:jpg|JPG)")
        self.regex_index = re.compile(r"Die_Gartenlaube_\((\d{4})\)")
        self.regex_magazine_in_index = \
            re.compile(r"((?:Heft|Halbheft) (?:\{\{0\}\})?\d{1,2}:.*?(?:\n\n|\Z))", re.DOTALL)
        self.regex_page_in_magazine = re.compile(r"_([_\w]{1,9}).(?:jpg|JPG)")
        self.regex_number_in_index = re.compile(
            r"(?:Heft|Halbheft) (?:\{\{0\}\})?(\d{1,2}):?")
        self.new_data_model = datetime(year=2018, month=7, day=1, hour=14)
        self.lemmas = None

    def __enter__(self):
        CanonicalBot.__enter__(self)
        if not self.data:
            self.data.assign_dict({"pages": {}, "indexes": {}})
        return self

    def task(self):
        self.lemmas = self.search_pages()
        temp_data_pages = {}
        self.process_indexes()
        self.process_pages(temp_data_pages)
        temp_data_magazines = self.process_actual_pages(temp_data_pages)
        self.make_magazines(temp_data_magazines)
        return True

    def process_pages(self, temp_data):
        for idx, lemma in enumerate(self.lemmas):
            try:
                hit = self.regex_page.search(lemma["title"])
                year = hit.group(1)
                page = hit.group(2)
                if year not in self.data["pages"].keys():
                    self.data["pages"][year] = {}
                proofread_lemma = ProofreadPage(self.wiki,
                                                f"Seite:{lemma['title']}")
                if self.debug:
                    self.logger.debug(
                        f"{idx + 1}/{len(self.lemmas)} Page {page}({year}) "
                        f"has quality level {proofread_lemma.quality_level} "
                        f"_ Seite:{lemma['title']}")
                ref = search_for_refs(proofread_lemma.text)
                page_dict = {"q": int(proofread_lemma.quality_level)}
                if ref:
                    self.logger.debug(
                        f"There are refs ({ref}) @ {year}, {page}")
                    page_dict.update({"r": ref})
                self.data["pages"][year][page] = page_dict
                if year not in temp_data.keys():
                    temp_data[year] = []
                temp_data[year].append(page)
            except Exception as error:  # pylint: disable=broad-except
                self.logger.error(
                    f"wasn't able to process {lemma['title']}, error: {error}")

    def process_indexes(self):
        for index_lemma, index_page in self._get_indexes():
            self.logger.debug(f"[[Index:{index_lemma}]]")
            magazines = self.regex_magazine_in_index.findall(index_page.text)
            hit_year = self.regex_index.search(index_lemma)
            year = hit_year.group(1)
            if year not in self.data["indexes"].keys():
                self.data["indexes"][year] = {}
            for magazine in magazines:
                pages = self.regex_page_in_magazine.findall(magazine)
                hit_number = self.regex_number_in_index.findall(magazine)
                number = int(hit_number[0])
                self.data["indexes"][year][number] = pages

    def process_actual_pages(self, dictionary_of_new_pages):
        tempdata_magzines = {}
        for year in dictionary_of_new_pages:
            set_of_pages = set(dictionary_of_new_pages[year])
            tempdata_magzines[year] = set()
            try:
                dictionary_of_magazines = self.data["indexes"][year]
            except KeyError as error:
                raise BotException(
                    f"The list of indexes is incorrect, {year} is missing."
                ) from error
            for magazine in dictionary_of_magazines:
                set_of_potential_pages = set(dictionary_of_magazines[magazine])
                if set_of_potential_pages.intersection(set_of_pages):
                    tempdata_magzines[year].add(magazine)
        return tempdata_magzines

    def make_magazines(self, dictionary_of_magazines_by_year):
        for idx_year, year in enumerate(dictionary_of_magazines_by_year):
            magazines = dictionary_of_magazines_by_year[year]
            self.logger.debug(f"make_mag_year {idx_year + 1}/"
                              f"{len(dictionary_of_magazines_by_year)}")
            for idx_mag, magazine in enumerate(magazines):
                self.logger.debug(
                    f"make_mag_mag {idx_mag + 1}/{len(magazines)} ... issue:{year}/{magazine}"
                )
                if year == "1986" and magazine == "31":
                    self.logger.warning(
                        "There is magazine 1986, 31, this is special, no creating here"
                    )
                    continue
                if self.debug:
                    lemma = Page(self.wiki, "Benutzer:THEbotIT/Test")
                else:
                    lemma = Page(
                        self.wiki,
                        f"Die Gartenlaube ({year})/Heft {int(magazine):d}")
                new_text = self.make_magazine(year, magazine)
                if new_text:
                    if hash(new_text.strip()) != hash(lemma.text.strip()):
                        self.logger.debug(
                            f"Print [[Die Gartenlaube ({year})/Heft {magazine}]]."
                        )
                        if lemma.text != '':
                            lemma.text = new_text
                            lemma.save(
                                "Automatische Aktualisierung des Heftes",
                                botflag=True)
                        else:
                            lemma.text = new_text
                            lemma.save("automatische Hefterstellung",
                                       botflag=True)
                    else:
                        self.logger.debug(
                            f"Keine Änderung im Text ({year}/{magazine}).")

    def make_magazine(self, year, magazine):
        last_magazine = True
        try:
            for key in self.data["indexes"][year].keys():
                if int(key) > int(magazine):
                    last_magazine = False
                    break
        except KeyError as error:
            raise BotException(f"The list of indexes is incorrect, {year} is missing.") \
                from error
        try:
            list_of_pages = self.data["indexes"][year][magazine]
        except KeyError as error:
            raise BotException(f"The list of indexes is incorrect, year:{year} or mag:{magazine} is missing.") \
                from error
        quality = 4
        for page in list_of_pages:
            try:
                if self.data["pages"][year][page]["q"] == 0:
                    page_quality = 4
                else:
                    page_quality = self.data["pages"][year][page]["q"]
                if page_quality < quality:
                    quality = page_quality
                if quality < 3:
                    self.logger.debug(
                        f"The quality of {year}/{magazine} is too poor.")
                    return None
            except KeyError:
                self.logger.warning(f"The list of pages is incorrect, "
                                    f"year:{year} or page:{page} is missing.")
                return None
        return self.make_magazine_text(year, magazine, quality, list_of_pages,
                                       last_magazine)

    @staticmethod
    def convert_page_no(page: str):
        while True:
            if page[0] == "0":
                page = page[1:]
            else:
                break
        return page.replace("_", " ")

    def make_magazine_text(self, year, magazine, quality, list_of_pages, last):
        # pylint: disable=too-many-arguments,too-many-branches
        magazine = int(magazine)
        year = int(year)
        string_list = []
        string_list.append(
            "<!--Diese Seite wurde automatisch durch einen Bot erstellt. "
            "Wenn du einen Fehler findest oder eine Änderung wünscht, "
            "benachrichtige bitte den Betreiber, THE IT, des Bots.-->\n"
            "{{Textdaten\n")
        if magazine > 1:
            string_list.append(
                f"|VORIGER=Die Gartenlaube ({year:d})/Heft {magazine - 1:d}\n")
        else:
            string_list.append("|VORIGER=\n")
        if last:
            string_list.append("|NÄCHSTER=\n")
        else:
            string_list.append(
                f"|NÄCHSTER=Die Gartenlaube ({year:d})/Heft {magazine + 1:d}\n"
            )
        string_list.append(
            f"|AUTOR=Verschiedene\n"
            f"|TITEL=[[Die Gartenlaube ({year})|Die Gartenlaube]]\n"
            f"|SUBTITEL=''Illustrirtes Familienblatt''\n"
            f"|HERKUNFT=off\n")
        if year < 1863:
            string_list.append("|HERAUSGEBER=[[Ferdinand Stolle]]\n")
        elif (year < 1878) or (year == 1878 and magazine < 14):
            string_list.append("|HERAUSGEBER=[[Ernst Keil]]\n")
        elif year < 1885:
            string_list.append("|HERAUSGEBER=Ernst Ziel\n")
        else:
            string_list.append("|HERAUSGEBER=Adolf Kröner\n")
        string_list.append(f"|ENTSTEHUNGSJAHR={year:d}\n"
                           f"|ERSCHEINUNGSJAHR={year:d}\n"
                           f"|ERSCHEINUNGSORT=Leipzig\n"
                           f"|VERLAG=Ernst Keil\n"
                           f"|WIKIPEDIA=Die Gartenlaube\n")
        if year == 1873:
            extension = "JPG"
        else:
            extension = "jpg"
        string_list.append(
            f"|BILD=Die Gartenlaube ({year:d}) {list_of_pages[0]}.{extension}\n"
        )
        string_list.append(
            f"|QUELLE=[[commons:category:Gartenlaube ({year})|commons]]\n")
        if quality == 4:
            string_list.append("|BEARBEITUNGSSTAND=fertig\n")
        else:
            string_list.append("|BEARBEITUNGSSTAND=korrigiert\n")
        string_list.append(f"|INDEXSEITE=Die Gartenlaube ({year})\n}}}}\n"
                           f"{{{{BlockSatzStart}}}}\n__TOC__\n")
        ref = []
        for page in list_of_pages:
            page_format = self.convert_page_no(page)
            string_list.append(
                f"{{{{SeitePR|{page_format}|Die Gartenlaube ({year}) {page}.{extension}}}}}\n"
            )
            try:
                page_dict = self.data["pages"][str(year)][page]
                if "r" in page_dict.keys():
                    if "ref" in page_dict["r"]:
                        if "ref" not in ref:
                            ref.append("ref")
                    for ref_type in page_dict["r"]:
                        if (ref_type != "ref") and (ref_type not in ref):
                            ref.append(ref_type)
            except KeyError:
                self.logger.error(f"The list of pages is incorrect, "
                                  f"year:{year} or page:{page} is missing.")
                return None
        if "ref" in ref:
            string_list.append("{{references|x}}\n")
        for ref_type in ref:
            if ref_type != "ref":
                string_list.append(f"{{{{references|TIT|{ref_type}}}}}\n")
        string_list.append(
            f"{{{{BlockSatzEnd}}}}\n\n[[Kategorie:Deutschland]]\n"
            f"[[Kategorie:Neuhochdeutsch]]\n[[Kategorie:Illustrierte Werke]]\n"
            f"[[Kategorie:Die Gartenlaube ({year:d}) Hefte| {magazine:02d}]]\n"
        )
        string_list.append(f"[[Kategorie:{str(year)[0:3]}0er Jahre]]\n\n")
        return ''.join(string_list)

    def _get_indexes(self) -> Iterator[IndexPage]:
        self.searcher_indexes.add_positive_category("Die Gartenlaube")
        self.searcher_indexes.add_positive_category("Index")
        self.searcher_indexes.set_regex_filter(r".*Die Gartenlaube \(\d{4}\)")
        self.searcher_indexes.set_timeout(60)
        for index in self.searcher_indexes.run():
            yield index["title"], IndexPage(self.wiki,
                                            f"Index:{index['title']}")

    def search_pages(self):
        self.searcher_pages.add_positive_category("Die Gartenlaube")
        self.searcher_pages.add_namespace(102)  # namespace Seite
        self.searcher_pages.set_search_depth(1)
        self.searcher_pages.set_timeout(60)
        if self.last_run_successful or self.debug:
            delta = (self.timestamp.start_of_run -
                     self.timestamp.last_run).days
            if self.debug:
                delta = 10
            start_of_search = self.create_timestamp_for_search(delta)
            self.searcher_pages.last_change_after(start_of_search)
            self.logger.info(
                f"The date {start_of_search.strftime('%d.%m.%Y')} is set to the argument \"after\"."
            )
        return self.searcher_pages.run()
Пример #22
0
 def get_count():
     searcher = PetScan()
     searcher.add_positive_category("David Hilbert Gesammelte Abhandlungen Erster Band")
     searcher.add_positive_category("Unkorrigiert")
     searcher.add_namespace("Seite")
     return len(searcher.run())
Пример #23
0
 def get_list():
     searcher = PetScan()
     searcher.add_positive_category("Aachener Stadtrechnungen")
     searcher.add_namespace(0)
     return searcher.run()
Пример #24
0
def crawler_cat_index_site():
    searcher_werke = PetScan()
    for item in watch_themes:
        searcher_werke.add_positive_category(item)
    searcher_werke.add_negative_category('Zeitschrift')
    searcher_werke.set_search_depth(4)
    searcher_werke.set_logic(log_or=True)
    list_werke = searcher_werke.run()
    for row in range(len(list_werke)):
        list_werke[row] = list_werke[row]['a']['title']
    pass
    all_sites = set([])
    counter = 1
    for werk in list_werke:
        searcher_sites = PetScan()
        searcher_sites.add_namespace(102)
        searcher_sites.add_positive_category('Fertig')
        searcher_sites.add_positive_category('Korrigiert')
        searcher_sites.add_positive_category('Unkorrigiert')
        searcher_sites.set_logic(log_or=True)
        searcher_sites.add_any_outlink(werk)
        # this link have a bug on catscan2
        # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de
        sites = searcher_sites.run()
        if len(sites) > 0:
            for row in range(len(sites)):
                sites[row] = sites[row]['a']['title']
            all_sites = all_sites | set(sites)
        else:
            searcher_index = PetScan()
            searcher_index.add_any_outlink(werk)
            searcher_index.add_namespace(104)
            searcher_index.add_positive_category('Index')
            index = searcher_index.run()
            if index:
                searcher_sites = PetScan()
                searcher_sites.add_namespace(102)
                searcher_sites.add_positive_category('Fertig')
                searcher_sites.add_positive_category('Korrigiert')
                searcher_sites.add_positive_category('Unkorrigiert')
                searcher_sites.set_logic(log_or=True)
                searcher_sites.add_any_outlink(index[0]['a']['nstext'] + ':' +
                                               index[0]['a']['title'])
                # this link have a bug on catscan2
                # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de
                sites = searcher_sites.run()
            else:
                print(werk)
        print(counter, '/', len(list_werke), ' result:', len(all_sites))
        counter += 1
    with open('output.txt', 'w', encoding='utf-8') as f:
        f.writelines(["Seite:%s\n" % item for item in all_sites])
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
import re
from pywikibot import Page, Site
from tools.petscan import PetScan

wiki = Site()

searcher = PetScan()
searcher.add_positive_category("RE:Korrigiert")
lemma_list = searcher.run()

list_for_pfaerrich = []
for idx_lem, lemma in enumerate(lemma_list):
    print(idx_lem)
    page = Page(wiki, lemma['title'])
    version_history = page.fullVersionHistory()[::-1]
    size_all_changes = 0
    for idx_rev, revision in enumerate(version_history):
        user = revision.user
        if user == 'Pfaerrich':
             if idx_rev > 0:
                 size_prev = len(version_history[idx_rev-1].text)
             else:
                 size_prev = 0
             size_all_changes += abs(len(version_history[idx_rev].text) - size_prev)
    korrigiert_flag = False
    if size_all_changes > 0:
        for version in page.getVersionHistory():
Пример #26
0
def crawler_cat_index_site():
    searcher_werke  =PetScan()
    for item in watch_themes:
        searcher_werke.add_positive_category(item)
    searcher_werke.add_negative_category('Zeitschrift')
    searcher_werke.set_search_depth(4)
    searcher_werke.set_logic(log_or=True)
    list_werke = searcher_werke.run()
    for row in range(len(list_werke)):
        list_werke[row] = list_werke[row]['a']['title']
    pass
    all_sites = set([])
    counter = 1
    for werk in list_werke:
        searcher_sites = PetScan()
        searcher_sites.add_namespace('Seite')
        searcher_sites.add_positive_category('Fertig')
        searcher_sites.add_positive_category('Korrigiert')
        searcher_sites.add_positive_category('Unkorrigiert')
        searcher_sites.set_logic(log_or=True)
        searcher_sites.add_any_outlink(werk)
        # this link have a bug on catscan2
        # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de
        sites = searcher_sites.run()
        if len(sites) > 0:
            for row in range(len(sites)):
                sites[row] = sites[row]['a']['title']
            all_sites = all_sites | set(sites)
        else:
            searcher_index = PetScan()
            searcher_index.add_any_outlink(werk)
            searcher_index.add_namespace('Index')
            searcher_index.add_positive_category('Index')
            index = searcher_index.run()
            if index:
                searcher_sites = PetScan()
                searcher_sites.add_namespace('Seite')
                searcher_sites.add_positive_category('Fertig')
                searcher_sites.add_positive_category('Korrigiert')
                searcher_sites.add_positive_category('Unkorrigiert')
                searcher_sites.set_logic(log_or=True)
                searcher_sites.add_any_outlink(index[0]['a']['nstext'] + ':' + index[0]['a']['title'])
                # this link have a bug on catscan2
                # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de
                sites = searcher_sites.run()
            else:
                print(werk)
        print(counter, '/', len(list_werke), ' result:', len(all_sites))
        counter += 1
    with open('output.txt', 'w', encoding='utf-8') as f:
        f.writelines(["Seite:%s\n" % item  for item in all_sites])
Пример #27
0
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
import re
import pywikibot
from tools.petscan import PetScan

wiki = pywikibot.Site()

regex_suez = re.compile("\[\[Der Canal von Suez\]\]")

searcher = PetScan()
searcher.add_positive_category('Der Canal von Suez')
lemmas = searcher.run()

for lemma in lemmas:
    print(lemma)
    page = pywikibot.Page(wiki, '{}:{}'.format(lemma['nstext'],
                                               lemma['title']))
    temp_text = page.text
    page.text = regex_suez.sub(
        '[[Der Canal von Suez (Nordische Revue 1864)|Der Canal von Suez]]',
        temp_text)
    print(page.text)
    page.save('Link korrigiert', botflag=True)
Пример #28
0
class AuthorList(CanonicalBot):
    # pylint: disable=bare-except,too-many-branches,broad-except
    def __init__(self, wiki, debug):
        CanonicalBot.__init__(self, wiki, debug)
        self.searcher = PetScan()
        self.repo = self.wiki.data_repository()  # this is a DataSite object
        self.string_list = []
        self.match_property = re.compile(r"\{\{#property:P(\d{1,4})\}\}")
        self.number_to_month = {1: "Januar",
                                2: "Februar",
                                3: "März",
                                4: "April",
                                5: "Mai",
                                6: "Juni",
                                7: "Juli",
                                8: "August",
                                9: "September",
                                10: "Oktober",
                                11: "November",
                                12: "Dezember"}

    def __enter__(self):
        CanonicalBot.__enter__(self)
        if self.timestamp.start_of_run.day == 1:
            self.data.assign_dict(dict())
            self.logger.warning("The data is thrown away. It is the first of the month")
        return self

    def task(self):
        lemma_list = self._run_searcher()
        self._build_database(lemma_list)
        if self.debug:
            dump = Page(self.wiki, f"Benutzer:THEbotIT/{self.bot_name}")
        else:
            dump = Page(self.wiki, "Liste der Autoren")
        old_text = dump.text
        new_text = self._convert_to_table()
        if new_text[150:] != old_text[150:]:  # compare all but the date
            dump.text = new_text
            dump.save("Die Liste wurde auf den aktuellen Stand gebracht.", botflag=True)
        else:
            self.logger.info("Heute gab es keine Änderungen, "
                             "daher wird die Seite nicht überschrieben.")
        return True

    def _run_searcher(self):
        # was the last run successful
        if self.debug:
            # if False
            yesterday = datetime.now() - timedelta(days=5)
            self.searcher.last_change_after(datetime(year=int(yesterday.strftime("%Y")),
                                                     month=int(yesterday.strftime("%m")),
                                                     day=int(yesterday.strftime("%d"))))
        elif self.last_run_successful and self.data:
            start_of_search = self.create_timestamp_for_search()
            self.searcher.last_change_after(start_of_search)
            self.logger.info(f"The date {start_of_search.strftime('%d.%m.%Y')} "
                             f"is set to the argument \"after\".")
        else:
            self.logger.warning("There was no timestamp found of the last run, "
                                "so the argument \"after\" is not set.")
        self.searcher.add_namespace(0)  # search in main namespace
        self.searcher.add_positive_category("Autoren")
        self.searcher.add_yes_template("Personendaten")
        self.searcher.get_wikidata_items()

        self.logger.debug(self.searcher)

        entries_to_search = self.searcher.run()
        return entries_to_search

    _space_regex = re.compile(r"\s+")

    def _strip_spaces(self, raw_string: str):
        return self._space_regex.subn(raw_string.strip(), " ")[0]

    def _build_database(self, lemma_list):
        # pylint: disable=too-many-statements
        for idx, author in enumerate(lemma_list):
            self.logger.debug(f"{idx + 1}/{len(lemma_list)} {author['title']}")
            # delete preexisting data of this author
            try:
                del self.data[str(author["id"])]
            except KeyError:
                if self.last_run_successful:
                    self.logger.info(f"Can't delete old entry of [[{author['title']}]]")

            dict_author = {"title": author["title"]}
            # extract the Personendaten-block form the wikisource page
            page = Page(self.wiki, author["title"])
            try:
                try:
                    personendaten = re.search(r"\{\{Personendaten(?:.|\n)*?\n\}\}\n",
                                              page.text).group()
                except AttributeError:
                    self.logger.error(f"No valid block \"Personendaten\" was found for "
                                      f"[[{author['title']}]].")
                    personendaten = None
                if personendaten:
                    # personendaten = re.sub('<ref.*?>.*?<\/ref>|<ref.*?\/>', '', personendaten)
                    # personendaten = re.sub('\{\{CRef|.*?(?:\{\{.*?\}\})?}}', '', personendaten)
                    template_extractor = TemplateHandler(personendaten)
                    dict_author.update({"name": self._strip_spaces(
                        template_extractor.get_parameter("NACHNAME")["value"])})
                    dict_author.update({"first_name": self._strip_spaces(
                        template_extractor.get_parameter("VORNAMEN")["value"])})
                    try:
                        dict_author.update({"birth": self._strip_spaces(
                            template_extractor.get_parameter("GEBURTSDATUM")["value"])})
                    except Exception:
                        dict_author.update({"birth": ""})
                        self.logger.warning(f"Templatehandler couldn't find a birthdate for: "
                                            f"[[{author['title']}]]")
                    try:
                        dict_author.update({"death": self._strip_spaces(
                            template_extractor.get_parameter("STERBEDATUM")["value"])})
                    except Exception:
                        dict_author.update({"death": ""})
                        self.logger.warning(f"Templatehandler couldn't find a deathdate for: "
                                            f"[[{author['title']}]]")
                    try:
                        dict_author.update(
                            {"description":
                             template_extractor.get_parameter("KURZBESCHREIBUNG")["value"]})
                    except Exception:
                        dict_author.update({"description": ""})
                        self.logger.warning(
                            f"Templatehandler couldn't find a description for: "
                            f"[[{author['title']}]]")
                    try:
                        dict_author.update(
                            {"synonyms":
                             template_extractor.get_parameter("ALTERNATIVNAMEN")["value"]})
                    except Exception:
                        dict_author.update({"synonyms": ""})
                        self.logger.warning(f"Templatehandler couldn't find synonyms for: "
                                            f"[[{author['title']}]]")
                    try:
                        dict_author.update(
                            {"sortkey": template_extractor.get_parameter("SORTIERUNG")["value"]})
                        if dict_author["sortkey"] == "":
                            raise ValueError
                    except Exception:
                        self.logger.debug(f"there is no sortkey for [[{author['title']}]].")
                        # make a dummy key
                        if not dict_author["name"]:
                            dict_author["sortkey"] = dict_author["first_name"]
                            self.logger.warning("Author has no last name.")
                        elif not dict_author["first_name"]:
                            dict_author["sortkey"] = dict_author["name"]
                            self.logger.warning("Author has no last first_name.")
                        else:
                            dict_author["sortkey"] = \
                                dict_author["name"] + ", " + dict_author["first_name"]
                    try:
                        dict_author.update({"wikidata": author["q"]})
                    except KeyError:
                        self.logger.warning(f"The autor [[{author['title']}]] has no wikidata_item")
                    self.data.update({author["id"]: dict_author})
            except Exception as exception:
                self.logger.exception("Exception not catched: ", exc_info=exception)
                self.logger.error(f"author {author['title']} have a problem")

    @staticmethod
    def _sort_author_list(list_authors):
        list_authors.sort(key=lambda x: x[0])
        for i in range(len(list_authors) - 1):
            if list_authors[i][0] == list_authors[i + 1][0]:
                equal_count = 2
                while True:
                    if i + equal_count <= len(list_authors):
                        if list_authors[i][0] != list_authors[i + equal_count][0]:
                            break
                        equal_count += 1
                temp_list = list_authors[i:i + equal_count]
                temp_list.sort(key=lambda x: x[5])  # sort by birth date
                list_authors[i:i + equal_count] = temp_list

    def _convert_to_table(self):
        # pylint: disable=too-many-locals
        # make a list of lists
        self.logger.info("Start compiling.")
        list_authors = []
        for key in self.data:
            author_dict = self.data[key]
            list_author = list()
            list_author.append(author_dict["sortkey"])  # 0
            list_author.append(author_dict["title"].replace("_", " "))  # 1
            list_author.append(author_dict["name"])  # 2
            list_author.append(author_dict["first_name"])  # 3

            for event in ["birth", "death"]:
                list_author.append(self._handle_birth_and_death(event, author_dict))  # 4,6
                try:
                    list_author.append(str(DateConversion(list_author[-1])))  # 5,7
                except ValueError:
                    self.logger.error(f"Can´t compile sort key for {author_dict['title']}: "
                                      f"{event}/{author_dict[event]}")
                    list_author.append("!-00-00")  # 5,7
            list_author.append(author_dict["description"])  # 8
            list_authors.append(list_author)

        # sorting the list
        self.logger.info("Start sorting.")
        self._sort_author_list(list_authors)

        self.logger.info("Start printing.")
        start_of_run = self.timestamp.start_of_run
        self.string_list.append(f"Diese Liste der Autoren enthält alle {len(self.data)}<ref>Stand: "
                                f"{start_of_run.day}.{start_of_run.month}.{start_of_run.year}, "
                                f"{self.timestamp.start_of_run.strftime('%H:%M')} (UTC)</ref> Autoren, "
                                f"zu denen in Wikisource eine Autorenseite existiert.")
        self.string_list.append("Die Liste kann mit den Buttons neben den Spaltenüberschriften"
                                " nach der jeweiligen Spalte sortiert werden.")
        self.string_list.append("<!--")
        self.string_list.append("Diese Liste wurde durch ein Computerprogramm erstellt, "
                                "das die Daten verwendet, "
                                "die aus den Infoboxen auf den Autorenseiten stammen.")
        self.string_list.append("Sollten daher Fehler vorhanden sein, "
                                "sollten diese jeweils dort korrigiert werden.")
        self.string_list.append("-->")
        self.string_list.append("{|class=\"wikitable sortable\"")
        self.string_list.append("!style=\"width:20%\"| Name")
        self.string_list.append("!data-sort-type=\"text\" style=\"width:15%\"| Geb.-datum")
        self.string_list.append("!data-sort-type=\"text\" style=\"width:15%\"| Tod.-datum")
        self.string_list.append("!class=\"unsortable\" style=\"width:50%\"| Beschreibung")
        for list_author in list_authors:
            aut_sort, aut_page, aut_sur, aut_pre, birth_str, \
                birth_sort, death_str, death_sort, description = \
                list_author
            self.string_list.append("|-")
            if aut_sur and aut_pre:
                self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|"
                                        f"[[{aut_page}|{aut_sur}, {aut_pre}]]")
            elif aut_pre:
                self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_pre}]]")
            else:
                self.string_list.append(f"|data-sort-value=\"{aut_sort}\"|[[{aut_page}|{aut_sur}]]")
            self.string_list.append(f"|data-sort-value=\"{birth_sort}\"|{birth_str}")
            self.string_list.append(f"|data-sort-value=\"{death_sort}\"|{death_str}")
            self.string_list.append(f"|{description}")
        self.string_list.append("|}")
        self.string_list.append('')
        self.string_list.append("== Anmerkungen ==")
        self.string_list.append("<references/>")
        self.string_list.append('')
        self.string_list.append("{{SORTIERUNG:Autoren #Liste der}}")
        self.string_list.append("[[Kategorie:Listen]]")
        self.string_list.append("[[Kategorie:Autoren|!]]")

        return "\n".join(self.string_list)

    def _handle_birth_and_death(self, event, author_dict):
        if author_dict[event] == '' or self.match_property.search(author_dict[event]):
            self.logger.debug(f"No valid entry in {event} for "
                              f"[[{author_dict['title']}]] ... Fallback to wikidata")
            try:
                item = ItemPage(self.repo, author_dict["wikidata"])
                if event == "birth":
                    property_label = "P569"
                else:
                    property_label = "P570"
                claim = item.text["claims"][property_label][0]
                date_from_data = claim.getTarget()
                if date_from_data.precision < 7:
                    self.logger.error(f"Precison is to low for [[{author_dict['title']}]]")
                elif date_from_data.precision < 8:
                    date_from_data = int(ceil(float(date_from_data.year) / 100.0) * 100)
                    if date_from_data < 1000:
                        date_from_data = str(date_from_data)[0:1] + ". Jh."
                    else:
                        date_from_data = str(date_from_data)[0:2] + ". Jh."
                elif date_from_data.precision < 10:
                    date_from_data = str(date_from_data.year)
                elif date_from_data.precision < 11:
                    date_from_data = self.number_to_month[date_from_data.month] + " " + \
                        str(date_from_data.year)
                else:
                    date_from_data = f"{date_from_data.day}. " \
                        f"{self.number_to_month[date_from_data.month]} " \
                        f"{date_from_data.year}"
                if re.search("-", date_from_data):
                    date_from_data = date_from_data.replace("-", "") + " v. Chr."
                self.logger.debug(f"Found {date_from_data} @ wikidata for {event}")
                return date_from_data  # 4,6
            except Exception:
                self.logger.debug("Wasn't able to ge any data from wikidata")
                return ''  # 4,6
        else:
            return author_dict[event]  # 4,6
Пример #29
0
 def get_list():
     searcher = PetScan()
     searcher.add_positive_category("RE:Autor:Arthur Stein")
     searcher.add_namespace(0)
     return searcher.run()
Пример #30
0
 def get_count():
     searcher = PetScan()
     searcher.add_positive_category("David Hilbert Gesammelte Abhandlungen Erster Band")
     searcher.add_positive_category("Unkorrigiert")
     searcher.add_namespace("Seite")
     return len(searcher.run())
Пример #31
0
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
from tools.petscan import PetScan
import pywikibot

searcher_index = PetScan()
searcher_index.add_namespace('Index')
searcher_index.add_positive_category('Index')
list_of_indexes = searcher_index.run()
wiki = pywikibot.Site()

for idx, index in enumerate(list_of_indexes):
    print('{}/{} {}'.format(idx + 1, len(list_of_indexes), index['a']['title']))
    searcher_sites_of_index = PetScan()
    searcher_sites_of_index.add_namespace('Seite')
    searcher_sites_of_index.add_yes_outlink(index['a']['nstext'] + ':' + index['a']['title'])
    searcher_sites_of_index.add_positive_category('Fertig')
    searcher_sites_of_index.add_positive_category('Korrigiert')
    searcher_sites_of_index.add_positive_category('Unkorrigiert')
    searcher_sites_of_index.set_logic(log_or=True)
    list_of_sites = searcher_sites_of_index.run()
    for idx_, site in enumerate(list_of_sites):
        print('\t{}/{} {}'.format(idx_ + 1, len(list_of_sites), site['a']['nstext'] + ':' + site['a']['title']))
        touchpage = pywikibot.Page(wiki, title=site['a']['nstext'] + ':' + site['a']['title'])
        touchpage.touch()
    del searcher_sites_of_index
Пример #32
0
class GlCreateMagazine(CanonicalBot):
    def __init__(self, wiki, debug):
        CanonicalBot.__init__(self, wiki, debug)
        self.searcher_pages = PetScan()
        self.searcher_indexes = PetScan()
        self.regex_page = re.compile(r"Die_Gartenlaube_\((\d{4})\)_([^\.]*?)\.(?:jpg|JPG)")
        self.regex_index = re.compile(r"Die_Gartenlaube_\((\d{4})\)")
        self.regex_magazine_in_index = \
            re.compile(r"((?:Heft|Halbheft) (?:\{\{0\}\})?\d{1,2}:.*?(?:\n\n|\Z))", re.DOTALL)
        self.regex_page_in_magazine = re.compile(r"_([_\w]{1,9}).(?:jpg|JPG)")
        self.regex_number_in_index = re.compile(r"(?:Heft|Halbheft) (?:\{\{0\}\})?(\d{1,2}):?")
        self.new_data_model = datetime(year=2018, month=7, day=1, hour=14)
        self.lemmas = None

    def __enter__(self):
        CanonicalBot.__enter__(self)
        if not self.data:
            self.data.assign_dict({"pages": {}, "indexes": {}})
        return self

    def task(self):
        self.lemmas = self.search_pages()
        temp_data_pages = {}
        self.process_indexes()
        self.process_pages(temp_data_pages)
        temp_data_magazines = self.process_actual_pages(temp_data_pages)
        self.make_magazines(temp_data_magazines)
        return True

    def process_pages(self, temp_data):
        for idx, lemma in enumerate(self.lemmas):
            try:
                hit = self.regex_page.search(lemma["title"])
                year = hit.group(1)
                page = hit.group(2)
                if year not in self.data["pages"].keys():
                    self.data["pages"][year] = {}
                proofread_lemma = ProofreadPage(self.wiki, f"Seite:{lemma['title']}")
                if self.debug:
                    self.logger.debug(f"{idx + 1}/{len(self.lemmas)} Page {page}({year}) "
                                      f"has quality level {proofread_lemma.quality_level} "
                                      f"_ Seite:{lemma['title']}")
                ref = search_for_refs(proofread_lemma.text)
                page_dict = {"q": int(proofread_lemma.quality_level)}
                if ref:
                    self.logger.debug(f"There are refs ({ref}) @ {year}, {page}")
                    page_dict.update({"r": ref})
                self.data["pages"][year][page] = page_dict
                if year not in temp_data.keys():
                    temp_data[year] = []
                temp_data[year].append(page)
            except Exception as error:  # pylint: disable=broad-except
                self.logger.error(f"wasn't able to process {lemma['title']}, error: {error}")

    def process_indexes(self):
        for index_lemma, index_page in self._get_indexes():
            self.logger.debug("[[Index:{}]]".format(index_lemma))
            magazines = self.regex_magazine_in_index.findall(index_page.text)
            hit_year = self.regex_index.search(index_lemma)
            year = hit_year.group(1)
            if year not in self.data["indexes"].keys():
                self.data["indexes"][year] = {}
            for magazine in magazines:
                pages = self.regex_page_in_magazine.findall(magazine)
                hit_number = self.regex_number_in_index.findall(magazine)
                number = int(hit_number[0])
                self.data["indexes"][year][number] = pages

    def process_actual_pages(self, dictionary_of_new_pages):
        tempdata_magzines = {}
        for year in dictionary_of_new_pages:
            set_of_pages = set(dictionary_of_new_pages[year])
            tempdata_magzines[year] = set()
            try:
                dictionary_of_magazines = self.data["indexes"][year]
            except KeyError:
                raise BotException(f"The list of indexes is incorrect, {year} is missing.")
            for magazine in dictionary_of_magazines:
                set_of_potential_pages = set(dictionary_of_magazines[magazine])
                if set_of_potential_pages.intersection(set_of_pages):
                    tempdata_magzines[year].add(magazine)
        return tempdata_magzines

    def make_magazines(self, dictionary_of_magazines_by_year):
        for idx_year, year in enumerate(dictionary_of_magazines_by_year):
            magazines = dictionary_of_magazines_by_year[year]
            self.logger.debug(f"make_mag_year {idx_year + 1}/"
                              f"{len(dictionary_of_magazines_by_year)}")
            for idx_mag, magazine in enumerate(magazines):
                self.logger.debug("make_mag_mag {idx}/{len} ... issue:{year}/{mag}"
                                  .format(idx=idx_mag + 1, len=len(magazines),
                                          year=year, mag=magazine))
                if year == "1986" and magazine == "31":
                    self.logger.warning("There is magazine 1986, 31, "
                                        "this is special, no creating here")
                    continue
                if self.debug:
                    lemma = Page(self.wiki, "Benutzer:THEbotIT/Test")
                else:
                    lemma = Page(self.wiki, f"Die Gartenlaube ({year})/Heft {int(magazine):d}")
                new_text = self.make_magazine(year, magazine)
                if new_text:
                    if hash(new_text.strip()) != hash(lemma.text.strip()):
                        self.logger.debug("Print [[Die Gartenlaube ({year})/Heft {magazine}]]."
                                          .format(year=year, magazine=magazine))
                        if lemma.text != '':
                            lemma.text = new_text
                            lemma.save("Automatische Aktualisierung des Heftes", botflag=True)
                        else:
                            lemma.text = new_text
                            lemma.save("automatische Hefterstellung", botflag=True)
                    else:
                        self.logger.debug("Keine Änderung im Text ({year}/{magazine})."
                                          .format(year=year, magazine=magazine))

    def make_magazine(self, year, magazine):
        last_magazine = True
        try:
            for key in self.data["indexes"][year].keys():
                if int(key) > int(magazine):
                    last_magazine = False
                    break
        except KeyError:
            raise BotException("The list of indexes is incorrect, {year} is missing."
                               .format(year=year))
        try:
            list_of_pages = self.data["indexes"][year][magazine]
        except KeyError:
            raise BotException("The list of indexes is incorrect, "
                               "year:{year} or mag:{mag} is missing."
                               .format(year=year, mag=magazine))
        quality = 4
        for page in list_of_pages:
            try:
                if self.data["pages"][year][page]["q"] == 0:
                    page_quality = 4
                else:
                    page_quality = self.data["pages"][year][page]["q"]
                if page_quality < quality:
                    quality = page_quality
                if quality < 3:
                    self.logger.debug("The quality of {year}/{magazine} is too poor."
                                      .format(year=year, magazine=magazine))
                    return None
            except KeyError:
                self.logger.warning("The list of pages is incorrect, "
                                    "year:{year} or page:{page} is missing."
                                    .format(year=year, page=page))
                return None
        return self.make_magazine_text(year, magazine, quality, list_of_pages, last_magazine)

    @staticmethod
    def convert_page_no(page: str):
        while True:
            if page[0] == "0":
                page = page[1:]
            else:
                break
        return page.replace("_", " ")

    def make_magazine_text(self, year, magazine, quality, list_of_pages, last):
        # pylint: disable=too-many-arguments,too-many-branches
        magazine = int(magazine)
        year = int(year)
        string_list = list()
        string_list.append("<!--Diese Seite wurde automatisch durch einen Bot erstellt. "
                           "Wenn du einen Fehler findest oder eine Änderung wünscht, "
                           "benachrichtige bitte den Betreiber, THE IT, des Bots.-->\n"
                           "{{Textdaten\n")
        if magazine > 1:
            string_list.append("|VORIGER=Die Gartenlaube ({year:d})/Heft {magazine:d}\n"
                               .format(year=year, magazine=magazine - 1))
        else:
            string_list.append("|VORIGER=\n")
        if last:
            string_list.append("|NÄCHSTER=\n")
        else:
            string_list.append("|NÄCHSTER=Die Gartenlaube ({year:d})/Heft {magazine:d}\n"
                               .format(year=year, magazine=magazine + 1))
        string_list.append("|AUTOR=Verschiedene\n|TITEL=[[Die Gartenlaube]]\n"
                           "|SUBTITEL=''Illustrirtes Familienblatt''\n|HERKUNFT=off\n")
        if year < 1863:
            string_list.append("|HERAUSGEBER=[[Ferdinand Stolle]]\n")
        elif (year < 1878) or (year == 1878 and magazine < 14):
            string_list.append("|HERAUSGEBER=[[Ernst Keil]]\n")
        elif year < 1885:
            string_list.append("|HERAUSGEBER=Ernst Ziel\n")
        else:
            string_list.append("|HERAUSGEBER=Adolf Kröner\n")
        string_list.append("|ENTSTEHUNGSJAHR={year:d}\n|ERSCHEINUNGSJAHR={year:d}\n"
                           "|ERSCHEINUNGSORT=Leipzig\n|VERLAG=Ernst Keil\n"
                           "|WIKIPEDIA=Die Gartenlaube\n".format(year=year))
        if year == 1873:
            extension = "JPG"
        else:
            extension = "jpg"
        string_list.append("|BILD=Die Gartenlaube ({year:d}) {page1}.{extension}\n"
                           .format(year=year, page1=list_of_pages[0], extension=extension))
        string_list.append("|QUELLE=[[commons:category:Gartenlaube ({year})|commons]]\n"
                           .format(year=year))
        if quality == 4:
            string_list.append("|BEARBEITUNGSSTAND=fertig\n")
        else:
            string_list.append("|BEARBEITUNGSSTAND=korrigiert\n")
        string_list.append("|INDEXSEITE=Die Gartenlaube ({year})\n}}}}\n\n"
                           "{{{{BlockSatzStart}}}}\n__TOC__\n".format(year=year))
        ref = []
        for page in list_of_pages:
            page_format = self.convert_page_no(page)
            string_list.append(
                "{{{{SeitePR|{page_format}|Die Gartenlaube ({year}) {page}.{extension}}}}}\n"
                .format(year=year, page_format=page_format, page=page, extension=extension))
            try:
                page_dict = self.data["pages"][str(year)][page]
                if "r" in page_dict.keys():
                    if "ref" in page_dict["r"]:
                        if "ref" not in ref:
                            ref.append("ref")
                    for ref_type in page_dict["r"]:
                        if (ref_type != "ref") and (ref_type not in ref):
                            ref.append(ref_type)
            except KeyError:
                self.logger.error("The list of pages is incorrect, "
                                  "year:{year} or page:{page} is missing."
                                  .format(year=year, page=page))
                return None
        if "ref" in ref:
            string_list.append("{{references|x}}\n")
        for ref_type in ref:
            if ref_type != "ref":
                string_list.append("{{{{references|TIT|{ref}}}}}\n".format(ref=ref_type))
        string_list.append("{{{{BlockSatzEnd}}}}\n\n[[Kategorie:Deutschland]]\n"
                           "[[Kategorie:Neuhochdeutsch]]\n[[Kategorie:Illustrierte Werke]]\n"
                           "[[Kategorie:Die Gartenlaube ({year:d}) Hefte| {magazine:02d}]]\n"
                           .format(year=year, magazine=magazine))
        string_list.append("[[Kategorie:{year}0er Jahre]]\n\n".format(year=str(year)[0:3]))
        return ''.join(string_list)

    def _get_indexes(self) -> Iterator[IndexPage]:
        self.searcher_indexes.add_positive_category("Die Gartenlaube")
        self.searcher_indexes.add_positive_category("Index")
        self.searcher_indexes.set_regex_filter(r".*Die Gartenlaube \(\d{4}\)")
        self.searcher_indexes.set_timeout(60)
        for index in self.searcher_indexes.run():
            yield index["title"], IndexPage(self.wiki, "Index:{}".format(index["title"]))

    def search_pages(self):
        self.searcher_pages.add_positive_category("Die Gartenlaube")
        self.searcher_pages.add_namespace("Seite")
        self.searcher_pages.set_search_depth(1)
        self.searcher_pages.set_timeout(60)
        if self.last_run_successful or self.debug:
            delta = (self.timestamp.start_of_run - self.timestamp.last_run).days
            if self.debug:
                delta = 10
            start_of_search = self.create_timestamp_for_search(delta)
            self.searcher_pages.last_change_after(start_of_search)
            self.logger.info("The date {} is set to the argument \"after\"."
                             .format(start_of_search.strftime("%d.%m.%Y")))
        return self.searcher_pages.run()
# -*- coding: utf-8 -*-
__author__ = 'eso'
import sys
sys.path.append('../../')
from tools.petscan import PetScan
import re
import requests
import pywikibot

searcher_catscan = PetScan()
searcher_catscan.add_positive_category('Nicolaus Coppernicus aus Thorn über die Kreisbewegungen der Weltkörper')
searcher_catscan.add_namespace('Seite')
sites = searcher_catscan.run()
site = pywikibot.Site()

for lemma in sites:
    page = pywikibot.Page(site, lemma['a']['nstext'] + ':' + lemma['a']['title'])
    test_for_fit = re.search('150px', page.text)
    print(lemma['a']['title'])
    if test_for_fit:
        print('do replacement')
        page.text = re.sub('150px', '300px', page.text)
        page.save(summary='bot edit: replace 150px with 300px', botflag=True, )
Пример #34
0
            .*?                 # some characters in between   UNGREEDY (hier kam es beim Durchlauf zu einem Error
                                # beim nächsten mal die Anzahl der Zeichen begrenzen)
            (wikilivres|archive)# hit(1) for deciding whether a wikilivre or an archive address
            .*?                 # some characters in between   UNGREEDY
            \/Pauly-Wissowa     #
            [ _]?(S)?[ _]?      # hit(2) for a possible subblement letter
            ([IVX]{1,5}),?      # hit(3) for roman letters between one and 5 times
            [ _]?(A)?           # hit(4) for a possible append letter
            ([12])?,?           # hit(5) for a possible 1 or 2
            [ _]0{0,3}(\d{1,4}) # hit(6) for the page
            \.(jpg|png)         # hit(7) for the picture typ'''

FIT = re.compile(REGEX, re.VERBOSE)

LEMMA_SEARCHER = PetScan()
LEMMA_SEARCHER.add_positive_category('Paulys Realencyclopädie der classischen Altertumswissenschaft')
#lemma_searcher.add_no_template('REIA') # sadly I have to look on all 18.000 re-sites
LEMMA_SEARCHER.set_timeout(90)
lemmas = LEMMA_SEARCHER.run()

for i, _ in enumerate(lemmas):
    lemmas[i] = lemmas[i]['a']['title']

for idx, lemma in enumerate(lemmas):
    if lemma[0:3] == 're:':
        print(idx, '/', len(lemmas), lemma)
        page = pywikibot.Page(WIKI, lemma)
        searcher = FIT.search(page.text)
        if searcher:
            print('#######', lemma)
            temp = FIT.sub(lambda x: decide_REIA_or_REWL(x), page.text)