def petscan(self, categories, not_categories=None, article=False, year=None): searcher = PetScan() searcher.set_timeout(120) if article: # Article searcher.add_namespace(0) else: # Seite searcher.add_namespace(102) searcher.set_search_depth(5) if year: searcher.add_positive_category("Die Gartenlaube (" + str(year) + ")") else: searcher.add_positive_category("Die Gartenlaube") for category in categories: searcher.add_positive_category(category) if not_categories: for category in not_categories: searcher.add_negative_category(category) self.logger.debug(str(searcher)) return len(searcher.run())
def petscan(self, categories, negative_categories): searcher = PetScan() for category in categories: searcher.add_positive_category(category) for neg_category in negative_categories: searcher.add_negative_category(neg_category) searcher.set_logic_union() self.logger.debug(searcher) return searcher.run()
def petscan(self, categories: List[str], negative_categories: List[str]) -> List[PetscanLemma]: searcher = PetScan() for category in categories: searcher.add_positive_category(category) for neg_category in negative_categories: searcher.add_negative_category(neg_category) searcher.set_logic_union() self.logger.debug(str(searcher)) return searcher.run()
def crawler_cat_index_site(): searcher_werke = PetScan() for item in watch_themes: searcher_werke.add_positive_category(item) searcher_werke.add_negative_category('Zeitschrift') searcher_werke.set_search_depth(4) searcher_werke.set_logic(log_or=True) list_werke = searcher_werke.run() for row in range(len(list_werke)): list_werke[row] = list_werke[row]['a']['title'] pass all_sites = set([]) counter = 1 for werk in list_werke: searcher_sites = PetScan() searcher_sites.add_namespace(102) searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(werk) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() if len(sites) > 0: for row in range(len(sites)): sites[row] = sites[row]['a']['title'] all_sites = all_sites | set(sites) else: searcher_index = PetScan() searcher_index.add_any_outlink(werk) searcher_index.add_namespace(104) searcher_index.add_positive_category('Index') index = searcher_index.run() if index: searcher_sites = PetScan() searcher_sites.add_namespace(102) searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(index[0]['a']['nstext'] + ':' + index[0]['a']['title']) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() else: print(werk) print(counter, '/', len(list_werke), ' result:', len(all_sites)) counter += 1 with open('output.txt', 'w', encoding='utf-8') as f: f.writelines(["Seite:%s\n" % item for item in all_sites])
def crawler_cat_index_site(): searcher_werke =PetScan() for item in watch_themes: searcher_werke.add_positive_category(item) searcher_werke.add_negative_category('Zeitschrift') searcher_werke.set_search_depth(4) searcher_werke.set_logic(log_or=True) list_werke = searcher_werke.run() for row in range(len(list_werke)): list_werke[row] = list_werke[row]['a']['title'] pass all_sites = set([]) counter = 1 for werk in list_werke: searcher_sites = PetScan() searcher_sites.add_namespace('Seite') searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(werk) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() if len(sites) > 0: for row in range(len(sites)): sites[row] = sites[row]['a']['title'] all_sites = all_sites | set(sites) else: searcher_index = PetScan() searcher_index.add_any_outlink(werk) searcher_index.add_namespace('Index') searcher_index.add_positive_category('Index') index = searcher_index.run() if index: searcher_sites = PetScan() searcher_sites.add_namespace('Seite') searcher_sites.add_positive_category('Fertig') searcher_sites.add_positive_category('Korrigiert') searcher_sites.add_positive_category('Unkorrigiert') searcher_sites.set_logic(log_or=True) searcher_sites.add_any_outlink(index[0]['a']['nstext'] + ':' + index[0]['a']['title']) # this link have a bug on catscan2 # http://tools.wmflabs.org/catscan2/catscan2.php?project=wikisource&categories=Fertig%0D%0AKorrigiert%0D%0AUnkorrigiert&comb[union]=1&ns[102]=1&outlinks_any=Einige+Bemerkungen+%C3%BCber+die+von+Dr.+Liskovius+ver%C3%B6ffentlichten+Resultate+seiner+%E2%80%9EUntersuchungen+%C3%BCber+den+Einflu%C3%9F+der+verschiedenen+Weite+der+Labialpfeifen+auf+ihre+Tonh%C3%B6he%E2%80%9C&interface_language=de sites = searcher_sites.run() else: print(werk) print(counter, '/', len(list_werke), ' result:', len(all_sites)) counter += 1 with open('output.txt', 'w', encoding='utf-8') as f: f.writelines(["Seite:%s\n" % item for item in all_sites])
def petscan(self, categories, not_categories=None, article=False, year=None): searcher = PetScan() if article: searcher.add_namespace("Article") else: searcher.add_namespace("Seite") searcher.set_search_depth(5) if year: searcher.add_positive_category("Die Gartenlaube (" + str(year) + ")") else: searcher.add_positive_category("Die Gartenlaube") for category in categories: searcher.add_positive_category(category) if not_categories: for category in not_categories: searcher.add_negative_category(category) self.logger.debug(str(searcher)) return len(searcher.run())
class TestCatScan(TestCase): def setUp(self): self.petscan = PetScan() def test_add_options(self): self.petscan.add_options({"max_age": "45"}) self.petscan.add_options({"smaller": "300"}) self.assertDictEqual({"smaller": "300", "max_age": "45"}, self.petscan.options) def test_add_categoy(self): self.petscan.add_positive_category("pos1") self.petscan.add_positive_category("pos2") self.petscan.add_positive_category("pos3", 2) self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg2") self.petscan.add_negative_category("neg3", 3) self.assertEqual(["pos1", "pos2", "pos3|2"], self.petscan.categories["positive"]) self.assertEqual(["neg1", "neg2", "neg3|3"], self.petscan.categories["negative"]) def test_add_namespace(self): self.petscan.add_namespace(0) self.petscan.add_namespace("Datei") self.petscan.add_namespace([2, "Vorlage"]) self.assertDictEqual({"ns[0]": "1", "ns[2]": "1", "ns[6]": "1", "ns[10]": "1"}, self.petscan.options) def test_activate_redirects(self): self.petscan.activate_redirects() self.assertDictEqual({"show_redirects": "yes"}, self.petscan.options) def test_deactivate_redirects(self): self.petscan.deactivate_redirects() self.assertDictEqual({"show_redirects": "no"}, self.petscan.options) def test_last_change_before(self): self.petscan.last_change_before(datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"before": "12340101020242"}, self.petscan.options) def test_last_change_after(self): self.petscan.last_change_after(datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"after": "12340101020242"}, self.petscan.options) def test_max_age(self): self.petscan.max_age(1234) self.assertDictEqual({"max_age": "1234"}, self.petscan.options) def test_only_new(self): self.petscan.only_new() self.assertDictEqual({"only_new": "1"}, self.petscan.options) def test_smaller_then(self): self.petscan.smaller_then(42) self.assertDictEqual({"smaller": "42"}, self.petscan.options) def test_larger_then(self): self.petscan.larger_then(42) self.assertDictEqual({"larger": "42"}, self.petscan.options) def test_get_wikidata(self): self.petscan.get_wikidata_items() self.assertDictEqual({"wikidata_item": "any"}, self.petscan.options) def test_get_Pages_with_wikidata(self): self.petscan.get_pages_with_wd_items() self.assertDictEqual({"wikidata_item": "with"}, self.petscan.options) def test_get_Pages_without_wikidata(self): self.petscan.get_pages_without_wd_items() self.assertDictEqual({"wikidata_item": "without"}, self.petscan.options) def test_set_or(self): self.petscan.set_logic_union() self.assertDictEqual({"combination": "union"}, self.petscan.options) def test_set_regex(self): self.petscan.set_regex_filter("abc") self.assertDictEqual({"regexp_filter": "abc"}, self.petscan.options) def test_set_last_edits(self): self.petscan.set_last_edit_bots(True) self.petscan.set_last_edit_anons(False) self.petscan.set_last_edit_flagged() self.assertDictEqual({"edits[bots]": "yes", "edits[anons]": "no", "edits[flagged]": "yes"}, self.petscan.options) def test_construct_cat_string(self): self.petscan.add_positive_category("pos 1") self.petscan.add_positive_category("pos2") self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg 2") self.petscan.add_negative_category("neg3") self.assertEqual("pos+1\r\npos2", self.petscan._construct_list_argument(self.petscan.categories["positive"])) self.assertEqual("neg1\r\nneg+2\r\nneg3", self.petscan._construct_list_argument(self.petscan.categories["negative"])) def test_construct_templates(self): self.petscan.add_yes_template("yes1") self.petscan.add_yes_template("yes2") self.petscan.add_any_template("any1") self.petscan.add_any_template("any2") self.petscan.add_any_template("any3") self.petscan.add_no_template("no1") self.petscan.add_no_template("no2") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=de&project=wikisource&templates_yes=yes1%0D%0Ayes2&templates_any=any1%0D%0Aany2%0D%0Aany3&templates_no=no1%0D%0Ano2") def test_construct_outlinks(self): self.petscan.add_yes_outlink("yes1") self.petscan.add_yes_outlink("yes2") self.petscan.add_any_outlink("any1") self.petscan.add_any_outlink("any2") self.petscan.add_any_outlink("any3") self.petscan.add_no_outlink("no1") self.petscan.add_no_outlink("no2") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=de&project=wikisource&outlinks_yes=yes1%0D%0Ayes2&outlinks_any=any1%0D%0Aany2%0D%0Aany3&outlinks_no=no1%0D%0Ano2") def test_construct_links_to(self): self.petscan.add_yes_links_to("yes1") self.petscan.add_yes_links_to("yes2") self.petscan.add_any_links_to("any1") self.petscan.add_any_links_to("any2") self.petscan.add_any_links_to("any3") self.petscan.add_no_links_to("no1") self.petscan.add_no_links_to("no2") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=de&project=wikisource&links_to_all=yes1%0D%0Ayes2&links_to_any=any1%0D%0Aany2%0D%0Aany3&links_to_no=no1%0D%0Ano2") def test_construct_options(self): self.petscan.options = {"max_age": "1234", "get_q": "1", "show_redirects": "yes"} self.assertEqual("&max_age=1234" in str(self.petscan), True) self.assertEqual("&get_q=1" in str(self.petscan), True) self.assertEqual("&show_redirects=yes" in str(self.petscan), True) def test_construct_string(self): self.petscan.set_language("en") self.petscan.set_project("wikipedia") # only a positive category self.petscan.add_positive_category("test") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&categories=test") # only a negative category self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_negative_category("test") self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&negcats=test") # only a option self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_options({"max_age": "10"}) self.assertEqual(str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10") def test_do_positive(self): with requests_mock.mock() as mock: mock.get("https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", text='{"n": "result","a": {"querytime_sec": 1.572163,' '"query": "https://petscan.wmflabs.org/?language=de' '&project=wikisource&categories=Autoren&get_q=1' '&show_redirects=no&ns[0]=1&max_age=48' '&format=json&doit=1"},' '"*": [{"n": "combination",' '"a": {"type": "subset",' '"*": [{"id": 3279,' '"len": 10197,' '"n": "page",' '"namespace": 0,' '"nstext": "",' '"q": "Q60644",' '"title": "Friedrich_Rückert",' '"touched": "20161024211701"}]}}]}') self.assertEqual(self.petscan.run(), [{"id": 3279, "len": 10197, "n": "page", "namespace": 0, "nstext": "", "q": "Q60644", "title": "Friedrich_Rückert", "touched": "20161024211701"}]) def test_do_negative(self): with requests_mock.mock() as mock: mock.get("https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", status_code=404) with self.assertRaises(ConnectionError): self.petscan.run()
class TestCatScan(TestCase): def setUp(self): self.petscan = PetScan() def test_add_options(self): self.petscan.add_options({"max_age": "45"}) self.petscan.add_options({"smaller": "300"}) self.assertDictEqual({ "smaller": "300", "max_age": "45" }, self.petscan.options) def test_add_categoy(self): self.petscan.add_positive_category("pos1") self.petscan.add_positive_category("pos2") self.petscan.add_positive_category("pos3", 2) self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg2") self.petscan.add_negative_category("neg3", 3) self.assertEqual(["pos1", "pos2", "pos3|2"], self.petscan.categories["positive"]) self.assertEqual(["neg1", "neg2", "neg3|3"], self.petscan.categories["negative"]) def test_add_namespace(self): self.petscan.add_namespace(0) self.petscan.add_namespace([2, 10]) self.assertDictEqual({ "ns[0]": "1", "ns[2]": "1", "ns[10]": "1" }, self.petscan.options) def test_activate_redirects(self): self.petscan.activate_redirects() self.assertDictEqual({"show_redirects": "yes"}, self.petscan.options) def test_deactivate_redirects(self): self.petscan.deactivate_redirects() self.assertDictEqual({"show_redirects": "no"}, self.petscan.options) def test_last_change_before(self): self.petscan.last_change_before( datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"before": "12340101020242"}, self.petscan.options) def test_last_change_after(self): self.petscan.last_change_after( datetime(year=1234, month=1, day=1, hour=2, minute=2, second=42)) self.assertDictEqual({"after": "12340101020242"}, self.petscan.options) def test_max_age(self): self.petscan.max_age(1234) self.assertDictEqual({"max_age": "1234"}, self.petscan.options) def test_only_new(self): self.petscan.only_new() self.assertDictEqual({"only_new": "1"}, self.petscan.options) def test_smaller_then(self): self.petscan.smaller_then(42) self.assertDictEqual({"smaller": "42"}, self.petscan.options) def test_larger_then(self): self.petscan.larger_then(42) self.assertDictEqual({"larger": "42"}, self.petscan.options) def test_get_wikidata(self): self.petscan.get_wikidata_items() self.assertDictEqual({"wikidata_item": "any"}, self.petscan.options) def test_get_Pages_with_wikidata(self): self.petscan.get_pages_with_wd_items() self.assertDictEqual({"wikidata_item": "with"}, self.petscan.options) def test_get_Pages_without_wikidata(self): self.petscan.get_pages_without_wd_items() self.assertDictEqual({"wikidata_item": "without"}, self.petscan.options) def test_set_or(self): self.petscan.set_logic_union() self.assertDictEqual({"combination": "union"}, self.petscan.options) def test_set_regex(self): self.petscan.set_regex_filter("abc") self.assertDictEqual({"regexp_filter": "abc"}, self.petscan.options) def test_set_last_edits(self): self.petscan.set_last_edit_bots(True) self.petscan.set_last_edit_anons(False) self.petscan.set_last_edit_flagged() self.assertDictEqual( { "edits[bots]": "yes", "edits[anons]": "no", "edits[flagged]": "yes" }, self.petscan.options) def test_construct_cat_string(self): self.petscan.add_positive_category("pos 1") self.petscan.add_positive_category("pos2") self.petscan.add_negative_category("neg1") self.petscan.add_negative_category("neg 2") self.petscan.add_negative_category("neg3") self.assertEqual( "pos+1\r\npos2", self.petscan._construct_list_argument( self.petscan.categories["positive"])) self.assertEqual( "neg1\r\nneg+2\r\nneg3", self.petscan._construct_list_argument( self.petscan.categories["negative"])) def test_construct_templates(self): self.petscan.add_yes_template("yes1") self.petscan.add_yes_template("yes2") self.petscan.add_any_template("any1") self.petscan.add_any_template("any2") self.petscan.add_any_template("any3") self.petscan.add_no_template("no1") self.petscan.add_no_template("no2") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=de" "&project=wikisource" "&templates_yes=yes1%0D%0Ayes2" "&templates_any=any1%0D%0Aany2%0D%0Aany3" "&templates_no=no1%0D%0Ano2") def test_construct_outlinks(self): self.petscan.add_yes_outlink("yes1") self.petscan.add_yes_outlink("yes2") self.petscan.add_any_outlink("any1") self.petscan.add_any_outlink("any2") self.petscan.add_any_outlink("any3") self.petscan.add_no_outlink("no1") self.petscan.add_no_outlink("no2") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=de" "&project=wikisource" "&outlinks_yes=yes1%0D%0Ayes2" "&outlinks_any=any1%0D%0Aany2%0D%0Aany3" "&outlinks_no=no1%0D%0Ano2") def test_construct_links_to(self): self.petscan.add_yes_links_to("yes1") self.petscan.add_yes_links_to("yes2") self.petscan.add_any_links_to("any1") self.petscan.add_any_links_to("any2") self.petscan.add_any_links_to("any3") self.petscan.add_no_links_to("no1") self.petscan.add_no_links_to("no2") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=de" "&project=wikisource" "&links_to_all=yes1%0D%0Ayes2" "&links_to_any=any1%0D%0Aany2%0D%0Aany3" "&links_to_no=no1%0D%0Ano2") def test_construct_options(self): self.petscan.options = { "max_age": "1234", "get_q": "1", "show_redirects": "yes" } self.assertEqual("&max_age=1234" in str(self.petscan), True) self.assertEqual("&get_q=1" in str(self.petscan), True) self.assertEqual("&show_redirects=yes" in str(self.petscan), True) def test_construct_string(self): self.petscan.set_language("en") self.petscan.set_project("wikipedia") # only a positive category self.petscan.add_positive_category("test") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&categories=test" ) # only a negative category self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_negative_category("test") self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&negcats=test" ) # only a option self.petscan.categories = {"positive": [], "negative": []} self.petscan.add_options({"max_age": "10"}) self.assertEqual( str(self.petscan), "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10" ) def test_do_positive(self): with requests_mock.mock() as mock: mock.get( "https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", text='{"n": "result","a": {"querytime_sec": 1.572163,' '"query": "https://petscan.wmflabs.org/?language=de' '&project=wikisource&categories=Autoren&get_q=1' '&show_redirects=no&ns[0]=1&max_age=48' '&format=json&doit=1"},' '"*": [{"n": "combination",' '"a": {"type": "subset",' '"*": [{"id": 3279,' '"len": 10197,' '"n": "page",' '"namespace": 0,' '"nstext": "",' '"q": "Q60644",' '"title": "Friedrich_Rückert",' '"touched": "20161024211701"}]}}]}') self.assertEqual(self.petscan.run(), [{ "id": 3279, "len": 10197, "n": "page", "namespace": 0, "nstext": "", "q": "Q60644", "title": "Friedrich_Rückert", "touched": "20161024211701" }]) def test_do_negative(self): with requests_mock.mock() as mock: mock.get( "https://petscan.wmflabs.org/" "?language=de&project=wikisource&format=json&doit=1", status_code=404) with self.assertRaises(PetScanException): self.petscan.run()