def test_verify_xpath_will_not_run_splash_settings_if_not_javascript(mocker): mocker.patch("os.path.isdir") mock_crawler_worker = mocker.patch("webcomix.comic.CrawlerWorker") comic = Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, False) comic.verify_xpath() settings = mock_crawler_worker.call_args_list[0][0][0] assert all(setting not in settings.items() for setting in SPLASH_SETTINGS.items())
def test_verify_xpath_with_alt_text(three_webpages_alt_text_uri): comic = Comic( "test_alt", three_webpages_alt_text_uri, "//img/@src", "//a/@href", alt_text="//img/@title", ) three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip( "1.html") assert comic.verify_xpath() == [ { "page": 1, "url": three_webpages_alt_text_uri, "image_urls": [three_webpages_alt_text_folder + "1.jpeg"], "alt_text": "First page", }, { "page": 2, "url": three_webpages_alt_text_folder + "2.html", "image_urls": [three_webpages_alt_text_folder + "2.jpeg"], "alt_text": "Second page", }, { "page": 3, "url": three_webpages_alt_text_folder + "3.html", "image_urls": [], "alt_text": None, }, ]
def test_verify_xpath(three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") three_webpages_folder = three_webpages_uri.strip("1.html") assert comic.verify_xpath() == [ { "page": 1, "url": three_webpages_uri, "image_urls": [three_webpages_folder + "1.jpeg"], "alt_text": None, }, { "page": 2, "url": three_webpages_folder + "2.html", "image_urls": [three_webpages_folder + "2.jpeg"], "alt_text": None, }, { "page": 3, "url": three_webpages_folder + "3.html", "image_urls": [], "alt_text": None, }, ]
def test_print_verification(capfd, three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") verification = comic.verify_xpath() cli.print_verification(verification) out, err = capfd.readouterr() three_webpages_folder = three_webpages_uri.strip("1.html") assert out == ("Page 1:\n" "Page URL: " + three_webpages_uri + "\n" "Image URLs:\n" "" + three_webpages_folder + "1.jpeg" "\n" "\n" "Page 2:\n" "Page URL: " + three_webpages_folder + "2.html" "\n" "Image URLs:\n" "" + three_webpages_folder + "2.jpeg" "\n" "\n" "Page 3:\n" "Page URL: " + three_webpages_folder + "3.html" "\n" "Image URLs:\n" "\n" "\n")
def test_verify_xpath_only_verifies_one_page_with_single_page(one_webpage_uri): comic = Comic("test", one_webpage_uri, "//img/@src", "//a/@href", single_page=True) actual = comic.verify_xpath() assert len(actual) == 1 assert len(actual[0]["image_urls"]) == 2
def test_print_verification_with_alt_text(capfd, three_webpages_alt_text_uri): comic = Comic( "test_alt", three_webpages_alt_text_uri, "//img/@src", "//a/@href", alt_text="//img/@title", ) verification = comic.verify_xpath() cli.print_verification(verification) out, err = capfd.readouterr() three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip( "1.html") assert out == ("Page 1:\n" "Page URL: " + three_webpages_alt_text_uri + "\n" "Image URLs:\n" "" + three_webpages_alt_text_folder + "1.jpeg" "\n" "Alt text: First page\n" "\n" "Page 2:\n" "Page URL: " + three_webpages_alt_text_folder + "2.html" "\n" "Image URLs:\n" "" + three_webpages_alt_text_folder + "2.jpeg" "\n" "Alt text: Second page\n" "\n" "Page 3:\n" "Page URL: " + three_webpages_alt_text_folder + "3.html" "\n" "Image URLs:\n" "\n" "\n")
def test_supported_comics(comic_name): comic = Comic(comic_name, *supported_comics[comic_name]) first_pages = comic.verify_xpath() check_first_pages(first_pages)
def discovery( name: str, url: str, start_page: int = 1, alt_text: str = None, single_page: bool = False, javascript: bool = False, title: bool = False, debug: bool = False, ) -> Tuple[Optional[Comic], Optional[List[Mapping]]]: def to_lower_case(attribute): return ( "translate({}, " "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'," "'abcdefghijklmnopqrstuvwxyz')" ).format(attribute) click.echo("Looking for a path to the whole comic... (Ctrl-C to exit)") combinations = product( possible_next_page_xpath, possible_image_xpath, possible_tags_image, possible_tags_next, possible_attributes_image, possible_attributes_next, ) total = ( len(possible_next_page_xpath) * len(possible_image_xpath) * len(possible_tags_image) * len(possible_tags_next) * len(possible_attributes_image) * len(possible_attributes_next) ) for next_page, image, tag_image, tag_next, attribute_image, attribute_next in tqdm( combinations, total=total ): next_page_xpath = "//{}[contains({}, '{}')]//@href".format( tag_next, to_lower_case(attribute_next), next_page ) image_xpath = "//{}[contains({}, '{}')]//@src".format( tag_image, to_lower_case(attribute_image), image ) try: comic = Comic( name, url, image_xpath, next_page_xpath, start_page=start_page, alt_text=alt_text, single_page=single_page, javascript=javascript, title=title, debug=debug, ) first_pages = comic.verify_xpath() check_first_pages(first_pages) return comic, first_pages except KeyboardInterrupt: sys.exit(0) except: continue click.echo("Search has failed.") return None, None