def test_verify_xpath_with_alt_text(three_webpages_alt_text_uri): comic = Comic( "test_alt", three_webpages_alt_text_uri, "//img/@src", "//a/@href", alt_text="//img/@title", ) three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip( "1.html") assert comic.verify_xpath() == [ { "page": 1, "url": three_webpages_alt_text_uri, "image_urls": [three_webpages_alt_text_folder + "1.jpeg"], "alt_text": "First page", }, { "page": 2, "url": three_webpages_alt_text_folder + "2.html", "image_urls": [three_webpages_alt_text_folder + "2.jpeg"], "alt_text": "Second page", }, { "page": 3, "url": three_webpages_alt_text_folder + "3.html", "image_urls": [], "alt_text": None, }, ]
def test_verify_xpath(three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") three_webpages_folder = three_webpages_uri.strip("1.html") assert comic.verify_xpath() == [ { "page": 1, "url": three_webpages_uri, "image_urls": [three_webpages_folder + "1.jpeg"], "alt_text": None, }, { "page": 2, "url": three_webpages_folder + "2.html", "image_urls": [three_webpages_folder + "2.jpeg"], "alt_text": None, }, { "page": 3, "url": three_webpages_folder + "3.html", "image_urls": [], "alt_text": None, }, ]
def test_download_does_not_add_crawlers_in_main_process( mocker, cleanup_test_directories, three_webpages_uri): mocker.patch("webcomix.scrapy.crawler_worker.CrawlerWorker.start") mock_add_to_crawl = mocker.patch("scrapy.crawler.Crawler.crawl") comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") comic.download() assert mock_add_to_crawl.call_count == 0
def test_print_verification(capfd, three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") verification = comic.verify_xpath() cli.print_verification(verification) out, err = capfd.readouterr() three_webpages_folder = three_webpages_uri.strip("1.html") assert out == ("Page 1:\n" "Page URL: " + three_webpages_uri + "\n" "Image URLs:\n" "" + three_webpages_folder + "1.jpeg" "\n" "\n" "Page 2:\n" "Page URL: " + three_webpages_folder + "2.html" "\n" "Image URLs:\n" "" + three_webpages_folder + "2.jpeg" "\n" "\n" "Page 3:\n" "Page URL: " + three_webpages_folder + "3.html" "\n" "Image URLs:\n" "\n" "\n")
def test_download_will_not_run_splash_settings_if_not_javascript(mocker): mocker.patch("os.path.isdir") mock_crawler_worker = mocker.patch("webcomix.comic.CrawlerWorker") comic = Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY) comic.download() settings = mock_crawler_worker.call_args_list[0][0][0] assert all(setting not in settings.items() for setting in SPLASH_SETTINGS.items())
def test_save_image_location(): assert (Comic.save_image_location( "http://imgs.xkcd.com/comics/barrel_cropped_(1).jpg", 1, "foo") == "foo/1.jpg") assert Comic.save_image_location("", 1, "bar") == "bar/1" assert (Comic.save_image_location( "http://imgs.xkcd.com/comics/barrel_cropped_(1).jpg?q=123", 1, "foo") == "foo/1.jpg")
def test_verify_xpath_only_verifies_one_page_with_single_page(one_webpage_uri): comic = Comic("test", one_webpage_uri, "//img/@src", "//a/@href", single_page=True) actual = comic.verify_xpath() assert len(actual) == 1 assert len(actual[0]["image_urls"]) == 2
def test_download_saves_the_files_with_correct_first_item( cleanup_test_directories, three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href", start_page=10) comic.download() files = next(os.walk("test"))[2] assert sorted(files) == ["10", "11"]
def test_download_with_alt_text_saves_the_text(cleanup_test_directories, three_webpages_alt_text_uri): comic = Comic( "test", three_webpages_alt_text_uri, "//img/@src", "//a/@href", alt_text="//img/@title", ) comic.download() path, dirs, files = next(os.walk("test")) assert len(files) == 4
def test_download_runs_the_worker(mocker, cleanup_test_directories): mock_crawler_running = mocker.patch( "webcomix.scrapy.crawler_worker.CrawlerWorker.start") comic = Comic( "xkcd", "http://xkcd.com/1/", "//div[@id='comic']//img/@src", "//a[@rel='next']/@href", False, ) comic.download() assert mock_crawler_running.call_count == 1
def image_in_zipfile(item, directory): zipfile_path = "{}.cbz".format(directory) if not os.path.isfile(zipfile_path): return False image_path_cbz = Comic.save_image_location(item.get("url"), item.get("page")) with ZipFile(zipfile_path, "r") as zipfile: return image_path_cbz in zipfile.namelist()
def test_verify_xpath_will_run_splash_settings_if_javascript(mocker): mocker.patch("os.path.isdir") mock_crawler_worker = mocker.patch("webcomix.comic.CrawlerWorker") comic = Comic( mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, True, ) comic.verify_xpath() settings = mock_crawler_worker.call_args_list[0][0][0] assert all(setting in settings.items() for setting in SPLASH_SETTINGS.items())
def test_convert_to_cbz_adds_all_files_to_cbz(cleanup_test_directories, three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") comic.download() comic.convert_to_cbz() with ZipFile("{}.cbz".format(comic.name), mode="r") as cbz_file: assert len(cbz_file.infolist()) == 2
def get_media_requests(self, item, info): click.echo("Saving image {}".format(item.get("url"))) url, page, title, alt_text = itemgetter("url", "page", "title", "alt_text")(item) image_path_directory = Comic.save_image_location( url, page, info.spider.directory, title) if os.path.isfile(image_path_directory) or self.image_in_zipfile( item, info.spider.directory): click.echo("The image was already downloaded. Skipping...") raise DropItem("The image was already downloaded. Skipping...") if alt_text is not None: with open( Comic.save_alt_text_location(page, info.spider.directory), "w") as alt_text_file: alt_text_file.write(alt_text) yield scrapy.Request( item.get("url"), meta={ "image_file_name": Comic.save_image_filename(item.get("url"), item.get("page"), title, info.spider.directory) }, )
def test_print_verification_with_alt_text(capfd, three_webpages_alt_text_uri): comic = Comic( "test_alt", three_webpages_alt_text_uri, "//img/@src", "//a/@href", alt_text="//img/@title", ) verification = comic.verify_xpath() cli.print_verification(verification) out, err = capfd.readouterr() three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip( "1.html") assert out == ("Page 1:\n" "Page URL: " + three_webpages_alt_text_uri + "\n" "Image URLs:\n" "" + three_webpages_alt_text_folder + "1.jpeg" "\n" "Alt text: First page\n" "\n" "Page 2:\n" "Page URL: " + three_webpages_alt_text_folder + "2.html" "\n" "Image URLs:\n" "" + three_webpages_alt_text_folder + "2.jpeg" "\n" "Alt text: Second page\n" "\n" "Page 3:\n" "Page URL: " + three_webpages_alt_text_folder + "3.html" "\n" "Image URLs:\n" "\n" "\n")
def test_discovered_comic_searches_for_a_comic(mocker): runner = CliRunner() mock_discovery = mocker.patch( "webcomix.cli.discovery", return_value=( Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY), mocker.ANY, ), ) mock_download = mocker.patch("webcomix.comic.Comic.download") mock_verify_xpath = mocker.patch("webcomix.comic.Comic.verify_xpath") mock_print_verification = mocker.patch("webcomix.cli.print_verification") result = runner.invoke(cli.search, ["foo", "--start_url=good"], "y") assert result.exit_code == 0 assert mock_discovery.call_count == 1
def test_search_searchable_website(mocker, three_webpages_classes_uri): expected = Comic( "Blindsprings", three_webpages_classes_uri, "//*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'comic')]//@src", "//*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'next')]//@href", ) mocker.patch("webcomix.search.possible_image_xpath", ["comic"]) mocker.patch("webcomix.search.possible_next_page_xpath", ["next"]) mocker.patch("webcomix.search.possible_tags_image", ["*"]) mocker.patch("webcomix.search.possible_tags_next", ["*"]) mocker.patch("webcomix.search.possible_attributes_image", ["@class"]) mocker.patch("webcomix.search.possible_attributes_next", ["@class"]) mocker.patch("webcomix.util.check_first_pages") comic, result = discovery("Blindsprings", three_webpages_classes_uri) three_webpages_classes_folder = three_webpages_classes_uri.strip("1.html") assert result == [ { "page": 1, "url": three_webpages_classes_uri, "image_urls": [three_webpages_classes_folder + "1.jpeg"], "alt_text": None, }, { "page": 2, "url": three_webpages_classes_folder + "2.html", "image_urls": [three_webpages_classes_folder + "2.jpeg"], "alt_text": None, }, { "page": 3, "url": three_webpages_classes_folder + "3.html", "image_urls": [three_webpages_classes_folder + "3.jpeg"], "alt_text": None, }, ] assert comic.start_url == expected.start_url assert comic.next_page_selector == expected.next_page_selector assert comic.comic_image_selector == expected.comic_image_selector
def fake_downloaded_xkcd_comic(): if os.path.isfile("xkcd.cbz"): os.remove("xkcd.cbz") if os.path.isdir("xkcd"): shutil.rmtree("xkcd") comic = Comic( "xkcd", "http://xkcd.com/1/", "//div[@id='comic']/img/@src", "//a[@rel='next']/@href", False, ) os.mkdir("xkcd") for i in range(1, 6): with open("xkcd/{}.txt".format(i), "w") as image_file: image_file.write("testing {}".format(i)) yield comic if os.path.isfile("xkcd.cbz"): os.remove("xkcd.cbz") if os.path.isdir("xkcd"): shutil.rmtree("xkcd")
def test_discovered_comic_asks_for_verification_before_downloading(mocker): runner = CliRunner() mock_manager = mocker.Mock() mock_discovery = mocker.patch( "webcomix.cli.discovery", return_value=( Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY), mocker.ANY, ), ) mock_download = mocker.patch("webcomix.comic.Comic.download") mock_verify_xpath = mocker.patch("webcomix.comic.Comic.verify_xpath") mock_print_verification = mocker.patch("webcomix.cli.print_verification") mock_manager.attach_mock(mock_download, "download") mock_manager.attach_mock(mock_verify_xpath, "verify_xpath") mock_manager.attach_mock(mock_print_verification, "print_verification") result = runner.invoke(cli.search, ["foo", "--start_url=good"], "y") assert result.exit_code == 0 mock_manager.assert_has_calls( [mocker.call.print_verification(mocker.ANY), mocker.call.download()])
def test_supported_comics(comic_name): comic = Comic(comic_name, *supported_comics[comic_name]) first_pages = comic.verify_xpath() check_first_pages(first_pages)
def test_save_image_filename_with_title_present(): assert (Comic.save_image_filename( "http://imgs.xkcd.com/comics/barrel_cropped_(1).jpg", 1, True, "foo") == "foo-1.jpg")
def discovery( name: str, url: str, start_page: int = 1, alt_text: str = None, single_page: bool = False, javascript: bool = False, title: bool = False, debug: bool = False, ) -> Tuple[Optional[Comic], Optional[List[Mapping]]]: def to_lower_case(attribute): return ( "translate({}, " "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'," "'abcdefghijklmnopqrstuvwxyz')" ).format(attribute) click.echo("Looking for a path to the whole comic... (Ctrl-C to exit)") combinations = product( possible_next_page_xpath, possible_image_xpath, possible_tags_image, possible_tags_next, possible_attributes_image, possible_attributes_next, ) total = ( len(possible_next_page_xpath) * len(possible_image_xpath) * len(possible_tags_image) * len(possible_tags_next) * len(possible_attributes_image) * len(possible_attributes_next) ) for next_page, image, tag_image, tag_next, attribute_image, attribute_next in tqdm( combinations, total=total ): next_page_xpath = "//{}[contains({}, '{}')]//@href".format( tag_next, to_lower_case(attribute_next), next_page ) image_xpath = "//{}[contains({}, '{}')]//@src".format( tag_image, to_lower_case(attribute_image), image ) try: comic = Comic( name, url, image_xpath, next_page_xpath, start_page=start_page, alt_text=alt_text, single_page=single_page, javascript=javascript, title=title, debug=debug, ) first_pages = comic.verify_xpath() check_first_pages(first_pages) return comic, first_pages except KeyboardInterrupt: sys.exit(0) except: continue click.echo("Search has failed.") return None, None
def test_download_saves_the_files(cleanup_test_directories, three_webpages_uri): comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href") comic.download() path, dirs, files = next(os.walk("test")) assert len(files) == 2