예제 #1
0
def test_verify_xpath_with_alt_text(three_webpages_alt_text_uri):
    comic = Comic(
        "test_alt",
        three_webpages_alt_text_uri,
        "//img/@src",
        "//a/@href",
        alt_text="//img/@title",
    )

    three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip(
        "1.html")

    assert comic.verify_xpath() == [
        {
            "page": 1,
            "url": three_webpages_alt_text_uri,
            "image_urls": [three_webpages_alt_text_folder + "1.jpeg"],
            "alt_text": "First page",
        },
        {
            "page": 2,
            "url": three_webpages_alt_text_folder + "2.html",
            "image_urls": [three_webpages_alt_text_folder + "2.jpeg"],
            "alt_text": "Second page",
        },
        {
            "page": 3,
            "url": three_webpages_alt_text_folder + "3.html",
            "image_urls": [],
            "alt_text": None,
        },
    ]
예제 #2
0
def test_verify_xpath(three_webpages_uri):
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")

    three_webpages_folder = three_webpages_uri.strip("1.html")

    assert comic.verify_xpath() == [
        {
            "page": 1,
            "url": three_webpages_uri,
            "image_urls": [three_webpages_folder + "1.jpeg"],
            "alt_text": None,
        },
        {
            "page": 2,
            "url": three_webpages_folder + "2.html",
            "image_urls": [three_webpages_folder + "2.jpeg"],
            "alt_text": None,
        },
        {
            "page": 3,
            "url": three_webpages_folder + "3.html",
            "image_urls": [],
            "alt_text": None,
        },
    ]
예제 #3
0
def test_download_does_not_add_crawlers_in_main_process(
        mocker, cleanup_test_directories, three_webpages_uri):
    mocker.patch("webcomix.scrapy.crawler_worker.CrawlerWorker.start")
    mock_add_to_crawl = mocker.patch("scrapy.crawler.Crawler.crawl")
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")
    comic.download()
    assert mock_add_to_crawl.call_count == 0
예제 #4
0
def test_print_verification(capfd, three_webpages_uri):
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")
    verification = comic.verify_xpath()
    cli.print_verification(verification)
    out, err = capfd.readouterr()

    three_webpages_folder = three_webpages_uri.strip("1.html")

    assert out == ("Page 1:\n"
                   "Page URL: " + three_webpages_uri + "\n"
                   "Image URLs:\n"
                   "" + three_webpages_folder + "1.jpeg"
                   "\n"
                   "\n"
                   "Page 2:\n"
                   "Page URL: " + three_webpages_folder + "2.html"
                   "\n"
                   "Image URLs:\n"
                   "" + three_webpages_folder + "2.jpeg"
                   "\n"
                   "\n"
                   "Page 3:\n"
                   "Page URL: " + three_webpages_folder + "3.html"
                   "\n"
                   "Image URLs:\n"
                   "\n"
                   "\n")
예제 #5
0
def test_download_will_not_run_splash_settings_if_not_javascript(mocker):
    mocker.patch("os.path.isdir")
    mock_crawler_worker = mocker.patch("webcomix.comic.CrawlerWorker")
    comic = Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY)
    comic.download()
    settings = mock_crawler_worker.call_args_list[0][0][0]
    assert all(setting not in settings.items()
               for setting in SPLASH_SETTINGS.items())
예제 #6
0
def test_save_image_location():
    assert (Comic.save_image_location(
        "http://imgs.xkcd.com/comics/barrel_cropped_(1).jpg", 1,
        "foo") == "foo/1.jpg")
    assert Comic.save_image_location("", 1, "bar") == "bar/1"
    assert (Comic.save_image_location(
        "http://imgs.xkcd.com/comics/barrel_cropped_(1).jpg?q=123", 1,
        "foo") == "foo/1.jpg")
예제 #7
0
def test_verify_xpath_only_verifies_one_page_with_single_page(one_webpage_uri):
    comic = Comic("test",
                  one_webpage_uri,
                  "//img/@src",
                  "//a/@href",
                  single_page=True)
    actual = comic.verify_xpath()
    assert len(actual) == 1
    assert len(actual[0]["image_urls"]) == 2
예제 #8
0
def test_download_saves_the_files_with_correct_first_item(
        cleanup_test_directories, three_webpages_uri):
    comic = Comic("test",
                  three_webpages_uri,
                  "//img/@src",
                  "//a/@href",
                  start_page=10)
    comic.download()
    files = next(os.walk("test"))[2]
    assert sorted(files) == ["10", "11"]
예제 #9
0
def test_download_with_alt_text_saves_the_text(cleanup_test_directories,
                                               three_webpages_alt_text_uri):
    comic = Comic(
        "test",
        three_webpages_alt_text_uri,
        "//img/@src",
        "//a/@href",
        alt_text="//img/@title",
    )
    comic.download()
    path, dirs, files = next(os.walk("test"))
    assert len(files) == 4
예제 #10
0
def test_download_runs_the_worker(mocker, cleanup_test_directories):
    mock_crawler_running = mocker.patch(
        "webcomix.scrapy.crawler_worker.CrawlerWorker.start")
    comic = Comic(
        "xkcd",
        "http://xkcd.com/1/",
        "//div[@id='comic']//img/@src",
        "//a[@rel='next']/@href",
        False,
    )
    comic.download()
    assert mock_crawler_running.call_count == 1
예제 #11
0
 def image_in_zipfile(item, directory):
     zipfile_path = "{}.cbz".format(directory)
     if not os.path.isfile(zipfile_path):
         return False
     image_path_cbz = Comic.save_image_location(item.get("url"),
                                                item.get("page"))
     with ZipFile(zipfile_path, "r") as zipfile:
         return image_path_cbz in zipfile.namelist()
예제 #12
0
def test_verify_xpath_will_run_splash_settings_if_javascript(mocker):
    mocker.patch("os.path.isdir")
    mock_crawler_worker = mocker.patch("webcomix.comic.CrawlerWorker")
    comic = Comic(
        mocker.ANY,
        mocker.ANY,
        mocker.ANY,
        mocker.ANY,
        mocker.ANY,
        mocker.ANY,
        mocker.ANY,
        True,
    )
    comic.verify_xpath()
    settings = mock_crawler_worker.call_args_list[0][0][0]
    assert all(setting in settings.items()
               for setting in SPLASH_SETTINGS.items())
예제 #13
0
def test_convert_to_cbz_adds_all_files_to_cbz(cleanup_test_directories,
                                              three_webpages_uri):
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")
    comic.download()
    comic.convert_to_cbz()
    with ZipFile("{}.cbz".format(comic.name), mode="r") as cbz_file:
        assert len(cbz_file.infolist()) == 2
예제 #14
0
 def get_media_requests(self, item, info):
     click.echo("Saving image {}".format(item.get("url")))
     url, page, title, alt_text = itemgetter("url", "page", "title",
                                             "alt_text")(item)
     image_path_directory = Comic.save_image_location(
         url, page, info.spider.directory, title)
     if os.path.isfile(image_path_directory) or self.image_in_zipfile(
             item, info.spider.directory):
         click.echo("The image was already downloaded. Skipping...")
         raise DropItem("The image was already downloaded. Skipping...")
     if alt_text is not None:
         with open(
                 Comic.save_alt_text_location(page, info.spider.directory),
                 "w") as alt_text_file:
             alt_text_file.write(alt_text)
     yield scrapy.Request(
         item.get("url"),
         meta={
             "image_file_name":
             Comic.save_image_filename(item.get("url"), item.get("page"),
                                       title, info.spider.directory)
         },
     )
예제 #15
0
def test_print_verification_with_alt_text(capfd, three_webpages_alt_text_uri):
    comic = Comic(
        "test_alt",
        three_webpages_alt_text_uri,
        "//img/@src",
        "//a/@href",
        alt_text="//img/@title",
    )
    verification = comic.verify_xpath()
    cli.print_verification(verification)
    out, err = capfd.readouterr()

    three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip(
        "1.html")

    assert out == ("Page 1:\n"
                   "Page URL: " + three_webpages_alt_text_uri + "\n"
                   "Image URLs:\n"
                   "" + three_webpages_alt_text_folder + "1.jpeg"
                   "\n"
                   "Alt text: First page\n"
                   "\n"
                   "Page 2:\n"
                   "Page URL: " + three_webpages_alt_text_folder + "2.html"
                   "\n"
                   "Image URLs:\n"
                   "" + three_webpages_alt_text_folder + "2.jpeg"
                   "\n"
                   "Alt text: Second page\n"
                   "\n"
                   "Page 3:\n"
                   "Page URL: " + three_webpages_alt_text_folder + "3.html"
                   "\n"
                   "Image URLs:\n"
                   "\n"
                   "\n")
예제 #16
0
def test_discovered_comic_searches_for_a_comic(mocker):
    runner = CliRunner()
    mock_discovery = mocker.patch(
        "webcomix.cli.discovery",
        return_value=(
            Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY),
            mocker.ANY,
        ),
    )
    mock_download = mocker.patch("webcomix.comic.Comic.download")
    mock_verify_xpath = mocker.patch("webcomix.comic.Comic.verify_xpath")
    mock_print_verification = mocker.patch("webcomix.cli.print_verification")

    result = runner.invoke(cli.search, ["foo", "--start_url=good"], "y")
    assert result.exit_code == 0
    assert mock_discovery.call_count == 1
예제 #17
0
def test_search_searchable_website(mocker, three_webpages_classes_uri):
    expected = Comic(
        "Blindsprings",
        three_webpages_classes_uri,
        "//*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'comic')]//@src",
        "//*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'next')]//@href",
    )
    mocker.patch("webcomix.search.possible_image_xpath", ["comic"])
    mocker.patch("webcomix.search.possible_next_page_xpath", ["next"])
    mocker.patch("webcomix.search.possible_tags_image", ["*"])
    mocker.patch("webcomix.search.possible_tags_next", ["*"])
    mocker.patch("webcomix.search.possible_attributes_image", ["@class"])
    mocker.patch("webcomix.search.possible_attributes_next", ["@class"])
    mocker.patch("webcomix.util.check_first_pages")
    comic, result = discovery("Blindsprings", three_webpages_classes_uri)

    three_webpages_classes_folder = three_webpages_classes_uri.strip("1.html")

    assert result == [
        {
            "page": 1,
            "url": three_webpages_classes_uri,
            "image_urls": [three_webpages_classes_folder + "1.jpeg"],
            "alt_text": None,
        },
        {
            "page": 2,
            "url": three_webpages_classes_folder + "2.html",
            "image_urls": [three_webpages_classes_folder + "2.jpeg"],
            "alt_text": None,
        },
        {
            "page": 3,
            "url": three_webpages_classes_folder + "3.html",
            "image_urls": [three_webpages_classes_folder + "3.jpeg"],
            "alt_text": None,
        },
    ]

    assert comic.start_url == expected.start_url
    assert comic.next_page_selector == expected.next_page_selector
    assert comic.comic_image_selector == expected.comic_image_selector
예제 #18
0
def fake_downloaded_xkcd_comic():
    if os.path.isfile("xkcd.cbz"):
        os.remove("xkcd.cbz")
    if os.path.isdir("xkcd"):
        shutil.rmtree("xkcd")
    comic = Comic(
        "xkcd",
        "http://xkcd.com/1/",
        "//div[@id='comic']/img/@src",
        "//a[@rel='next']/@href",
        False,
    )
    os.mkdir("xkcd")
    for i in range(1, 6):
        with open("xkcd/{}.txt".format(i), "w") as image_file:
            image_file.write("testing {}".format(i))
    yield comic
    if os.path.isfile("xkcd.cbz"):
        os.remove("xkcd.cbz")
    if os.path.isdir("xkcd"):
        shutil.rmtree("xkcd")
예제 #19
0
def test_discovered_comic_asks_for_verification_before_downloading(mocker):
    runner = CliRunner()
    mock_manager = mocker.Mock()
    mock_discovery = mocker.patch(
        "webcomix.cli.discovery",
        return_value=(
            Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY),
            mocker.ANY,
        ),
    )
    mock_download = mocker.patch("webcomix.comic.Comic.download")
    mock_verify_xpath = mocker.patch("webcomix.comic.Comic.verify_xpath")
    mock_print_verification = mocker.patch("webcomix.cli.print_verification")
    mock_manager.attach_mock(mock_download, "download")
    mock_manager.attach_mock(mock_verify_xpath, "verify_xpath")
    mock_manager.attach_mock(mock_print_verification, "print_verification")

    result = runner.invoke(cli.search, ["foo", "--start_url=good"], "y")
    assert result.exit_code == 0
    mock_manager.assert_has_calls(
        [mocker.call.print_verification(mocker.ANY),
         mocker.call.download()])
예제 #20
0
def test_supported_comics(comic_name):
    comic = Comic(comic_name, *supported_comics[comic_name])
    first_pages = comic.verify_xpath()
    check_first_pages(first_pages)
예제 #21
0
def test_save_image_filename_with_title_present():
    assert (Comic.save_image_filename(
        "http://imgs.xkcd.com/comics/barrel_cropped_(1).jpg", 1, True,
        "foo") == "foo-1.jpg")
예제 #22
0
def discovery(
    name: str,
    url: str,
    start_page: int = 1,
    alt_text: str = None,
    single_page: bool = False,
    javascript: bool = False,
    title: bool = False,
    debug: bool = False,
) -> Tuple[Optional[Comic], Optional[List[Mapping]]]:
    def to_lower_case(attribute):
        return (
            "translate({}, "
            "'ABCDEFGHIJKLMNOPQRSTUVWXYZ',"
            "'abcdefghijklmnopqrstuvwxyz')"
        ).format(attribute)

    click.echo("Looking for a path to the whole comic... (Ctrl-C to exit)")
    combinations = product(
        possible_next_page_xpath,
        possible_image_xpath,
        possible_tags_image,
        possible_tags_next,
        possible_attributes_image,
        possible_attributes_next,
    )
    total = (
        len(possible_next_page_xpath)
        * len(possible_image_xpath)
        * len(possible_tags_image)
        * len(possible_tags_next)
        * len(possible_attributes_image)
        * len(possible_attributes_next)
    )

    for next_page, image, tag_image, tag_next, attribute_image, attribute_next in tqdm(
        combinations, total=total
    ):
        next_page_xpath = "//{}[contains({}, '{}')]//@href".format(
            tag_next, to_lower_case(attribute_next), next_page
        )
        image_xpath = "//{}[contains({}, '{}')]//@src".format(
            tag_image, to_lower_case(attribute_image), image
        )
        try:
            comic = Comic(
                name,
                url,
                image_xpath,
                next_page_xpath,
                start_page=start_page,
                alt_text=alt_text,
                single_page=single_page,
                javascript=javascript,
                title=title,
                debug=debug,
            )
            first_pages = comic.verify_xpath()
            check_first_pages(first_pages)
            return comic, first_pages
        except KeyboardInterrupt:
            sys.exit(0)
        except:
            continue
    click.echo("Search has failed.")
    return None, None
예제 #23
0
def test_download_saves_the_files(cleanup_test_directories,
                                  three_webpages_uri):
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")
    comic.download()
    path, dirs, files = next(os.walk("test"))
    assert len(files) == 2