Пример #1
0
    def test_download_image(self):
        reset_directory(IMAGE_DIRECTORY)
        url = URLS[0]

        download_image(url, IMAGE_DIRECTORY)

        file_name = escape_image_name(url)
        assert file_name in os.listdir(IMAGE_DIRECTORY)
        assert os.path.getsize(os.path.join(IMAGE_DIRECTORY, file_name)) > 0
Пример #2
0
    def test_download_image_with_timeout_exceed_time(self):
        reset_directory(IMAGE_DIRECTORY)
        timeout = 0
        url = URLS[0]

        successful = download_image_with_timeout(url, IMAGE_DIRECTORY, timeout)

        assert not successful
        file_name = escape_image_name(url)
        assert file_name not in os.listdir(IMAGE_DIRECTORY)
Пример #3
0
    def test_download_image_with_timeout_within_time(self):
        reset_directory(IMAGE_DIRECTORY)
        timeout = 5
        url = URLS[0]

        successful = download_image_with_timeout(url, IMAGE_DIRECTORY, timeout)

        assert successful
        file_name = escape_image_name(url)
        assert file_name in os.listdir(IMAGE_DIRECTORY)
        assert os.path.getsize(os.path.join(IMAGE_DIRECTORY, file_name)) > 0
def download_image(url: str, output_directory: str):
    """
    Downloads image from given URL.

    Parameters:
    url (str): URL to try and download image from
    output_directory (str): output_directory to save image to
    """
    image_name = escape_image_name(url)
    data = download_url(url)
    with open(os.path.join(output_directory, image_name), "wb") as output_file:
        output_file.write(data)
Пример #5
0
    def test_multi_thread_image_download(self):
        reset_directory(IMAGE_DIRECTORY)

        total_images = multi_thread_image_download(
            URL_FILE,
            IMAGE_DIRECTORY,
            max_fetching_threads=2,
            download_timeout=5,
            verbose=False,
        )
        assert total_images == len(URLS)
        for url in URLS:
            file_name = escape_image_name(url)
            assert file_name in os.listdir(IMAGE_DIRECTORY)
            assert os.path.getsize(os.path.join(IMAGE_DIRECTORY,
                                                file_name)) > 0
Пример #6
0
    def test_multi_thread_image_download_list(self):
        reset_directory(IMAGE_DIRECTORY)

        total_images = multi_thread_image_download(
            [
                "https://benaandrew.github.io/images/sentiment.jpg",
                "https://benaandrew.github.io/images/dog.jpg"
            ],
            IMAGE_DIRECTORY,
            max_fetching_threads=2,
            download_timeout=5,
            verbose=False,
        )
        assert total_images == len(URLS)
        for url in URLS:
            file_name = escape_image_name(url)
            assert file_name in os.listdir(IMAGE_DIRECTORY)
            assert os.path.getsize(os.path.join(IMAGE_DIRECTORY,
                                                file_name)) > 0
Пример #7
0
 def test_escape_image_name_https(self):
     url = "https://www.fakewebsite.com/sample.jpg"
     result = escape_image_name(url)
     assert result == "wwwfakewebsitecomsample.jpg"
Пример #8
0
 def test_escape_image_name_non_alphanumeric_characters(self):
     url = "https://www.fakewebsite!.com/sample$_.jpg"
     result = escape_image_name(url)
     assert result == "wwwfakewebsitecomsample.jpg"
def multi_thread_image_download(
    urls: str,
    output_directory: str,
    max_fetching_threads=None,
    download_timeout=5,
    verbose=True,
):
    """
    Downloads list of images using multiple threads.

    Parameters:
    url_file_path (list/str): list of URLs or path to text file containing list of URLs
    output_directory (str): destination directory path
    max_image_fetching_threads (int): maximum number of concurrent image download threads (default is cores * 5)
    image_download_timeout (int): maximum wait time in seconds for an image download (default is 5)
    verbose (bool): show tqdm progress bar (default true)

    Returns:
    int: total files in the directory
    """
    # If urls is not a list is must be a path to a file with a list of urls
    if isinstance(urls, str):
        with open(urls, "r") as url_file:
            urls = set(url_file.read().splitlines())

    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)
    else:
        # Exclude existing images
        urls = [
            url for url in urls
            if escape_image_name(url) not in os.listdir(output_directory)
        ]

    # Build concurrent thread pool with max_image_fetching_threads
    with ThreadPoolExecutor(max_fetching_threads) as pool:
        if download_timeout is not None:
            futures = [
                pool.submit(
                    download_image_with_timeout,
                    url,
                    output_directory,
                    download_timeout,
                ) for url in urls
            ]
        else:
            futures = [
                pool.submit(
                    download_image,
                    url,
                    output_directory,
                ) for url in urls
            ]

        if verbose:
            for _ in tqdm(as_completed(futures)):
                pass
        else:
            wait(futures)

    return len(os.listdir(output_directory))