Exemplo n.º 1
0
def test_google_images_query_chrome() -> None:
    output_path = Path(tempfile.TemporaryDirectory().name)
    metadata_path = Path(__file__).parent.joinpath("test-metadata.json")
    max_items = 30

    images_metadata = qloader.run(
        endpoint="google-images",
        query_terms="cute dog",
        output_path=output_path,
        metadata=metadata_path,
        max_items=max_items,
        language="en",
        browser="Chrome",
    )

    assert (len(images_metadata) / max_items) > 0.95  # assert 95% fill rate

    for image_metadata in images_metadata:
        assert (
            image_metadata["test-key"] == "test-value"
        )  # assert metadata is propagated

    assert (
        len([p for p in output_path.iterdir()]) / max_items
    ) > 0.95  # assert images are on disk
Exemplo n.º 2
0
    def run_image_capture(
        self,
        max_items: int = 100,
        extra_query_params: Optional[Dict[str, str]] = None,
        include_related: bool = False,
        overwrite: bool = False,
    ) -> None:
        """ Gather images from Google Images sets the attribute `self.raw_image_urls`"""

        # check if there are already raw images available already
        try:
            raw_images_available = len(
                list(self._local_raw_images_path.iterdir()))
        except FileNotFoundError:
            raw_images_available = 0

        if raw_images_available > 0 and overwrite:
            for p in self._local_raw_images_path.iterdir():
                if p.is_file():
                    p.unlink()
                else:
                    shutil.rmtree(p)
            raw_images_available = 0

        # allow a small failure rate, as a small percentage of downloads will fail
        if raw_images_available >= 0.90 * self.number_of_images:
            self.log.info(
                f"{raw_images_available} raw images already downloaded")
            if self.raw_image_urls is None:
                self.log.debug(
                    f"raw images are present on disk, but no URLs are known. Perhaps there is a saved object to load urls from"
                )
            return

        self.raw_images_metadata = qloader.run(
            endpoint="google-images",
            query_terms=self.label,
            output_path=self._local_raw_images_path,
            max_items=max_items,
            metadata=self.metadata,
            language=self.metadata["language"],
            browser=self.metadata["browser"],
            driver_path=self.metadata["driver_path"],
            extra_query_params=extra_query_params,
            track_related=include_related,
        )

        if include_related:
            # move related images to the same folder as primary results
            self.log.info(f"flattening related images to main image directory")
            related_img_dir = self._local_raw_images_path.joinpath("related")
            for related_img_path in related_img_dir.iterdir():
                related_img_path.rename(related_img_path.parents[1].joinpath(
                    related_img_path.name))
            related_img_dir.rmdir()
Exemplo n.º 3
0
def main(args: argparse.Namespace) -> None:
    images_metadata = qloader.run(
        endpoint="google-images",
        query_terms=args.query,
        output_path=args.output_path,
        metadata=None,
        max_items=args.max_items,
        language=args.language,
        browser=args.browser,
        track_related=args.track_related,
    )
    manifest_output = args.output_path.joinpath("manifest.json")
    manifest_output.write_text(
        json.dumps([im.data for im in images_metadata], indent=2))
    print(f"wrote {manifest_output}")
Exemplo n.º 4
0
def test_google_images_track_related() -> None:
    output_path = Path(tempfile.TemporaryDirectory().name)
    metadata_path = Path(__file__).parent.joinpath("test-metadata.json")
    max_items = 5

    images_metadata = qloader.run(
        endpoint="google-images",
        query_terms="poodle",
        output_path=output_path,
        metadata=metadata_path,
        max_items=max_items,
        language="en",
        browser="Chrome",
        extra_query_params={"cr": "countryCA"},
        track_related=True,
    )
Exemplo n.º 5
0
def test_google_images_region_specific_query() -> None:
    output_path = Path(tempfile.TemporaryDirectory().name)
    metadata_path = Path(__file__).parent.joinpath("test-metadata.json")
    max_items = 20

    images_metadata = qloader.run(
        endpoint="google-images",
        query_terms="vieux chien",
        output_path=output_path,
        metadata=metadata_path,
        max_items=max_items,
        language="fr",
        browser="Chrome",
        extra_query_params={"cr": "countryFR"},
    )

    assert (
        len([p for p in output_path.iterdir()]) / max_items
    ) > 0.95  # assert images are on disk