def test_google_images_query_chrome() -> None: output_path = Path(tempfile.TemporaryDirectory().name) metadata_path = Path(__file__).parent.joinpath("test-metadata.json") max_items = 30 images_metadata = qloader.run( endpoint="google-images", query_terms="cute dog", output_path=output_path, metadata=metadata_path, max_items=max_items, language="en", browser="Chrome", ) assert (len(images_metadata) / max_items) > 0.95 # assert 95% fill rate for image_metadata in images_metadata: assert ( image_metadata["test-key"] == "test-value" ) # assert metadata is propagated assert ( len([p for p in output_path.iterdir()]) / max_items ) > 0.95 # assert images are on disk
def run_image_capture( self, max_items: int = 100, extra_query_params: Optional[Dict[str, str]] = None, include_related: bool = False, overwrite: bool = False, ) -> None: """ Gather images from Google Images sets the attribute `self.raw_image_urls`""" # check if there are already raw images available already try: raw_images_available = len( list(self._local_raw_images_path.iterdir())) except FileNotFoundError: raw_images_available = 0 if raw_images_available > 0 and overwrite: for p in self._local_raw_images_path.iterdir(): if p.is_file(): p.unlink() else: shutil.rmtree(p) raw_images_available = 0 # allow a small failure rate, as a small percentage of downloads will fail if raw_images_available >= 0.90 * self.number_of_images: self.log.info( f"{raw_images_available} raw images already downloaded") if self.raw_image_urls is None: self.log.debug( f"raw images are present on disk, but no URLs are known. Perhaps there is a saved object to load urls from" ) return self.raw_images_metadata = qloader.run( endpoint="google-images", query_terms=self.label, output_path=self._local_raw_images_path, max_items=max_items, metadata=self.metadata, language=self.metadata["language"], browser=self.metadata["browser"], driver_path=self.metadata["driver_path"], extra_query_params=extra_query_params, track_related=include_related, ) if include_related: # move related images to the same folder as primary results self.log.info(f"flattening related images to main image directory") related_img_dir = self._local_raw_images_path.joinpath("related") for related_img_path in related_img_dir.iterdir(): related_img_path.rename(related_img_path.parents[1].joinpath( related_img_path.name)) related_img_dir.rmdir()
def main(args: argparse.Namespace) -> None: images_metadata = qloader.run( endpoint="google-images", query_terms=args.query, output_path=args.output_path, metadata=None, max_items=args.max_items, language=args.language, browser=args.browser, track_related=args.track_related, ) manifest_output = args.output_path.joinpath("manifest.json") manifest_output.write_text( json.dumps([im.data for im in images_metadata], indent=2)) print(f"wrote {manifest_output}")
def test_google_images_track_related() -> None: output_path = Path(tempfile.TemporaryDirectory().name) metadata_path = Path(__file__).parent.joinpath("test-metadata.json") max_items = 5 images_metadata = qloader.run( endpoint="google-images", query_terms="poodle", output_path=output_path, metadata=metadata_path, max_items=max_items, language="en", browser="Chrome", extra_query_params={"cr": "countryCA"}, track_related=True, )
def test_google_images_region_specific_query() -> None: output_path = Path(tempfile.TemporaryDirectory().name) metadata_path = Path(__file__).parent.joinpath("test-metadata.json") max_items = 20 images_metadata = qloader.run( endpoint="google-images", query_terms="vieux chien", output_path=output_path, metadata=metadata_path, max_items=max_items, language="fr", browser="Chrome", extra_query_params={"cr": "countryFR"}, ) assert ( len([p for p in output_path.iterdir()]) / max_items ) > 0.95 # assert images are on disk