def is_image_file_or_url(path_or_url: str) -> bool: """ Checks (via file extension) whether a file path or URL is an image. If path_or_url is a URL, strip away any query strings '?...'. This should have no adverse effect on local paths. """ stripped_path_or_url = urllib.parse.urlparse(path_or_url).path return path_utils.is_image_file(stripped_path_or_url)
def remove_non_images(js: MutableMapping[str, Dict[str, Any]], log: MutableMapping[str, Any]) -> None: """Remove images with non-image file extensions. Modifies [js] and [log] in-place. Args: js: dict, img_path => info dict log: dict, maps str description to log info """ print('Removing images with invalid image file extensions...') nonimg_paths = [k for k in js.keys() if not path_utils.is_image_file(k)] for img_path in nonimg_paths: del js[img_path] print(f'Removed {len(nonimg_paths)} files with non-image extensions.') if len(nonimg_paths) > 0: log['nonimage_files'] = sorted(nonimg_paths)
json_filename = f'{base_task_name}_{clean_folder_name}_all.json' list_file = os.path.join(filename_base, json_filename) # If this is intended to be a folder, it needs to end in '/', otherwise # files that start with the same string will match too folder_name = folder_name.replace('\\', '/') if len(folder_name) > 0 and (not folder_name.endswith('/')): folder_name = folder_name + '/' prefix = container_prefix + folder_name file_list = ai4e_azure_utils.enumerate_blobs_to_file( output_file=list_file, account_name=storage_account_name, container_name=container_name, sas_token=read_only_sas_token, blob_prefix=prefix) assert all(path_utils.is_image_file(s) for s in file_list) file_lists_by_folder.append(list_file) assert len(file_lists_by_folder) == len(folder_names) #%% Divide images into chunks for each folder # The JSON file at folder_chunks[i][j] corresponds to task j of taskgroup i folder_chunks = [] # list_file = file_lists_by_folder[0] for list_file in file_lists_by_folder: chunked_files, chunks = prepare_api_submission.divide_files_into_tasks( list_file) print('Divided images into files:') for i_fn, fn in enumerate(chunked_files):
def submit_batch_detection_api(images_to_detect: Iterable[str], task_lists_dir: str, detector_version: str, account: str, container: str, sas_token: str, caller: str, batch_detection_api_url: str, resume_file_path: str ) -> Dict[str, List[Task]]: """ Args: images_to_detect: list of str, list of str, image paths with the format <dataset-name>/<image-filename> task_lists_dir: str, path to local directory for saving JSON files each containing a list of image URLs corresponding to an API task detector_version: str, MegaDetector version string, e.g., '4.1', see {batch_detection_api_url}/supported_model_versions account: str, Azure Storage account name container: str, Azure Blob Storage container name, where the task lists will be uploaded sas_token: str, SAS token with write permissions for the container caller: str, allow-listed caller batch_detection_api_url: str, URL to batch detection API resume_file_path: str, path to save resume file Returns: dict, maps str dataset name to list of Task objects """ filtered_images_to_detect = [ x for x in images_to_detect if path_utils.is_image_file(x)] not_images = set(images_to_detect) - set(filtered_images_to_detect) if len(not_images) == 0: print('Good! All image files have valid file extensions.') else: print(f'Skipping {len(not_images)} files with non-image extensions:') pprint.pprint(sorted(not_images)) images_to_detect = filtered_images_to_detect datasets_table = megadb_utils.MegadbUtils().get_datasets_table() images_by_dataset = split_images_list_by_dataset(images_to_detect) tasks_by_dataset = {} for dataset, image_paths in images_by_dataset.items(): # get SAS URL for images container images_sas_token = datasets_table[dataset]['container_sas_key'] if images_sas_token[0] == '?': images_sas_token = images_sas_token[1:] images_container_url = sas_blob_utils.build_azure_storage_uri( account=datasets_table[dataset]['storage_account'], container=datasets_table[dataset]['container'], sas_token=images_sas_token) # strip image paths of dataset name image_blob_names = [path[path.find('/') + 1:] for path in image_paths] tasks_by_dataset[dataset] = submit_batch_detection_api_by_dataset( dataset=dataset, image_blob_names=image_blob_names, images_container_url=images_container_url, task_lists_dir=task_lists_dir, detector_version=detector_version, account=account, container=container, sas_token=sas_token, caller=caller, batch_detection_api_url=batch_detection_api_url) # save list of dataset names and task IDs for resuming resume_json = [ { 'dataset': dataset, 'task_name': task.name, 'task_id': task.id, 'local_images_list_path': task.local_images_list_path } for dataset in tasks_by_dataset for task in tasks_by_dataset[dataset] ] with open(resume_file_path, 'w') as f: json.dump(resume_json, f, indent=1) return tasks_by_dataset
def check_image_condition(img_path: str, truncated_images_lock: threading.Lock, account: Optional[str] = None, container: Optional[str] = None, sas_token: Optional[str] = None, datasets_table: Optional[Mapping[str, Any]] = None ) -> Tuple[str, str]: """ Args: img_path: str, either <blob_name> if datasets_table is None, or <dataset>/<blob_name> if datasets_table is given account: str, name of Azure Blob Storage account container: str, name of Azure Blob Storage container sas_token: str, optional SAS token (without leading '?') if the container is not publicly accessible datasets_table: dict, maps dataset name to dict of information Returns: (img_file, status) tuple, where status is one of 'nonexistant': blob does not exist in the container 'non_image': img_file does not have valid file extension 'good': image exists and is able to be opened without setting ImageFile.LOAD_TRUNCATED_IMAGES=True 'truncated': image exists but can only be opened by setting ImageFile.LOAD_TRUNCATED_IMAGES=True 'bad': image exists, but cannot be opened even when setting ImageFile.LOAD_TRUNCATED_IMAGES=True """ if (account is None) or (container is None) or (datasets_table is not None): assert account is None assert container is None assert sas_token is None assert datasets_table is not None dataset, img_file = img_path.split('/', maxsplit=1) account = datasets_table[dataset]['storage_account'] container = datasets_table[dataset]['container'] sas_token = datasets_table[dataset]['container_sas_key'] if sas_token[0] == '?': # strip leading '?' from SAS token sas_token = sas_token[1:] else: img_file = img_path if not path_utils.is_image_file(img_file): return img_file, 'non_image' blob_url = sas_blob_utils.build_azure_storage_uri( account=account, container=container, sas_token=sas_token, blob=img_file) blob_exists = sas_blob_utils.check_blob_exists(blob_url) if not blob_exists: return img_file, 'nonexistant' stream, _ = sas_blob_utils.download_blob_to_stream(blob_url) stream.seek(0) try: with truncated_images_lock: ImageFile.LOAD_TRUNCATED_IMAGES = False with Image.open(stream) as img: img.load() return img_file, 'good' except OSError as e: # PIL.UnidentifiedImageError is a subclass of OSError try: stream.seek(0) with truncated_images_lock: ImageFile.LOAD_TRUNCATED_IMAGES = True with Image.open(stream) as img: img.load() return img_file, 'truncated' except Exception as e: # pylint: disable=broad-except exception_type = type(e).__name__ tqdm.write(f'Unable to load {img_file}. {exception_type}: {e}.') return img_file, 'bad'