示例#1
0
def is_image_file_or_url(path_or_url: str) -> bool:
    """
    Checks (via file extension) whether a file path or URL is an image.

    If path_or_url is a URL, strip away any query strings '?...'. This should
    have no adverse effect on local paths.
    """
    stripped_path_or_url = urllib.parse.urlparse(path_or_url).path
    return path_utils.is_image_file(stripped_path_or_url)
示例#2
0
def remove_non_images(js: MutableMapping[str, Dict[str, Any]],
                      log: MutableMapping[str, Any]) -> None:
    """Remove images with non-image file extensions. Modifies [js] and [log]
    in-place.

    Args:
        js: dict, img_path => info dict
        log: dict, maps str description to log info
    """
    print('Removing images with invalid image file extensions...')
    nonimg_paths = [k for k in js.keys() if not path_utils.is_image_file(k)]
    for img_path in nonimg_paths:
        del js[img_path]
    print(f'Removed {len(nonimg_paths)} files with non-image extensions.')
    if len(nonimg_paths) > 0:
        log['nonimage_files'] = sorted(nonimg_paths)
    json_filename = f'{base_task_name}_{clean_folder_name}_all.json'
    list_file = os.path.join(filename_base, json_filename)

    # If this is intended to be a folder, it needs to end in '/', otherwise
    # files that start with the same string will match too
    folder_name = folder_name.replace('\\', '/')
    if len(folder_name) > 0 and (not folder_name.endswith('/')):
        folder_name = folder_name + '/'
    prefix = container_prefix + folder_name
    file_list = ai4e_azure_utils.enumerate_blobs_to_file(
        output_file=list_file,
        account_name=storage_account_name,
        container_name=container_name,
        sas_token=read_only_sas_token,
        blob_prefix=prefix)
    assert all(path_utils.is_image_file(s) for s in file_list)
    file_lists_by_folder.append(list_file)

assert len(file_lists_by_folder) == len(folder_names)

#%% Divide images into chunks for each folder

# The JSON file at folder_chunks[i][j] corresponds to task j of taskgroup i
folder_chunks = []

# list_file = file_lists_by_folder[0]
for list_file in file_lists_by_folder:
    chunked_files, chunks = prepare_api_submission.divide_files_into_tasks(
        list_file)
    print('Divided images into files:')
    for i_fn, fn in enumerate(chunked_files):
def submit_batch_detection_api(images_to_detect: Iterable[str],
                               task_lists_dir: str,
                               detector_version: str,
                               account: str,
                               container: str,
                               sas_token: str,
                               caller: str,
                               batch_detection_api_url: str,
                               resume_file_path: str
                               ) -> Dict[str, List[Task]]:
    """
    Args:
        images_to_detect: list of str, list of str, image paths with the format
            <dataset-name>/<image-filename>
        task_lists_dir: str, path to local directory for saving JSON files
            each containing a list of image URLs corresponding to an API task
        detector_version: str, MegaDetector version string, e.g., '4.1',
            see {batch_detection_api_url}/supported_model_versions
        account: str, Azure Storage account name
        container: str, Azure Blob Storage container name, where the task lists
            will be uploaded
        sas_token: str, SAS token with write permissions for the container
        caller: str, allow-listed caller
        batch_detection_api_url: str, URL to batch detection API
        resume_file_path: str, path to save resume file

    Returns: dict, maps str dataset name to list of Task objects
    """
    filtered_images_to_detect = [
        x for x in images_to_detect if path_utils.is_image_file(x)]
    not_images = set(images_to_detect) - set(filtered_images_to_detect)
    if len(not_images) == 0:
        print('Good! All image files have valid file extensions.')
    else:
        print(f'Skipping {len(not_images)} files with non-image extensions:')
        pprint.pprint(sorted(not_images))
    images_to_detect = filtered_images_to_detect

    datasets_table = megadb_utils.MegadbUtils().get_datasets_table()

    images_by_dataset = split_images_list_by_dataset(images_to_detect)
    tasks_by_dataset = {}
    for dataset, image_paths in images_by_dataset.items():
        # get SAS URL for images container
        images_sas_token = datasets_table[dataset]['container_sas_key']
        if images_sas_token[0] == '?':
            images_sas_token = images_sas_token[1:]
        images_container_url = sas_blob_utils.build_azure_storage_uri(
            account=datasets_table[dataset]['storage_account'],
            container=datasets_table[dataset]['container'],
            sas_token=images_sas_token)

        # strip image paths of dataset name
        image_blob_names = [path[path.find('/') + 1:] for path in image_paths]

        tasks_by_dataset[dataset] = submit_batch_detection_api_by_dataset(
            dataset=dataset,
            image_blob_names=image_blob_names,
            images_container_url=images_container_url,
            task_lists_dir=task_lists_dir,
            detector_version=detector_version,
            account=account, container=container, sas_token=sas_token,
            caller=caller, batch_detection_api_url=batch_detection_api_url)

    # save list of dataset names and task IDs for resuming
    resume_json = [
        {
            'dataset': dataset,
            'task_name': task.name,
            'task_id': task.id,
            'local_images_list_path': task.local_images_list_path
        }
        for dataset in tasks_by_dataset
        for task in tasks_by_dataset[dataset]
    ]
    with open(resume_file_path, 'w') as f:
        json.dump(resume_json, f, indent=1)
    return tasks_by_dataset
def check_image_condition(img_path: str,
                          truncated_images_lock: threading.Lock,
                          account: Optional[str] = None,
                          container: Optional[str] = None,
                          sas_token: Optional[str] = None,
                          datasets_table: Optional[Mapping[str, Any]] = None
                          ) -> Tuple[str, str]:
    """
    Args:
        img_path: str, either <blob_name> if datasets_table is None, or
            <dataset>/<blob_name> if datasets_table is given
        account: str, name of Azure Blob Storage account
        container: str, name of Azure Blob Storage container
        sas_token: str, optional SAS token (without leading '?') if the
            container is not publicly accessible
        datasets_table: dict, maps dataset name to dict of information

    Returns: (img_file, status) tuple, where status is one of
        'nonexistant': blob does not exist in the container
        'non_image': img_file does not have valid file extension
        'good': image exists and is able to be opened without setting
            ImageFile.LOAD_TRUNCATED_IMAGES=True
        'truncated': image exists but can only be opened by setting
            ImageFile.LOAD_TRUNCATED_IMAGES=True
        'bad': image exists, but cannot be opened even when setting
            ImageFile.LOAD_TRUNCATED_IMAGES=True
    """
    if (account is None) or (container is None) or (datasets_table is not None):
        assert account is None
        assert container is None
        assert sas_token is None
        assert datasets_table is not None

        dataset, img_file = img_path.split('/', maxsplit=1)
        account = datasets_table[dataset]['storage_account']
        container = datasets_table[dataset]['container']
        sas_token = datasets_table[dataset]['container_sas_key']
        if sas_token[0] == '?':  # strip leading '?' from SAS token
            sas_token = sas_token[1:]
    else:
        img_file = img_path

    if not path_utils.is_image_file(img_file):
        return img_file, 'non_image'

    blob_url = sas_blob_utils.build_azure_storage_uri(
        account=account, container=container, sas_token=sas_token,
        blob=img_file)
    blob_exists = sas_blob_utils.check_blob_exists(blob_url)
    if not blob_exists:
        return img_file, 'nonexistant'

    stream, _ = sas_blob_utils.download_blob_to_stream(blob_url)
    stream.seek(0)
    try:
        with truncated_images_lock:
            ImageFile.LOAD_TRUNCATED_IMAGES = False
            with Image.open(stream) as img:
                img.load()
        return img_file, 'good'
    except OSError as e:  # PIL.UnidentifiedImageError is a subclass of OSError
        try:
            stream.seek(0)
            with truncated_images_lock:
                ImageFile.LOAD_TRUNCATED_IMAGES = True
                with Image.open(stream) as img:
                    img.load()
            return img_file, 'truncated'
        except Exception as e:  # pylint: disable=broad-except
            exception_type = type(e).__name__
            tqdm.write(f'Unable to load {img_file}. {exception_type}: {e}.')
            return img_file, 'bad'