def construct_url(img_path: str, datasets_table: Mapping[str, Any], dataset_name: Optional[str] = None) -> str: """Builds Azure SAS storage URL. Args: img_path: str, either <dataset_name>/<blob> (set dataset_name=None) or <img_file> without the 'path_prefix' from dataset_table datasets_table: dict, from MegaDB dataset_name: optional str Returns: str, URL with SAS token """ if dataset_name is None: dataset_name, blob = img_path.split('/', maxsplit=1) else: blob = img_path path_prefix = datasets_table[dataset_name].get('path_prefix', '') if len(path_prefix) > 0: blob = path_prefix + '/' + blob sas_token = datasets_table[dataset_name]['container_sas_key'] if sas_token[0] == '?': sas_token = sas_token[1:] url = build_azure_storage_uri( account=datasets_table[dataset_name]['storage_account'], container=datasets_table[dataset_name]['container'], blob=blob, sas_token=sas_token) return url
def get_image_sas_uris(img_paths: Iterable[str]) -> List[str]: """Converts a image paths to Azure Blob Storage blob URIs with SAS tokens. Args: img_paths: list of str, <dataset-name>/<image-filename> Returns: image_sas_uris: list of str, image blob URIs with SAS tokens, ready to pass to the batch detection API """ # we need the datasets table for getting SAS keys datasets_table = megadb_utils.MegadbUtils().get_datasets_table() image_sas_uris = [] for img_path in img_paths: dataset, img_file = img_path.split('/', maxsplit=1) # strip leading '?' from SAS token sas_token = datasets_table[dataset]['container_sas_key'] if sas_token[0] == '?': sas_token = sas_token[1:] image_sas_uri = sas_blob_utils.build_azure_storage_uri( account=datasets_table[dataset]['storage_account'], container=datasets_table[dataset]['container'], blob=img_file, sas_token=sas_token) image_sas_uris.append(image_sas_uri) return image_sas_uris
def upload_file_to_blob(account_name: str, container_name: str, local_path: str, blob_name: str, sas_token: str) -> str: """Uploads a local file to Azure Blob Storage and returns the uploaded blob URI with SAS token.""" container_uri = sas_blob_utils.build_azure_storage_uri( account=account_name, container=container_name, sas_token=sas_token) with open(local_path, 'rb') as data: return sas_blob_utils.upload_blob(container_uri=container_uri, blob_name=blob_name, data=data)
def enumerate_blobs_to_file( output_file: str, account_name: str, container_name: str, sas_token: Optional[str] = None, blob_prefix: Optional[str] = None, blob_suffix: Optional[Union[str, Tuple[str]]] = None, rsearch: Optional[str] = None, limit: Optional[int] = None ) -> List[str]: """ Enumerates blobs in a container, and writes the blob names to an output file. Args: output_file: str, path to save list of files in container If ends in '.json', writes a JSON string. Otherwise, writes a newline-delimited list. Can be None, in which case this is just a convenient wrapper for blob enumeration. account_name: str, Azure Storage account name container_name: str, Azure Blob Storage container name sas_token: optional str, container SAS token, leading ? will be removed if present. blob_prefix: optional str, returned results will only contain blob names to with this prefix blob_suffix: optional str or tuple of str, returned results will only contain blob names with this/these suffix(es). The blob names will be lowercased first before comparing with the suffix(es). rsearch: optional str, returned results will only contain blob names that match this regex. Can also be a list of regexes, in which case blobs matching *any* of the regex's will be returned. limit: int, maximum # of blob names to list if None, then returns all blob names Returns: list of str, sorted blob names, of length limit or shorter. """ if sas_token is not None and len(sas_token) > 9 and sas_token[0] == '?': sas_token = sas_token[1:] container_uri = sas_blob_utils.build_azure_storage_uri( account=account_name, container=container_name, sas_token=sas_token) matched_blobs = sas_blob_utils.list_blobs_in_container( container_uri=container_uri, blob_prefix=blob_prefix, blob_suffix=blob_suffix, rsearch=rsearch, limit=limit) if output_file is not None: write_list_to_file(output_file, matched_blobs) return matched_blobs
def enumerate_blobs_to_file(output_file: str, account_name: str, container_name: str, sas_token: Optional[str] = None, blob_prefix: Optional[str] = None, blob_suffix: Optional[Union[str, Tuple[str]]] = None, rsearch: Optional[str] = None, limit: Optional[int] = None) -> List[str]: """Enumerates blobs in a container, and writes the blob names to an output file. Args: output_file: str, path to save list of files in container If ends in '.json', writes a JSON string. Otherwise, writes a newline-delimited list account_name: str, Azure Storage account name container_name: str, Azure Blob Storage container name sas_token: optional str, container SAS token, does not start with '?' blob_prefix: optional str, returned results will only contain blob names to with this prefix blob_suffix: optional str or tuple of str, returned results will only contain blob names with this/these suffix(es). The blob names will be lowercased first before comparing with the suffix(es). rsearch: optional str, returned results will only contain blob names that match this Python regex pattern at any point in the blob name. Use '^' character to only match from the beginning of the blob name. limit: int, maximum # of blob names to list if None, then returns all blob names Returns: list of str, sorted blob names, of length limit or shorter. """ container_uri = sas_blob_utils.build_azure_storage_uri( account=account_name, container=container_name, sas_token=sas_token) matched_blobs = sas_blob_utils.list_blobs_in_container( container_uri=container_uri, blob_prefix=blob_prefix, blob_suffix=blob_suffix, rsearch=rsearch, limit=limit) write_list_to_file(output_file, matched_blobs) return matched_blobs
def construct_url(img_path: str, datasets_table: Mapping[str, Any], dataset_name: Optional[str] = None) -> str: """Builds Azure SAS storage URL. Args: img_path: str, either <dataset_name>/<blob> (set dataset_name=None) or <img_file> without the 'path_prefix' from dataset_table datasets_table: dict, from MegaDB dataset_name: optional str Returns: str, URL with SAS token """ if dataset_name is None: dataset_name, blob = img_path.split('/', maxsplit=1) else: blob = img_path path_prefix = datasets_table[dataset_name].get('path_prefix', '') if len(path_prefix) > 0: blob = path_prefix + '/' + blob sas_token = datasets_table[dataset_name]['container_sas_key'] if sas_token[0] == '?': sas_token = sas_token[1:] url = build_azure_storage_uri( account=datasets_table[dataset_name]['storage_account'], container=datasets_table[dataset_name]['container'], blob=blob, sas_token=sas_token) # wiitigers Unicode issue - no good mapping from DB file names to file names in blob URL if dataset_name == 'wiitigers' and '' in img_path: url = url.replace('%C3%AF%E2%82%AC%C2%A8', '%EF%80%A8') return url
def create_batch_job(job_id: str, body: dict): """ This is the target to be run in a thread to submit a batch processing job and monitor progress """ job_status_table = JobStatusTable() try: log.info(f'server_job, create_batch_job, job_id {job_id}, {body}') input_container_sas = body.get('input_container_sas', None) use_url = body.get('use_url', False) images_requested_json_sas = body.get('images_requested_json_sas', None) image_path_prefix = body.get('image_path_prefix', None) first_n = body.get('first_n', None) first_n = int(first_n) if first_n else None sample_n = body.get('sample_n', None) sample_n = int(sample_n) if sample_n else None model_version = body.get('model_version', '') if model_version == '': model_version = api_config.DEFAULT_MD_VERSION # request_name and request_submission_timestamp are for appending to # output file names job_name = body.get('request_name', '') # in earlier versions we used "request" to mean a "job" job_submission_timestamp = get_utc_time() # image_paths can be a list of strings (Azure blob names or public URLs) # or a list of length-2 lists where each is a [image_id, metadata] pair # Case 1: listing all images in the container # - not possible to have attached metadata if listing images in a blob if images_requested_json_sas is None: log.info('server_job, create_batch_job, listing all images to process.') # list all images to process image_paths = sas_blob_utils.list_blobs_in_container( container_uri=input_container_sas, blob_prefix=image_path_prefix, # check will be case-sensitive blob_suffix=api_config.IMAGE_SUFFIXES_ACCEPTED, # check will be case-insensitive limit=api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB + 1 # + 1 so if the number of images listed > MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB # we will know and not proceed ) # Case 2: user supplied a list of images to process; can include metadata else: log.info('server_job, create_batch_job, using provided list of images.') output_stream, blob_properties = sas_blob_utils.download_blob_to_stream(images_requested_json_sas) image_paths = json.load(output_stream) log.info('server_job, create_batch_job, length of image_paths provided by the user: {}'.format( len(image_paths))) if len(image_paths) == 0: job_status = get_job_status( 'completed', '0 images found in provided list of images.') job_status_table.update_job_status(job_id, job_status) return error, metadata_available = validate_provided_image_paths(image_paths) if error is not None: msg = 'image paths provided in the json are not valid: {}'.format(error) raise ValueError(msg) # filter down to those conforming to the provided prefix and accepted suffixes (image file types) valid_image_paths = [] for p in image_paths: locator = p[0] if metadata_available else p # prefix is case-sensitive; suffix is not if image_path_prefix is not None and not locator.startswith(image_path_prefix): continue # Although urlparse(p).path preserves the extension on local paths, it will not work for # blob file names that contains "#", which will be treated as indication of a query. # If the URL is generated via Azure Blob Storage, the "#" char will be properly encoded path = urllib.parse.urlparse(locator).path if use_url else locator if path.lower().endswith(api_config.IMAGE_SUFFIXES_ACCEPTED): valid_image_paths.append(p) image_paths = valid_image_paths log.info(('server_job, create_batch_job, length of image_paths provided by user, ' f'after filtering to jpg: {len(image_paths)}')) # apply the first_n and sample_n filters if first_n: assert first_n > 0, 'parameter first_n is 0.' # OK if first_n > total number of images image_paths = image_paths[:first_n] if sample_n: assert sample_n > 0, 'parameter sample_n is 0.' if sample_n > len(image_paths): msg = ('parameter sample_n specifies more images than ' 'available (after filtering by other provided params).') raise ValueError(msg) # sample by shuffling image paths and take the first sample_n images log.info('First path before shuffling:', image_paths[0]) shuffle(image_paths) log.info('First path after shuffling:', image_paths[0]) image_paths = image_paths[:sample_n] num_images = len(image_paths) log.info(f'server_job, create_batch_job, num_images after applying all filters: {num_images}') if num_images < 1: job_status = get_job_status('completed', ( 'Zero images found in container or in provided list of images ' 'after filtering with the provided parameters.')) job_status_table.update_job_status(job_id, job_status) return if num_images > api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB: job_status = get_job_status( 'failed', (f'The number of images ({num_images}) requested for processing exceeds the maximum ' f'accepted {api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB} in one call')) job_status_table.update_job_status(job_id, job_status) return # upload the image list to the container, which is also mounted on all nodes # all sharding and scoring use the uploaded list images_list_str_as_bytes = bytes(json.dumps(image_paths, ensure_ascii=False), encoding='utf-8') container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME, container=api_config.STORAGE_CONTAINER_API) with ContainerClient.from_container_url(container_url, credential=api_config.STORAGE_ACCOUNT_KEY) as api_container_client: _ = api_container_client.upload_blob( name=f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_images.json', data=images_list_str_as_bytes) job_status = get_job_status('created', f'{num_images} images listed; submitting the job...') job_status_table.update_job_status(job_id, job_status) except Exception as e: job_status = get_job_status('failed', f'Error occurred while preparing the Batch job: {e}') job_status_table.update_job_status(job_id, job_status) log.error(f'server_job, create_batch_job, Error occurred while preparing the Batch job: {e}') return # do not start monitoring try: batch_job_manager = BatchJobManager() model_rel_path = api_config.MD_VERSIONS_TO_REL_PATH[model_version] batch_job_manager.create_job(job_id, model_rel_path, input_container_sas, use_url) num_tasks, task_ids_failed_to_submit = batch_job_manager.submit_tasks(job_id, num_images) # now request_status moves from created to running job_status = get_job_status('running', (f'Submitted {num_images} images to cluster in {num_tasks} shards. ' f'Number of shards failed to be submitted: {len(task_ids_failed_to_submit)}')) # an extra field to allow the monitoring thread to restart after an API restart: total number of tasks job_status['num_tasks'] = num_tasks # also record the number of images to process for reporting job_status['num_images'] = num_images job_status_table.update_job_status(job_id, job_status) except Exception as e: job_status = get_job_status('problem', f'Please contact us. Error occurred while submitting the Batch job: {e}') job_status_table.update_job_status(job_id, job_status) log.error(f'server_job, create_batch_job, Error occurred while submitting the Batch job: {e}') return # start the monitor thread with the same name try: thread = threading.Thread( target=monitor_batch_job, name=f'job_{job_id}', kwargs={ 'job_id': job_id, 'num_tasks': num_tasks, 'model_version': model_version, 'job_name': job_name, 'job_submission_timestamp': job_submission_timestamp } ) thread.start() except Exception as e: job_status = get_job_status('problem', f'Error occurred while starting the monitoring thread: {e}') job_status_table.update_job_status(job_id, job_status) log.error(f'server_job, create_batch_job, Error occurred while starting the monitoring thread: {e}') return
def aggregate_results(job_id: str, model_version: str, job_name: str, job_submission_timestamp: str) -> str: log.info(f'server_job, aggregate_results starting, job_id: {job_id}') container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME, container=api_config.STORAGE_CONTAINER_API) # when people download this, the timestamp will have : replaced by _ output_file_path = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_detections_{job_name}_{job_submission_timestamp}.json' with ContainerClient.from_container_url(container_url, credential=api_config.STORAGE_ACCOUNT_KEY) as container_client: # check if the result blob has already been written (could be another instance of the API / worker thread) # and if so, skip aggregating and uploading the results, and just generate the SAS URL, which # could be needed still if the previous request_status was `problem`. blob_client = container_client.get_blob_client(output_file_path) if blob_client.exists(): log.warning(f'The output file already exists, likely because another monitoring thread already wrote it.') else: task_outputs_dir = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_outputs/' generator = container_client.list_blobs(name_starts_with=task_outputs_dir) blobs = [i for i in generator if i.name.endswith('.json')] all_results = [] for blob_props in tqdm(blobs): with container_client.get_blob_client(blob_props) as blob_client: stream = io.BytesIO() blob_client.download_blob().readinto(stream) stream.seek(0) task_results = json.load(stream) all_results.extend(task_results) api_output = { 'info': { 'detector': f'megadetector_v{model_version}', 'detection_completion_time': get_utc_time(), 'format_version': api_config.OUTPUT_FORMAT_VERSION }, 'detection_categories': api_config.DETECTOR_LABEL_MAP, 'images': all_results } # upload the output JSON to the Job folder api_output_as_bytes = bytes(json.dumps(api_output, ensure_ascii=False, indent=1), encoding='utf-8') _ = container_client.upload_blob(name=output_file_path, data=api_output_as_bytes) output_sas = generate_blob_sas( account_name=api_config.STORAGE_ACCOUNT_NAME, container_name=api_config.STORAGE_CONTAINER_API, blob_name=output_file_path, account_key=api_config.STORAGE_ACCOUNT_KEY, permission=BlobSasPermissions(read=True, write=False), expiry=datetime.utcnow() + timedelta(days=api_config.OUTPUT_SAS_EXPIRATION_DAYS) ) output_sas_url = sas_blob_utils.build_azure_storage_uri( account=api_config.STORAGE_ACCOUNT_NAME, container=api_config.STORAGE_CONTAINER_API, blob=output_file_path, sas_token=output_sas ) log.info(f'server_job, aggregate_results done, job_id: {job_id}') log.info(f'output_sas_url: {output_sas_url}') return output_sas_url
# # Also available at the /supported_model_versions and /default_model_version # endpoints # # Unless you have any specific reason to set this to a non-default value, leave # it at the default, which as of 2020.04.28 is MegaDetector 4.1 # # additional_task_args = {"model_version":"4_prelim"} # #%% Derived variables, path setup assert len(folder_names) != 0 read_only_sas_url = sas_blob_utils.build_azure_storage_uri( account=storage_account_name, container=container_name, sas_token=read_only_sas_token) write_sas_url = sas_blob_utils.build_azure_storage_uri( account=storage_account_name, container=container_name, sas_token=read_write_sas_token) # local folders filename_base = os.path.join(base_output_folder_name, base_task_name) raw_api_output_folder = os.path.join(filename_base, 'raw_api_outputs') combined_api_output_folder = os.path.join(filename_base, 'combined_api_outputs') postprocessing_output_folder = os.path.join(filename_base, 'postprocessing') os.makedirs(filename_base, exist_ok=True) os.makedirs(raw_api_output_folder, exist_ok=True)
output_files = {} pbar = tqdm(js.items()) for img_path, img_info in pbar: save_path = os.path.join(images_dir, img_path) if os.path.exists(save_path): continue ds, img_file = img_path.split('/', maxsplit=1) if ds not in output_files: output_path = os.path.join(output_dir, f'{ds}_images.txt') output_files[ds] = open(output_path, 'w') dataset_info = datasets_table[ds] account = dataset_info['storage_account'] container = dataset_info['container'] if 'public' in datasets_table[ds]['access']: url = sas_blob_utils.build_azure_storage_uri(account, container) else: url = sas_blob_utils.build_azure_storage_uri( account, container, sas_token=dataset_info['container_sas_key'][1:]) pbar.write(f'"{url}"') output_files[ds].write(img_file + '\n') for f in output_files.values(): f.close()
def aggregate_results(job_id, model_version, job_name, job_submission_timestamp): log.info(f'server_job, aggregate_results starting, job_id: {job_id}') task_outputs_dir = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_outputs/' container_url = sas_blob_utils.build_azure_storage_uri( account=api_config.STORAGE_ACCOUNT_NAME, container=api_config.STORAGE_CONTAINER_API) all_results = [] with ContainerClient.from_container_url( container_url, credential=api_config.STORAGE_ACCOUNT_KEY) as container_client: generator = container_client.list_blobs( name_starts_with=task_outputs_dir) blobs = [i for i in generator if i.name.endswith('.json')] for blob_props in tqdm(blobs): with container_client.get_blob_client(blob_props) as blob_client: stream = io.BytesIO() blob_client.download_blob().readinto(stream) stream.seek(0) task_results = json.load(stream) all_results.extend(task_results) api_output = { 'info': { 'detector': f'megadetector_v{model_version}', 'detection_completion_time': get_utc_time(), 'format_version': api_config.OUTPUT_FORMAT_VERSION }, 'detection_categories': api_config.DETECTOR_LABEL_MAP, 'images': all_results } # upload the output JSON to the Job folder api_output_as_bytes = bytes(json.dumps(api_output, ensure_ascii=False, indent=1), encoding='utf-8') output_file_path = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_detections_{job_name}_{job_submission_timestamp}.json' _ = container_client.upload_blob(name=output_file_path, data=api_output_as_bytes) output_sas = generate_blob_sas( account_name=api_config.STORAGE_ACCOUNT_NAME, container_name=api_config.STORAGE_CONTAINER_API, blob_name=output_file_path, account_key=api_config.STORAGE_ACCOUNT_KEY, permission=BlobSasPermissions(read=True, write=False), expiry=datetime.utcnow() + timedelta(days=api_config.OUTPUT_SAS_EXPIRATION_DAYS)) output_sas_url = sas_blob_utils.build_azure_storage_uri( account=api_config.STORAGE_ACCOUNT_NAME, container=api_config.STORAGE_CONTAINER_API, blob=output_file_path, sas_token=output_sas) log.info(f'server_job, aggregate_results done, job_id: {job_id}') log.info(f'output_sas_url: {output_sas_url}') return output_sas_url
def download_and_crop( queried_images_json: Mapping[str, Mapping[str, Any]], detection_cache: Mapping[str, Mapping[str, Mapping[str, Any]]], detection_categories: Mapping[str, str], detector_version: str, cropped_images_dir: str, confidence_threshold: float, save_full_images: bool, square_crops: bool, check_crops_valid: bool, images_dir: Optional[str] = None, threads: int = 1, images_missing_detections: Optional[Iterable[str]] = None ) -> Tuple[List[str], int, int]: """ Saves crops to a file with the same name as the original image with an additional suffix appended, starting with 3 underscores: - if image has ground truth bboxes: "___cropXX.jpg", where "XX" indicates the bounding box index - if image has bboxes from MegaDetector: "___cropXX_mdvY.Y.jpg", where "Y.Y" indicates the MegaDetector version See module docstring for more info and examples. Note: this function is very similar to the "download_and_crop()" function in crop_detections.py. The main difference is that this function uses MegaDB to look up Azure Storage container information for images based on the dataset, whereas the crop_detections.py version has no concept of a "dataset" and "ground-truth" bounding boxes from MegaDB. Args: queried_images_json: dict, represents JSON output of json_validator.py, all images in queried_images_json are assumed to have either ground truth or cached detected bounding boxes unless images_missing_detections is given detection_cache: dict, dataset_name => {img_path => detection_dict} detector_version: str, detector version string, e.g., '4.1' cropped_images_dir: str, path to folder where cropped images are saved confidence_threshold: float, only crop bounding boxes above this value save_full_images: bool, whether to save downloaded images to images_dir, images_dir must be given and must exist if save_full_images=True square_crops: bool, whether to crop bounding boxes as squares check_crops_valid: bool, whether to load each crop to ensure the file is valid (i.e., not truncated) images_dir: optional str, path to folder where full images are saved threads: int, number of threads to use for downloading images images_missing_detections: optional list of str, image files to skip because they have no ground truth or cached detected bounding boxes Returns: list of str, images with bounding boxes that failed to download or crop properly """ # error checking before we download and crop any images valid_img_paths = set(queried_images_json.keys()) if images_missing_detections is not None: valid_img_paths -= set(images_missing_detections) for img_path in valid_img_paths: info_dict = queried_images_json[img_path] ds, img_file = img_path.split('/', maxsplit=1) assert ds == info_dict['dataset'] if 'bbox' in info_dict: # ground-truth bounding boxes pass elif img_file in detection_cache[ds]: # detected bounding boxes bbox_dicts = detection_cache[ds][img_file]['detections'] assert all('conf' in bbox_dict for bbox_dict in bbox_dicts) # convert from category ID to category name for d in bbox_dicts: d['category'] = detection_categories[d['category']] else: raise ValueError(f'{img_path} has no ground truth bounding boxes ' 'and was not found in the detection cache. Please ' 'include it in images_missing_detections.') # we need the datasets table for getting SAS keys datasets_table = megadb_utils.MegadbUtils().get_datasets_table() container_clients = {} # dataset name => ContainerClient pool = futures.ThreadPoolExecutor(max_workers=threads) future_to_img_path = {} images_failed_download = [] print(f'Getting bbox info for {len(valid_img_paths)} images...') for img_path in tqdm(sorted(valid_img_paths)): # we already did all error checking above, so we don't do any here info_dict = queried_images_json[img_path] ds, img_file = img_path.split('/', maxsplit=1) # get ContainerClient if ds not in container_clients: sas_token = datasets_table[ds]['container_sas_key'] if sas_token[0] == '?': sas_token = sas_token[1:] url = sas_blob_utils.build_azure_storage_uri( account=datasets_table[ds]['storage_account'], container=datasets_table[ds]['container'], sas_token=sas_token) container_clients[ds] = ContainerClient.from_container_url(url) container_client = container_clients[ds] # get bounding boxes # we must include the dataset <ds> in <crop_path_template> because # '{img_path}' actually gets populated with <img_file> in # load_and_crop() is_ground_truth = ('bbox' in info_dict) if is_ground_truth: # ground-truth bounding boxes bbox_dicts = info_dict['bbox'] crop_path_template = os.path.join( cropped_images_dir, ds, '{img_path}___crop{n:>02d}.jpg') else: # detected bounding boxes bbox_dicts = detection_cache[ds][img_file]['detections'] crop_path_template = os.path.join( cropped_images_dir, ds, '{img_path}___crop{n:>02d}_' + f'mdv{detector_version}.jpg') ds_dir = None if images_dir is None else os.path.join(images_dir, ds) # get the image, either from disk or from Blob Storage future = pool.submit( load_and_crop, img_file, ds_dir, container_client, bbox_dicts, confidence_threshold, crop_path_template, save_full_images, square_crops, check_crops_valid) future_to_img_path[future] = img_path total = len(future_to_img_path) total_downloads = 0 total_new_crops = 0 print(f'Reading/downloading {total} images and cropping...') for future in tqdm(futures.as_completed(future_to_img_path), total=total): img_path = future_to_img_path[future] try: did_download, num_new_crops = future.result() total_downloads += did_download total_new_crops += num_new_crops except Exception as e: # pylint: disable=broad-except exception_type = type(e).__name__ tqdm.write(f'{img_path} - generated {exception_type}: {e}') images_failed_download.append(img_path) pool.shutdown() for container_client in container_clients.values(): # inelegant way to close the container_clients with container_client: pass print(f'Downloaded {total_downloads} images.') print(f'Made {total_new_crops} new crops.') return images_failed_download, total_downloads, total_new_crops
def submit_batch_detection_api(images_to_detect: Iterable[str], task_lists_dir: str, detector_version: str, account: str, container: str, sas_token: str, caller: str, batch_detection_api_url: str, resume_file_path: str ) -> Dict[str, List[Task]]: """ Args: images_to_detect: list of str, list of str, image paths with the format <dataset-name>/<image-filename> task_lists_dir: str, path to local directory for saving JSON files each containing a list of image URLs corresponding to an API task detector_version: str, MegaDetector version string, e.g., '4.1', see {batch_detection_api_url}/supported_model_versions account: str, Azure Storage account name container: str, Azure Blob Storage container name, where the task lists will be uploaded sas_token: str, SAS token with write permissions for the container caller: str, allow-listed caller batch_detection_api_url: str, URL to batch detection API resume_file_path: str, path to save resume file Returns: dict, maps str dataset name to list of Task objects """ filtered_images_to_detect = [ x for x in images_to_detect if path_utils.is_image_file(x)] not_images = set(images_to_detect) - set(filtered_images_to_detect) if len(not_images) == 0: print('Good! All image files have valid file extensions.') else: print(f'Skipping {len(not_images)} files with non-image extensions:') pprint.pprint(sorted(not_images)) images_to_detect = filtered_images_to_detect datasets_table = megadb_utils.MegadbUtils().get_datasets_table() images_by_dataset = split_images_list_by_dataset(images_to_detect) tasks_by_dataset = {} for dataset, image_paths in images_by_dataset.items(): # get SAS URL for images container images_sas_token = datasets_table[dataset]['container_sas_key'] if images_sas_token[0] == '?': images_sas_token = images_sas_token[1:] images_container_url = sas_blob_utils.build_azure_storage_uri( account=datasets_table[dataset]['storage_account'], container=datasets_table[dataset]['container'], sas_token=images_sas_token) # strip image paths of dataset name image_blob_names = [path[path.find('/') + 1:] for path in image_paths] tasks_by_dataset[dataset] = submit_batch_detection_api_by_dataset( dataset=dataset, image_blob_names=image_blob_names, images_container_url=images_container_url, task_lists_dir=task_lists_dir, detector_version=detector_version, account=account, container=container, sas_token=sas_token, caller=caller, batch_detection_api_url=batch_detection_api_url) # save list of dataset names and task IDs for resuming resume_json = [ { 'dataset': dataset, 'task_name': task.name, 'task_id': task.id, 'local_images_list_path': task.local_images_list_path } for dataset in tasks_by_dataset for task in tasks_by_dataset[dataset] ] with open(resume_file_path, 'w') as f: json.dump(resume_json, f, indent=1) return tasks_by_dataset
def submit_tasks(self, job_id: str, num_images: int) -> Tuple[int, list]: """ Shard the images and submit each shard as a Task under the Job pointed to by this job_id Args: job_id: ID of the Batch Job to submit the tasks to num_images: total number of images to be processed in this Job Returns: num_task: total number of Tasks that should be in this Job task_ids_failed_to_submit: which Tasks from the above failed to be submitted """ log.info('BatchJobManager, submit_tasks') # cannot execute the scoring script that is in the mounted directory; has to be copied to cwd # not luck giving the commandline arguments via formatted string - set as env vars instead score_command = '/bin/bash -c \"cp $AZ_BATCH_NODE_MOUNTS_DIR/batch-api/scripts/score.py . && python score.py\" ' num_images_per_task = api_config.NUM_IMAGES_PER_TASK # form shards of images and assign each shard to a Task num_tasks = math.ceil(num_images / num_images_per_task) # for persisting stdout and stderr permissions = ContainerSasPermissions(read=True, write=True, list=True) access_duration_hrs = api_config.MONITOR_PERIOD_MINUTES * api_config.MAX_MONITOR_CYCLES / 60 container_sas_token = generate_container_sas( account_name=api_config.STORAGE_ACCOUNT_NAME, container_name=api_config.STORAGE_CONTAINER_API, account_key=api_config.STORAGE_ACCOUNT_KEY, permission=permissions, expiry=datetime.utcnow() + timedelta(hours=access_duration_hrs)) container_sas_url = sas_blob_utils.build_azure_storage_uri( account=api_config.STORAGE_ACCOUNT_NAME, container=api_config.STORAGE_CONTAINER_API, sas_token=container_sas_token) tasks = [] for task_id in range(num_tasks): begin_index = task_id * num_images_per_task end_index = begin_index + num_images_per_task # persist stdout and stderr (will be removed when node removed) # paths are relative to the Task working directory stderr_destination = OutputFileDestination( container=OutputFileBlobContainerDestination( container_url=container_sas_url, path= f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_logs/job_{job_id}_task_{task_id}_stderr.txt' )) stdout_destination = OutputFileDestination( container=OutputFileBlobContainerDestination( container_url=container_sas_url, path= f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_logs/job_{job_id}_task_{task_id}_stdout.txt' )) std_err_and_out = [ OutputFile( file_pattern= '../stderr.txt', # stderr.txt is at the same level as wd destination=stderr_destination, upload_options=OutputFileUploadOptions( upload_condition=OutputFileUploadCondition. task_completion) # can also just upload on failure ), OutputFile(file_pattern='../stdout.txt', destination=stdout_destination, upload_options=OutputFileUploadOptions( upload_condition=OutputFileUploadCondition. task_completion)) ] task = TaskAddParameter( id=str(task_id), command_line=score_command, container_settings=TaskContainerSettings( image_name=api_config.CONTAINER_IMAGE_NAME, working_directory='taskWorkingDirectory'), environment_settings=[ EnvironmentSetting(name='TASK_BEGIN_INDEX', value=begin_index), EnvironmentSetting(name='TASK_END_INDEX', value=end_index), ], output_files=std_err_and_out) tasks.append(task) # first try submitting Tasks task_ids_failed_to_submit = self._create_tasks( job_id, tasks, api_config.NUM_TASKS_PER_SUBMISSION, 1) # retry submitting Tasks if len(task_ids_failed_to_submit) > 0: task_ids_failed_to_submit_set = set(task_ids_failed_to_submit) tasks_to_retry = [ t for t in tasks if t.id in task_ids_failed_to_submit_set ] task_ids_failed_to_submit = self._create_tasks( job_id, tasks_to_retry, api_config.NUM_TASKS_PER_RESUBMISSION, 2) if len(task_ids_failed_to_submit) > 0: log.info( 'BatchJobManager, submit_tasks, after retry, ' f'len of task_ids_failed_to_submit: {len(task_ids_failed_to_submit)}' ) else: log.info( 'BatchJobManager, submit_tasks, after retry, all Tasks submitted' ) else: log.info( 'BatchJobManager, submit_tasks, all Tasks submitted after first try' ) # Change the Job's on_all_tasks_complete option to 'terminateJob' so the Job's status changes automatically # after all submitted tasks are done # This is so that we do not take up the quota for active Jobs in the Batch account. job_patch_params = JobPatchParameter( on_all_tasks_complete=OnAllTasksComplete.terminate_job) self.batch_client.job.patch(job_id, job_patch_params) return num_tasks, task_ids_failed_to_submit
def check_image_condition(img_path: str, truncated_images_lock: threading.Lock, account: Optional[str] = None, container: Optional[str] = None, sas_token: Optional[str] = None, datasets_table: Optional[Mapping[str, Any]] = None ) -> Tuple[str, str]: """ Args: img_path: str, either <blob_name> if datasets_table is None, or <dataset>/<blob_name> if datasets_table is given account: str, name of Azure Blob Storage account container: str, name of Azure Blob Storage container sas_token: str, optional SAS token (without leading '?') if the container is not publicly accessible datasets_table: dict, maps dataset name to dict of information Returns: (img_file, status) tuple, where status is one of 'nonexistant': blob does not exist in the container 'non_image': img_file does not have valid file extension 'good': image exists and is able to be opened without setting ImageFile.LOAD_TRUNCATED_IMAGES=True 'truncated': image exists but can only be opened by setting ImageFile.LOAD_TRUNCATED_IMAGES=True 'bad': image exists, but cannot be opened even when setting ImageFile.LOAD_TRUNCATED_IMAGES=True """ if (account is None) or (container is None) or (datasets_table is not None): assert account is None assert container is None assert sas_token is None assert datasets_table is not None dataset, img_file = img_path.split('/', maxsplit=1) account = datasets_table[dataset]['storage_account'] container = datasets_table[dataset]['container'] sas_token = datasets_table[dataset]['container_sas_key'] if sas_token[0] == '?': # strip leading '?' from SAS token sas_token = sas_token[1:] else: img_file = img_path if not path_utils.is_image_file(img_file): return img_file, 'non_image' blob_url = sas_blob_utils.build_azure_storage_uri( account=account, container=container, sas_token=sas_token, blob=img_file) blob_exists = sas_blob_utils.check_blob_exists(blob_url) if not blob_exists: return img_file, 'nonexistant' stream, _ = sas_blob_utils.download_blob_to_stream(blob_url) stream.seek(0) try: with truncated_images_lock: ImageFile.LOAD_TRUNCATED_IMAGES = False with Image.open(stream) as img: img.load() return img_file, 'good' except OSError as e: # PIL.UnidentifiedImageError is a subclass of OSError try: stream.seek(0) with truncated_images_lock: ImageFile.LOAD_TRUNCATED_IMAGES = True with Image.open(stream) as img: img.load() return img_file, 'truncated' except Exception as e: # pylint: disable=broad-except exception_type = type(e).__name__ tqdm.write(f'Unable to load {img_file}. {exception_type}: {e}.') return img_file, 'bad'