def _finalize_preprocessing_parameters( preprocessing_parameters: dict, first_img_entry: Union[str, 'numpy.array'], src_path: str, input_feature_col: np.array): """ Helper method to determine the height, width and number of channels for preprocessing the image data. This is achieved by looking at the parameters provided by the user. When there are some missing parameters, we fall back on to the first image in the dataset. The assumption being that all the images in the data are expected be of the same size with the same number of channels """ first_image = read_image(first_img_entry) explicit_height_width = HEIGHT in preprocessing_parameters or WIDTH in preprocessing_parameters explicit_num_channels = NUM_CHANNELS in preprocessing_parameters inferred_sample = None if preprocessing_parameters[INFER_IMAGE_DIMENSIONS] and not ( explicit_height_width and explicit_num_channels): sample_size = min( len(input_feature_col), preprocessing_parameters[INFER_IMAGE_SAMPLE_SIZE]) sample = [ read_image(get_image_from_path(src_path, img)) for img in input_feature_col.head(sample_size) ] inferred_sample = [img for img in sample if img is not None] if len(inferred_sample) == 0: raise ValueError( "No readable images in sample, image dimensions cannot be inferred" ) should_resize = False if explicit_height_width: should_resize = True try: height = int(preprocessing_parameters[HEIGHT]) width = int(preprocessing_parameters[WIDTH]) except ValueError as e: raise ValueError('Image height and width must be set and have ' 'positive integer values: ' + str(e)) if height <= 0 or width <= 0: raise ValueError( 'Image height and width must be positive integers') else: # User hasn't specified height and width. # Default to inferring from sample or first image. if preprocessing_parameters[INFER_IMAGE_DIMENSIONS]: should_resize = True height_avg = min( sum(x.shape[0] for x in inferred_sample) / len(inferred_sample), preprocessing_parameters[INFER_IMAGE_MAX_HEIGHT]) width_avg = min( sum(x.shape[1] for x in inferred_sample) / len(inferred_sample), preprocessing_parameters[INFER_IMAGE_MAX_WIDTH]) height, width = round(height_avg), round(width_avg) logger.debug("Inferring height: {0} and width: {1}".format( height, width)) elif first_image is not None: height, width = first_image.shape[0], first_image.shape[1] else: raise ValueError( "Explicit image width/height are not set, infer_image_dimensions is false, " "and first image cannot be read, so image dimensions are unknown" ) if explicit_num_channels: # User specified num_channels in the model/feature config user_specified_num_channels = True num_channels = preprocessing_parameters[NUM_CHANNELS] else: user_specified_num_channels = False if preprocessing_parameters[INFER_IMAGE_DIMENSIONS]: user_specified_num_channels = True num_channels = round( sum(num_channels_in_image(x) for x in inferred_sample) / len(inferred_sample)) elif first_image is not None: num_channels = num_channels_in_image(first_image) else: raise ValueError( "Explicit image num channels is not set, infer_image_dimensions is false, " "and first image cannot be read, so image num channels is unknown" ) assert isinstance( num_channels, int), ValueError('Number of image channels needs to be an integer') return (should_resize, width, height, num_channels, user_specified_num_channels, first_image)
def add_feature_data(feature, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): in_memory = preprocessing_parameters['in_memory'] if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]: in_memory = feature[PREPROCESSING]['in_memory'] num_processes = preprocessing_parameters['num_processes'] if PREPROCESSING in feature and 'num_processes' in feature[ PREPROCESSING]: num_processes = feature[PREPROCESSING]['num_processes'] src_path = None if SRC in metadata: src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) num_images = len(input_df[feature[COLUMN]]) if num_images == 0: raise ValueError('There are no images in the dataset provided.') first_img_entry = next(iter(input_df[feature[COLUMN]])) logger.debug('Detected image feature type is {}'.format( type(first_img_entry))) if not isinstance(first_img_entry, str) \ and not isinstance(first_img_entry, np.ndarray): raise ValueError( 'Invalid image feature data type. Detected type is {}, ' 'expect either string for file path or numpy array.'.format( type(first_img_entry))) first_img_entry = get_image_from_path(src_path, first_img_entry) (should_resize, width, height, num_channels, user_specified_num_channels, first_image) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, first_img_entry, src_path, input_df[feature[COLUMN]]) metadata[feature[NAME]][PREPROCESSING]['height'] = height metadata[feature[NAME]][PREPROCESSING]['width'] = width metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels read_image_and_resize = partial( ImageFeatureMixin._read_image_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters['resize_method'], user_specified_num_channels=user_specified_num_channels) # TODO: alternatively use get_average_image() for unreachable images default_image = get_gray_default_image(height, width, num_channels) # check to see if the active backend can support lazy loading of # image features from the hdf5 cache. backend.check_lazy_load_supported(feature) if in_memory or skip_save_processed_input: # Number of processes to run in parallel for preprocessing metadata[ feature[NAME]][PREPROCESSING]['num_processes'] = num_processes metadata[feature[NAME]]['reshape'] = (height, width, num_channels) # Split the dataset into pools only if we have an explicit request to use # multiple processes. In case we have multiple input images use the # standard code anyway. if backend.supports_multiprocessing and (num_processes > 1 or num_images > 1): all_img_entries = [ get_abs_path(src_path, img_entry) if isinstance( img_entry, str) else img_entry for img_entry in input_df[feature[COLUMN]] ] with Pool(num_processes) as pool: logger.debug( 'Using {} processes for preprocessing images'.format( num_processes)) res = pool.map(read_image_and_resize, all_img_entries) proc_df[feature[PROC_COLUMN]] = [ x if x is not None else default_image for x in res ] else: # If we're not running multiple processes and we are only processing one # image just use this faster shortcut, bypassing multiprocessing.Pool.map logger.debug( 'No process pool initialized. Using internal process for preprocessing images' ) # helper function for handling single image def _get_processed_image(img_store): if isinstance(img_store, str): res_single = read_image_and_resize( get_abs_path(src_path, img_store)) else: res_single = read_image_and_resize(img_store) return res_single if res_single is not None else default_image proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects( input_df[feature[COLUMN]], _get_processed_image) else: all_img_entries = [ get_abs_path(src_path, img_entry) if isinstance( img_entry, str) else img_entry for img_entry in input_df[feature[COLUMN]] ] data_fp = backend.cache.get_cache_path(metadata.get(SRC), metadata.get(CHECKSUM), TRAINING) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature[PROC_COLUMN] + '_data', (num_images, height, width, num_channels), dtype=np.uint8) for i, img_entry in enumerate(all_img_entries): res = read_image_and_resize(img_entry) image_dataset[ i, :height, : width, :] = res if res is not None else default_image h5_file.flush() proc_df[feature[PROC_COLUMN]] = np.arange(num_images) return proc_df
def _finalize_preprocessing_parameters( preprocessing_parameters: dict, first_img_entry: Union[str, 'numpy.array'], src_path: str, input_feature_col: np.array): """ Helper method to determine the height, width and number of channels for preprocessing the image data. This is achieved by looking at the parameters provided by the user. When there are some missing parameters, we fall back on to the first image in the dataset. The assumption being that all the images in the data are expected be of the same size with the same number of channels """ first_image = read_image(first_img_entry) first_img_height = first_image.shape[0] first_img_width = first_image.shape[1] first_img_num_channels = num_channels_in_image(first_image) should_resize = False if (HEIGHT in preprocessing_parameters or WIDTH in preprocessing_parameters): should_resize = True try: height = int(preprocessing_parameters[HEIGHT]) width = int(preprocessing_parameters[WIDTH]) except ValueError as e: raise ValueError('Image height and width must be set and have ' 'positive integer values: ' + str(e)) if height <= 0 or width <= 0: raise ValueError( 'Image height and width must be positive integers') else: # User hasn't specified height and width. # Default to first image, or infer from sample. height, width = first_img_height, first_img_width if preprocessing_parameters[INFER_IMAGE_DIMENSIONS]: should_resize = True sample_size = min( len(input_feature_col), preprocessing_parameters[INFER_IMAGE_SAMPLE_SIZE]) sample_images = [ read_image(get_image_from_path(src_path, img)) for img in input_feature_col[:sample_size] ] if sample_images: height_avg = min( sum(x.shape[0] for x in sample_images) / len(sample_images), preprocessing_parameters[INFER_IMAGE_MAX_HEIGHT]) width_avg = min( sum(x.shape[1] for x in sample_images) / len(sample_images), preprocessing_parameters[INFER_IMAGE_MAX_WIDTH]) height, width = round(height_avg), round(width_avg) logger.debug("Inferring height: {0} and width: {1}".format( height, width)) else: logger.warning( "Sample set for inference is empty, default to height and width of first image" ) if NUM_CHANNELS in preprocessing_parameters: # User specified num_channels in the model/feature config user_specified_num_channels = True num_channels = preprocessing_parameters[NUM_CHANNELS] else: user_specified_num_channels = False num_channels = first_img_num_channels assert isinstance( num_channels, int), ValueError('Number of image channels needs to be an integer') return (should_resize, width, height, num_channels, user_specified_num_channels, first_image)