def _get_processed_image(img_store): if isinstance(img_store, str): res_single = read_image_and_resize( get_abs_path(src_path, img_store)) else: res_single = read_image_and_resize(img_store) return res_single if res_single is not None else default_image
def _get_processed_image(img_store): if isinstance(img_store, str): return read_image_and_resize( get_abs_path(src_path, img_store) ) else: return read_image_and_resize(img_store)
def is_image(src_path: str, img_entry: Union[bytes, str], column: str) -> bool: if not isinstance(img_entry, str): return False try: import imghdr path = get_abs_path(src_path, img_entry) bytes_obj = get_bytes_obj_from_path(path) if isinstance(bytes_obj, bytes): return imghdr.what(None, bytes_obj) is not None return imghdr.what(bytes_obj) is not None except Exception as e: logger.warning( f"While assessing potential image in is_image() for column {column}, encountered exception: {e}" ) return False
def get_image_from_path(src_path, img_entry, ret_bytes=False): """ skimage.io.imread() can read filenames or urls imghdr.what() can read filenames or bytes """ if not isinstance(img_entry, str): return img_entry if is_http(img_entry): if ret_bytes: return get_image_from_http_bytes(img_entry) return img_entry if src_path or os.path.isabs(img_entry): return get_abs_path(src_path, img_entry) with open_file(img_entry, 'rb') as f: if ret_bytes: return f.read() return f
def get_image_from_path( src_path: Union[str, torch.Tensor], img_entry: Union[str, bytes], ret_bytes: bool = False ) -> Union[BytesIO, BinaryIO, TextIO, bytes, str]: if not isinstance(img_entry, str): return img_entry if is_http(img_entry): if ret_bytes: # Returns BytesIO. return get_image_from_http_bytes(img_entry) return img_entry if src_path or os.path.isabs(img_entry): return get_abs_path(src_path, img_entry) with open_file(img_entry, "rb") as f: if ret_bytes: return f.read() return f
def add_feature_data(feature, dataset_df, dataset, metadata, preprocessing_parameters): set_default_value(feature['preprocessing'], 'in_memory', preprocessing_parameters['in_memory']) if not 'audio_feature' in preprocessing_parameters: raise ValueError( 'audio_feature dictionary has to be present in preprocessing ' 'for audio.') if not TYPE in preprocessing_parameters['audio_feature']: raise ValueError( 'type has to be present in audio_feature dictionary ' 'for audio.') src_path = None # this is not super nice, but works both and DFs and lists first_path = '.' for first_path in dataset_df[feature[NAME]]: break if hasattr(dataset_df, 'src'): src_path = os.path.dirname(os.path.abspath(dataset_df.src)) if src_path is None and not os.path.isabs(first_path): raise ValueError('Audio file paths must be absolute') num_audio_utterances = len(dataset_df) padding_value = preprocessing_parameters['padding_value'] normalization_type = preprocessing_parameters['norm'] feature_name = feature[NAME] feature_dim = metadata[feature_name]['feature_dim'] max_length = metadata[feature_name]['max_length'] audio_feature_dict = preprocessing_parameters['audio_feature'] audio_file_length_limit_in_s = preprocessing_parameters[ 'audio_file_length_limit_in_s'] if num_audio_utterances == 0: raise ValueError( 'There are no audio files in the dataset provided.') audio_stats = { 'count': 0, 'mean': 0, 'var': 0, 'std': 0, 'max': 0, 'min': float('inf'), 'cropped': 0, 'max_length_in_s': audio_file_length_limit_in_s } if feature['preprocessing']['in_memory']: dataset[feature[NAME]] = np.empty( (num_audio_utterances, max_length, feature_dim), dtype=np.float32) for i, path in enumerate(dataset_df[feature[NAME]]): filepath = get_abs_path(src_path, path) audio_feature = AudioFeatureMixin._read_audio_and_transform_to_feature( filepath, audio_feature_dict, feature_dim, max_length, padding_value, normalization_type, audio_stats) dataset[feature[NAME]][i, :, :] = audio_feature audio_stats['std'] = np.sqrt(audio_stats['var'] / float(audio_stats['count'])) print_statistics = ("{} audio files loaded.\n" "Statistics of audio file lengths:\n" "- mean: {:.4f}\n" "- std: {:.4f}\n" "- max: {:.4f}\n" "- min: {:.4f}\n" "- cropped audio_files: {}\n" "Max length was given as {}s").format( audio_stats['count'], audio_stats['mean'], audio_stats['std'], audio_stats['max'], audio_stats['min'], audio_stats['cropped'], audio_stats['max_length_in_s']) logger.debug(print_statistics)
def add_feature_data(feature, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): in_memory = preprocessing_parameters['in_memory'] if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]: in_memory = feature[PREPROCESSING]['in_memory'] num_processes = preprocessing_parameters['num_processes'] if PREPROCESSING in feature and 'num_processes' in feature[ PREPROCESSING]: num_processes = feature[PREPROCESSING]['num_processes'] src_path = None if SRC in metadata: src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) num_images = len(input_df[feature[COLUMN]]) if num_images == 0: raise ValueError('There are no images in the dataset provided.') first_img_entry = next(iter(input_df[feature[COLUMN]])) logger.debug('Detected image feature type is {}'.format( type(first_img_entry))) if not isinstance(first_img_entry, str) \ and not isinstance(first_img_entry, np.ndarray): raise ValueError( 'Invalid image feature data type. Detected type is {}, ' 'expect either string for file path or numpy array.'.format( type(first_img_entry))) first_img_entry = get_image_from_path(src_path, first_img_entry) (should_resize, width, height, num_channels, user_specified_num_channels, first_image) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, first_img_entry, src_path, input_df[feature[COLUMN]]) metadata[feature[NAME]][PREPROCESSING]['height'] = height metadata[feature[NAME]][PREPROCESSING]['width'] = width metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels read_image_and_resize = partial( ImageFeatureMixin._read_image_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters['resize_method'], user_specified_num_channels=user_specified_num_channels) # TODO: alternatively use get_average_image() for unreachable images default_image = get_gray_default_image(height, width, num_channels) # check to see if the active backend can support lazy loading of # image features from the hdf5 cache. backend.check_lazy_load_supported(feature) if in_memory or skip_save_processed_input: # Number of processes to run in parallel for preprocessing metadata[ feature[NAME]][PREPROCESSING]['num_processes'] = num_processes metadata[feature[NAME]]['reshape'] = (height, width, num_channels) # Split the dataset into pools only if we have an explicit request to use # multiple processes. In case we have multiple input images use the # standard code anyway. if backend.supports_multiprocessing and (num_processes > 1 or num_images > 1): all_img_entries = [ get_abs_path(src_path, img_entry) if isinstance( img_entry, str) else img_entry for img_entry in input_df[feature[COLUMN]] ] with Pool(num_processes) as pool: logger.debug( 'Using {} processes for preprocessing images'.format( num_processes)) res = pool.map(read_image_and_resize, all_img_entries) proc_df[feature[PROC_COLUMN]] = [ x if x is not None else default_image for x in res ] else: # If we're not running multiple processes and we are only processing one # image just use this faster shortcut, bypassing multiprocessing.Pool.map logger.debug( 'No process pool initialized. Using internal process for preprocessing images' ) # helper function for handling single image def _get_processed_image(img_store): if isinstance(img_store, str): res_single = read_image_and_resize( get_abs_path(src_path, img_store)) else: res_single = read_image_and_resize(img_store) return res_single if res_single is not None else default_image proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects( input_df[feature[COLUMN]], _get_processed_image) else: all_img_entries = [ get_abs_path(src_path, img_entry) if isinstance( img_entry, str) else img_entry for img_entry in input_df[feature[COLUMN]] ] data_fp = backend.cache.get_cache_path(metadata.get(SRC), metadata.get(CHECKSUM), TRAINING) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature[PROC_COLUMN] + '_data', (num_images, height, width, num_channels), dtype=np.uint8) for i, img_entry in enumerate(all_img_entries): res = read_image_and_resize(img_entry) image_dataset[ i, :height, : width, :] = res if res is not None else default_image h5_file.flush() proc_df[feature[PROC_COLUMN]] = np.arange(num_images) return proc_df
def read_audio(path): filepath = get_abs_path(src_path, path) return soundfile.read(filepath)
def add_feature_data(feature, dataset_df, data, metadata, preprocessing_parameters): set_default_value(feature['preprocessing'], 'in_memory', preprocessing_parameters['in_memory']) set_default_value(feature['preprocessing'], 'num_processes', preprocessing_parameters['num_processes']) csv_path = None if hasattr(dataset_df, 'csv'): csv_path = os.path.dirname(os.path.abspath(dataset_df.csv)) num_images = len(dataset_df) if num_images == 0: raise ValueError('There are no images in the dataset provided.') first_image_path = dataset_df[feature['name']][0] if csv_path is None and not os.path.isabs(first_image_path): raise ValueError('Image file paths must be absolute') first_image_path = get_abs_path(csv_path, first_image_path) (should_resize, width, height, num_channels, user_specified_num_channels, first_image) = ImageBaseFeature._finalize_preprocessing_parameters( preprocessing_parameters, first_image_path) metadata[feature['name']]['preprocessing']['height'] = height metadata[feature['name']]['preprocessing']['width'] = width metadata[ feature['name']]['preprocessing']['num_channels'] = num_channels read_image_and_resize = partial( ImageBaseFeature._read_image_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters['resize_method'], user_specified_num_channels=user_specified_num_channels) all_file_paths = [ get_abs_path(csv_path, file_path) for file_path in dataset_df[feature['name']] ] if feature['preprocessing']['in_memory']: # Number of processes to run in parallel for preprocessing num_processes = feature['preprocessing']['num_processes'] metadata[feature['name']]['preprocessing'][ 'num_processes'] = num_processes data[feature['name']] = np.empty( (num_images, height, width, num_channels), dtype=np.uint8) # Split the dataset into pools only if we have an explicit request to use # multiple processes. In case we have multiple input images use the # standard code anyway. if num_processes > 1 or num_images > 1: with Pool(num_processes) as pool: logger.warning( 'Using {} processes for preprocessing images'.format( num_processes)) data[feature['name']] = np.array( pool.map(read_image_and_resize, all_file_paths)) # If we're not running multiple processes and we are only processing one # image just use this faster shortcut, bypassing multiprocessing.Pool.map else: logger.warning( 'No process pool initialized. Using one process for preprocessing images' ) img = read_image_and_resize(all_file_paths[0]) data[feature['name']] = np.array([img]) else: data_fp = os.path.splitext(dataset_df.csv)[0] + '.hdf5' mode = 'w' if os.path.isfile(data_fp): mode = 'r+' with h5py.File(data_fp, mode) as h5_file: # TODO add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature['name'] + '_data', (num_images, height, width, num_channels), dtype=np.uint8) for i, filepath in enumerate(all_file_paths): image_dataset[i, :height, :width, :] = ( read_image_and_resize(filepath)) data[feature['name']] = np.arange(num_images)
def test_get_abs_path(): assert get_abs_path("a", "b.jpg") == "a/b.jpg" assert get_abs_path(None, "b.jpg") == "b.jpg"
) -> Union[BinaryIO, TextIO, bytes]: ======= ======= src_path: Union[str, torch.Tensor], img_entry: Union[str, bytes], ret_bytes: bool = False >>>>>>> upstream/master ) -> Union[BytesIO, BinaryIO, TextIO, bytes, str]: >>>>>>> upstream/master if not isinstance(img_entry, str): return img_entry if is_http(img_entry): if ret_bytes: # Returns BytesIO. return get_image_from_http_bytes(img_entry) return img_entry if src_path or os.path.isabs(img_entry): return get_abs_path(src_path, img_entry) with open_file(img_entry, "rb") as f: if ret_bytes: return f.read() return f def is_image(src_path: str, img_entry: Union[bytes, str]) -> bool: if not isinstance(img_entry, str): return False try: import imghdr img = get_image_from_path(src_path, img_entry, True) if isinstance(img, bytes): return imghdr.what(None, img) is not None
def add_feature_data(feature_config, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): set_default_value(feature_config["preprocessing"], "in_memory", preprocessing_parameters["in_memory"]) name = feature_config[NAME] column = input_df[feature_config[COLUMN]] num_audio_files = len(column) if num_audio_files == 0: raise ValueError( "There are no audio files in the dataset provided.") first_audio_entry = next(iter(column)) logging.debug( f"Detected audio feature type is {type(first_audio_entry)}") if not isinstance(first_audio_entry, str) and not isinstance( first_audio_entry, torch.Tensor): raise ValueError( "Invalid audio feature data type. Detected type is {}, " "expected either string for local/remote file path or Torch Tensor." .format(type(first_audio_entry))) src_path = None if SRC in metadata: if isinstance(first_audio_entry, str) and not has_remote_protocol(first_audio_entry): src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) abs_path_column = backend.df_engine.map_objects( column, lambda row: get_abs_path(src_path, row) if isinstance(row, str) else row) num_audio_utterances = len(input_df[feature_config[COLUMN]]) padding_value = preprocessing_parameters["padding_value"] normalization_type = preprocessing_parameters["norm"] feature_dim = metadata[name]["feature_dim"] max_length = metadata[name]["max_length"] audio_feature_dict = { key: value for key, value in preprocessing_parameters.items() if key in AUDIO_FEATURE_KEYS and value is not None } audio_file_length_limit_in_s = preprocessing_parameters[ "audio_file_length_limit_in_s"] if num_audio_utterances == 0: raise ValueError( "There are no audio files in the dataset provided.") if feature_config[PREPROCESSING]["in_memory"]: audio_features = AudioFeatureMixin._process_in_memory( abs_path_column, audio_feature_dict, feature_dim, max_length, padding_value, normalization_type, audio_file_length_limit_in_s, backend, ) proc_df[feature_config[PROC_COLUMN]] = audio_features else: backend.check_lazy_load_supported(feature_config) return proc_df
def test_get_abs_path(): assert get_abs_path('a', 'b.jpg') == 'a/b.jpg' assert get_abs_path(None, 'b.jpg') == 'b.jpg'
def add_feature_data(feature, input_df, proc_df, metadata, preprocessing_parameters, backend): set_default_value(feature[PREPROCESSING], 'in_memory', preprocessing_parameters['in_memory']) set_default_value(feature[PREPROCESSING], 'num_processes', preprocessing_parameters['num_processes']) src_path = None if hasattr(input_df, 'src'): src_path = os.path.dirname(os.path.abspath(input_df.src)) num_images = len(input_df) if num_images == 0: raise ValueError('There are no images in the dataset provided.') first_path = next(iter(input_df[feature[COLUMN]])) if src_path is None and not os.path.isabs(first_path): raise ValueError('Image file paths must be absolute') first_path = get_abs_path(src_path, first_path) (should_resize, width, height, num_channels, user_specified_num_channels, first_image) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, first_path) metadata[feature[NAME]][PREPROCESSING]['height'] = height metadata[feature[NAME]][PREPROCESSING]['width'] = width metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels read_image_and_resize = partial( ImageFeatureMixin._read_image_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters['resize_method'], user_specified_num_channels=user_specified_num_channels) if feature[PREPROCESSING]['in_memory']: # Number of processes to run in parallel for preprocessing num_processes = feature[PREPROCESSING]['num_processes'] metadata[ feature[NAME]][PREPROCESSING]['num_processes'] = num_processes # Split the dataset into pools only if we have an explicit request to use # multiple processes. In case we have multiple input images use the # standard code anyway. if backend.supports_multiprocessing and (num_processes > 1 or num_images > 1): all_file_paths = [ get_abs_path(src_path, file_path) for file_path in input_df[feature[NAME]] ] with Pool(num_processes) as pool: logger.debug( 'Using {} processes for preprocessing images'.format( num_processes)) proc_df[feature[PROC_COLUMN]] = pool.map( read_image_and_resize, all_file_paths) else: # If we're not running multiple processes and we are only processing one # image just use this faster shortcut, bypassing multiprocessing.Pool.map logger.debug( 'No process pool initialized. Using internal process for preprocessing images' ) proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects( input_df[feature[COLUMN]], lambda file_path: read_image_and_resize( get_abs_path(src_path, file_path))) else: backend.check_lazy_load_supported(feature) all_file_paths = [ get_abs_path(src_path, file_path) for file_path in input_df[feature[NAME]] ] data_fp = os.path.splitext(input_df.src)[0] + '.hdf5' mode = 'w' if os.path.isfile(data_fp): mode = 'r+' with h5py.File(data_fp, mode) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature[PROC_COLUMN] + '_data', (num_images, height, width, num_channels), dtype=np.uint8) for i, filepath in enumerate(all_file_paths): image_dataset[i, :height, :width, :] = ( read_image_and_resize(filepath)) h5_file.flush() proc_df[feature[PROC_COLUMN]] = np.arange(num_images) return proc_df
def add_feature_data(feature, dataset_df, data, metadata, preprocessing_parameters): set_default_value(feature['preprocessing'], 'in_memory', preprocessing_parameters['in_memory']) csv_path = None if hasattr(dataset_df, 'csv'): csv_path = os.path.dirname(os.path.abspath(dataset_df.csv)) num_images = len(dataset_df) if num_images == 0: raise ValueError('There are no images in the dataset provided.') height = 0 width = 0 should_resize = False if ('height' in preprocessing_parameters or 'width' in preprocessing_parameters): should_resize = True try: height = int(preprocessing_parameters[HEIGHT]) width = int(preprocessing_parameters[WIDTH]) except ValueError as e: raise ValueError('Image height and width must be set and have ' 'positive integer values: ' + str(e)) if height <= 0 or width <= 0: raise ValueError( 'Image height and width must be positive integers') # here if a width and height have not been specified # we assume that all images have the same width and height # thus the width and height of the first one are the same # of all the other ones if (csv_path is None and not os.path.isabs(dataset_df[feature['name']][0])): raise ValueError('Image file paths must be absolute') first_image = imread( get_abs_path(csv_path, dataset_df[feature['name']][0])) first_img_height = first_image.shape[0] first_img_width = first_image.shape[1] first_img_num_channels = num_channels_in_image(first_image) if height == 0 or width == 0: # User hasn't specified height and width height = first_img_height width = first_img_width # User specified num_channels in the model/feature definition user_specified_num_channels = False num_channels = first_img_num_channels if NUM_CHANNELS in preprocessing_parameters: user_specified_num_channels = True num_channels = preprocessing_parameters[NUM_CHANNELS] assert isinstance( num_channels, int), ValueError('Number of image channels needs to be an integer') metadata[feature['name']]['preprocessing']['height'] = height metadata[feature['name']]['preprocessing']['width'] = width metadata[ feature['name']]['preprocessing']['num_channels'] = num_channels if feature['preprocessing']['in_memory']: data[feature['name']] = np.empty( (num_images, height, width, num_channels), dtype=np.uint8) for i in range(len(dataset_df)): filepath = get_abs_path(csv_path, dataset_df[feature['name']][i]) img = ImageBaseFeature._read_image_and_resize( filepath, width, height, should_resize, num_channels, preprocessing_parameters['resize_method'], user_specified_num_channels) try: data[feature['name']][i, :, :, :] = img except: logger.error( "Images are not of the same size. " "Expected size is {}, " "current image size is {}." "Images are expected to be all of the same size" "or explicit image width and height are expected" "to be provided. " "Additional information: https://uber.github.io/ludwig/user_guide/#image-features-preprocessing" .format(first_image.shape, img.shape)) raise else: data_fp = os.path.splitext(dataset_df.csv)[0] + '.hdf5' mode = 'w' if os.path.isfile(data_fp): mode = 'r+' with h5py.File(data_fp, mode) as h5_file: image_dataset = h5_file.create_dataset( feature['name'] + '_data', (num_images, height, width, num_channels), dtype=np.uint8) for i in range(len(dataset_df)): filepath = get_abs_path(csv_path, dataset_df[feature['name']][i]) img = ImageBaseFeature._read_image_and_resize( filepath, width, height, should_resize, num_channels, preprocessing_parameters['resize_method'], user_specified_num_channels) image_dataset[i, :height, :width, :] = img data[feature['name']] = np.arange(num_images)
def add_feature_data( feature, dataset_df, data, metadata, preprocessing_parameters ): set_default_value( feature['preprocessing'], 'in_memory', preprocessing_parameters['in_memory'] ) if not 'audio_feature' in preprocessing_parameters: raise ValueError( 'audio_feature dictionary has to be present in preprocessing ' 'for audio.') if not 'type' in preprocessing_parameters['audio_feature']: raise ValueError( 'type has to be present in audio_feature dictionary ' 'for audio.') csv_path = None if hasattr(dataset_df, 'csv'): csv_path = os.path.dirname(os.path.abspath(dataset_df.csv)) if (csv_path is None and not os.path.isabs(dataset_df[feature['name']][0])): raise ValueError( 'Audio file paths must be absolute' ) num_audio_utterances = len(dataset_df) padding_value = preprocessing_parameters['padding_value'] normalization_type = preprocessing_parameters['norm'] feature_name = feature['name'] feature_dim = metadata[feature_name]['feature_dim'] max_length = metadata[feature_name]['max_length'] audio_feature_dict = preprocessing_parameters['audio_feature'] audio_file_length_limit_in_s = preprocessing_parameters[ 'audio_file_length_limit_in_s'] if num_audio_utterances == 0: raise ValueError( 'There are no audio files in the dataset provided.') audio_stats = { 'count': 0, 'mean': 0, 'var': 0, 'std': 0, 'max': 0, 'min': float('inf'), 'cropped': 0, 'max_length_in_s': audio_file_length_limit_in_s } if feature['preprocessing']['in_memory']: data[feature['name']] = np.empty( (num_audio_utterances, max_length, feature_dim), dtype=np.float32 ) for i in range(len(dataset_df)): filepath = get_abs_path( csv_path, dataset_df[feature['name']][i] ) audio_feature = AudioBaseFeature._read_audio_and_transform_to_feature( filepath, audio_feature_dict, feature_dim, max_length, padding_value, normalization_type, audio_stats ) data[feature['name']][i, :, :] = audio_feature audio_stats['std'] = np.sqrt( audio_stats['var'] / float(audio_stats['count'])) print_statistics = """ {} audio files loaded. Statistics of audio file lengths: - mean: {:.4f} - std: {:.4f} - max: {:.4f} - min: {:.4f} - cropped audio_files: {} Max length was given as {}. """.format(audio_stats['count'], audio_stats['mean'], audio_stats['std'], audio_stats['max'], audio_stats['min'], audio_stats['cropped'], audio_stats['max_length_in_s']) print(print_statistics)
def add_feature_data(feature_config, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): set_default_value(feature_config[PREPROCESSING], "in_memory", preprocessing_parameters["in_memory"]) name = feature_config[NAME] column = input_df[feature_config[COLUMN]] src_path = None if SRC in metadata: src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) abs_path_column = backend.df_engine.map_objects( column, lambda row: get_abs_path(src_path, row) if isinstance(row, str) and not has_remote_protocol(row) else row, ) ( should_resize, width, height, num_channels, user_specified_num_channels, ) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, abs_path_column) metadata[name][PREPROCESSING]["height"] = height metadata[name][PREPROCESSING]["width"] = width metadata[name][PREPROCESSING]["num_channels"] = num_channels read_image_if_bytes_obj_and_resize = partial( ImageFeatureMixin._read_image_if_bytes_obj_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters["resize_method"], user_specified_num_channels=user_specified_num_channels, ) # TODO: alternatively use get_average_image() for unreachable images default_image = get_gray_default_image(num_channels, height, width) # check to see if the active backend can support lazy loading of # image features from the hdf5 cache. backend.check_lazy_load_supported(feature_config) in_memory = feature_config[PREPROCESSING]["in_memory"] if in_memory or skip_save_processed_input: metadata[name]["reshape"] = (num_channels, height, width) proc_col = backend.read_binary_files( abs_path_column, map_fn=read_image_if_bytes_obj_and_resize) proc_col = backend.df_engine.map_objects( proc_col, lambda row: row if row is not None else default_image) proc_df[feature_config[PROC_COLUMN]] = proc_col else: num_images = len(abs_path_column) data_fp = backend.cache.get_cache_path(wrap(metadata.get(SRC)), metadata.get(CHECKSUM), TRAINING) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature_config[PROC_COLUMN] + "_data", (num_images, num_channels, height, width), dtype=np.uint8) for i, img_entry in enumerate(abs_path_column): res = read_image_if_bytes_obj_and_resize(img_entry) image_dataset[ i, :height, : width, :] = res if res is not None else default_image h5_file.flush() proc_df[feature_config[PROC_COLUMN]] = np.arange(num_images) return proc_df