def test_is_tif(self): """Test identifying tif or vrt files as tif""" img_dir = op.join('test', 'fixtures') # tif with .tif extension identified as tif test_tif = op.join(img_dir, 'drone.tif') self.assertTrue(is_tif(test_tif)) # vrt with .vrt extension identified as tif test_vrt = op.join(img_dir, 'drone.vrt') self.assertTrue(is_tif(test_vrt)) # tif with no extension identified as tif with tempfile.TemporaryDirectory() as tmpdirname: test_tif_no_ext = op.join(tmpdirname, 'drone') shutil.copy(test_tif, test_tif_no_ext) self.assertTrue(is_tif(test_tif_no_ext)) # vrt with no extension identified as tif with tempfile.TemporaryDirectory() as tmpdirname: test_vrt_no_ext = op.join(tmpdirname, 'drone') shutil.copy(test_vrt, test_vrt_no_ext) self.assertTrue(is_tif(test_vrt_no_ext))
def preview(dest_folder, number, classes, imagery, ml_type, imagery_offset, **kwargs): """Produce imagery examples for specified classes Parameters ------------ dest_folder: str Folder to save labels and example tiles into number: int Number of preview images to download per class classes: list A list of classes for machine learning training. Each class is defined as a dict with two required properties: - name: class name - filter: A Mapbox GL Filter. See the README for more details imagery: str Imagery template to download satellite images from. Ex: http://a.tiles.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN ml_type: str Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" imagery_offset: list An optional list of integers representing the number of pixels to offset imagery. Ex. [15, -5] will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ # open labels file labels_file = op.join(dest_folder, 'labels.npz') tiles = np.load(labels_file) # create example tiles directory examples_dir = op.join(dest_folder, 'examples') if not op.isdir(examples_dir): makedirs(examples_dir) # find examples tiles for each class and download print('Writing example images to {}'.format(examples_dir)) # get image acquisition function based on imagery string image_function = download_tile_tms if is_tif(imagery): image_function = get_tile_tif for i, cl in enumerate(classes): # create class directory class_dir = op.join(dest_folder, 'examples', cl.get('name')) if not op.isdir(class_dir): makedirs(class_dir) class_tiles = (t for t in tiles.files if class_match(ml_type, tiles[t], i + 1)) print('Downloading at most {} tiles for class {}'.format( number, cl.get('name'))) for n, tile in enumerate(class_tiles): if n > number: break tile_img = image_function(tile, imagery, class_dir, imagery_offset) if ml_type == 'object-detection': img = Image.open(tile_img) draw = ImageDraw.Draw(img) for box in tiles[tile]: draw.rectangle(((box[0], box[1]), (box[2], box[3])), outline='red') img.save(tile_img) elif ml_type == 'segmentation': final = Image.new('RGB', (256, 256)) img = Image.open(tile_img) mask = Image.fromarray(tiles[tile] * 255) final.paste(img, mask) final.save(tile_img)
def package_directory(dest_folder, classes, imagery, ml_type, seed=False, split_names=('train', 'test'), split_vals=(0.8, .2), **kwargs): """Generate an .npz file containing arrays for training machine learning algorithms Parameters ------------ dest_folder: str Folder to save labels, tiles, and final numpy arrays into classes: list A list of classes for machine learning training. Each class is defined as a dict with two required properties: - name: class name - filter: A Mapbox GL Filter. See the README for more details imagery: str Imagery template to download satellite images from. Ex: http://a.tiles.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN ml_type: str Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" seed: int Random generator seed. Optional, use to make results reproducible. split_vals: tuple Percentage of data to put in each catagory listed in split_names. Must be floats and must sum to one. Default: (0.8, 0.2) split_names: tupel Default: ('train', 'test') List of names for each subset of the data. **kwargs: dict Other properties from CLI config passed as keywords to other utility functions. """ # if a seed is given, use it if seed: np.random.seed(seed) if len(split_names) != len(split_vals): raise ValueError('`split_names` and `split_vals` must be the same ' 'length. Please update your config.') if not np.isclose(sum(split_vals), 1): raise ValueError( '`split_vals` must sum to one. Please update your config.') # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') labels = np.load(labels_file) tile_names = [tile for tile in labels.files] tile_names.sort() tiles = np.array(tile_names) np.random.shuffle(tiles) # find maximum number of features in advance so numpy shapes match if ml_type == 'object-detection': max_features = 0 for tile in labels.files: features = len(labels[tile]) if features > max_features: max_features = features x_vals = [] y_vals = [] # open the images and load those plus the labels into the final arrays if is_tif(imagery): # if a TIF is provided, use jpg as tile format image_format = '.jpg' else: image_format = get_image_format(imagery, kwargs) for tile in tiles: image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format)) try: img = Image.open(image_file) except FileNotFoundError: # we often don't download images for each label (e.g. background tiles) continue except OSError: print('Couldn\'t open {}, skipping'.format(image_file)) continue np_image = np.array(img) img.close() x_vals.append(np_image) if ml_type == 'classification': y_vals.append(labels[tile]) elif ml_type == 'object-detection': # zero pad object-detection arrays cl = labels[tile] y_vals.append( np.concatenate((cl, np.zeros((max_features - len(cl), 5))))) elif ml_type == 'segmentation': y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel # Convert lists to numpy arrays x_vals = np.array(x_vals, dtype=np.uint8) y_vals = np.array(y_vals, dtype=np.uint8) # Get number of data samples per split from the float proportions split_n_samps = [len(x_vals) * val for val in split_vals] if np.any(split_n_samps == 0): raise ValueError('Split must not generate zero samples per partition. ' 'Change ratio of values in config file.') # Convert into a cumulative sum to get indices split_inds = np.cumsum(split_n_samps).astype(np.integer) # Exclude last index as `np.split` handles splitting without that value split_arrs_x = np.split(x_vals, split_inds[:-1]) split_arrs_y = np.split(y_vals, split_inds[:-1]) save_dict = {} for si, split_name in enumerate(split_names): save_dict['x_{}'.format(split_name)] = split_arrs_x[si] save_dict['y_{}'.format(split_name)] = split_arrs_y[si] np.savez(op.join(dest_folder, 'data.npz'), **save_dict) print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
def download_images(dest_folder, classes, imagery, ml_type, background_ratio, imagery_offset=False, **kwargs): """Download satellite images specified by a URL and a label.npz file Parameters ------------ dest_folder: str Folder to save labels, tiles, and final numpy arrays into classes: list A list of classes for machine learning training. Each class is defined as a dict with two required properties: - name: class name - filter: A Mapbox GL Filter. See the README for more details imagery: str Imagery template to download satellite images from. Ex: http://a.tiles.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN ml_type: str Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" background_ratio: float Determines the number of background images to download in single class problems. Ex. A value of 1 will download an equal number of background images to class images. imagery_offset: list An optional list of integers representing the number of pixels to offset imagery. Ex. [15, -5] will move the images 15 pixels right and 5 pixels up relative to the requested tile bounds **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ # open labels file labels_file = op.join(dest_folder, 'labels.npz') tiles = np.load(labels_file) # create tiles directory tiles_dir = op.join(dest_folder, 'tiles') if not op.isdir(tiles_dir): makedirs(tiles_dir) # find tiles which have any matching class def class_test(value): """Determine if a label matches a given class index""" if ml_type == 'object-detection': return len(value) elif ml_type == 'segmentation': return np.sum(value) > 0 elif ml_type == 'classification': return value[0] == 0 return None class_tiles = [tile for tile in tiles.files if class_test(tiles[tile])] # for classification problems with a single class, we also get background # tiles up to len(class_tiles) * config.get('background_ratio') background_tiles = [] limit = len(class_tiles) * background_ratio if ml_type == 'classification' and len(classes) == 1: background_tiles_full = [ tile for tile in tiles.files if tile not in class_tiles ] shuffle(background_tiles_full) background_tiles = background_tiles_full[:limit] # download tiles tiles = class_tiles + background_tiles print('Downloading {} tiles to {}'.format(len(tiles), tiles_dir)) # get image acquisition function based on imagery string image_function = download_tile_tms if is_tif(imagery): image_function = get_tile_tif for tile in tiles: image_function(tile, imagery, tiles_dir, imagery_offset)
def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs): """Generate an .npz file containing arrays for training machine learning algorithms Parameters ------------ dest_folder: str Folder to save labels, tiles, and final numpy arrays into classes: list A list of classes for machine learning training. Each class is defined as a dict with two required properties: - name: class name - filter: A Mapbox GL Filter. See the README for more details imagery: str Imagery template to download satellite images from. Ex: http://a.tiles.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN ml_type: str Defines the type of machine learning. One of "classification", "object-detection", or "segmentation" seed: int Random generator seed. Optional, use to make results reproducable. train_size: float Portion of the data to use in training, the remainder is used as test data (default 0.8) **kwargs: dict Other properties from CLI config passed as keywords to other utility functions """ # if a seed is given, use it if seed: np.random.seed(seed) # open labels file, create tile array labels_file = op.join(dest_folder, 'labels.npz') labels = np.load(labels_file) tile_names = [tile for tile in labels.files] tile_names.sort() tiles = np.array(tile_names) np.random.shuffle(tiles) # find maximum number of features in advance so numpy shapes match if ml_type == 'object-detection': max_features = 0 for tile in labels.files: features = len(labels[tile]) if features > max_features: max_features = features x_vals = [] y_vals = [] # open the images and load those plus the labels into the final arrays o = urlparse(imagery) _, image_format = op.splitext(o.path) if is_tif(imagery): # if a TIF is provided, use jpg as tile format image_format = '.jpg' for tile in tiles: image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format)) try: img = Image.open(image_file) except FileNotFoundError: # we often don't download images for each label (e.g. background tiles) continue except OSError: print('Couldn\'t open {}, skipping'.format(image_file)) continue np_image = np.array(img) img.close() x_vals.append(np_image) if ml_type == 'classification': y_vals.append(labels[tile]) elif ml_type == 'object-detection': # zero pad object-detection arrays cl = labels[tile] y_vals.append(np.concatenate((cl, np.zeros((max_features - len(cl), 5))))) elif ml_type == 'segmentation': y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel # split into train and test split_index = int(len(x_vals) * train_size) # convert lists to numpy arrays x_vals = np.array(x_vals, dtype=np.uint8) y_vals = np.array(y_vals, dtype=np.uint8) print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz'))) np.savez(op.join(dest_folder, 'data.npz'), x_train=x_vals[:split_index, ...], y_train=y_vals[:split_index, ...], x_test=x_vals[split_index:, ...], y_test=y_vals[split_index:, ...])