def main(split_filepath, model_filepath, output_dir, dst_filepath, method, img_cluster, refine, refine_beta, refine_int_rescale, tree_val, nontree_val): logger = logging.getLogger(__name__) logger.info("classifying tiles for cluster %d with classifier from %s", img_cluster, model_filepath) split_df = pd.read_csv(split_filepath) clf = jl.load(model_filepath) pred_imgs = dtr.Classifier( tree_val=tree_val, nontree_val=nontree_val, refine=refine, refine_beta=refine_beta, refine_int_rescale=refine_int_rescale).classify_imgs( split_df, output_dir, clf=clf, method=method, img_cluster=img_cluster) logger.info("dumped %d classified tiles to %s", len(pred_imgs), output_dir) pd.Series(pred_imgs).to_csv(dst_filepath, index=False, header=False) logger.info("dumped list of classified tiles to %s", dst_filepath)
def make_confusion_df( lidar_gdf, lidar_raw_dir, split_df=None, img_filepaths=None, n=None, frac=0.05, clf=None, clf_dict=None, ): c = dtr.Classifier() truth_pred_lazy = [] if clf is not None: if split_df is None: num_validation_tiles = int(frac * len(img_filepaths)) test_filepaths = random.choices(img_filepaths, k=num_validation_tiles) else: test_filepaths = _get_validation_df(split_df, n, frac)["img_filepath"] for img_filepath in test_filepaths: truth_pred_lazy.append( dask.delayed(_inner_loop)(img_filepath, lidar_gdf, lidar_raw_dir, c, clf)) else: validation_df = _get_validation_df(split_df, n, frac) for img_cluster, cluster_df in validation_df.groupby("img_cluster"): clf = clf_dict[img_cluster] for img_filepath in cluster_df["img_filepath"]: truth_pred_lazy.append( dask.delayed(_inner_loop)(img_filepath, lidar_gdf, lidar_raw_dir, c, clf)) with diagnostics.ProgressBar(): truth_pred = np.hstack(dask.compute(*truth_pred_lazy)) truth_ser = pd.Series(truth_pred[0], name="actual") pred_ser = pd.Series(truth_pred[1], name="predicted") return pd.crosstab(truth_ser, pred_ser) / len(truth_ser)
def classify_imgs(ctx, split_filepath, clf_filepath, clf_dir, method, img_cluster, tree_val, nontree_val, refine, refine_beta, refine_int_rescale, pixel_features_builder_kws, output_dir): logger = ctx.obj['LOGGER'] split_df = pd.read_csv(split_filepath) if clf_filepath is not None: clf_dict = None clf = joblib.load(clf_filepath) logger.info("Classifying images from %s with classifier of %s", split_filepath, clf_filepath) if clf_dir is not None: clf = None clf_dict = {} for img_cluster in split_df['img_cluster'].unique(): clf_dict[img_cluster] = joblib.load( path.join(clf_dir, f"{img_cluster}.joblib")) pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws) c = dtr.Classifier(tree_val=tree_val, nontree_val=nontree_val, refine=refine, refine_beta=refine_beta, refine_int_rescale=refine_int_rescale, **pixel_features_builder_kws) if output_dir is None: output_dir = '' pred_imgs = c.classify_imgs(split_df, output_dir, clf=clf, clf_dict=clf_dict, method=method, img_cluster=img_cluster) logger.info("Dumped %d predicted images to %s", len(pred_imgs), output_dir)
def classify_img(ctx, img_filepath, clf_filepath, tree_val, nontree_val, refine, refine_beta, refine_int_rescale, pixel_features_builder_kws, output_filepath): logger = ctx.obj['LOGGER'] logger.info("Classifying %s with classifier of %s", img_filepath, clf_filepath) pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws) c = dtr.Classifier(tree_val=tree_val, nontree_val=nontree_val, refine=refine, refine_beta=refine_beta, refine_int_rescale=refine_int_rescale, **pixel_features_builder_kws) if output_filepath is None: filename, ext = path.splitext(path.basename(img_filepath)) output_filepath = f"{filename}-pred{ext}" c.classify_img(img_filepath, joblib.load(clf_filepath), output_filepath) logger.info("Dumped predicted image to %s", output_filepath)
def main(validation_img_dir, split_filepath, models_dir, dst_filepath): logger = logging.getLogger(__name__) validation_img_filepaths = glob.glob( path.join(validation_img_dir, settings.IMG_DEFAULT_FILENAME_PATTERN)) logger.info("computing confusion data frame with the tiles in %s", validation_img_dir) split_df = pd.read_csv(split_filepath, index_col=0) c = dtr.Classifier() observations = [] predictions = [] for validation_img_filepath in validation_img_filepaths: validation_img_filename = path.basename(validation_img_filepath) try: img_filepath, img_cluster = split_df[ split_df['img_filepath'].str.endswith( validation_img_filename)][['img_filepath', 'img_cluster']].iloc[0] except IndexError: raise ValueError( f'Could not find an image named {validation_img_filename} in ' f' {split_filepath}') with rio.open(validation_img_filepath) as src: observations.append(src.read(1)) predictions.append( c.classify_img( img_filepath, jl.load(path.join(models_dir, f'{img_cluster}.joblib')))) truth_ser = pd.Series(np.hstack(observations).flatten(), name='obs') pred_ser = pd.Series(np.hstack(predictions).flatten(), name='pred') df = pd.crosstab(truth_ser, pred_ser) / len(truth_ser) logger.info("estimated accuracy score is %f", np.trace(df)) df.to_csv(dst_filepath) logger.info("dumped confusion data frame to %s", dst_filepath)
def test_classifier(self): # TODO: test init arguments of `Classifier` c = dtr.Classifier() img_filepath = self.split_i_df.iloc[0]['img_filepath'] # test that `classify_img` returns a ndarray self.assertIsInstance(c.classify_img(img_filepath, self.clf), np.ndarray) # test that `classify_img` with `output_filepath` returns a ndarray # and dumps it output_filepath = path.join(self.tmp_output_dir, 'foo.tif') y_pred = c.classify_img(img_filepath, self.clf, output_filepath) self.assertIsInstance(y_pred, np.ndarray) self.assertTrue(os.path.exists(output_filepath)) # remove it so that the output dir is clean in the tests below os.remove(output_filepath) # test that `classify_imgs` with implicit `cluster-I` method returns a # list and that the images have been dumped pred_imgs = c.classify_imgs(self.split_i_df, self.tmp_output_dir, self.clf) self.assertIsInstance(pred_imgs, list) self._test_imgs_exist_and_rm(pred_imgs) # test that `classify_imgs` with implicit `cluster-II` method, `clf` # and `img_label` returns a list and that the images have been dumped pred_imgs = c.classify_imgs(self.split_ii_df, self.tmp_output_dir, self.clf, img_cluster=self.img_cluster) self.assertIsInstance(pred_imgs, list) self._test_imgs_exist_and_rm(pred_imgs) # test that this works equally when providing `clf_dict` pred_imgs = c.classify_imgs(self.split_ii_df, self.tmp_output_dir, clf_dict=self.clf_dict, img_cluster=self.img_cluster) self.assertIsInstance(pred_imgs, list) self._test_imgs_exist_and_rm(pred_imgs) # test that `classify_imgs` with implicit `cluster-II` method and # `clf_dict` returns a dict and that the images have been dumped pred_imgs = c.classify_imgs(self.split_ii_df, self.tmp_output_dir, clf_dict=self.clf_dict) self.assertIsInstance(pred_imgs, dict) for img_cluster in pred_imgs: self._test_imgs_exist_and_rm(pred_imgs[img_cluster]) # test that `clf=None` with 'cluster-I' raises a `ValueError` self.assertRaises(ValueError, c.classify_imgs, self.split_i_df, self.tmp_output_dir) # test that `clf=None` and `clf_dict=None` with 'cluster-II' raises a # `ValueError` self.assertRaises(ValueError, c.classify_imgs, self.split_ii_df, self.tmp_output_dir) # test that `clf_dict=None` with 'cluster-II' and `img_cluster=None` # raises a `ValueError`, even when providing a non-None `clf` self.assertRaises(ValueError, c.classify_imgs, self.split_ii_df, self.tmp_output_dir, clf=c) # TODO: test with explicit `method` keyword argument # test that `Classifier` with `refine=False` also returns an ndarray c = dtr.Classifier(refine=False) img_filepath = self.split_i_df.iloc[0]['img_filepath'] # test that `classify_img` returns a ndarray self.assertIsInstance(c.classify_img(img_filepath, self.clf), np.ndarray)
def main( tile_filepath, split_filepath, models_dir, lidar_dir, validation_tiles_dir, dst_filepath, high_veg_val, num_opening_iterations, num_dilation_iterations, output_dtype, output_tree_val, output_nodata, ): logger = logging.getLogger(__name__) # predict the tile using the trained classifier split_df = pd.read_csv(split_filepath, index_col=0) tile_cluster = split_df[split_df["img_filepath"] == tile_filepath]["img_cluster"].iloc[0] pred_arr = dtr.Classifier().classify_img( tile_filepath, jl.load(path.join(models_dir, f"{tile_cluster}.joblib"))) # load lidar default settings high_veg_val = high_veg_val if high_veg_val else lidar_utils.HIGH_VEG_VAL num_opening_iterations = (num_opening_iterations if num_opening_iterations else lidar_utils.NUM_OPENING_ITERATIONS) num_dilation_iterations = (num_dilation_iterations if num_dilation_iterations else lidar_utils.NUM_DILATION_ITERATIONS) output_dtype = output_dtype if output_dtype else lidar_utils.OUTPUT_DTYPE output_tree_val = (output_tree_val if output_tree_val else lidar_utils.OUTPUT_TREE_VAL) output_nodata = output_nodata if output_nodata else lidar_utils.OUTPUT_NODATA # estimate the "ground-truth" mask with LIDAR data lidar_filepath = path.join(lidar_dir, lidar_utils.get_lidar_filename(tile_filepath)) validation_tile_filepath = path.join(validation_tiles_dir, path.basename(tile_filepath)) _ = dtr.LidarToCanopy( three_threshold=high_veg_val, output_dtype=output_dtype, output_tree_val=output_tree_val, output_nodata=output_nodata, ).to_canopy_mask( lidar_filepath, lidar_utils.LIDAR_TREE_VALUES, tile_filepath, output_filepath=validation_tile_filepath, postprocess_func=lidar_utils.postprocess_canopy_mask, postprocess_func_args=[ high_veg_val, num_opening_iterations, num_dilation_iterations, output_dtype, output_tree_val, ], ) with rio.open(validation_tile_filepath) as src: obs_arr = src.read(1) # compute the confusion matrix and dump it to a file obs_ser = pd.Series(obs_arr.flatten(), name="obs") pred_ser = pd.Series(pred_arr.flatten(), name="pred") df = pd.crosstab(obs_ser, pred_ser) / len(obs_ser) logger.info("estimated accuracy score is %f", np.trace(df)) df.to_csv(dst_filepath) logger.info("dumped confusion data frame to %s", dst_filepath)