def extract_statistics(image_file, boxes_gpd, n_retain, spectra_ml_csv): spectra_ml = pd.read_csv(spectra_ml_csv, index_col=0) arr, meta = rio_read_all_bands(image_file) arr = rescale_s2(arr) osm_file = os.path.join(dir_osm, "osm%s" % os.path.basename(image_file)) lat, lon = lat_from_meta(meta), lon_from_meta(meta) bbox_epsg4326 = list(np.flip(metadata_to_bbox_epsg4326(meta))) osm_mask = get_osm_mask(bbox_epsg4326, meta["crs"], arr[0], { "lat": lat, "lon": lon }, dir_osm) meta["count"] = 1 meta["dtype"] = osm_mask.dtype with rio.open(osm_file, "w", **meta) as tgt: tgt.write(osm_mask, 1) arr *= osm_mask n_bands = 3 ratios = np.zeros((n_bands + 1, arr.shape[1], arr.shape[2])) ratio_counterparts = [2, 0, 0] for band_idx in range(n_bands): ratios[band_idx] = normalized_ratio(arr[band_idx], arr[ratio_counterparts[band_idx]]) ratios[3] = normalized_ratio(arr[1], arr[2]) # add green vs. blue lat, lon = lat_from_meta(meta), lon_from_meta(meta) # shift lat lon to pixel center lat_shifted, lon_shifted = shift_lat(lat, 0.5), shift_lon(lon, 0.5) # boxes_training.index = range(len(boxes_training)) boxes_training = boxes_gpd means_arr = [np.nanmean(arr[band_idx]) for band_idx in [0, 1, 2, 3]] np.random.seed(99) for i in np.random.choice(list(range(len(boxes_training))), n_retain, replace=False): box = boxes_training.geometry[i].bounds x0, x1 = get_smallest_deviation(lon_shifted, box[0]), get_smallest_deviation( lon_shifted, box[2]) y1, y0 = get_smallest_deviation(lat_shifted, box[1]), get_smallest_deviation( lat_shifted, box[3]) sub_arr = arr[0:4, y0:y1 + 1, x0:x1 + 1].copy() sub_ratios = ratios[:, y0:y1 + 1, x0:x1 + 1].copy() spectra_ml = extract_rgb_spectra(spectra_ml, sub_arr, sub_ratios, means_arr) arr[:, y0:y1 + 1, x0:x1 + 1] = np.nan # mask out box reflectances in order to avoid using them as background ratios[:, y0:y1 + 1, x0:x1 + 1] = np.nan print("Number of training boxes: %s" % n_retain) # ensure equal number of blueish, greenish and reddish spectra spectra_ml = add_background(spectra_ml, arr, ratios, means_arr, int(n_retain)) spectra_ml.to_csv(spectra_ml_csv)
def preprocess_bands(self, band_stack, subset_box=None): bands_rescaled = band_stack[0:4].copy() bands_rescaled[np.isnan(bands_rescaled)] = 0 bands_rescaled = rescale_s2(bands_rescaled) bands_rescaled[bands_rescaled == 0] = np.nan band_stack = None self.lat, self.lon = lat_from_meta(self.meta), lon_from_meta(self.meta) if subset_box is not None: ymin, ymax, xmin, xmax = subset_box["ymin"], subset_box[ "ymax"], subset_box["xmin"], subset_box["xmax"] bands_rescaled = bands_rescaled[:, ymin:ymax, xmin:xmax] self.lat, self.lon = self.lat[ymin:ymax], self.lon[xmin:xmax] self.meta["height"], self.meta["width"] = bands_rescaled.shape[ 1], bands_rescaled.shape[2] t = list(self.meta["transform"]) t[2], t[5] = self.lon[0], self.lat[0] self.meta["transform"] = Affine(t[0], t[1], t[2], t[3], t[4], t[5]) bbox_epsg4326 = list(np.flip(metadata_to_bbox_epsg4326(self.meta))) osm_mask = self._get_osm_mask( bbox_epsg4326, "EPSG:" + str(self.meta["crs"].to_epsg()), bands_rescaled[0], { "lat": self.lat, "lon": self.lon }, dirs["osm"]) if np.count_nonzero(osm_mask) == 0: raise ValueError("No OSM roads of requested road types in aoi") bands_rescaled *= osm_mask bands_rescaled[bands_rescaled == 0] = np.nan osm_mask = None self._build_variables(bands_rescaled)
def pre_process(self, band_dict, metadata, subset_box=None): """ rescales data to 0-1 and calculates lat, lon coordinates, masks to OSM roads :param band_dict: dict holding 3 arrays with shape (height, width), keys are B02, B03, B04, B08 :param metadata: dict metadata from rasterio IO :param subset_box: dict with int ymin, ymax, xmin, xmax """ self.metadata = metadata if not isinstance(band_dict, dict): raise TypeError("'band_dict' must be a dictionary") try: test = band_dict["B02"], band_dict["B03"], band_dict[ "B04"], band_dict["B08"] except KeyError: raise KeyError( "'band_dict' must contain 'B02', 'B03', 'B04', 'B08'") if not isinstance(metadata, dict): raise TypeError("'metadata' must be a dictionary") self.crs = metadata["crs"] try: self.lat, self.lon = metadata["lat"], metadata["lon"] except KeyError: try: self.lat, self.lon = lat_from_meta(metadata), lon_from_meta( metadata) except KeyError as e: raise e box_epsg4326 = metadata_to_bbox_epsg4326(metadata) if not os.path.exists(self.dir_ancil): os.mkdir(self.dir_ancil) self.box_epsg4326 = list(np.flip(box_epsg4326)) osm_mask = self.get_osm_mask(self.box_epsg4326, metadata["crs"], band_dict["B02"], { "lat": self.lat, "lon": self.lon }, self.dir_ancil) osm_mask[osm_mask != 0] = 1 osm_mask[osm_mask == 0] = np.nan band_stack_np = np.array([ band_dict["B04"], band_dict["B03"], band_dict["B02"], band_dict["B08"] ]) band_stack_np *= osm_mask try: band_stack_np = band_stack_np[:, subset_box["ymin"]:subset_box[ "ymax"], subset_box["xmin"]:subset_box["xmax"]] self.lat = self.lat[subset_box["ymin"]:subset_box["ymax"] + 1] self.lon = self.lon[subset_box["xmin"]:subset_box["xmax"] + 1] except TypeError: # subset_box is allowed to be None pass band_stack_np_rescaled = band_stack_np.copy() band_stack_np = None band_stack_np_rescaled[np.isnan(band_stack_np_rescaled)] = 0 band_stack_np_rescaled = rescale(band_stack_np_rescaled, 0, 1) band_stack_np_rescaled[band_stack_np_rescaled == 0] = np.nan return band_stack_np_rescaled
def validate_boxes(self): tiles_pd = pd.read_csv(os.path.join(dir_training, "tiles.csv"), sep=",") try: os.remove(boxes_validation_file) except FileNotFoundError: pass boxes_validation_pd = pd.DataFrame() tiles = list(tiles_pd["validation_tiles"]) for tile in tiles: print(tile) try: imgs = np.array(glob(dir_s2_subsets + os.sep + "*" + tile + "*.tif")) except TypeError: # nan continue validation_boxes = gpd.read_file(glob(os.path.join(dir_labels, "*%s*.gpkg" % tile))[0]) try: prediction_boxes_file = glob(os.path.join(self.dirs["detections"], "*%s*.gpkg" % tile))[0] # fail except IndexError: lens = np.int32([len(x) for x in imgs]) img_file = imgs[np.where(lens == lens.max())[0]][0] name = os.path.basename(img_file).split(".tif")[0] print(img_file) # read labels prediction_boxes_file = os.path.join(self.dirs["detections"], name + "_boxes.gpkg") rf_td = RFTruckDetector() band_data = rf_td.read_bands(img_file) # subset to label extent lat, lon = lat_from_meta(rf_td.meta), lon_from_meta(rf_td.meta) extent = validation_boxes.total_bounds # process only subset where boxes given diff_ymin, diff_ymax = np.abs(lat - extent[3]), np.abs(lat - extent[1]) diff_xmin, diff_xmax = np.abs(lon - extent[0]), np.abs(lon - extent[2]) ymin, ymax = np.argmin(diff_ymin), np.argmin(diff_ymax) xmin, xmax = np.argmin(diff_xmin), np.argmin(diff_xmax) rf_td.preprocess_bands(band_data, {"ymin": ymin, "xmin": xmin, "ymax": ymax + 1, "xmax": xmax + 1}) # do detection prediction_array = rf_td.predict() prediction_boxes = rf_td.extract_objects(prediction_array) rf_td.prediction_raster_to_gtiff(prediction_array, os.path.join(self.dirs["detections"], name + "_raster")) rf_td.prediction_boxes_to_gpkg(prediction_boxes, prediction_boxes_file) prediction_boxes = gpd.read_file(prediction_boxes_file) prediction_array, band_data, rf_td = None, None, None # iterate over score thresholds and plot precision and recall curve for score_threshold in np.arange(0, 2, 0.1): prediction_boxes = prediction_boxes[prediction_boxes["score"] > score_threshold] tp = 0 intersection_over_union = [] yet_seen_validation_boxes = [] for row_idx, prediction_box in enumerate(prediction_boxes.geometry): for validation_box in validation_boxes.geometry: if validation_box in yet_seen_validation_boxes: continue else: if prediction_box.intersects(validation_box): union = prediction_box.union(validation_box) intersection = prediction_box.intersection(validation_box) iou = intersection.area/union.area if iou > 0.25: yet_seen_validation_boxes.append(validation_box) intersection_over_union.append(iou) tp += 1 break # for validation_box in validation_boxes.geometry: # for prediction_box in prediction_boxes.geometry: # if validation_box.intersects(prediction_box): # union = prediction_box.union(validation_box) # intersection = prediction_box.intersection(validation_box) # iou = intersection.area/union.area # if iou > 0.25: # validation_positive += 1 # break try: precision = tp / len(prediction_boxes) except ZeroDivisionError: precision = 0 fn = len(validation_boxes) - tp fp = len(prediction_boxes) - tp recall = tp / (tp + fn) row_idx = len(boxes_validation_pd) boxes_validation_pd.loc[row_idx, "detection_file"] = prediction_boxes_file boxes_validation_pd.loc[row_idx, "accuracy"] = tp / (tp + fp + fn) boxes_validation_pd.loc[row_idx, "precision"] = precision boxes_validation_pd.loc[row_idx, "recall"] = recall boxes_validation_pd.loc[row_idx, "score_threshold"] = score_threshold boxes_validation_pd.loc[row_idx, "n_prediction_boxes"] = len(prediction_boxes) boxes_validation_pd.loc[row_idx, "n_validation_boxes"] = len(validation_boxes) boxes_validation_pd.loc[row_idx, "IoU"] = np.mean(intersection_over_union) boxes_validation_pd.to_csv(boxes_validation_file)
def validate_acquisition_wise(self, period): dates_between = self.generate_process_periods(period) station_folder = "zst" + self.station_name.split("(")[1].split(")")[0] wrong = len(station_folder) == 4 station_folder = "zst" + self.station_name.split(") ")[1].split("(")[1][0:-1] if wrong else station_folder station_file = os.path.join(self.dirs["station_counts"], station_folder, station_folder + "_%s.csv" % str(year)) for sub_period in dates_between: print("At date: %s" % sub_period) self.date = sub_period[0] band_names, resolution, folder = ["B04", "B03", "B02", "B08", "CLM"], 10, "" dir_save_archive = os.path.join(self.dirs["s2"], "archive") dir_save_archive = "G:\\archive" # if not os.path.exists(dir_save_archive): # os.mkdir(dir_save_archive) area_id = additional_stations[self.station_name_clear] if self.station_name_clear in additional_stations.keys() \ else self.station_name_clear sh_bbox = (self.bbox_epsg4326[1], self.bbox_epsg4326[0], self.bbox_epsg4326[3], self.bbox_epsg4326[2]) detections_file = os.path.join(self.dirs["detections"], "s2_detections_%s_%s.gpkg" % (self.date, area_id)) merged_file = os.path.join(dir_save_archive, "s2_bands_%s_%s_%s_merged.tiff" % (area_id, sub_period[0], sub_period[1])) if os.path.exists(detections_file): self.validate_with_bast(sub_period[0], detections_file, station_file, merged_file) continue else: if os.path.exists(merged_file): detector = RFTruckDetector() band_stack = detector.read_bands(merged_file) detector.preprocess_bands(band_stack[0:4]) prediction = detector.predict() prediction_boxes = detector.extract_objects(prediction) try: detector.prediction_boxes_to_gpkg(prediction_boxes, detections_file) except ValueError: print("Number of detections: %s, cannot write" % len(prediction_boxes)) continue self.detections_file = detections_file with rio.open(merged_file, "r") as src: meta = src.meta self.lat = lat_from_meta(meta) self.lon = lon_from_meta(meta) detector, band_stack_np = None, None self.validate_with_bast(sub_period[0], detections_file, station_file, merged_file) continue else: continue kwargs = dict(bbox=sh_bbox, period=sub_period, dataset=DataCollection.SENTINEL2_L2A, bands=["B04", "B03", "B02", "B08"], resolution=resolution, dir_save=self.dirs["s2"], merged_file=merged_file, mosaicking_order="leastCC") data_yet_there, sh = os.path.exists(merged_file), SentinelHub() obs_file = os.path.join(dir_save_archive, "obs.csv") # check if acquisition has been checked yet_checked = False try: obs_pd = pd.read_csv(obs_file, index_col=0) except FileNotFoundError: obs_pd = pd.DataFrame() try: obs_pd.to_csv(obs_file) except FileNotFoundError: os.mkdir(os.path.dirname(obs_file)) obs_pd.to_csv(obs_file) try: yet_checked = sub_period[0] in np.array(obs_pd[merged_file]) except KeyError: pass finally: if yet_checked: continue if data_yet_there: data_available = True else: sh.set_credentials(SH_CREDENTIALS_FILE) data_available = sh.data_available(kwargs) if data_available: if data_yet_there: has_obs = data_yet_there else: # check if data has enough non-cloudy observations kwargs_copy = kwargs.copy() kwargs_copy["bands"] = ["CLM"] # get cloud mask in order to check if low cloud coverage kwargs_copy["merged_file"] = os.path.join(dir_save_archive, "clm.tiff") clm, data_folder = sh.get_data(**kwargs_copy) # get only cloud mask has_obs = self.has_observations(kwargs_copy["merged_file"]) try: os.remove(kwargs_copy["merged_file"]) # cloud mask except FileNotFoundError: pass if has_obs: print("Processing: %s" % sub_period[0]) band_stack, folder = sh.get_data(**kwargs) # get full data detector = RFTruckDetector() band_stack = detector.read_bands(merged_file) detector.preprocess_bands(band_stack[0:4]) prediction = detector.predict() prediction_boxes = detector.extract_objects(prediction) try: detector.prediction_boxes_to_gpkg(prediction_boxes, detections_file) except ValueError: print("Number of detections: %s, cannot write" % len(prediction_boxes)) continue self.detections_file = detections_file with rio.open(merged_file, "r") as src: meta = src.meta self.lat = lat_from_meta(meta) self.lon = lon_from_meta(meta) detector, band_stack_np = None, None self.validate_with_bast(sub_period[0], detections_file, station_file, merged_file) # run comparison else: # add date for file in order to avoid duplicate check self.register_non_available_date(sub_period[0], obs_pd, obs_file, merged_file) else: self.register_non_available_date(sub_period[0], obs_pd, obs_file, merged_file) self.plot_bast_comparison(self.validation_file)