def main(params, test_txt=None, region_nodata=0, region_shp=None): t0 = time.time() # Read params and make variables from each line inputs = read_params(params) for var in inputs: exec("{0} = str({1})").format(var, inputs[var]) if not os.path.exists(out_dir): os.mkdir(out_dir) shutil.copy2(params, out_dir) n_samples = int(n_samples) p_nodata = int(p_nodata) t_nodata = int(t_nodata) bins = parse_bins(bins) if 'region_nodata' in inputs: region_nodata = int(region_nodata) ds = gdal.Open(region_raster) regions = ds.ReadAsArray() #nodata_mask = regions == region_nodata ds = None ds_t = gdal.Open(t_raster) ar_t = ds_t.ReadAsArray() ds_p = gdal.Open(p_raster) ar_p = ds_p.ReadAsArray() nodata_mask = (ar_p == p_nodata) | (ar_t == t_nodata) d = {} if 'region_ids' in inputs: region_ids = [int(r) for r in region_ids.split(',')] else: region_ids = np.unique(regions[~nodata_mask]) if test_txt: test_sample = pd.read_csv(test_txt) if 'row' not in test_sample.columns: test_sample = pd.read_csv(test_txt, sep='\t') else: train_sample = pd.read_csv(train_txt, sep='\t', index_col='obs_id') # Set any pixels used for training to -1 so they can be avoided for testing n_rows, n_cols = regions.shape n_pixels = regions.size pixel_ids = np.arange(n_pixels, dtype=np.uint32).reshape(n_rows, n_cols) pixel_ids[ train_sample.row, train_sample.col] = n_pixels #will always be 1 more than last col pixel_ids[nodata_mask] = n_pixels #ar_col[train_sample.row, train_sample.col] = -1 #import pdb; pdb.set_trace() test_ids = np.array(random.sample(pixel_ids[pixel_ids != n_pixels], n_samples), dtype=np.uint32) test_rows = test_ids / n_cols test_cols = test_ids % n_cols #test_cols = random.sample(ar_col[ar_col != -1], n_samples) test_sample = pd.DataFrame({'row': test_rows, 'col': test_cols}) test_sample['region'] = regions[test_rows, test_cols] ind_mask = (pixel_ids == n_pixels).reshape(n_rows, n_cols) del pixel_ids ar_row, ar_col = np.indices(regions.shape, dtype=np.int32) confusion_dir = os.path.join(out_dir, 'region_confusion_tbls') if not os.path.isdir(os.path.join(confusion_dir)): os.mkdir(confusion_dir) test_sample['reference'] = -1 test_sample['predicted'] = -1 stats = [] n_regions = len(region_ids) for i, r_id in enumerate(region_ids): print '\nCalculating stats for region %s (%s of %s)...' % (r_id, i + 1, n_regions) t1 = time.time() ''' figure out what's up with nodata values ''' region_mask = regions == r_id min_row = int(ar_row[region_mask & ~ind_mask].min()) max_row = int(ar_row[region_mask & ~ind_mask].max()) min_col = int(ar_col[region_mask & ~ind_mask].min()) max_col = int(ar_col[region_mask & ~ind_mask].max()) nrows = max_row - min_row ncols = max_col - min_col ar_t_region = ds_t.ReadAsArray(min_col, min_row, ncols, nrows) ar_p_region = ds_p.ReadAsArray(min_col, min_row, ncols, nrows) #clipped_mask = region_mask[min_row:max_row, min_col:max_col] clipped_mask = (ar_t_region == t_nodata) | (ar_p_region == p_nodata) del region_mask region_sample = test_sample[test_sample.region == r_id].copy() region_sample['global_row'] = region_sample.row region_sample['global_col'] = region_sample.col region_sample['row'] = region_sample.row - min_row region_sample['col'] = region_sample.col - min_col region_sample['reference'] = ar_t_region[region_sample.row, region_sample.col] region_sample['predicted'] = ar_p_region[region_sample.row, region_sample.col] region_sample = region_sample test_sample.ix[region_sample.index, 'reference'] = region_sample['reference'] test_sample.ix[region_sample.index, 'predicted'] = region_sample['predicted'] #import pdb; pdb.set_trace() #try: df = evaluation.confusion_matrix_by_area(ar_p_region, ar_t_region, region_sample, p_nodata, t_nodata, mask=clipped_mask, bins=bins, match='best') this_txt = os.path.join(confusion_dir, 'confusion_%s.txt' % r_id) df.to_csv(this_txt, sep='\t') accuracy = df.ix['producer', 'user'] kappa = df.ix['producer', 'kappa'] sample_mask = (region_sample.reference == t_nodata) | (region_sample.predicted == p_nodata) rmse = sf.rmse(region_sample.reference[~sample_mask], region_sample.predicted[~sample_mask]) print len(sample_mask[sample_mask]) stats.append({ 'region': r_id, 'accuracy': accuracy, 'kappa': kappa, 'rmse': rmse }) print 'Time for this region: %.1f minutes' % ((time.time() - t1) / 60) #if not test_txt: test_basename = 'test_sample_%s.txt' % n_samples test_txt = os.path.join(out_dir, test_basename) test_sample.to_csv(test_txt) desc = 'Random test sample of %s not used in training samples %s' % ( n_samples, train_txt) createMetadata(sys.argv, os.path.join(out_dir, test_basename), description=desc) print '\nTest sample text file written to:', test_txt df = pd.DataFrame(stats) out_txt = os.path.join(out_dir, 'region_stats_%s.txt' % n_samples) df.to_csv(out_txt, sep='\t', index=False) test_basename = os.path.basename(test_txt) desc = 'Stats from modeling regions from %s. Samples drawn from %s' % ( region_raster, os.path.join(out_dir, test_basename)) createMetadata(sys.argv, out_txt, description=desc) if region_shp: out_shp = os.path.join(out_dir, 'modeling_region_stats.shp') df_to_shp(df, region_shp, out_shp, copy_fields=False, df_id_field='region', shp_id_field='Zone_ID') print '\nFinished in %.1f minutes' % ((time.time() - t0) / 60)
def main(params, ar_p=None, out_txt=None, inventory_txt=None, target_col=None, match=False, file_stamp=None): #p_path, t_path, bins, sample_txt, p_nodata, t_nodata, out_dir, inventory_txt=None # Read params and make variables from text inputs = read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) # Check that variables were specified in params try: bins = parse_bins(bins) p_nodata = int(p_nodata) t_nodata = int(t_nodata) str_check = sample_txt #, target_col except NameError as e: print '' missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) #if out_dir_: # then out_dir came from predict_stem call # out_dir = out_dir_ #out_txt = os.path.join(out_dir, 'confusion.txt') if out_txt: out_dir = os.path.dirname(out_txt) if not os.path.exists(out_dir): os.mkdir(out_dir) shutil.copy2(params, out_dir) # If p_path was specified, this call of the function is coming from outside # predict_stem.py. Otherwise, ar_p should be given. if 'p_path' in locals(): print 'Reading in the prediction raster:%s\n' % p_path ds_p = gdal.Open(p_path) ar_p = ds_p.ReadAsArray() ds_t = gdal.Open(t_path) band = ds_t.GetRasterBand(1) ar_t = band.ReadAsArray() #ar_t=ar_t.GetRasterBand(1) #print('read in the truth raster') t_xsize = ds_t.RasterXSize #print('t_xsize is: ', t_xsize) t_ysize = ds_t.RasterYSize #print('tYsize is: ', t_ysize) p_xsize = ds_p.RasterXSize #print('p_xsize is: ', p_xsize) p_ysize = ds_p.RasterYSize #print('p_ysize is: ', p_ysize) tx_t = ds_t.GetGeoTransform() tx_p = ds_p.GetGeoTransform() # If two arrays are different sizes, make prediction array match reference if not t_xsize == p_xsize or t_ysize == p_ysize or tx_t != tx_p: print('entered if statement') warnings.warn( 'Prediction and reference rasters do not share the same extent. Snapping prediction raster to reference....' ) offset = mosaic.calc_offset((tx_t[0], tx_t[3]), tx_p) #print(offset) t_inds, p_inds = mosaic.get_offset_array_indices( (t_ysize, t_xsize), (p_ysize, p_xsize), offset) print(t_inds, p_inds) ar_buf = np.full(ar_t.shape, p_nodata, dtype=ar_p.dtype) print ar_buf.shape ar_buf[t_inds[0]:t_inds[1], t_inds[2]:t_inds[3]] = ar_p[p_inds[0]:p_inds[1], p_inds[2]:p_inds[3]] ar_p = ar_buf.copy() del ar_buf mask = (ar_p == p_nodata) | (ar_t == t_nodata) #''' samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') print samples df_adj, df_smp = confusion_matrix_by_area(ar_p, ar_t, samples, p_nodata, t_nodata, mask=mask, bins=bins, out_txt=out_txt, target_col=target_col, match=match) ar_p = None ar_t = None mask = None accuracy = df_adj.ix['producer', 'user'] kappa = df_adj.ix['producer', 'kappa'] if inventory_txt and file_stamp: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') if file_stamp in df_inv.index and 'vote' in os.path.basename(out_dir): cols = ['vote_accuracy', 'vote_kappa'] df_inv.ix[file_stamp, cols] = accuracy, kappa df_inv.to_csv(inventory_txt, sep='\t') print 'Vote scores written to inventory_txt: ', inventory_txt if file_stamp in df_inv.index and 'mean' in os.path.basename(out_dir): cols = ['mean_accuracy', 'mean_kappa'] df_inv.ix[file_stamp, cols] = accuracy, kappa df_inv.to_csv(inventory_txt, sep='\t') return df_smp