def main(): p_path = '/vol/v2/stem/imperv/models/imperv_20161012_0958/imperv_20161012_0958_vote.bsq' t_path = '/vol/v2/stem/imperv/truth_map/imperv2001_CAORWA.bsq' nodata_p = 255 nodata_t = 255 out_dir = '/vol/v2/stem/imperv/models/imperv_20161012_0958/evaluation_vote/' ds_p = gdal.Open(p_path) ar_p = ds_p.ReadAsArray() ds_p = None ds_t = gdal.Open(t_path) ar_t = ds_t.ReadAsArray() ds_t = None sample_txt = '/vol/v2/stem/imperv/samples/imperv_sample1454542_20161007_0843/imperv_sample1454542_20161007_0843_test.txt' #sample_txt = '/vol/v2/stem/canopy/samples/canopy_sample1454542_20161017_1919/canopy_sample1454542_20161017_1919_test.txt' df = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') p_samples, t_samples = get_samples(ar_p, ar_t, df, nodata_p, nodata_t, match=False) #t_mask = (t_samples > 0) & (p_samples > 0) #t_samples = t_samples[t_mask] #p_samples = p_samples[t_mask] out_png = os.path.join(out_dir, 'imperv_20161012_0958_2dhistogram_average_hex_gray.png') #out_png = os.path.join(out_dir, 'canopy_20161018_2254_2dhistogram_bestmatch_hex_gray.png') #import pdb; pdb.set_trace() sns.set_context(context='paper', font_scale=1.4) histogram_2d(t_samples, p_samples, out_png, bins=50, hexplot=True, vmax=4000) print out_png ar_p = None ar_t = None p_samples = None t_samples = None
def main(p_path, t_path, nodata_p, nodata_t, sample_txt, out_png): ds_p = gdal.Open(p_path) ar_p = ds_p.ReadAsArray() ds_p = None ds_t = gdal.Open(t_path) ar_t = ds_t.ReadAsArray() ds_t = None df = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') p_samples, t_samples = get_samples(ar_p, ar_t, df, nodata_p, nodata_t, match=False) if not os.path.isdir(os.path.dirname(out_png)): os.makedirs(os.path.dirname(out_png)) sns.set_context(context='paper', font_scale=1.4) histogram_2d(t_samples, p_samples, out_png, bins=50, hexplot=True, vmax=4000) print(out_png) ar_p = None ar_t = None p_samples = None t_samples = None
def par_get_match(args): t0 = time.time() tile_ind, this_in, this_match, in_nodata, match_nodata, count, total_tiles = args matched_vals, _ = get_samples(this_in, this_match, in_nodata, match_nodata, match=True) print 'Time for getting array %s of %s: %.1f seconds' % ( count, total_tiles, time.time() - t0) return tile_ind, matched_vals
def get_samples(self, model_name, zero_inflated=True, num_samples=1000): return get_samples( self.model_dict[model_name], self.guide_dict[model_name], self.p_data_train, self.t_data, self.s_data, self.r_data, None, self.p_types_train, self.p_stories_train, self.p_subreddits_train, zero_inflated, num_samples=num_samples, )
def main(sample_txt, ref_raster, pred_raster, p_nodata, t_nodata, target_col, bins, out_txt, match=None, predict_col=None): p_nodata = int(p_nodata) t_nodata = int(t_nodata) ds_p = gdal.Open(pred_raster) ar_p = ds_p.ReadAsArray() ds_r = gdal.Open(ref_raster) ar_r = ds_r.ReadAsArray() r_xsize = ds_r.RasterXSize r_ysize = ds_r.RasterYSize p_xsize = ds_p.RasterXSize p_ysize = ds_p.RasterYSize tx_r = ds_r.GetGeoTransform() tx_p = ds_p.GetGeoTransform() # If two arrays are different sizes, make prediction array match reference if not r_xsize == p_xsize or r_ysize == p_ysize or tx_r != tx_p: warnings.warn('Prediction and reference rasters do not share the same extent. Snapping prediction raster to reference....') offset = mosaic.calc_offset((tx_r[0], tx_r[3]), tx_p) t_inds, p_inds = mosaic.get_offset_array_indices((r_ysize, r_xsize), (p_ysize, p_xsize), offset) ar_buf = np.full(ar_r.shape, p_nodata, dtype=ar_p.dtype) ar_buf[t_inds[0]:t_inds[1], t_inds[2]:t_inds[3]] = ar_p[p_inds[0]:p_inds[1], p_inds[2]:p_inds[3]] ar_p = ar_buf.copy() del ar_buf bins = parse_bins(bins) sample = pd.read_csv(sample_txt, sep='\t') if target_col in sample.columns: t_sample = sample[target_col] else: raise IndexError('target_col "%s" not in sample' % target_col) if match: t_sample, p_sample = get_samples(ar_p, ar_r, p_nodata, t_nodata, sample, match=match) elif predict_col: p_sample = sample[predict_col] else: p_sample = ar_p[sample.row, sample.col] t_sample = ar_r[sample.row, sample.col] rmse = area_weighted_rmse(ar_p, ar_r, p_sample, t_sample, bins, p_nodata, out_txt=out_txt) return rmse
def main(search_dir, models, t_path, inventory_txt, t_nodata=255): df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') columns = df_inv.columns if 'vote_rmse' not in columns: df_inv['vote_rmse'] = None if 'mean_rmse' not in columns: df_inv['mean_rmse'] = None df_inv = df_inv.ix[models] ds = gdal.Open(t_path) ar_t = ds.ReadAsArray() nodata_mask = ar_t == t_nodata ds = None for model in models: print '\nCalculating RMSE for ', model model_dir = os.path.join(search_dir, model) if not os.path.exists(model_dir): print 'Model dir does not exist: %s. Skipping...\n' % model_dir continue confusion_params = os.path.join(model_dir, 'confusion_matrix_params.txt') if not os.path.exists(confusion_params): print 'Could not find confusion params: ', confusion_params predict_params = os.path.join(model_dir, 'predict_stem_params.txt') inputs, _ = stem.read_params(predict_params) p_nodata = int(inputs['nodata'].replace('"', '')) this_srch_str = os.path.join(model_dir, 'train_stem*_params.txt') train_params = glob.glob(this_srch_str) if len(train_params) == 0: print 'Can not find test data for ', model, '\n' continue train_params = train_params[0] inputs, _ = stem.read_params(train_params) test_txt = inputs['sample_txt'].replace('predictors', 'test').replace('"', '') train_txt = inputs['sample_txt'].replace('"', '') else: inputs = read_params(confusion_params) for k, v in inputs.iteritems(): inputs[k] = v.replace('"', '') test_txt = inputs['sample_txt'] p_nodata = int(inputs['p_nodata']) train_txt = inputs['sample_txt'].replace('_test', '').replace('"', '') #df = pd.read_csv(test_txt, sep='\t', index_col='obs_id') train_sample = pd.read_csv(train_txt, sep='\t', index_col='obs_id') # Set any pixels used for training to -1 so they can be avoided for testing n_rows, n_cols = ar_t.shape n_pixels = ar_t.size pixel_ids = np.arange(n_pixels, dtype=np.uint32).reshape(n_rows, n_cols) pixel_ids[ train_sample.row, train_sample.col] = n_pixels #will always be 1 more than last id pixel_ids[nodata_mask] = n_pixels n_samples = int( int( os.path.basename(train_txt).split('_')[1].replace( 'sample', '')) * 0.2) test_ids = np.array(random.sample(pixel_ids[pixel_ids != n_pixels], n_samples), dtype=np.uint32) test_rows = test_ids / n_cols test_cols = test_ids % n_cols #test_cols = random.sample(ar_col[ar_col != -1], n_samples) df = pd.DataFrame({'row': test_rows, 'col': test_cols}) for agg_method in ['vote', 'mean']: p_path = os.path.join(model_dir, '%s_%s.bsq' % (model, agg_method)) ds = gdal.Open(p_path) ar_p = ds.ReadAsArray() t_samples, p_samples = get_samples(ar_p, ar_t, p_nodata, 255, samples=df, match='best') rmse = np.round(calc_rmse(t_samples, p_samples), 1) print agg_method, ': ', rmse df_inv.ix[model, '%s_rmse' % agg_method] = rmse out_txt = os.path.join( model_dir, '%s_random_test_sample%s.txt' % (model, n_samples)) df.to_csv(out_txt, sep='\t', index=False) out_txt = inventory_txt.replace('.txt', '_randomRMSE.txt') df_inv.to_csv(out_txt, sep='\t')