def main(set_txt, train_params, plot_dims, out_png): sns.set_context(context='paper', font_scale=.3) sns.set_style('white', rc={'axes.linewidth': .5}) inputs, df_var = stem.read_params(train_params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) predict_cols = sorted(df_var.index) df = pd.read_csv(set_txt, sep='\t', index_col='set_id') df = df[df.oob_rate < 5] fig, axes = plt.subplots(*plot_dims) n_plots = axes.size set_ids = random.sample(df.index, n_plots) sample = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') for si, ax in zip(set_ids, axes.ravel()): support_set = df.ix[si] oob_ind_txt = support_set.dt_file.replace('.pkl', '_oob_inds.txt') with open(oob_ind_txt) as txt: oob_inds = [int(l) for l in txt] with open(support_set.dt_file, 'rb') as f: dt_model = pickle.load(f) oob_sample = sample.ix[oob_inds] oob_predictions = dt_model.predict(oob_sample[predict_cols]) oob_reference = oob_sample[target_col] rmse = sf.rmse(oob_reference, oob_predictions) ac, ac_s, ac_u, ssd, spod = sf.agree_coef(oob_reference, oob_predictions) ax.plot(oob_reference, oob_predictions, 'o', alpha=0.05, markeredgecolor='none', markersize=2.5) #ax.xticks([0, 50, 100]) #ax.yticks([0, 50, 100]) title = 'Set ID: %s, RMSE: %.1f, ac: %.3f' % (si, rmse, ac) ax.set_title(title) sns.despine() fig.subplots_adjust(hspace=0.1) plt.savefig(out_png, dpi=300)
def main(params, inventory_txt=None, constant_vars=None): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: n_tiles = [int(i) for i in n_tiles.split(',')] support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) # If constants were given, make a dict and make sure they match the training # constants if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) pred_constants = sorted(constant_vars.keys()) train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')] train_constants = sorted(train_constants) unmatched_vars = [v for v in pred_vars if v not in train_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in pred_constants if v not in train_constants] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str raise NameError(msg) unmatched_vars = [v for v in train_vars if v not in pred_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in train_constants if v not in pred_constants] pred_vars += pred_constants # Add here because it would screw with stuff upstream if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) t0 = time.time() if 'n_jobs' in inputs: # Predict in parallel n_jobs = int(n_jobs) args = [] t1 = time.time() print 'Predicting in parallel with %s jobs...' % n_jobs print 'Building args and making rasters of TSA arrays...' for c, (set_id, row) in enumerate(df_sets.iterrows()): # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] dtype_code = mosaic_ds.GetRasterBand(1).DataType mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True) # Build list of args to pass to the Pool tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) ds = gdal.Open(tsa_raster) tsa_tx = ds.GetGeoTransform() ds = None tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx) args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1)/60) p = Pool(n_jobs) p.map(stem.par_predict, args, 1) else: # Loop through each set and generate predictions for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, mosaic_tx, xsize, ysize, dt_model, nodata, np.int16, constant_vars) tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id) mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60) #mosaic_ds = None print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#''' #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0) #df_sets.to_csv(set_txt, sep='\t')''' mosaic_ds = None # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp)) ar_vote = ds.ReadAsArray() ds = None ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp)) ar_mean = ds.ReadAsArray() ds = None#''' if 'confusion_params' in locals(): import confusion_matrix as confusion vote_dir = os.path.join(model_dir, 'evaluation_vote') mean_dir = os.path.join(model_dir, 'evaluation_mean') print '\nComputing confusion matrix for vote...' out_txt = os.path.join(vote_dir, 'confusion.txt') print confusion_params df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e '''print '\nGetting confusion matrix for mean...' out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] #mean_acc = df_m.ix['user','producer'] #mean_kap = df_m.ix['user', 'kappa'] if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap #print 'Mean accuracy .............. ', mean_acc #print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
def main(): srch_dir = '/vol/v2/stem/canopy/models' stamps = fnmatch.filter(os.listdir(srch_dir), 'canopy*') info = [] for stamp in stamps: print stamp this_dir = os.path.join(srch_dir, stamp) this_srch_str = os.path.join(this_dir, 'train_stem*_params.txt') matched = glob.glob(this_srch_str) if len(matched) == 0: print 'No param file for ', stamp info.append( [stamp, '', 0, 0, 0, 0, False, 0, 0, False, 0, 0, '', 0, '']) continue this_param_text = matched[0] if 'regressor' in this_param_text: model_type = 'Regressor' else: model_type = 'Classifier' inputs, df_var = stem.read_params(this_param_text) for var in inputs: exec("{0} = str({1})").format(var, inputs[var]) vote_mask = False vote_accuracy = None vote_kappa = None vote_dir = os.path.join(this_dir, 'evaluation_vote') if os.path.exists(vote_dir): for root, dis, files in os.walk(vote_dir): for f in files: if f.endswith('txt'): vote_txt = os.path.join(root, f) df_vote = pd.read_csv(vote_txt, sep='\t', index_col='bin') try: vote_accuracy = int(df_vote.ix['producer', 'user']) vote_kappa = round(df_vote.ix['kappa', 'kappa'], 2) except: vote_accuracy = int(df_vote.ix['user', 'producer']) vote_kappa = round(df_vote.ix['user', 'kappa'], 2) if 'mask' in vote_txt: vote_mask = True mean_mask = False mean_accuracy = None mean_kappa = None mean_dir = os.path.join(this_dir, 'evaluation_mean') if os.path.exists(mean_dir): for root, dis, files in os.walk(mean_dir): for f in files: if f.endswith('txt'): mean_txt = os.path.join(root, f) df_mean = pd.read_csv(mean_txt, sep='\t', index_col='bin') try: mean_accuracy = int(df_mean.ix['producer', 'user']) mean_kappa = round(df_mean.ix['kappa', 'kappa'], 2) except: mean_accuracy = int(df_mean.ix['user', 'producer']) mean_kappa = round(df_mean.ix['user', 'kappa'], 2) if 'mask' in mean_txt: mean_mask = True dt_dir = os.path.join(this_dir, 'decisiontree_models') try: n_sets = len(os.listdir(dt_dir)) - 1 except: n_sets = None n_samples = int(sample_txt.split('_')[1].replace('sample', '')) if not 'max_features' in inputs: max_features = None avg_count = None cnt_path = os.path.join(this_dir, '%s_count.bsq' % stamp) if os.path.exists(cnt_path): ds = gdal.Open(cnt_path) ar = ds.ReadAsArray() cnt_nodata = ds.GetRasterBand(1).GetNoDataValue() ds = None if len(ar[ar == cnt_nodata]) == 0: cnt_min = ar.min() cnt_max = ar.max() if cnt_min <= 0: cnt_nodata = cnt_min else: cnt_nodata = cnt_max avg_count = int(round(np.mean(ar[ar != cnt_nodata]), 0)) avg_oob = None oob_path = os.path.join(this_dir, '%s_oob.bsq' % stamp) if os.path.exists(oob_path): ds = gdal.Open(oob_path) ar = ds.ReadAsArray() ds = None oob_min = ar.min() oob_max = ar.max() if oob_min <= 0: oob_nodata = oob_min else: oob_nodata = oob_max avg_oob = round(np.mean(ar[ar != oob_nodata]), 1) info.append([ stamp, model_type, avg_oob, avg_count, vote_accuracy, vote_kappa, vote_mask, mean_accuracy, mean_kappa, mean_mask, n_sets, n_samples, '[%s]' % support_size, sets_per_cell, max_features ]) df = pd.DataFrame(info, columns=[ 'stamp', 'model_type', 'avg_oob', 'avg_count', 'vote_accuracy', 'vote_kappa', 'vote_mask', 'mean_accuracy', 'mean_kappa', 'mean_mask', 'n_sets', 'n_samples', 'support_size', 'sets_per_cell', 'max_features' ]) out_txt = os.path.join(srch_dir, 'model_info.txt') df.to_csv(out_txt, sep='\t', index=False) print 'Text written to ', out_txt
def main(params, pct_train=None, min_oob=0, gsrd_shp=None, resolution=30, make_oob_map=False, snap_coord=None, oob_map_metric='oob_rate', n_jobs=1, oob_drop=None): t0 = time.time() inputs = stem.read_params(params) # Convert params to named variables and check for required vars for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'max_features' not in locals(): max_features = None if 'min_oob' in inputs: min_oob = int(min_oob) num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train) cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars str_check = sample_txt, target_col, mosaic_path, out_dir, model_type except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) print(var_info) df_var = pd.read_csv(var_info, sep='\t', index_col='var_name') # Read in training samples and check that df_train has exactly the same # columns as variables specified in df_vars df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n\t'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str import pdb pdb.set_trace() raise NameError(msg) if target_col not in df_train.columns: raise NameError('target_col "%s" not in sample_txt: %s' % (target_col, sample_txt)) if 'max_target_val' in inputs: max_target_val = int(max_target_val) else: max_target_val = df_train[target_col].max() # Make a timestamped output directory if outdir not specified now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] if not 'out_dirname' in locals(): out_dirname = target_col stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str) out_dir = os.path.join(out_dir, stamp) os.makedirs( out_dir ) # With a timestamp in dir, no need to check if it already exists shutil.copy2(params, out_dir) #Copy the params for reference ''' predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order # If there are variables that should remain constant across the modeling # region, get the names if 'constant_vars' in locals(): constant_vars = sorted([i.strip() for i in constant_vars.split(',')]) predict_cols += constant_vars # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None if snap_coord: snap_coord = [int(c) for c in snap_coord.split(',')] out_txt = os.path.join(out_dir, stamp + '.txt') df_sets = stem.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, df_train, min_obs, target_col, predict_cols, out_txt, gsrd_shp, pct_train, snap_coord=snap_coord) n_sets = len(df_sets) # Create SQL DB and add train sample table '''print 'Dumping train_txt to database...' t1 = time.time()#''' db_path = os.path.join(out_dir, stamp + '.db') '''engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) #df_train.to_sql('train_sample', engine, chunksize=10000) print '%.1f minutes\n' % ((time.time() - t1)/60)#''' # Split x and y train t1 = time.time() print "'{0}'".format(model_type.lower()) if model_type.lower().strip( ) == 'classifier': #remove .trim() peter clary it was after lower print 'Training STEM with classifier algorithm...' model_func = stem.fit_tree_classifier elif model_type.lower().strip() == 'zeroinflated': print 'Training STEM with zeroinflated regression algorithm...' model_func = stem.fit_tree_zeroinflated else: print 'Training STEM with regressor algorithm...' model_func = stem.fit_tree_regressor x_train = df_train.reindex(columns=predict_cols) y_train = df_train[target_col] importance_cols = ['importance_%s' % c for c in predict_cols] for c in importance_cols: df_sets[c] = 0 # Train estimators dropped_sets = pd.DataFrame(columns=df_sets.columns) dt_dir = os.path.join(out_dir, 'decisiontree_models') if not os.path.exists(dt_dir): os.mkdir(dt_dir) dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl') #oob_rates = [0] n_jobs = int(n_jobs) sets = _par_train_stem(n_jobs, n_sets, df_train, predict_cols, target_col, min_obs, df_sets, model_func, model_type, max_features, dt_path_template, db_path, max_target_val) support_sets, samples = zip(*sets) df_sets = pd.DataFrame(list(support_sets))\ .dropna(subset=['dt_file'])\ .rename_axis('set_id') #print('the cols in the df at this point are: ', df_sets.columns) df_sets.to_csv(os.path.join(out_dir, 'support_sets.txt'), sep='\t') # Consider moving this back to train function by switching to DBMS with multithread support '''print '\n\nMaking relationship table for samples and sets...' t1 = time.time() set_samples = pd.concat(list(samples), ignore_index=True) set_samples.to_sql('set_samples', engine, chunksize=100000) print '%.1f minutes\n' % ((time.time() - t1)/60)''' # Calculate OOB rates and drop sets with too low OOB print 'Calculating OOB rates and dropping sets with high OOB error...' t1 = time.time() try: df_sets, low_oob, oob_metric = stem.get_oob_rates( df_sets, df_train, db_path, target_col, predict_cols, min_oob, model_type, drop_expression=oob_drop) except Exception as e: import pdb pdb.set_trace() if oob_drop and len(low_oob) > 0: df_sets.drop(low_oob.index, inplace=True) low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp') low_oob.drop('dt_model', axis=1, inplace=True) stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp) set_shp = os.path.join(out_dir, 'support_sets.shp') try: stem.coords_to_shp(df_sets.drop('dt_model', axis=1), gsrd_shp, set_shp) except Exception as e: import pdb pdb.set_trace() print e.message print 'Min OOB rate after dropping: ', df_sets[oob_metric].min() print 'Estimated average OOB score: ', int(df_sets[oob_metric].mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving support set info...' #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt') df_sets['set_id'] = df_sets.index df_sets = df_sets.drop('dt_model', axis=1) #.to_csv(set_txt, sep='\t', index=False) #df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine) t1 = time.time() print '%.1f minutes\n' % ((time.time() - t1) / 60) #""" '''stamp = os.path.basename(out_dir) db_path = os.path.join(out_dir, stamp + '.db') engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, inventory_txt=None): inputs, df_var = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: n_tiles = [int(i) for i in n_tiles.split(',')] support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: _, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str raise NameError(msg) unmatched_vars = [v for v in train_vars if v not in pred_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: #shutil.copy2(confusion_params, out_dir) conf_bn = os.path.basename(confusion_params) confusion_params = os.path.join(out_dir, conf_bn) if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob( os.path.join('/vol/v2/stem/imperv/imperv_bdt', 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) '''# Loop through each set and generate predictions t0 = time.time() for c, (set_id, row) in enumerate(df_sets.iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) ar_coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem.predict_set_in_pieces(set_id, df_var, mosaic_ds, ar_coords, mosaic_tx, xsize, ysize, dt_model, nodata) tx = ar_coords.ul_x, x_res, x_rot, ar_coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id) array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60) #mosaic_ds = None print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#''' #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) ar_mean, ar_vote, pct_importance, df_sets = stem.aggregate_predictions( ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0) #df_sets.to_csv(set_txt, sep='\t')''' mosaic_ds = None ds = gdal.Open('/vol/v2/stem/canopy/canopy_bdt/canopy_bdt_vote.bsq') ar_vote = ds.ReadAsArray() ds = None if 'confusion_params' in locals(): import confusion_matrix as confusion vote_dir = os.path.join(model_dir, 'evaluation_vote') mean_dir = os.path.join(model_dir, 'evaluation_mean') print '\nGetting confusion matrix for vote...' out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e print '\nGetting confusion matrix for mean...' out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e vote_acc = df_v.ix['user', 'producer'] vote_kap = df_v.ix['user', 'kappa'] mean_acc = df_m.ix['user', 'producer'] mean_kap = df_m.ix['user', 'kappa'] if 'inventory_txt': df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = [ 'vote_accuracy', 'vote_kappa', 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask' ] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap, False, \ mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #'''
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var = pd.read_csv(var_info, sep='\t', index_col='var_name') df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = pd.read_csv(train_inputs['var_info'].replace('"', ''), sep='\t', index_col='var_name') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if overwrite_tiles.lower() == 'false': overwrite_tiles = False if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, os.path.basename(model_dir) + '.db') if os.path.exists(db_path): engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') #''' else: set_txt = stem.find_file(model_dir, '*support_sets.txt') if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace( '.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format( clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True) #''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path, 1) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) i = 0 for feature in mosaic_ds: g = feature.GetGeometryRef() # Check that the feature is valid. Clipping can produce a feautre # w/ an area of 0 if g.GetArea() > 1: mosaic_geom.AddGeometry(g) else: fid = feature.GetFID() feature.Destroy() mosaic_ds.DeleteFeature(fid) #import pdb; pdb.set_trace() df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom.UnionCascaded()) xsize = int((max_x - min_x) / resolution) ysize = int((max_y - min_y) / resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"', '') snap_coord = [float(c) for c in snap_coord.split(',')] #''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df( mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 90 x 40 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(out_dir, '_temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: pattern = re.compile('tile_\d+_%s.tif' % stat) stat_match = [f.split('_')[1] for f in files if pattern.match(f)] try: tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) except: pass #import pdb; pdb.set_trace() index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False) tiles.set_index(index_field, inplace=True) #''' tiles['ul_x'] = [ stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin', 'xmax']].iterrows() ] tiles['ul_y'] = [ stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin', 'ymax']].iterrows() ] tiles['lr_x'] = [ xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin', 'xmin']].iterrows() ] tiles['lr_y'] = [ ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin', 'ymin']].iterrows() ] support_nrows = int(support_size[0] / abs(y_res)) support_ncols = int(support_size[1] / abs(x_res)) t1 = time.time() # Patch for unknown Landcover screwup args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin([ '1931', '2810', '0705', '0954', '2814', '1986', '2552', '2019', '2355', '3354', '2278', '2559' ])].iterrows())] args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[ tiles['name'].isin(['0705'])].iterrows())] # Patch for the GEE subset 2 outside-of-buffer 'slice' #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin(['0639','0718','0797','0876','0955','1034'])].iterrows())] # Original line #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tile_files.isnull().any(axis=1).values].iterrows())] limits = [] for arg in args: print tile_info[tile_id_field] limits.append(stem.par_predict_tile(arg)) #''' ### return print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ( (time.time() - t1) / 3600) try: limits = pd.concat(limits) except: # They're all None pass t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: #dtype = mosaic.get_min_numpy_dtype(limits[stat]) dtype = np.int16 if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) #dtype) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=dtype) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join( tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) try: ds = gdal.Open(tile_file) except: print 'Tile not found' continue tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off:row_off + tile_rows, col_off:col_off + tile_cols] = ar_tile except Exception as e: pass #import pdb; pdb.set_trace() out_path = os.path.join(out_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles #shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1) / 60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1) / 60) # Save the importance values importance = pd.DataFrame({ 'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [ int(r) for r in importance.pct_importance.rank(method='first', ascending=False) ] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t') #''' print '\nTotal prediction runtime: %.1f hours\n' % ( (time.time() - t0) / 3600)
def main(params, min_oob=0, err_threshold=10): t0 = time.time() #read_params(params) inputs, df_var = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'err_threshold' in inputs: err_threshold = int(err_threshold) str_check = sample_txt, target_col, mosaic_path, tsa_txt, out_dir n_sets = int(n_sets) except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) '''now = datetime.now() date_str = str(now.date()).replace('-','') time_str = str(now.time()).replace(':','')[:4] stamp = '{0}_{1}_{2}'.format(target_col, date_str, time_str) out_dir = os.path.join(out_dir, stamp) os.makedirs(out_dir) # With a timestamp in dir, no need to check if it already exists''' #out_dir = '/vol/v2/stem/imperv/imperv_bdt' if not os.path.exists(out_dir): os.mkdir(out_dir) stamp = os.path.basename(out_dir) shutil.copy2(params, out_dir) #Copy the params for reference df_train = pd.read_csv(sample_txt, sep='\t') # Check that df_train has exactly the same columns as variables specified in df_vars unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str raise NameError(msg) predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order # Make dataframe of set coords min_x, min_y, max_x, max_y, x_res, y_res, tx = stem.get_raster_bounds( mosaic_path) if x_res < 0: ul_x = max_x lr_x = min_x else: ul_x = min_x lr_x = max_x if y_res < 0: ul_y = max_y lr_y = min_y else: ul_y = min_y lr_y = max_y ar_sets = np.tile([ul_x, ul_y, lr_x, lr_y], n_sets).reshape(n_sets, 4) df_sets = pd.DataFrame(ar_sets, columns=['ul_x', 'ul_y', 'lr_x', 'lr_y']) # Train a tree for each support set print 'Training models...' t1 = time.time() #set_txt = os.path.join(out_dir, 'decisiontree_models/%s_support_sets.txt' % stamp) #df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') x_train = df_train.reindex(columns=predict_cols) y_train = df_train[target_col] df_sets['dt_model'] = '' df_sets['oob_rate'] = 0 df_sets[['dt_model', 'oob_rate']] = [ stem.fit_bdt_tree_regressor(x_train, y_train) for s in df_sets.index ] del df_train print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving models...' t1 = time.time() df_sets, set_txt = stem.write_model(out_dir, df_sets) print '%.1f minutes\n' % ((time.time() - t1) / 60) #''' '''out_dir = '/vol/v2/stem/canopy/models/canopy_20161016_0910' stamp = os.path.basename(out_dir) set_txt = '/vol/v2/stem/{0}/models/{1}/decisiontree_models/{1}_support_sets.txt'.format(target_col, stamp) df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') oob_txt = os.path.join(out_dir, '%s_oob.txt' % stamp) df_oob = pd.read_csv(oob_txt, sep='\t') ds = gdal.Open(os.path.join(out_dir, '%s_oob.bsq' % stamp)) ar_oob = ds.ReadAsArray() ds = None ds = gdal.Open(os.path.join(out_dir, '%s_count.bsq' % stamp)) ar_cnt = ds.ReadAsArray() ds = None predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_bright','delta_green','delta_nbr','delta_wet', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' # Record params in inventory text file if 'inventory_txt' in locals(): t1 = time.time() print 'Getting model info...\n' df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') if 'regressor' in params: model_type = 'Regressor' else: model_type = 'Classifier' n_sets = len(df_sets) n_samples = int(sample_txt.split('_')[1].replace('sample', '')) info = [ model_type, None, None, None, None, None, None, None, None, n_sets, n_samples, str(support_size), sets_per_cell, max_features ] df_inv.ix[stamp] = info info_dir = os.path.dirname(inventory_txt) existing_models = fnmatch.filter(os.listdir(os.path.dirname(info_dir)), '%s*' % target_col) if len(existing_models) > 0: df_inv = df_inv[df_inv.index.isin(existing_models)] # Check if oob_map params were specified. If not, set to defaults if 'err_threshold' not in locals(): print 'err_threshold not specified. Using default: 10 ...\n' err_threshold = 10 else: err_threshold = int(err_threshold) if 'n_tiles' not in locals(): print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = int(n_tiles[0]), int(n_tiles[1]) #t1 = time.time() print 'Calculating OOB score and making OOB score map...' ds = gdal.Open(mosaic_path) ar = ds.ReadAsArray() mask = ar != 0 del ar xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() ds = None #import get_oob_map as oob ar_oob, ar_cnt, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx, support_size, df_oob, df_sets, target_col, predict_cols, err_threshold, out_dir, stamp, prj, driver) df_sets.to_csv(set_txt, sep='\t') #''' #if 'inventory_txt' in locals() : avg_oob = round(np.mean(ar_oob[mask]), 1) avg_cnt = int(round(np.mean(ar_cnt[mask]), 0)) df_inv.ix[stamp, 'avg_oob'] = avg_oob df_inv.ix[stamp, 'avg_count'] = avg_cnt if len(df_inv) > 1: df_inv.to_csv(inventory_txt, sep='\t') else: print 'WARNING: Model info not written to inventory_txt...\n' print '\nAverage OOB score: .................... %.1f' % avg_oob print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt print 'Time to make OOB score map: %.1f hours\n' % ( (time.time() - t1) / 3600) #except Exception as e: # print 'Problem getting oob map: ', e print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, pct_train=None, aggregate_presence=False): t0 = time.time() # Read params. Make variables from each line of the 1-line variables inputs, df_vars = stem.read_params(params) for var in inputs: exec("{0} = str({1})").format(var, inputs[var]) try: if 'years' in inputs: years = np.array([int(yr) for yr in years.split(',')]) else: year_start = int(year_start) year_end = int(year_end) years = np.arange(year_start, year_end + 1) '''tsa_mosaic = inputs['tsa_mosaic'] search_dir = inputs['search_dir'] search_str = inputs['search_str'] obs_txt = inputs['obs_txt'] index_col = inputs['index_col'] year_col = inputs['year_col'] target_col = inputs['target_col'] out_txt = inputs['out_txt']''' add_file_tag = int(add_file_tag) #count_type = inputs['count_type'] except KeyError as e: missing_var = str(e).split("'")[1] if missing_var in ['year_start', 'year_end', 'years']: msg = ('No list of years or year_start/year_end specified in' +\ ' param file:\n%s\n. Re-run script with either of these' +\ ' parameters given.') % params msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) out_dir, original_bn = os.path.split(out_txt) # Add informative tags to output dir and basename if add_file_tag: res = years[1] - years[0] #out_dir = os.path.basename(out_dir) now = datetime.datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] out_dirname = '{0}_res{1}yr_{2}_{3}'.format(target_col, res, date_str, time_str) out_dir = os.path.join(out_dir, out_dirname) if not os.path.exists(out_dir): os.mkdir(out_dir) out_bn = '{0}_{1}'.format( os.path.basename(obs_txt).replace('.txt', ''), original_bn) out_txt = os.path.join(out_dir, out_bn) if params != os.path.exists(os.path.join(out_dir, os.path.basename(params))): print 'Copying params to output dir: %s\n' % out_dir shutil.copy2(params, out_dir) print 'Getting predictors... ' t1 = time.time() df_obs = pd.read_csv(obs_txt, sep='\t', index_col=index_col) original_columns = df_obs.columns df = get_predictors(years, search_dir, search_str, df_obs, index_col, year_col, df_vars) print '%.1f seconds\n' % (time.time() - t1) # Select count type and date range if 'count_type' in inputs: count_type = [t.strip() for t in count_type.split(',')] df = df[df.COUNT_TYPE.isin(count_type)] #df.drop(['COUNT_TYPE'], axis=1, inplace=True) if 'P21' in count_type: df = df[df.EFFORT_DISTANCE_KM < .1] if 'day_minmax' in inputs: day_min, day_max = [int(d) for d in day_minmax.split(',')] df = df[(df.DAY >= day_min) & (df.DAY <= day_max)] if 'time_minmax' in inputs: time_min, time_max = [int(t) for t in time_minmax.split(',')] df = df[(df.TIME >= time_min) & (df.TIME <= time_max)] if 'max_effort_time' in inputs: max_effort_time = int(max_effort_time) df = df[df.EFFORT_HRS < max_effort_time] if 'max_effort_dist' in inputs: max_effort_dist = int(max_effort_dist) df = df[df.EFFORT_DISTANCE_KM < max_effort_time] #df = df[(df.YEAR >= min(years)) & (df.YEAR <= max(years))] #df[target_col] *= 100 # To be able to keep stuff as 8 bit ints # Calc row and col from x, y ds = gdal.Open(tsa_mosaic) tx = ds.GetGeoTransform() ds = None ul_xy = tx[0], tx[3] df['row'], df['col'] = zip(*[ stem.calc_offset(ul_xy, xy, tx) for i, xy in df[['x', 'y']].iterrows() ]) if 'kernel_dist' in inputs: t1 = time.time() print 'Calculating kernel density...' kernel_dist = int(kernel_dist) for yr in df.YEAR.unique(): yr_mask = df.YEAR == yr df_w = gaussain_weights(df[yr_mask], target_col, kernel_dist) df.ix[yr_mask, target_col] = df_w.weighted ''' df_w = gaussain_weights(df, target_col, kernel_dist) df[target_col] = df_w.weighted #df = df.drop_duplicates(subset=[target_col, 'row', 'col'])''' print '%.1f seconds\n' % (time.time() - t1) #""" if aggregate_presence: t1 = time.time() print 'Aggregating presence records...' df.ix[df[target_col] > 0, target_col] = 1 for yr in df.YEAR.unique(): yr_mask = df.YEAR == yr df_yr = df[yr_mask] # Get unique locations for this year unique = df_yr[['row', 'col']].drop_duplicates().values for row, col in unique: this_loc = df_yr[(df_yr.row == row) & (df_yr.col == col)] #If there are ones and 0s, drop the 0s if this_loc[target_col].min( ) == 0 and this_loc[target_col].max() == 1: df.drop(this_loc[this_loc[target_col] == 0].index, inplace=True) print '%.1f seconds\n' % (time.time() - t1) if pct_train: print 'Splitting training and test sets...' pct_train = float(pct_train) #n_test = int(len(df) * (1 - pct_train)) unique = df[['row', 'col']].drop_duplicates().values n_test = int(len(unique) * (1 - pct_train)) random_idx = random.sample(xrange(len(unique)), n_test) random_row, random_col = zip(*unique[random_idx]) df_test = df[df.row.isin(random_row) & df.col.isin(random_col)] test_idx = df_test.index test_txt = out_txt.replace('.txt', '_test.txt') df = df[~df.index.isin(test_idx)] df_test.to_csv(test_txt, sep='\t') df.to_csv(out_txt, sep='\t') obs_out_txt = out_txt.replace('_' + original_bn[:-4], '') df[original_columns].to_csv(obs_out_txt, sep='\t') print '\nLength of output df:', len(df) print 'Text file written to: ', out_txt print '\nTotal time: %.1f minutes' % ((time.time() - t0) / 60)
def main(predict_params, start_year, end_year, out_dir, txt_out_dir, n_jobs_pred=0, n_jobs_agg=0, confusion=False, subset_shp=None): param_dict, df_var_orig = read_params(predict_params) for k, v in param_dict.iteritems(): param_dict[k] = v.replace('"', '') param_basename = os.path.basename(predict_params) out_txt = os.path.join(txt_out_dir, param_basename) if not os.path.isdir(txt_out_dir): os.mkdir(txt_out_dir) sub_tag = os.path.basename(subset_shp.replace('.shp', '')) for year in range(int(start_year), int(end_year) + 1): print 'Making params for year %s...' % year ################################################################################# # jdb added 6/2/2017 - make a fresh copy of the original df_var otherwise line 32 won't work right df_var = df_var_orig.copy() ################################################################################# # Write the variable param table first band = year - 1983 # 1983 becuse gdal bands start at 1 df_var.ix[df_var.data_band != 1, 'data_band'] = band df_var.data_band = df_var.data_band.astype(int) this_txt = out_txt.replace('.txt', '_%s_%s.txt' % (sub_tag, year)) df_var.to_csv(this_txt, sep='\t') # Adjust a couple of variable values file_stamp = os.path.basename(param_dict['model_dir'].split('_') [0]) + '_' + sub_tag + '_' + str(year) param_dict['file_stamp'] = file_stamp param_dict['out_dir'] = os.path.join(out_dir, str(year)) #if 'confusion_params' in param_dict and not confusion: #del param_dict['confusion_params'] with open(this_txt, 'a') as txt: txt.write('\n') txt.write('model_dir; %s\n' % param_dict['model_dir']) txt.write('train_params; %s\n' % param_dict['train_params']) txt.write('mosaic_path; %s\n' % param_dict['mosaic_path']) txt.write('support_size; %s\n' % param_dict['support_size']) txt.write('n_tiles; %s\n' % '3,3') txt.write('nodata; %s\n' % param_dict['nodata']) txt.write('out_dir; %s\n' % param_dict['out_dir']) txt.write('agg_stats; vote, mean\n') txt.write('\nOptional Parameters\n') txt.write('file_stamp; %s\n' % param_dict['file_stamp']) if int(n_jobs_pred) != 0: n_jobs_pred = int(n_jobs_pred) txt.write('n_jobs_pred; %s\n' % n_jobs_pred) if int(n_jobs_agg) != 0: n_jobs_agg = int(n_jobs_agg) txt.write('n_jobs_agg; %s\n' % n_jobs_agg) if subset_shp: txt.write('subset_shp; %s' % subset_shp) print 'Params written to %s\n' % this_txt
def main(predict_params, start_year, end_year, out_dir, txt_out_dir, n_jobs=0, agg_stats='mean, vote, median, stdv', confusion=False, subset_shp=None, n_tiles=None): param_dict, df_var_orig = read_params(predict_params) for k, v in param_dict.iteritems(): param_dict[k] = v.replace('"', '') param_basename = os.path.basename(predict_params) out_txt = os.path.join(txt_out_dir, param_basename) if not os.path.isdir(txt_out_dir): os.mkdir(txt_out_dir) for year in range(int(start_year), int(end_year) + 1): print 'Making params for year %s...' % year df_var = df_var_orig.copy() # Write the variable param table first band = year - 1983 # 1983 becuse gdal bands start at 1 df_var.ix[df_var.data_band != 1, 'data_band'] = band df_var.data_band = df_var.data_band.astype(int) this_txt = out_txt.replace('.txt', '_%s.txt' % year) df_var.to_csv(this_txt, sep='\t') # Adjust a couple of variable values file_stamp = os.path.basename( param_dict['model_dir'].split('_')[0]) + '_' + str(year) param_dict['file_stamp'] = file_stamp param_dict['out_dir'] = os.path.abspath( os.path.join(out_dir, str(year))) param_dict['agg_stats'] = agg_stats #if 'confusion_params' in param_dict and not confusion: #del param_dict['confusion_params'] with open(this_txt, 'a') as txt: txt.write('\n') txt.write('model_dir; %s\n' % param_dict['model_dir']) txt.write('train_params; %s\n' % param_dict['train_params']) txt.write('mosaic_path; %s\n' % param_dict['mosaic_path']) txt.write('support_size; %s\n' % param_dict['support_size']) txt.write('nodata; %s\n' % param_dict['nodata']) txt.write('out_dir; %s\n' % param_dict['out_dir']) txt.write('agg_stats; %s\n' % param_dict['agg_stats']) txt.write('\nOptional Parameters\n') txt.write('file_stamp; %s\n' % param_dict['file_stamp']) if int(n_jobs) != 0: n_jobs = int(n_jobs) txt.write('n_jobs; %s\n' % n_jobs) if subset_shp: txt.write('subset_shp; %s\n' % subset_shp) if n_tiles: txt.write('n_tiles; %s\n' % n_tiles) print 'Params written to %s\n' % this_txt
def main(params, pct_train=None, min_oob=0, gsrd_shp=None, resolution=30, make_oob_map=False, snap_coord=None, oob_map_metric='oob_rate'): t0 = time.time() inputs, df_var = stem.read_params(params) # Convert params to named variables and check for required vars for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'max_features' not in locals(): max_features = None if 'min_oob' in inputs: min_oob = int(min_oob) num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train) cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars str_check = sample_txt, target_col, mosaic_path, out_dir, model_type except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Read in training samples and check that df_train has exactly the same # columns as variables specified in df_vars df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n\t'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str import pdb pdb.set_trace() raise NameError(msg) if target_col not in df_train.columns: raise NameError('target_col "%s" not in sample_txt: %s' % (target_col, sample_txt)) # Make a timestamped output directory if outdir not specified now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] if not 'out_dirname' in locals(): out_dirname = target_col stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str) out_dir = os.path.join(out_dir, stamp) os.makedirs( out_dir ) # With a timestamp in dir, no need to check if it already exists shutil.copy2(params, out_dir) #Copy the params for reference ''' predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order # If there are variables that should remain constant across the modeling # region, get the names if 'constant_vars' in locals(): constant_vars = sorted([i.strip() for i in constant_vars.split(',')]) predict_cols += constant_vars # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None if snap_coord: snap_coord = [int(c) for c in snap_coord.split(',')] out_txt = os.path.join(out_dir, stamp + '.txt') df_sets = stem.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, df_train, min_obs, target_col, predict_cols, out_txt, gsrd_shp, pct_train, snap_coord=snap_coord) n_sets = len(df_sets) # Create SQL DB and add train sample table print 'Dumping train_txt to database...' t1 = time.time() db_path = os.path.join(out_dir, stamp + '.db') engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) df_train.to_sql('train_sample', engine, chunksize=10000) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Train a tree for each support set t1 = time.time() if model_type.lower() == 'classifier': print 'Training STEM with classifier algorithm...' model_func = stem.fit_tree_classifier else: print 'Training STEM with regressor algorithm...' model_func = stem.fit_tree_regressor x_train = df_train.reindex(columns=predict_cols) y_train = df_train[target_col] importance_cols = ['importance_%s' % c for c in predict_cols] for c in importance_cols: df_sets[c] = 0 # Train estimators dropped_sets = pd.DataFrame(columns=df_sets.columns) dt_dir = os.path.join(out_dir, 'decisiontree_models') if not os.path.exists(dt_dir): os.mkdir(dt_dir) dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl') # establish DB connection and create empty relationship table for sample inds cmd = ( 'CREATE TABLE set_samples (set_id INTEGER, sample_id INTEGER, in_bag INTEGER);' ) with sqlite3.connect(db_path) as connection: connection.executescript(cmd) connection.commit() insert_cmd = 'INSERT INTO set_samples (set_id, sample_id, in_bag) VALUES (?,?,?);' oob_rates = [0] for i, (set_id, ss) in enumerate(df_sets.iterrows()): format_tuple = i + 1, n_sets, float(i) / n_sets * 100, ( time.time() - t1) / 60, np.mean(oob_rates) sys.stdout.write( '\rTraining %s/%s DTs (%.1f%%) || %.1f minutes || Avg OOB: %d' % format_tuple) sys.stdout.flush() # Get all samples within support set sample_inds = df_train.index[ (df_train['x'] > ss[['ul_x', 'lr_x']].min()) & (df_train['x'] < ss[['ul_x', 'lr_x']].max()) & (df_train['y'] > ss[['ul_y', 'lr_y']].min()) & (df_train['y'] < ss[['ul_y', 'lr_y']].max())] n_samples = int(len(sample_inds) * .63) if n_samples < min_obs: df_sets.drop(set_id, inplace=True) continue this_x = x_train.ix[sample_inds] this_y = y_train.ix[sample_inds] support_set = df_sets.ix[set_id] dt_path = dt_path_template % set_id dt_model, train_inds, oob_inds, importance, oob_metrics = stem.train_estimator( support_set, n_samples, this_x, this_y, model_func, model_type, max_features, dt_path) oob_rates.append(oob_metrics['oob_rate']) df_sets.ix[set_id, importance_cols] = importance df_sets.ix[set_id, 'dt_model'] = dt_model df_sets.ix[set_id, 'dt_file'] = dt_path df_sets.ix[set_id, 'n_samples'] = n_samples for metric in oob_metrics: df_sets.ix[set_id, metric] = oob_metrics[metric] # Save oob and train inds n_train = len(train_inds) n_oob = len(oob_inds) train_records = zip(np.full(n_train, set_id, dtype=int), train_inds, np.ones(n_train, dtype=int)) oob_records = zip(np.full(n_oob, set_id, dtype=int), oob_inds, np.zeros(n_oob, dtype=int)) #try: with sqlite3.connect(db_path) as connection: connection.executemany(insert_cmd, train_records + oob_records) connection.commit() print '\n%.1f minutes\n' % ((time.time() - t1) / 60) # Calculate OOB rates and drop sets with too low OOB print 'Calculating OOB rates...' t1 = time.time() df_sets, low_oob = stem.get_oob_rates(df_sets, df_train, db_path, target_col, predict_cols, min_oob) if len(low_oob) > 0: #df_sets.drop(low_oob.index, inplace=True) low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp') low_oob.drop('dt_model', axis=1, inplace=True) stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp) set_shp = os.path.join(out_dir, 'support_sets.shp') try: stem.coords_to_shp(df_sets, gsrd_shp, set_shp) except Exception as e: print e.message print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob) print 'Min OOB rate after dropping: ', df_sets.oob_rate.min() print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving support set info...' #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt') df_sets['set_id'] = df_sets.index #df_sets = df_sets.drop('dt_model', axis=1)#.to_csv(set_txt, sep='\t', index=False) df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine) t1 = time.time() print '%.1f minutes\n' % ((time.time() - t1) / 60) #""" '''stamp = os.path.basename(out_dir) db_path = os.path.join(out_dir, stamp + '.db') engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' if make_oob_map or oob_map_metric in inputs: # Check if oob_map params were specified. If not, set to defaults if 'n_tiles' not in inputs: n_tiles = 40, 90 print 'n_tiles not specified. Using default: %s x %s ...\n' % ( n_tiles) else: n_tiles = int(n_tiles[0]), int(n_tiles[1]) print 'Calculating OOB score and making OOB score map...' try: ds = gdal.Open(mosaic_path) ar = ds.ReadAsArray() mask = ar != 0 del ar xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() ds = None except: mosaic_ds = ogr.Open(mosaic_path) if 'resolution' not in inputs: warnings.warn( 'Resolution not specified. Assuming default of 30...\n') mask = mosaic_ds.GetLayer() min_x, max_x, min_y, max_y = mask.GetExtent() ul_x = min_x - ((min_x - snap_coord[0]) % resolution) ul_y = max_y - ((max_y - snap_coord[1]) % resolution) xsize = int((max_x - ul_x) / resolution) ysize = int((ul_y - min_y) / resolution) prj = mask.GetSpatialRef().ExportToWkt() driver = gdal.GetDriverByName('gtiff') x_res = resolution y_res = -resolution tx = ul_x, x_res, 0, ul_y, 0, y_res avg_dict, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx, support_size, db_path, df_sets, df_train, target_col, predict_cols, out_dir, stamp, prj, driver, oob_map_metric) df_sets.to_csv(set_txt, sep='\t') #''' avg_oob = round(avg_dict[oob_map_metric], 1) avg_cnt = int(round(avg_dict['count'], 0)) print '\nAverage OOB score: .................... %.1f' % avg_oob print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt print 'Time to make OOB score map: %.1f hours\n' % ( (time.time() - t1) / 3600) # Record params in inventory text file if 'inventory_txt' in inputs: t1 = time.time() print 'Getting model info...\n' df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') n_sets = len(df_sets) '''if 'sample' in sample_txt: n_samples = int(sample_txt.split('_')[1].replace('sample','')) inv_columns = df_inv.columns if 'n_sets' in inv_columns: df_inv.ix[stamp, 'n_sets'] = n_sets if 'n_samples' in inv_columns: df_inv.ix[stamp, 'n_samples'] = n_samples if 'support_size' in inv_columns: df_inv.ix[stamp, 'support_size'] = str(support_size) if 'sets_per_cell' in inv_columns: df_inv.ix[stamp, 'sets_per_cell'] = sets_per_cell if 'max_features' in inv_columns: df_inv.ix[stamp, 'max_features'] = max_features info_dir = os.path.dirname(inventory_txt) existing_models = fnmatch.filter(os.listdir(info_dir), '%s*' % target_col) if len(existing_models) > 0: df_inv = df_inv[df_inv.index.isin(existing_models)]#''' if 'avg_oob' in inv_columns and make_oob_map: df_inv.ix[stamp, 'avg_oob'] = avg_oob if 'avg_count' in inv_columns and make_oob_map: df_inv.ix[stamp, 'avg_count'] = avg_cnt if len(df_inv) > 1: df_inv.to_csv(inventory_txt, sep='\t') else: print 'WARNING: Model info not written to inventory_txt...\n' #''' print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(model_dir, n_tiles, **kwargs): t0 = time.time() n_tiles = [int(n) for n in n_tiles.split(',')] if not os.path.isdir(model_dir): message = 'model directory given does not exist or is not a directory: ', model_dir raise IOError(message) model = os.path.basename(model_dir) dt_dir = os.path.join(model_dir, 'decisiontree_models') set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model) df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') pred_param_path = glob(os.path.join(model_dir, 'predict_stem_*params.txt'))[0] predict_params, df_var = stem.read_params(pred_param_path) train_param_path = glob(os.path.join(model_dir, 'train_stem_*params.txt'))[0] train_params, _ = stem.read_params(train_param_path) df_var.sort_index(inplace=True) nodata = int(predict_params['nodata'].replace('"', '')) if len(kwargs) == 0: var_ids = df_sets.max_importance.unique() var_names = df_var.ix[var_ids].index variables = zip(var_ids, var_names) else: variables = [(variable_id, variable_name) for variable_name, variable_id in kwargs] mask_path = os.path.join(model_dir, '%s_vote.bsq' % model) if not os.path.exists(mask_path): mask_path = mask_path.replace('.bsq', '.tif') mask_ds = gdal.Open(mask_path) mask_tx = mask_ds.GetGeoTransform() xsize = mask_ds.RasterXSize ysize = mask_ds.RasterYSize prj = mask_ds.GetProjection() df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mask_tx) total_tiles = len(df_tiles) df_tiles['tile'] = df_tiles.index # Find the tiles that have only nodata values t1 = time.time() print '\nFinding empty tiles...' mask = mask_ds.ReadAsArray() == nodata empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx) mask_ds = None print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60) # Select only tiles that are not empty df_tiles = df_tiles.select(lambda x: x not in empty_tiles) total_tiles = len(df_tiles) #some_set = df_sets.iloc[0] support_size = [ int(s) for s in train_params['support_size'].replace('"', '').split(',') ] set_size = [int(abs(s / mask_tx[1])) for s in support_size] out_dir = os.path.join(model_dir, 'importance_maps') if not os.path.exists(out_dir): os.mkdir(out_dir) print variables for vi, (v_id, v_name) in enumerate(variables): t1 = time.time() print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1, len(variables)) ar = np.full((ysize, xsize), nodata, dtype=np.uint8) for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()): t2 = time.time() print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles) # Calculate the size of this tile in case it's at the edge where the # tile size will be slightly different this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x - t_row.ul_x) df_these_sets = stem.get_overlapping_sets(df_sets, t_row, this_size, support_size) rc = df_tiles_rc.ix[t_ind] this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c n_sets = len(df_these_sets) # Load overlapping predictions from disk and read them as arrays tile_ul = t_row[['ul_x', 'ul_y']] print n_sets, ' Overlapping sets' importance_bands = [] importance_values = [] for s_ind, s_row in df_these_sets.iterrows(): # Calculate offset and array/tile indices offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y), mask_tx) #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]): tile_inds, a_inds = mosaic.get_offset_array_indices( tile_size, set_size, offset) # Get feature with maximum importance and fill tile with that val try: with open(s_row.dt_file, 'rb') as f: dt_model = pickle.load(f) importance_value = int( dt_model.feature_importances_[v_id] * 100) importance_values.append(importance_value) #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8) #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata) import_band = np.full(this_size, np.nan, dtype=np.float16) import_band[tile_inds[0]:tile_inds[1], tile_inds[2]:tile_inds[3]] = importance_value importance_bands.append(import_band) except Exception as e: print e continue #''' print 'Average importance for this tile: %.1f' % np.mean( importance_values) #Aggregate importance_stack = np.dstack(importance_bands) importance_tile = np.nanmean(importance_stack, axis=2) tile_mask = mask[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] | np.isnan(importance_tile) importance_tile[tile_mask] = nodata ar[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8) print 'Aggregation time for this tile: %.1f minutes\n' % ( (time.time() - t2) / 60) '''temp_dir = os.path.join(out_dir, 'delete') if not os.path.isdir(temp_dir): os.mkdir(temp_dir) t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30 array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)''' out_path = os.path.join(out_dir, '%s_importance_%s.tif' % (model, v_name)) try: array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'), out_path, gdal.GDT_Byte, nodata) except Exception as e: print e import pdb pdb.set_trace() print 'Time for this variable: %.1f minutes\n' % ( (time.time() - t1) / 60) print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), ( (time.time() - t0) / 3600))
def main(params, pct_train=None, min_oob=0, err_threshold=10): t0 = time.time() #read_params(params) inputs, df_var = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'max_features' not in locals(): max_features = None if 'err_threshold' in inputs: err_threshold = float(err_threshold) if 'min_oob' in inputs: min_oob = int(min_oob) num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train) cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars str_check = sample_txt, target_col, mosaic_path, tsa_txt, out_dir except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] if not 'out_dirname' in locals(): out_dirname = target_col stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str) out_dir = os.path.join(out_dir, stamp) #import pdb; pdb.set_trace() os.makedirs( out_dir ) # With a timestamp in dir, no need to check if it already exists''' #stamp = os.path.dirname(out_dir) shutil.copy2(params, out_dir) #Copy the params for reference df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) # Check that df_train has exactly the same columns as variables specified in df_vars unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str raise NameError(msg) predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) #import pdb; pdb.set_trace() df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order if 'constant_vars' in locals(): constant_vars = sorted([i.strip() for i in constant_vars.split(',')]) predict_cols += constant_vars # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None out_txt = os.path.join(out_dir, stamp + '.txt') df_train, df_sets, df_oob = stem.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, df_train, min_obs, target_col, predict_cols, out_txt, gsrd_shp, pct_train) # Train a tree for each support set print 'Training models...' t1 = time.time() x_train = df_train.reindex(columns=predict_cols + ['set_id']) y_train = df_train[[target_col, 'set_id']] df_sets['dt_model'] = [stem.fit_tree_regressor(x_train.ix[x_train.set_id==s, predict_cols],\ y_train.ix[y_train.set_id==s, target_col], max_features) for s in df_sets.index] del df_train print '%.1f minutes\n' % ((time.time() - t1) / 60) # Calculate OOB rates and drop sets with too low OOB print 'Calculating OOB rates...' t1 = time.time() df_sets, low_oob = stem.get_oob_rates(df_sets, df_oob, err_threshold, target_col, predict_cols, min_oob) if len(low_oob) > 0: df_sets.drop(low_oob.index, inplace=True) low_oob_shp = os.path.join(out_dir, 'gsrd_low_oob.shp') low_oob.drop('dt_model', axis=1, inplace=True) stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp) print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob) print 'Min OOB rate after dropping: ', df_sets.oob_rate.min() print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving models...' t1 = time.time() df_sets, set_txt = stem.write_model(out_dir, df_sets) print '%.1f minutes\n' % ((time.time() - t1) / 60) #''' #stamp = os.path.basename(out_dir) #set_txt = '/vol/v2/stem/{0}/models/{1}/decisiontree_models/{1}_support_sets.txt'.format(target_col, stamp) #predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_bright','delta_green','delta_nbr','delta_wet', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' # Record params in inventory text file if 'inventory_txt' in locals(): t1 = time.time() '''print 'Getting model info...\n' df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') if 'regressor' in params: model_type = 'Regressor' else: model_type = 'Classifier' n_sets = len(df_sets) if 'sample' in sample_txt: n_samples = int(sample_txt.split('_')[1].replace('sample','')) info = [model_type, None, None, None, None, None, None, None, None, n_sets, n_samples, str(support_size), sets_per_cell, max_features] df_inv.ix[stamp] = info info_dir = os.path.dirname(inventory_txt) existing_models = fnmatch.filter(os.listdir(os.path.dirname(info_dir)), '%s*' % target_col) if len(existing_models) > 0: df_inv = df_inv[df_inv.index.isin(existing_models)]''' # Check if oob_map params were specified. If not, set to defaults if 'err_threshold' not in locals(): print 'err_threshold not specified. Using default: 10 ...\n' err_threshold = 10 else: err_threshold = int(err_threshold) if 'n_tiles' not in locals(): print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = int(n_tiles[0]), int(n_tiles[1]) #t1 = time.time() print 'Calculating OOB score and making OOB score map...' ds = gdal.Open(mosaic_path) ar = ds.ReadAsArray() mask = ar != 0 del ar xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() ds = None #import get_oob_map as oob ar_oob, ar_cnt, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx, support_size, df_oob, df_sets, target_col, predict_cols, err_threshold, out_dir, stamp, prj, driver) df_sets.to_csv(set_txt, sep='\t') #''' #if 'inventory_txt' in locals() : avg_oob = round(np.mean(ar_oob[mask]), 1) avg_cnt = int(round(np.mean(ar_cnt[mask]), 0)) '''df_inv.ix[stamp, 'avg_oob'] = avg_oob #df_inv.ix[stamp, 'avg_count'] = avg_cnt if len(df_inv) > 1: df_inv.to_csv(inventory_txt, sep='\t') else: print 'WARNING: Model info not written to inventory_txt...\n' ''' print '\nAverage OOB score: .................... %.1f' % avg_oob print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt print 'Time to make OOB score map: %.1f hours\n' % ( (time.time() - t1) / 3600) #except Exception as e: # print 'Problem getting oob map: ', e print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(search_dir, models, t_path, inventory_txt, t_nodata=255): df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') columns = df_inv.columns if 'vote_rmse' not in columns: df_inv['vote_rmse'] = None if 'mean_rmse' not in columns: df_inv['mean_rmse'] = None df_inv = df_inv.ix[models] ds = gdal.Open(t_path) ar_t = ds.ReadAsArray() nodata_mask = ar_t == t_nodata ds = None for model in models: print '\nCalculating RMSE for ', model model_dir = os.path.join(search_dir, model) if not os.path.exists(model_dir): print 'Model dir does not exist: %s. Skipping...\n' % model_dir continue confusion_params = os.path.join(model_dir, 'confusion_matrix_params.txt') if not os.path.exists(confusion_params): print 'Could not find confusion params: ', confusion_params predict_params = os.path.join(model_dir, 'predict_stem_params.txt') inputs, _ = stem.read_params(predict_params) p_nodata = int(inputs['nodata'].replace('"', '')) this_srch_str = os.path.join(model_dir, 'train_stem*_params.txt') train_params = glob.glob(this_srch_str) if len(train_params) == 0: print 'Can not find test data for ', model, '\n' continue train_params = train_params[0] inputs, _ = stem.read_params(train_params) test_txt = inputs['sample_txt'].replace('predictors', 'test').replace('"', '') train_txt = inputs['sample_txt'].replace('"', '') else: inputs = read_params(confusion_params) for k, v in inputs.iteritems(): inputs[k] = v.replace('"', '') test_txt = inputs['sample_txt'] p_nodata = int(inputs['p_nodata']) train_txt = inputs['sample_txt'].replace('_test', '').replace('"', '') #df = pd.read_csv(test_txt, sep='\t', index_col='obs_id') train_sample = pd.read_csv(train_txt, sep='\t', index_col='obs_id') # Set any pixels used for training to -1 so they can be avoided for testing n_rows, n_cols = ar_t.shape n_pixels = ar_t.size pixel_ids = np.arange(n_pixels, dtype=np.uint32).reshape(n_rows, n_cols) pixel_ids[ train_sample.row, train_sample.col] = n_pixels #will always be 1 more than last id pixel_ids[nodata_mask] = n_pixels n_samples = int( int( os.path.basename(train_txt).split('_')[1].replace( 'sample', '')) * 0.2) test_ids = np.array(random.sample(pixel_ids[pixel_ids != n_pixels], n_samples), dtype=np.uint32) test_rows = test_ids / n_cols test_cols = test_ids % n_cols #test_cols = random.sample(ar_col[ar_col != -1], n_samples) df = pd.DataFrame({'row': test_rows, 'col': test_cols}) for agg_method in ['vote', 'mean']: p_path = os.path.join(model_dir, '%s_%s.bsq' % (model, agg_method)) ds = gdal.Open(p_path) ar_p = ds.ReadAsArray() t_samples, p_samples = get_samples(ar_p, ar_t, p_nodata, 255, samples=df, match='best') rmse = np.round(calc_rmse(t_samples, p_samples), 1) print agg_method, ': ', rmse df_inv.ix[model, '%s_rmse' % agg_method] = rmse out_txt = os.path.join( model_dir, '%s_random_test_sample%s.txt' % (model, n_samples)) df.to_csv(out_txt, sep='\t', index=False) out_txt = inventory_txt.replace('.txt', '_randomRMSE.txt') df_inv.to_csv(out_txt, sep='\t')
def main(txt, n_sample, out_txt, bins, train_params, by_psu=True, extract_predictors=True): n_sample = int(n_sample) bins = parse_bins(bins) df = pd.read_csv(txt, sep='\t', dtype={'tile_id': object}) sample = pd.DataFrame(columns=df.columns) n_bins = len(bins) psu_ids = df.tile_id.unique() train_params = stem.read_params(train_params) for var in train_params: exec ("{0} = str({1})").format(var, train_params[var]) tiles = attributes_to_df(MOSAIC_SHP) if extract_predictors: var_info = pd.read_csv(var_info, sep='\t', index_col='var_name') for i, tile in enumerate(psu_ids): print("extracting %s of %s" % (i, len(psu_ids))) sample_mask = df.tile_id == tile this_sample = df.loc[sample_mask] tile_ul = tiles.loc[tiles['name'] == tile, ['xmin', 'ymax']].values[0] #point_dict = get_point_dict(df, psu_ids) mosaic_tx, extent = stem.tx_from_shp(MOSAIC_SHP, 30, -30) row_off, col_off = stem.calc_offset([mosaic_tx[0], mosaic_tx[3]], tile_ul, mosaic_tx) this_sample['local_row'] = this_sample.row - row_off this_sample['local_col'] = this_sample.col - col_off for var_name, var_row in var_info.iterrows(): #tiles = pd.DataFrame({'tile_id': psu_ids, 'tile_str': psu_ids}) file_path = stem.find_file(var_row.basepath, var_row.search_str, tile) ds = gdal.Open(file_path) ar = ds.GetRasterBand(var_row.data_band).ReadAsArray() try: if len(this_sample) == ar.size: df.loc[sample_mask, var_name] = ar.ravel() else: df.loc[sample_mask, var_name] = ar[this_sample.local_row, this_sample.local_col] except Exception as e: print(e) import pdb; pdb.set_trace() ds = None df.to_csv(txt.replace('.txt', '_predictors.txt')) #df[var_name], _ = extract.extract_var('', var_name, var_row.by_tile, var_row.data_band, var_row.data_type, tiles, df, point_dict, var_row.basepath, var_row.search_str, var_row.path_filter, mosaic_tx, 0, 0, silent=True) if by_psu: n_per_psu = n_sample/len(psu_ids) n_per_bin = n_per_psu/n_bins for i, pid in enumerate(psu_ids): psu_pixels = df.loc[df.tile_id == pid] print("Sampling for %s of %s PSUs" % (i + 1, len(psu_ids))) for l, u in bins: this_bin = psu_pixels.loc[(l < psu_pixels.value) & (psu_pixels.value <= u)] if len(this_bin) > 0: bin_sample_size = min(n_per_bin, len(this_bin)) sample = pd.concat([sample, this_bin.sample(bin_sample_size)]) print("Sampled %s for bin %s-%s" % (n_per_bin, l, u)) else: print("No pixels between %s and %s found" % (l, u)) print("") else: n_per_bin = n_sample/n_bins for l, u in bins: sample = pd.concat([sample, df.sample(n_per_bin)]) sample.to_csv(out_txt, index=False) print 'Sample written to ', out_txt
def main(params, snap_coord=None, resolution=30, n_sizes=5, max_features=None, n_jobs=1): t0 = time.time() inputs, df_var = stem.read_params(params) # Convert params to named variables and check for required vars for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: sets_per_cell = int(sets_per_cell) cell_size = [int(s) for s in cell_size.split(',')] min_size = int(min_size) max_size = int(max_size) except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Read in training samples and check that df_train has exactly the same # columns as variables specified in df_vars df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n\t'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str import pdb pdb.set_trace() raise NameError(msg) if target_col not in df_train.columns: raise NameError('target_col "%s" not in sample_txt: %s' % (target_col, sample_txt)) if 'max_target_val' in inputs: max_target_val = int(max_target_val) else: max_target_val = df_train[target_col].max() if 'n_jobs' in inputs: n_jobs = int(n_jobs) predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order if snap_coord: snap_coord = [int(c) for c in snap_coord.split(',')] t1 = time.time() if model_type.lower() == 'classifier': model_func = stem.fit_tree_classifier else: model_func = stem.fit_tree_regressor # Make grid x_res = resolution y_res = -resolution tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) min_x, max_x, min_y, max_y = [int(i) for i in extent] cells = stem.generate_gsrd_grid(cell_size, min_x, min_y, max_x, max_y, x_res, y_res) grid = pd.DataFrame(cells, columns=['ul_x', 'ul_y', 'lr_x', 'lr_y']) grid.to_csv(out_txt.replace('.txt', '_grid.txt')) #import pdb; pdb.set_trace() grid = intersecting_cells(grid, mosaic_path) stem.coords_to_shp(grid, '/vol/v2/stem/extent_shp/CAORWA.shp', out_txt.replace('.txt', '_grid.shp')) if 'set_sizes' in inputs: set_sizes = np.sort([int(s) for s in set_sizes.split(',')]) else: if 'n_sizes' in inputs: n_sizes = int(n_sizes) set_sizes = np.arange(min_size, max_size + 1, (max_size - min_size) / n_sizes) # Sample grid dfs = [] for i, cell in grid.iterrows(): ul_x, ul_y, lr_x, lr_y = cell min_x, max_x = min(ul_x, lr_x), max(ul_x, lr_x) min_y, max_y = min(ul_y, lr_y), max(ul_y, lr_y) # Calculate support set centers x_centers = [ int(stem.snap_coordinate(x, snap_coord[0], x_res)) for x in random.sample(xrange(min_x, max_x + 1), sets_per_cell) ] y_centers = [ int(stem.snap_coordinate(y, snap_coord[1], y_res)) for y in random.sample(xrange(min_y, max_y + 1), sets_per_cell) ] for size in set_sizes: df = stem.sample_gsrd_cell(sets_per_cell, cell, size, size, x_res, y_res, tx, snap_coord, center_coords=(zip( x_centers, y_centers))) df['set_size'] = size df['cell_id'] = i dfs.append(df) support_sets = pd.concat(dfs, ignore_index=True) n_sets = len(support_sets) #import pdb; pdb.set_trace() print 'Testing set sizes with %s jobs...\n' % n_jobs oob_metrics = _par_train_estimator(n_jobs, n_sets, df_train, predict_cols, target_col, support_sets, model_func, model_type, max_features, max_target_val) '''args = [[i, n_sets, start_time, df_train, predict_cols, target_col, support_set, model_func, model_type, max_features, max_target_val] for i, (si, support_set) in enumerate(support_sets.ix[:100].iterrows())] oob_metrics = [] for arg in args: oob_metrics.append(par_train_estimator(arg))''' oob_metrics = pd.DataFrame(oob_metrics) oob_metrics.set_index('set_id', inplace=True) support_sets = pd.merge(support_sets, oob_metrics, left_index=True, right_index=True) #import pdb; pdb.set_trace() support_sets.to_csv(out_txt)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, file_stamp + '.db') try: engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#''' except: set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True)#''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) for feature in mosaic_ds: mosaic_geom.AddGeometry(feature.GetGeometryRef()) df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom) xsize = int((max_x - min_x)/resolution) ysize = int((max_y - min_y)/resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"','') snap_coord = [float(c) for c in snap_coord.split(',')]#''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(model_dir, 'temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)] tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)] tiles.set_index(index_field, inplace=True) tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()] tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()] tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin','xmin']].iterrows()] tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin','ymin']].iterrows()] support_nrows = int(support_size[0]/abs(y_res)) support_ncols = int(support_size[1]/abs(x_res)) t1 = time.time() args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())] #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())] if n_jobs > 1: print 'Predicting with %s jobs...\n' % n_jobs pool = Pool(n_jobs) pool.map(stem.predict_tile, args, 1) pool.close() pool.join() else: for arg in args: print 'Predicting with 1 job ...\n' stem.predict_tile(*arg)#''' print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600) t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) ds = gdal.Open(tile_file) tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile except Exception as e: import pdb; pdb.set_trace() out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1)/60) # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' if 'confusion_params' in locals(): import confusion_matrix as confusion ''' Read the mean or vote back in ''' if 'vote' in agg_stats: vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp) ar_vote = gdal.Open(vote_path) print '\nComputing confusion matrix for vote...' vote_dir = os.path.join(model_dir, 'evaluation_vote') out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] '''try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e''' if 'mean' in agg_stats: mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp) ar_mean = gdal.Open(mean_path) print '\nGetting confusion matrix for mean...' mean_dir = os.path.join(model_dir, 'evaluation_mean') out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) mean_acc = df_m.ix['user','producer'] mean_kap = df_m.ix['user', 'kappa'] '''try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' if 'vote' in agg_stats: print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap if 'mean' in agg_stats: print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)