def main(): path = sys.argv[1] files = os.listdir(path) files = [os.path.join(path,f) for f in files if f.endswith('.txt')] res = [] for file in files: confusion_matrix.main(file) file_name = file.split('/')[-1].split('.')[0] input_file = '{}_confusion_matrix.txt'.format(file_name) with open(input_file) as f: first = 0 starting = 0 m = re.search('starting_(\d+)', input_file) if m is not None: starting = int(m.group(1)) m = re.search('first_(\d+)', input_file) if m is not None: first = int(m.group(1)) data = f.readlines()[-1] res.append((first, starting, data)) res = sorted(res, key=lambda x: x[0], reverse=False) output_file = 'aggregated_confusion_matrix.txt' try: os.remove(output_file) except: pass with open(output_file, 'a') as f: for l in res: f.write('{} {} {}'.format(l[0], l[1], l[2])) generate_gnuplot(output_file)
def main(params, inventory_txt=None, constant_vars=None): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: n_tiles = [int(i) for i in n_tiles.split(',')] support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) # If constants were given, make a dict and make sure they match the training # constants if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) pred_constants = sorted(constant_vars.keys()) train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')] train_constants = sorted(train_constants) unmatched_vars = [v for v in pred_vars if v not in train_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in pred_constants if v not in train_constants] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str raise NameError(msg) unmatched_vars = [v for v in train_vars if v not in pred_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in train_constants if v not in pred_constants] pred_vars += pred_constants # Add here because it would screw with stuff upstream if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) t0 = time.time() if 'n_jobs' in inputs: # Predict in parallel n_jobs = int(n_jobs) args = [] t1 = time.time() print 'Predicting in parallel with %s jobs...' % n_jobs print 'Building args and making rasters of TSA arrays...' for c, (set_id, row) in enumerate(df_sets.iterrows()): # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] dtype_code = mosaic_ds.GetRasterBand(1).DataType mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True) # Build list of args to pass to the Pool tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) ds = gdal.Open(tsa_raster) tsa_tx = ds.GetGeoTransform() ds = None tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx) args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1)/60) p = Pool(n_jobs) p.map(stem.par_predict, args, 1) else: # Loop through each set and generate predictions for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, mosaic_tx, xsize, ysize, dt_model, nodata, np.int16, constant_vars) tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id) mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60) #mosaic_ds = None print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#''' #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0) #df_sets.to_csv(set_txt, sep='\t')''' mosaic_ds = None # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp)) ar_vote = ds.ReadAsArray() ds = None ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp)) ar_mean = ds.ReadAsArray() ds = None#''' if 'confusion_params' in locals(): import confusion_matrix as confusion vote_dir = os.path.join(model_dir, 'evaluation_vote') mean_dir = os.path.join(model_dir, 'evaluation_mean') print '\nComputing confusion matrix for vote...' out_txt = os.path.join(vote_dir, 'confusion.txt') print confusion_params df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e '''print '\nGetting confusion matrix for mean...' out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] #mean_acc = df_m.ix['user','producer'] #mean_kap = df_m.ix['user', 'kappa'] if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap #print 'Mean accuracy .............. ', mean_acc #print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
def main(params, inventory_txt=None): inputs, df_var = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: n_tiles = [int(i) for i in n_tiles.split(',')] support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: _, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str raise NameError(msg) unmatched_vars = [v for v in train_vars if v not in pred_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: #shutil.copy2(confusion_params, out_dir) conf_bn = os.path.basename(confusion_params) confusion_params = os.path.join(out_dir, conf_bn) if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob( os.path.join('/vol/v2/stem/imperv/imperv_bdt', 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) '''# Loop through each set and generate predictions t0 = time.time() for c, (set_id, row) in enumerate(df_sets.iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) ar_coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem.predict_set_in_pieces(set_id, df_var, mosaic_ds, ar_coords, mosaic_tx, xsize, ysize, dt_model, nodata) tx = ar_coords.ul_x, x_res, x_rot, ar_coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id) array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60) #mosaic_ds = None print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#''' #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) ar_mean, ar_vote, pct_importance, df_sets = stem.aggregate_predictions( ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0) #df_sets.to_csv(set_txt, sep='\t')''' mosaic_ds = None ds = gdal.Open('/vol/v2/stem/canopy/canopy_bdt/canopy_bdt_vote.bsq') ar_vote = ds.ReadAsArray() ds = None if 'confusion_params' in locals(): import confusion_matrix as confusion vote_dir = os.path.join(model_dir, 'evaluation_vote') mean_dir = os.path.join(model_dir, 'evaluation_mean') print '\nGetting confusion matrix for vote...' out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e print '\nGetting confusion matrix for mean...' out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e vote_acc = df_v.ix['user', 'producer'] vote_kap = df_v.ix['user', 'kappa'] mean_acc = df_m.ix['user', 'producer'] mean_kap = df_m.ix['user', 'kappa'] if 'inventory_txt': df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = [ 'vote_accuracy', 'vote_kappa', 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask' ] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap, False, \ mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #'''
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs_pred=0, n_jobs_agg=0, mosaic_nodata=0): inputs, df_var = stem_conus.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem_conus.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) if mosaic_path.endswith('.shp'): mosaic_type = 'vector' if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') mosaic_dataset = ogr.Open(mosaic_path) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() xsize = int((max_x - min_x) / resolution) ysize = int((max_y - min_y) / resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 mosaic_tx, extent = stem_conus.tx_from_shp(mosaic_path, x_res, y_res) #df_tiles = attributes_to_df(mosaic_path) else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = [int(i) for i in n_tiles.split(',')] df_tiles, df_tiles_rc, tile_size = stem_conus.get_tiles( n_tiles, xsize, ysize, mosaic_tx) predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob( os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) t0 = time.time() if 'n_jobs_pred' in inputs: n_jobs = int(n_jobs_pred) # Predict in parallel args = [] t1 = time.time() print 'Predicting in parallel with %s jobs...' % n_jobs print 'Building args and making rasters of tile arrays...' for c, (set_id, row) in enumerate(df_sets.iterrows()): # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] '''if mosaic_type == 'vector': tsa_ar, tsa_off = mosaic.kernel_from_shp(mosaic_ds, coords, mosaic_tx, nodata=0) else: tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) set_mosaic_path = os.path.join(predict_dir, 'tsa_%s.tif' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] np_dtype = get_min_numpy_dtype(tsa_ar) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype) mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, set_mosaic_path, gdal_dtype, silent=True) pct_progress = float(c + 1)/total_sets * 100 sys.stdout.write('\rRetreived points for feature %s of %s (%%%.1f)' % (c + 1, total_sets, pct_progress)) sys.stdout.flush()''' # Build list of args to pass to the Pool #tsa_off = stem_conus.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tx_out[0], tx_out[3]), tx_out) args.append([ coords, mosaic_type, mosaic_path, mosaic_tx, prj, nodata, c, total_sets, set_id, df_var, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir ]) #args.append([c, total_sets, set_id, df_var, set_mosaic_path, tsa_off, coords, #mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, #constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1) / 60) p = Pool(n_jobs) p.map(stem_conus.par_predict, args, 1) else: # Loop through each set and generate predictions for c, (set_id, row) in enumerate(df_sets.iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem_conus.predict_set(set_id, df_var, mosaic_ds, coords, mosaic_tx, xsize, ysize, dt_model, nodata, np.int16, constant_vars) tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.tif' % set_id) np_dtype = get_min_numpy_dtype(ar_predict) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype) mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ( (time.time() - t1) / 60) #mosaic = None print '\nTotal time for predicting: %.1f hours\n' % ( (time.time() - t0) / 3600) #''' """ #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) t1 = time.time() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] if 'n_jobs_agg' in inputs: n_jobs_agg = int(n_jobs_agg) if mosaic_type == 'vector': nodata_mask = mosaic_ds else: if 'mosaic_nodata' in inputs: mosaic_nodata = int(mosaic_nodata) nodata_mask = mosaic_ds.ReadAsArray() != mosaic_nodata ######################################################################################################################################## # jdb 6/22/17 check for sets that errored - if there are any, remove them from the df_sets DF so that the aggregation step doesn't expect them setErrorLog = os.path.dirname(predict_dir) + '/predication_errors.txt' if os.path.isfile(setErrorLog): with open(setErrorLog) as f: lines = f.readlines() badSets = [ int(line.split(':')[1].rstrip().strip()) for line in lines if 'set_id' in line ] for thisSet in badSets: df_sets = df_sets[df_sets.index != thisSet] ######################################################################################################################################## pct_importance, df_sets = stem_conus.aggregate_predictions( n_tiles, ysize, xsize, nodata, nodata_mask, mosaic_tx, support_size, agg_stats, predict_dir, df_sets, out_dir, file_stamp, prj, driver, n_jobs_agg) #print 'Total aggregation time: %.1f hours\n' % ((time.time() - t0)/3600) mosaic_ds = None mosaic_dataset = None # Save the importance values importance = pd.DataFrame({ 'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [ int(r) for r in importance.pct_importance.rank(method='first', ascending=False) ] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t') #''' if 'confusion_params' in locals(): import confusion_matrix as confusion ''' Read the mean or vote back in ''' if 'vote' in agg_stats: vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp) ar_vote = gdal.Open(vote_path) print '\nComputing confusion matrix for vote...' vote_dir = os.path.join(model_dir, 'evaluation_vote') out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] '''try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e''' if 'mean' in agg_stats: mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp) ar_mean = gdal.Open(mean_path) print '\nGetting confusion matrix for mean...' mean_dir = os.path.join(model_dir, 'evaluation_mean') out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) mean_acc = df_m.ix['user', 'producer'] mean_kap = df_m.ix['user', 'kappa'] '''try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = [ 'vote_accuracy', 'vote_kappa' ] #, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[ file_stamp, cols] = vote_acc, vote_kap #, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' if 'vote' in agg_stats: print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap if 'mean' in agg_stats: print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0) / 60)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, file_stamp + '.db') try: engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#''' except: set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True)#''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) for feature in mosaic_ds: mosaic_geom.AddGeometry(feature.GetGeometryRef()) df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom) xsize = int((max_x - min_x)/resolution) ysize = int((max_y - min_y)/resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"','') snap_coord = [float(c) for c in snap_coord.split(',')]#''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(model_dir, 'temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)] tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)] tiles.set_index(index_field, inplace=True) tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()] tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()] tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin','xmin']].iterrows()] tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin','ymin']].iterrows()] support_nrows = int(support_size[0]/abs(y_res)) support_ncols = int(support_size[1]/abs(x_res)) t1 = time.time() args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())] #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())] if n_jobs > 1: print 'Predicting with %s jobs...\n' % n_jobs pool = Pool(n_jobs) pool.map(stem.predict_tile, args, 1) pool.close() pool.join() else: for arg in args: print 'Predicting with 1 job ...\n' stem.predict_tile(*arg)#''' print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600) t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) ds = gdal.Open(tile_file) tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile except Exception as e: import pdb; pdb.set_trace() out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1)/60) # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' if 'confusion_params' in locals(): import confusion_matrix as confusion ''' Read the mean or vote back in ''' if 'vote' in agg_stats: vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp) ar_vote = gdal.Open(vote_path) print '\nComputing confusion matrix for vote...' vote_dir = os.path.join(model_dir, 'evaluation_vote') out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] '''try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e''' if 'mean' in agg_stats: mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp) ar_mean = gdal.Open(mean_path) print '\nGetting confusion matrix for mean...' mean_dir = os.path.join(model_dir, 'evaluation_mean') out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) mean_acc = df_m.ix['user','producer'] mean_kap = df_m.ix['user', 'kappa'] '''try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' if 'vote' in agg_stats: print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap if 'mean' in agg_stats: print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)