예제 #1
0
def par_filter(args):
    
    t0 = time.time()
    #ind, ar, func, kernel, extra_args, i, n_tiles = args
    ind, path, databand, nodata, r, tile_coords, out_dir, func, kernel, extra_args, i, n_tiles = args
    ds = gdal.Open(path)
    nrows = r.lr_r - r.ul_r
    ncols = r.lr_c - r.ul_c
    ar = ds.GetRasterBand(databand).ReadAsArray(r.ul_c, r.ul_r, ncols, nrows)
    mask = ar == nodata
    if np.all(mask):
        return ind, None
    ar = ndi.generic_filter(ar, func, footprint=kernel, extra_arguments=extra_args)
    ar[mask] = nodata
    
    _, x_res, _, _, _, y_res = ds.GetGeoTransform()
    driver = gdal.GetDriverByName('gtiff')
    prj = ds.GetProjection()
    tx = tile_coords.ul_x, x_res, 0, tile_coords.ul_y, 0, y_res
    out_path = os.path.join(out_dir, 'tile_%s.tif' % ind)
    array_to_raster(ar, tx, prj, driver, out_path, nodata=nodata)
    
    print 'Time for tile %s of %s: %.1f minutes' % (i, n_tiles, ((time.time() - t0)/60))
    ds = None
    
    return ind, out_path
예제 #2
0
def main(pred_path,
         targ_path,
         lc_path,
         mask_path,
         nodata_p,
         nodata_t,
         nodata_lc,
         search_dir,
         search_str,
         eval_scales,
         out_dir,
         clip_shp=None):

    pxl_scale_dir = os.path.join(out_dir, 'pixel_scale')
    if not os.path.exists(pxl_scale_dir):
        os.makedirs(pxl_scale_dir)

    ds_m = gdal.Open(mask_path)
    tx_m = ds_m.GetGeoTransform()
    ar_m = ds_m.ReadAsArray().astype(np.int32)
    nonforest = ar_m == 1
    ar_m = None

    print('\nReading in raster data...\n')
    ds_p = gdal.Open(pred_path)
    ar_p = ds_p.ReadAsArray()
    tx = ds_p.GetGeoTransform()
    prj = ds_p.GetProjection()
    driver = ds_p.GetDriver()

    ds_t = gdal.Open(targ_path)
    ar_t = ds_t.ReadAsArray()
    ar_t[ar_t == 0] = nodata_t

    ar_t[nonforest] = nodata_t
    ar_p[nonforest] = nodata_p

    stdv_path = pred_path.replace('vote', 'stdv')
    ds_stdv = gdal.Open(stdv_path)
    ar_stdv = ds_stdv.ReadAsArray()

    print('Getting difference map...')
    t0 = time.time()
    ar_diff, nans = get_dif_map(ar_p, ar_t, nodata_p, nodata_t)
    ras_ext = pred_path.split('.')[-1]
    dif_path = os.path.join(pxl_scale_dir,
                            'prediction_minus_target.' + ras_ext)
    mosaic.array_to_raster(ar_diff, tx, prj, driver, dif_path, GDT_Int32,
                           nodata_p)
    print('%.1f seconds\n' % (time.time() - t0))

    shps = find_files(search_dir, search_str, eval_scales)
    print('Calculating stats and plotting for all evaluation scales...')
    for eval_scale, zone_shp in shps:

        #If clip_shp is specified, assume that zone shape is unclipped and clip it
        if clip_shp:
            print ('clip_shp given so... getting only features from %s that ' +\
            'overlap %s') % (zone_shp, clip_shp)
            out_shp = zone_shp.replace(
                '.shp', '_%s.shp' % os.path.basename(clip_shp)[:-4])
            get_overlapping_polys(zone_shp, clip_shp, out_shp)
            zone_shp = out_shp

        scale_dir = os.path.join(out_dir, 'scale_%s_m' % eval_scale)
        if not os.path.exists(scale_dir):
            os.mkdir(scale_dir)

        print('Getting zonal stats for %s scale...' % eval_scale)
        t0 = time.time()
        df_stats = zonal_stats(ar_p, ar_t, ar_diff, ar_stdv, zone_shp, tx,
                               nodata_p, nodata_t)
        out_txt = os.path.join(scale_dir, 'zonal_stats_%s.txt' % eval_scale)
        df_stats.to_csv(out_txt, sep='\t', index=False)
        print('%.1f seconds\n' % (time.time() - t0))

        print('Writing stats to shp...')
        t0 = time.time()
        out_shp = os.path.join(scale_dir, 'zonal_stats_%s.shp' % eval_scale)
        df_to_shp(df_stats, zone_shp, out_shp, copy_fields=False)
        print('%.1f seconds\n' % (time.time() - t0))

        print('Making scatter plot for %s scale...' % eval_scale)
        t0 = time.time()
        plt.scatter(df_stats.targ_mean, df_stats.pred_mean, alpha=.05)
        plt.xlabel('Target')
        plt.ylabel('Prediction')
        scatter_path = os.path.join(scale_dir, 'scatter_%s.png' % eval_scale)
        plt.savefig(scatter_path)
        print('%.1f seconds\n' % (time.time() - t0))

    ar_stdv = None
    ds_stdv = None
    ar_diff = None

    ar_t_data = ar_t[~nans]
    ar_p_data = ar_p[~nans]
    print('Plotting scatter of the 2 maps...')
    t0 = time.time()

    inds = random.sample(xrange(len(ar_t_data)), 100000)
    x = ar_t_data[inds]
    y = ar_p_data[inds]
    plt.scatter(x, y, alpha=.01)
    plt.xlabel(os.path.basename(targ_path))
    plt.ylabel(os.path.basename(pred_path))
    fig_path = os.path.join(pxl_scale_dir,
                            'prediction_vs_target_scatter_no0.png')
    plt.savefig(fig_path)
    plt.clf()
    print('%.1f seconds\n' % (time.time() - t0))

    # Create 2D histograms
    print('Plotting 2D histogram...')
    t0 = time.time()
    plt.hist2d(ar_t_data, ar_p_data, bins=50, norm=LogNorm())
    plt.xlabel(os.path.basename(targ_path))
    plt.ylabel(os.path.basename(pred_path))
    plt.colorbar()
    fig_path = os.path.join(pxl_scale_dir,
                            'prediction_vs_target_2Dhistogram_no0.png')
    plt.savefig(fig_path)
    plt.clf()
    print('%.1f seconds\n' % (time.time() - t0))

    print('Evaluating by land cover class...')
    t0 = time.time()
    ds_lc = gdal.Open(lc_path)
    ar_lc = ds_lc.ReadAsArray()
    df_lc = evaluate_by_lc(ar_p, ar_t, ar_lc, ~nans, nodata_lc, pxl_scale_dir)
    print('%.1f seconds\n' % (time.time() - t0))

    print('Plotting bin stats...')
    t0 = time.time()
    plot_bin_agreement(ar_p_data, ar_t_data, nodata_t, pxl_scale_dir)
    print('%.1f seconds\n' % (time.time() - t0))  #'''

    print('Calculating confusion matrix...')
    t0 = time.time()
    out_txt = os.path.join(pxl_scale_dir, 'confusion_matrix.txt')
    ar_t_samples = ar_t_data[inds]
    ar_p_samples = ar_p_data[inds]
    confusion_matrix(ar_p_samples, ar_t_samples, out_txt=out_txt)
    print('%.1f seconds\n' % (time.time() - t0))

    ds_p = None
    ds_t = None
    ds_lc = None
    ar_p = None
    ar_t = None
    ar_lc = None

    print('Outputs written to ', out_dir)
예제 #3
0
파일: predict_rf.py 프로젝트: eMapR/pystem
def main(params,
         n_pieces=False,
         ydims=None,
         constant_vars=None,
         year='',
         agg_method=None):

    t0 = time.time()
    print 'Predicting Random Forest... %s\n' % time.ctime(t0)

    # Set optional params to default:
    split_predictors = False

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        nodata = int(nodata)
        str_check = train_params, rf_path, mask_path, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in
    train_dict = forest.read_params(train_params)
    train_txt_bn = os.path.basename(train_dict['var_txt'][:-1])
    if 'var_txt' not in locals():
        var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn)
    if not os.path.exists(var_txt):
        print ''
        msg = 'Could not find var_txt:\n%s\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make sure vars are sorted alphabetically since they were for training
    pred_vars = sorted(df_var.index)
    df_var = df_var.reindex(pred_vars)
    '''if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        #year = constant_vars['YEAR']
        year = 2012
        pred_constants = sorted(constant_vars.keys())
    else:
        df_var.search_str = [s.format(2007) for s in df_var.search_str]'''

    #out_dir = os.path.dirname(out_raster)
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    new_params = os.path.join(out_dir, os.path.basename(params))
    shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year))

    # Load the Random Forest model
    print 'Loading the RandomForest model from \n%s... \n%s\n' % (
        rf_path, time.ctime(time.time()))
    if not os.path.exists(rf_path):
        raise IOError('%s does not exist' % rf_path)
    with open(rf_path) as f:
        rf_model = pickle.load(f)
    n_features = rf_model.n_features_
    n_vars = len(df_var.index.tolist())
    if 'constant_vars' in inputs:
        n_vars += len(pred_constants)
    if n_features != n_vars:
        print df_var.index.tolist() + pred_constants
        sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\
            '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \
            '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\
            '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt))
        #"""
    if 'agg_method' in inputs:
        agg_method = inputs['agg_method']

    # Get mask and raster info
    ds = gdal.Open(mask_path)
    ar = ds.ReadAsArray()
    nodata_mask = ar != 0
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = gdal.GetDriverByName('gtiff')
    ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx

    # Predict
    #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time()))
    t1 = time.time()
    predict_pieces = []

    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 25, 15
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]

    if 'n_tiles' in inputs:
        df_tiles, df_tiles_rc, tile_size = stem.get_tiles(
            n_tiles, xsize, ysize, tx)
        empty_tiles = []
        ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        tile_dir = os.path.join(out_dir, 'predict_tiles')
        if not os.path.isdir(tile_dir):
            os.mkdir(tile_dir)
        for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()):
            print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles))
            t1 = time.time()
            coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist()
            tsa_ar, tsa_off = mosaic.extract_kernel(ds,
                                                    1,
                                                    coords,
                                                    tx,
                                                    xsize,
                                                    ysize,
                                                    nodata=nodata)
            tsa_mask = tsa_ar == 0
            if tsa_mask.all():
                print 'Tile %s empty. Skipping...' % ind
                continue
            tsa_ar[tsa_mask] = nodata
            # Get the ids of TSAs this kernel covers
            tsa_ids = np.unique(tsa_ar)
            #tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata]
            tsa_strs = [str(tsa) for tsa in tsa_ids if tsa != nodata]
            array_shape = tsa_ar.shape

            # Get an array of predictors where each column is a flattened 2D array of a
            #   single predictor variable
            temp_nodata = -9999
            ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar,
                                                coords, tsa_mask, temp_nodata,
                                                1)
            nodata_mask = ~np.any(ar_predictors == temp_nodata, axis=1)
            predictors = ar_predictors[nodata_mask]
            t2 = time.time()
            if agg_method == 'mode':
                args = []
                for dt in rf_model.estimators_:
                    args.append([dt, predictors])
                pool = Pool(rf_model.n_jobs)
                t3 = time.time()
                dt_predictions = np.vstack(
                    pool.map(forest.par_predict_from_dt, args, 1))
                print 'Prediction time: %.1f minutes' % (
                    (time.time() - t3) / 60)
                t3 = time.time()
                predictions = stem.mode(dt_predictions, axis=0)
                print 'Aggregation time:  %.1f minutes' % (
                    (time.time() - t3) / 60)
                del dt_predictions
                t3 = time.time()
                pool.close()
                pool.join()
                print 'Closing time:  %.1f minutes' % ((time.time() - t3) / 60)
            else:
                predictions = rf_model.predict(ar_predictors[nodata_mask])
            print 'Prediction time: %.1f minutes' % ((time.time() - t2) / 60)

            ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8)
            ar_tile[nodata_mask] = predictions.astype(np.uint8)
            ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind]
            ar_out[ul_r:lr_r, ul_c:lr_c] = ar_tile.reshape(array_shape)
            tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res
            mosaic.array_to_raster(ar_tile.reshape(array_shape),
                                   tx_tile,
                                   prj,
                                   driver,
                                   os.path.join(tile_dir, 'tile_%s.tif' % ind),
                                   dtype=gdal.GDT_Byte,
                                   nodata=nodata)
            print 'Total time for this piece: %.1f minutes\n' % (
                (time.time() - t1) / 60)
            #del ar_predictors, nodata_mask, ar_prediction'''
        #ar_prediction = np.concatenate(predict_pieces)
        #del predict_pieces
        '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        for ind, tile_coords in df_tiles_rc.iterrows():
            if ind in empty_tiles:
                continue
            ul_r, lr_r, ul_c, lr_c = tile_coords
            tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind)
            if not os.path.exists(tile_file):
                continue
            ds_t = gdal.Open(tile_file)
            ar_tile = ds_t.ReadAsArray()
            t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']]
            ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile'''

    else:
        ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata)
        # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict
        #   separately, then stack them back together
        if split_predictors:
            split_predictors = int(split_predictors)
            predictions = []
            for i, p in enumerate(
                    np.array_split(ar_predictors, split_predictors)):
                t1 = time.time()
                print '\nPredicting for %s of %s pieces of the final array...' % (
                    i + 1, split_predictors)
                predictions.append(rf_model.predict(p))
                print '%.1f minutes' % ((time.time() - t1) / 60)
            predictions = np.concatenate(predictions)
            print ''
        else:
            print 'Predicting in one chunk...'
            predictions = rf_model.predict(ar_predictors)
        ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32)
        ar_prediction[nodata_mask] = predictions
        del ar_predictors, predictions

    # Save the prediction array to disk
    stamp = os.path.basename(out_dir)
    out_path = os.path.join(out_dir, '%s_rf_vote.tif' % stamp)
    #ar_prediction = ar_prediction.reshape(ysize, xsize)
    if constant_vars:
        out_path = out_path.replace('.tif', '_yr%s.tif' % year)
    forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte,
                           nodata)  #"""
    # Delete the tiles
    shutil.rmtree(tile_dir)
    ds = None
    '''stamp = os.path.basename(out_dir)
    path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) 
    stamp = os.path.basename(os.path.dirname(path))
    ds = gdal.Open(path)
    ar_prediction = ds.ReadAsArray()
    ds = None#'''

    if 'test_params' in inputs:
        #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id')
        print '\nEvaluating the model...'
        t1 = time.time()
        test_dict = forest.read_params(test_params)
        for i in test_dict:
            exec("{0} = str({1})").format(i, test_dict[i])

        if 'n_trials' in test_dict:
            n_trials = int(n_trials)
        else:
            'n_trials not specified. Setting default to 50...\n'
            n_trials = 50
        if 'year' in test_dict:
            year = int(year)
        else:
            year = None
        cell_size = [int(i) for i in cell_size.split(',')]
        n_per_cell = int(n_per_cell)
        param_bn = os.path.basename(test_params)
        shutil.copy2(
            test_params,
            os.path.join(out_dir, param_bn.replace('.txt', '_%s.txt' % year)))

        df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx,
                                                 cell_size, target_col,
                                                 n_per_cell, n_trials, year)
        if len(roc_curves) > 0:
            for fpr, tpr, thresholds in roc_curves:
                plt.plot(fpr, tpr, 'k', alpha=.1)
            out_png = os.path.join(out_dir,
                                   '{0}_roc_curve_{1}.png'.format(stamp, year))
            plt.savefig(out_png)

        if 'lc_path' in test_dict:
            '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
            out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year))
            df_lc.to_csv(out_txt, sep='\t')'''

        #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
        df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
        out_txt = os.path.join(
            out_dir,
            '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year))
        df_lc.to_csv(out_txt, sep='\t')
        if 'inventory_txt' in test_dict:
            score_cols = sorted(df.columns)
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            for col in score_cols:
                score_mean = df[col].mean()
                df_inv.ix[stamp, col] = score_mean
                print 'Average %s: %2.3f' % (col.upper(), score_mean)
            df_inv.to_csv(inventory_txt, sep='\t')
        out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year))
        df.to_csv(out_txt, sep='\t', index=False)
        samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t')
        print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1) / 60)
    else:
        print '\nEither "test_samples" or "inventory_txt" was not specified.' +\
            ' This model will not be evaluated...'

    print '\nTotal runtime: %.1f minutes' % ((time.time() - t0) / 60)

    return out_path
예제 #4
0
def main(params, n_tiles=(25, 15), n_jobs=20, kernel_type='circle', filter_value=None):
    
    t0 = time.time()
    
    # Read params and make variables from text
    inputs = read_params(params)
        
    # Check params
    try:
        path = inputs['path']
        function = inputs['function']
        out_path = inputs['out_path']
        kernel_size = int(inputs['kernel_size'])
        databand = int(inputs['databand'])
    except KeyError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    if 'n_jobs' in inputs: n_jobs = int(inputs['n_jobs'])
    if 'n_tiles' in inputs: n_tiles = [int(n) for n in inputs['n_tiles'].split(',')]
    if 'nodata' in inputs: nodata = int(inputs['nodata'])
    
    extra_args = () # The default for ndi.generic_filter 'extra_args' is an empty tuple
    if 'average' in function.lower():
        func = np.nanmean
    elif 'mode' in function.lower():
        func = mode
    elif 'area' in function.lower():
        func = pct_nonzero
        if not filter_value and not 'filter_value' in inputs:
            sys.exit('Cannot calculate percent area without filter_value. ' +\
            'Try specifying filter_value in parameters file.')
        else:
            filter_value = int(inputs['filter_value'])
    elif 'equal' in function.lower():
        func = is_equal_to
        center_idx = kernel_size**2/2
        extra_args = tuple([center_idx])
        
    else:
        sys.exit('Could not find filtering function for alias: %s' % function)
    
    out_dir = os.path.dirname(out_path)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    shutil.copy2(params, out_dir)
        
    print '\nReading input raster...\n'
    t1 = time.time()
    ds = gdal.Open(path)
    band = ds.GetRasterBand(databand)
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = ds.GetDriver()
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    
    # Get an array and mask out nodata values with nans
    if 'nodata' not in inputs:
        print 'nodata not specified in params. Getting nodata value from input dataset...\n'
        nodata = band.GetNoDataValue()
    '''ar = band.ReadAsArray()
    ds = None
    array_dtype = ar.dtype
    ar = ar.astype(np.float16)
    mask = (ar != nodata) #& (ar != 255)
    ar[~mask] = np.nan'''
    if 'area' in function.lower():
        ar[(ar != filter_value) & mask] = 0
    #import pdb; pdb.set_trace()
    #ysize, xsize = ar.shape
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    if kernel_type.lower() == 'circle':
        #kernel_size /= 2
        kernel = circle_mask(kernel_size)
    else:
        kernel = np.ones((kernel_size, kernel_size))
    
    tile_buffer = kernel.shape[0]/2
    # Tile up the array to filter in parallel
    # Find empty tiles
    print 'Finding empty tiles...'
    t1 = time.time()
    df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx)

    total_tiles = len(df_tiles)
    '''empty_tiles = find_empty_tiles(df_tiles, mask, tx)
    df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles)
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)'''
    
    # Add buffer around each tile
    df_buf = df_tiles_rc.copy()
    df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer
    df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer
    df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize)
    df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize)
    
    # Get arrays
    print 'Getting buffered arrays...'
    t1 = time.time()
    n_full_tiles = len(df_tiles)
    args = []
    temp_dir = os.path.join(out_dir, 'tiles')
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)
    for i, (ind, r) in enumerate(df_buf.iterrows()):
        #this_ar = ar[r.ul_r : r.lr_r, r.ul_c : r.lr_c]
        #args.append([ind, this_ar, func, kernel, extra_args, i + 1, n_full_tiles])
        args.append([ind, path, databand, nodata, r, df_tiles.ix[ind], temp_dir, func, kernel, extra_args, i + 1, n_full_tiles])
        #arrays.append([i, this_ar])
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    print 'Filtering chunks in parallel with %s jobs...' % n_jobs
    p = Pool(n_jobs)
    tiles = p.map(par_filter, args, 1)

    print '\nTotal time for filtering: %.1f minutes\n' % ((time.time() - t1)/60)#'''

    
    print 'Tiling pieces back together...'
    t1 = time.time()
    gdal_dtype = band.DataType
    array_dtype = gdalnumeric.GDALTypeCodeToNumericTypeCode(gdal_dtype)
    filtered = np.full((ysize, xsize), nodata, dtype=array_dtype)
    for i, tile_path in tiles:
        if not tile_path:
            continue
        ds_t = gdal.Open(tile_path)
        buffered_tile = ds_t.ReadAsArray()
        b_inds = df_buf.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']]
        t_inds = df_tiles_rc.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']]
        d_ulr, d_lrr, d_ulc, d_lrc = t_inds - b_inds
        tile = buffered_tile[d_ulr : d_lrr, d_ulc : d_lrc]
        tile[np.isnan(tile)] = nodata
        tile = tile.astype(array_dtype)
        t_ulr, t_lrr, t_ulc, t_lrc = t_inds
        filtered[t_ulr : t_lrr, t_ulc : t_lrc] = tile
    print '%.1f minutes\n' % ((time.time() - t1)/60)   
    
    #filtered = filtered.astype(array_dtype)
    if 'out_nodata' in inputs: 
        #filtered[np.isnan(filtered) | ~mask] = nodata
        filtered[filtered == nodata] = int(inputs['out_nodata'])
        nodata = int(inputs['out_nodata'])

    try:
        array_to_raster(filtered, tx, prj, driver, out_path, dtype=gdal_dtype, nodata=nodata)
    except:
        array_to_raster(filtered, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
    desc = ('Raster filtered by kernel of shape {kernel_type} and size ' +\
            '{kernel_size} and function {func}').format(kernel_type=kernel_type,
                                                        kernel_size=kernel_size, 
                                                        func=function)
    meta_path = createMetadata(sys.argv, out_path, description=desc)
    write_params_to_meta(meta_path, params)
    del ar, filtered, tiles, args, p
    ds = None
    import pdb; pdb.set_trace()
    shutil.rmtree(temp_dir)
    
    print 'Total time: %.1f minutes' % ((time.time() - t0)/60)
예제 #5
0
def main(params):
    
    '''### copy params to out_dir #### '''
    
    #read_params(params)
    inputs, df_var = read_params(params)

    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])
    try:
        num_vars = vars_to_numbers(cell_size, support_size, sets_per_cell,
                                   min_obs, pct_train, n_tiles, nodata)
        cell_size, support_size, sets_per_cell, min_obs, pct_train, n_tiles, nodata = num_vars
        str_check = sample_txt, target_col, mosaic_path, tsa_txt, dep_var_name, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
        return None
    
    now = datetime.now()
    date_str = str(now.date()).replace('-','')
    time_str = str(now.time()).replace(':','')[:4]
    stamp = '{0}_{1}_{2}'.format(dep_var_name, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(out_dir) # With a timestamp in dir, no need to check if it exists
    shutil.copy2(params, out_dir) #Copy the params for reference
    
    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    out_txt = os.path.join(out_dir, stamp + '.txt')
    dfs = gsrd.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell,
                        sample_txt, min_obs, pct_train, dep_var_name, out_txt,
                        gsrd_shp)
    df_train, df_test, df_sets = dfs
    support_sets = df_train.set_id.unique()

    # Check that df_train has exactly the same columns as variables specified in df_vars
    #   Last four characters in each column of df_train should be year
    unmatched_vars = [v for v in df_var.index if v not in [c for c  in df_train]]
    
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)
    
    predict_cols = sorted(np.unique([c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values())# Make sure predict_cols and df_var are in the same order

    # Train a tree for each support set
    x_train = df_train.reindex(columns=predict_cols + ['set_id'])
    y_train = df_train[[target_col, 'set_id']]    
    df_sets['dt_model'] = [fit_tree(x_train.ix[x_train.set_id==s, predict_cols],\
    y_train.ix[y_train.set_id==s, target_col]) for s in support_sets]
    
    # Write df_sets and each decison tree to disk
    write_model(out_dir, df_sets)
    
    mosaic_ds = gdal.Open(mosaic_path, GA_ReadOnly)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    
    t0 = time.time()
    
    predict_dir = os.path.join(out_dir, 'predctions')
    os.mkdir(predict_dir)
    # Loop through each set and generate predictions
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    c = 1
    total_sets = len(support_sets)
    predictions = {}
    for set_id, row in df_sets.iterrows():
        print 'Predicting for set %s of %s' % (c, total_sets)
        ar_coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
        ar_predict = predict_set(set_id, df_var, mosaic_ds, ar_coords, 
                                 mosaic_tx, xsize, ysize, row.dt_model, nodata)
        #predictions[set_id] = ar_predict
        
        tx = ar_coords['ul_x'], x_res, x_rot, ar_coords['ul_y'], y_rot, y_res
        out_path = predict_dir + '/prediction_%s.bsq' % set_id
        mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, GDT_Int32, nodata=nodata)
        c += 1
    mosaic_ds = None                  
    print '\nTotal time for predictions: %.1f minutes' % ((time.time() - t0)/60)#'''
    
    #Aggregate predictions by tile and stitch them back together
    aggr.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_tx, support_size, predict_dir, df_sets, out_dir, stamp, prj, driver)
예제 #6
0
def main(params, inventory_txt=None, constant_vars=None):
    
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    # If constants were given, make a dict and make sure they match the training
    #  constants
    if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        pred_constants = sorted(constant_vars.keys())
        train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')]
        train_constants = sorted(train_constants)
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if 'constant_vars' in inputs: 
        unmatched_vars += [v for v in pred_constants if v not in train_constants]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str
        raise NameError(msg)
    unmatched_vars = [v for v in train_vars if v not in pred_vars]
    if 'constant_vars' in inputs:
        unmatched_vars += [v for v in train_constants if v not in pred_constants]
        pred_vars += pred_constants # Add here because it would screw with stuff upstream
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    mosaic_ds = gdal.Open(mosaic_path)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)
    
    t0 = time.time()
    if 'n_jobs' in inputs:

        # Predict in parallel
        n_jobs = int(n_jobs)
        args = []
        t1 = time.time()
        print 'Predicting in parallel with %s jobs...' % n_jobs
        print 'Building args and making rasters of TSA arrays...'
        for c, (set_id, row) in enumerate(df_sets.iterrows()):
            
            # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx,
                            xsize, ysize, nodata=nodata)
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
            dtype_code = mosaic_ds.GetRasterBand(1).DataType
            mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True)
            
            # Build list of args to pass to the Pool
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            ds = gdal.Open(tsa_raster)
            tsa_tx = ds.GetGeoTransform()
            ds = None
            tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx)
            args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, 
                         mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, 
                         constant_vars, predict_dir])
        print '%.1f minutes\n' % ((time.time() - t1)/60)
        p = Pool(n_jobs)
        p.map(stem.par_predict, args, 1)
            
    
    else:
        # Loop through each set and generate predictions
        for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()):
            t1 = time.time()
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            print '\nPredicting for set %s of %s' % (c + 1, total_sets)
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, 
                                     mosaic_tx, xsize, ysize, dt_model, nodata,
                                     np.int16, constant_vars)        
            tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res
            out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id)
            mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
            print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60)
    
        #mosaic_ds = None                  
    print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#'''
    
    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0)
    #df_sets.to_csv(set_txt, sep='\t')'''
    mosaic_ds = None
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp))
    ar_vote = ds.ReadAsArray()
    ds = None
    ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp))
    ar_mean = ds.ReadAsArray()
    ds = None#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion
        
        vote_dir = os.path.join(model_dir, 'evaluation_vote')
        mean_dir = os.path.join(model_dir, 'evaluation_mean')
        
        print '\nComputing confusion matrix for vote...'
        out_txt = os.path.join(vote_dir, 'confusion.txt')
        print confusion_params
        df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
        try:
            out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
            df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
        except Exception as e:
            print e
        
        '''print '\nGetting confusion matrix for mean...'
        out_txt = os.path.join(mean_dir, 'confusion.txt')
        df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
        try:
            out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
            df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
        except Exception as e:
            print e#'''
        
        vote_acc = df_v.ix['producer', 'user']
        vote_kap = df_v.ix['producer', 'kappa']
        #mean_acc = df_m.ix['user','producer']
        #mean_kap = df_m.ix['user', 'kappa']

        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        print 'Vote accuracy .............. ', vote_acc
        print 'Vote kappa ................. ', vote_kap
        #print 'Mean accuracy .............. ', mean_acc
        #print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
예제 #7
0
 for c, (set_id, row) in enumerate(df_sets.iterrows()):
     
     coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
     
     # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic
     if mosaic_predictors:
         if mosaic_path.endswith('.shp'):
             tsa_ar, tsa_off = mosaic.kernel_from_shp(mosaic_ds, coords, mosaic_tx, nodata)
         else:
             tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords,
                                                     mosaic_tx, xsize, ysize,
                                                     nodata=nodata)
         set_mosaic_path = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
         tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
         np_dtype = get_min_numpy_dtype(tsa_ar)
         gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype)
         mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, set_mosaic_path, gdal_dtype, silent=True)
         tsa_off = stem_conus.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tx_out[0], tx_out[3]), tx_out)
     
     else:
         set_mosaic_path = None
         tsa_ar = None
         tsa_off = None
     
     # Build list of args to pass to the Pool
     args.append([c, total_sets, set_id, df_var, set_mosaic_path, tsa_off, coords, 
                  mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, 
                  constant_vars, predict_dir])
 print '%.1f minutes\n' % ((time.time() - t1)/60)
 p = Pool(n_jobs)
 p.map(stem_conus.par_predict, args, 1)
예제 #8
0
def main(params,
         inventory_txt=None,
         constant_vars=None,
         mosaic_shp=None,
         resolution=30,
         n_jobs=0,
         n_jobs_agg=0,
         mosaic_nodata=0,
         snap_coord=None,
         overwrite_tiles=False,
         tile_id_field='name'):
    inputs = stem.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var = pd.read_csv(var_info, sep='\t', index_col='var_name')
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = pd.read_csv(train_inputs['var_info'].replace('"', ''),
                             sep='\t',
                             index_col='var_name')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path

    if overwrite_tiles.lower() == 'false':
        overwrite_tiles = False

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, os.path.basename(model_dir) + '.db')
    if os.path.exists(db_path):
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets',
                                        con,
                                        index_col='set_id')  #'''
    else:
        set_txt = stem.find_file(model_dir, '*support_sets.txt')
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace(
                '.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(
                clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)  #'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path, 1)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            i = 0
            for feature in mosaic_ds:
                g = feature.GetGeometryRef()
                # Check that the feature is valid. Clipping can produce a feautre
                #  w/ an area of 0
                if g.GetArea() > 1:
                    mosaic_geom.AddGeometry(g)
                else:
                    fid = feature.GetFID()
                    feature.Destroy()
                    mosaic_ds.DeleteFeature(fid)
            #import pdb; pdb.set_trace()
            df_sets = stem.get_overlapping_sets(df_sets,
                                                mosaic_geom.UnionCascaded())
        xsize = int((max_x - min_x) / resolution)
        ysize = int((max_y - min_y) / resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"', '')
            snap_coord = [float(c) for c in snap_coord.split(',')]  #'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path,
                                             x_res,
                                             y_res,
                                             snap_coord=snap_coord)
        tiles = stem.attributes_to_df(
            mosaic_path)  # Change to accept arbittary geometry

    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')

    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 90 x 40 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)

    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(out_dir, '_temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)

    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats,
                                  index=tiles[tile_id_field])
        for stat in agg_stats:
            pattern = re.compile('tile_\d+_%s.tif' % stat)
            stat_match = [f.split('_')[1] for f in files if pattern.match(f)]
            try:
                tile_files[stat] = pd.Series(np.ones(len(stat_match)),
                                             index=stat_match)
            except:
                pass  #import pdb; pdb.set_trace()
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)
        tiles.set_index(index_field, inplace=True)  #'''
    tiles['ul_x'] = [
        stem.get_ul_coord(xmin, xmax, x_res)
        for i, (xmin, xmax) in tiles[['xmin', 'xmax']].iterrows()
    ]
    tiles['ul_y'] = [
        stem.get_ul_coord(ymin, ymax, y_res)
        for i, (ymin, ymax) in tiles[['ymin', 'ymax']].iterrows()
    ]
    tiles['lr_x'] = [
        xmax if ulx == xmin else xmin
        for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin', 'xmin']].iterrows()
    ]
    tiles['lr_y'] = [
        ymax if uly == ymin else ymin
        for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin', 'ymin']].iterrows()
    ]

    support_nrows = int(support_size[0] / abs(y_res))
    support_ncols = int(support_size[1] / abs(x_res))
    t1 = time.time()

    # Patch for unknown Landcover screwup
    args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets,
             df_var, (support_nrows, support_ncols), agg_stats,
             tile_path_template, prj, nodata, snap_coord)
            for i, (t_ind,
                    tile_info) in enumerate(tiles.loc[tiles['name'].isin([
                        '1931', '2810', '0705', '0954', '2814', '1986', '2552',
                        '2019', '2355', '3354', '2278', '2559'
                    ])].iterrows())]

    args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets,
             df_var, (support_nrows, support_ncols), agg_stats,
             tile_path_template, prj, nodata, snap_coord)
            for i, (t_ind, tile_info) in enumerate(tiles.loc[
                tiles['name'].isin(['0705'])].iterrows())]

    # Patch for the GEE subset 2 outside-of-buffer 'slice'
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin(['0639','0718','0797','0876','0955','1034'])].iterrows())]

    # Original line
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tile_files.isnull().any(axis=1).values].iterrows())]

    limits = []

    for arg in args:
        print tile_info[tile_id_field]
        limits.append(stem.par_predict_tile(arg))  #'''

    ###

    return
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % (
        (time.time() - t1) / 3600)

    try:
        limits = pd.concat(limits)
    except:
        # They're all None
        pass

    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        #dtype = mosaic.get_min_numpy_dtype(limits[stat])
        dtype = np.int16
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16)  #dtype)
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=dtype)

        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(
                tile_dir,
                'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            try:
                ds = gdal.Open(tile_file)
            except:
                print 'Tile not found'
                continue
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off:row_off + tile_rows,
                   col_off:col_off + tile_cols] = ar_tile
            except Exception as e:
                pass  #import pdb; pdb.set_trace()

        out_path = os.path.join(out_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar,
                               mosaic_tx,
                               prj,
                               driver,
                               out_path,
                               gdal_dtype,
                               nodata=this_nodata)

    # Clean up the tiles
    #shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1) / 60)

    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f:
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values,
                                              axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Save the importance values
    importance = pd.DataFrame({
        'variable': pred_vars,
        'pct_importance': pct_importance,
        'index': range(len(pred_vars))
    })
    importance.set_index('index', inplace=True)
    importance['rank'] = [
        int(r) for r in importance.pct_importance.rank(method='first',
                                                       ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')  #'''

    print '\nTotal prediction runtime: %.1f hours\n' % (
        (time.time() - t0) / 3600)
예제 #9
0
def main(params,
         inventory_txt=None,
         constant_vars=None,
         mosaic_shp=None,
         resolution=30,
         n_jobs_pred=0,
         n_jobs_agg=0,
         mosaic_nodata=0):
    inputs, df_var = stem_conus.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem_conus.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        mosaic_dataset = ogr.Open(mosaic_path)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        xsize = int((max_x - min_x) / resolution)
        ysize = int((max_y - min_y) / resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        mosaic_tx, extent = stem_conus.tx_from_shp(mosaic_path, x_res, y_res)
        #df_tiles = attributes_to_df(mosaic_path)

    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    driver = gdal.GetDriverByName('gtiff')

    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 25, 15
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    df_tiles, df_tiles_rc, tile_size = stem_conus.get_tiles(
        n_tiles, xsize, ysize, mosaic_tx)

    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)

    set_txt = glob.glob(
        os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)

    t0 = time.time()
    if 'n_jobs_pred' in inputs:
        n_jobs = int(n_jobs_pred)
        # Predict in parallel
        args = []
        t1 = time.time()
        print 'Predicting in parallel with %s jobs...' % n_jobs
        print 'Building args and making rasters of tile arrays...'
        for c, (set_id, row) in enumerate(df_sets.iterrows()):

            # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            '''if mosaic_type == 'vector':
                tsa_ar, tsa_off = mosaic.kernel_from_shp(mosaic_ds, coords, mosaic_tx, nodata=0)
            else:
                tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords,
                                                        mosaic_tx, xsize, ysize,
                                                        nodata=nodata)
            set_mosaic_path = os.path.join(predict_dir, 'tsa_%s.tif' % set_id)
            tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
            np_dtype = get_min_numpy_dtype(tsa_ar)
            gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype)
            mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, set_mosaic_path, gdal_dtype, silent=True)
            pct_progress = float(c + 1)/total_sets * 100
            sys.stdout.write('\rRetreived points for feature %s of %s (%%%.1f)' % (c + 1, total_sets, pct_progress))
            sys.stdout.flush()'''

            # Build list of args to pass to the Pool
            #tsa_off = stem_conus.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tx_out[0], tx_out[3]), tx_out)
            args.append([
                coords, mosaic_type, mosaic_path, mosaic_tx, prj, nodata, c,
                total_sets, set_id, df_var, xsize, ysize, row.dt_file, nodata,
                np.uint8, constant_vars, predict_dir
            ])

            #args.append([c, total_sets, set_id, df_var, set_mosaic_path, tsa_off, coords,
            #mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8,
            #constant_vars, predict_dir])
        print '%.1f minutes\n' % ((time.time() - t1) / 60)
        p = Pool(n_jobs)
        p.map(stem_conus.par_predict, args, 1)

    else:
        # Loop through each set and generate predictions
        for c, (set_id, row) in enumerate(df_sets.iterrows()):
            t1 = time.time()
            with open(row.dt_file, 'rb') as f:
                dt_model = pickle.load(f)
            print '\nPredicting for set %s of %s' % (c + 1, total_sets)
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            ar_predict = stem_conus.predict_set(set_id, df_var, mosaic_ds,
                                                coords, mosaic_tx, xsize,
                                                ysize, dt_model, nodata,
                                                np.int16, constant_vars)
            tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res
            out_path = os.path.join(predict_dir, 'prediction_%s.tif' % set_id)
            np_dtype = get_min_numpy_dtype(ar_predict)
            gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype)
            mosaic.array_to_raster(ar_predict,
                                   tx,
                                   prj,
                                   driver,
                                   out_path,
                                   gdal.GDT_Byte,
                                   nodata=nodata)
            print 'Total time for this set: %.1f minutes' % (
                (time.time() - t1) / 60)

        #mosaic = None
    print '\nTotal time for predicting: %.1f hours\n' % (
        (time.time() - t0) / 3600)  #''' """

    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)

    t1 = time.time()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    if 'n_jobs_agg' in inputs:
        n_jobs_agg = int(n_jobs_agg)

    if mosaic_type == 'vector':
        nodata_mask = mosaic_ds
    else:
        if 'mosaic_nodata' in inputs: mosaic_nodata = int(mosaic_nodata)
        nodata_mask = mosaic_ds.ReadAsArray() != mosaic_nodata

    ########################################################################################################################################
    # jdb 6/22/17 check for sets that errored - if there are any, remove them from the df_sets DF so that the aggregation step doesn't expect them
    setErrorLog = os.path.dirname(predict_dir) + '/predication_errors.txt'
    if os.path.isfile(setErrorLog):
        with open(setErrorLog) as f:
            lines = f.readlines()

        badSets = [
            int(line.split(':')[1].rstrip().strip()) for line in lines
            if 'set_id' in line
        ]
        for thisSet in badSets:
            df_sets = df_sets[df_sets.index != thisSet]
    ########################################################################################################################################

    pct_importance, df_sets = stem_conus.aggregate_predictions(
        n_tiles, ysize, xsize, nodata, nodata_mask, mosaic_tx, support_size,
        agg_stats, predict_dir, df_sets, out_dir, file_stamp, prj, driver,
        n_jobs_agg)
    #print 'Total aggregation time: %.1f hours\n' % ((time.time() - t0)/3600)
    mosaic_ds = None
    mosaic_dataset = None

    # Save the importance values
    importance = pd.DataFrame({
        'variable': pred_vars,
        'pct_importance': pct_importance,
        'index': range(len(pred_vars))
    })
    importance.set_index('index', inplace=True)
    importance['rank'] = [
        int(r) for r in importance.pct_importance.rank(method='first',
                                                       ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')  #'''

    if 'confusion_params' in locals():
        import confusion_matrix as confusion
        ''' 
         Read the mean or vote back in '''
        if 'vote' in agg_stats:
            vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp)
            ar_vote = gdal.Open(vote_path)
            print '\nComputing confusion matrix for vote...'
            vote_dir = os.path.join(model_dir, 'evaluation_vote')
            out_txt = os.path.join(vote_dir, 'confusion.txt')
            df_v = confusion.main(confusion_params,
                                  ar_vote,
                                  out_txt,
                                  match=True)
            vote_acc = df_v.ix['producer', 'user']
            vote_kap = df_v.ix['producer', 'kappa']
            '''try:
                out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
                df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
            except Exception as e:
                print e'''

        if 'mean' in agg_stats:
            mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp)
            ar_mean = gdal.Open(mean_path)
            print '\nGetting confusion matrix for mean...'
            mean_dir = os.path.join(model_dir, 'evaluation_mean')
            out_txt = os.path.join(mean_dir, 'confusion.txt')
            df_m = confusion.main(confusion_params,
                                  ar_mean,
                                  out_txt,
                                  match=True)
            mean_acc = df_m.ix['user', 'producer']
            mean_kap = df_m.ix['user', 'kappa']
            '''try:
                out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
                df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
            except Exception as e:
                print e#'''

        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = [
                'vote_accuracy', 'vote_kappa'
            ]  #, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[
                file_stamp,
                cols] = vote_acc, vote_kap  #, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'

        print ''
        if 'vote' in agg_stats:
            print 'Vote accuracy .............. ', vote_acc
            print 'Vote kappa ................. ', vote_kap
        if 'mean' in agg_stats:
            print 'Mean accuracy .............. ', mean_acc
            print 'Mean kappa ................. ', mean_kap

    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''

    print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0) / 60)
예제 #10
0
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'):
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, file_stamp + '.db')
    try:
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#'''
    except:
        set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    
    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)#'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            for feature in mosaic_ds:
                mosaic_geom.AddGeometry(feature.GetGeometryRef())
            df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom)
        xsize = int((max_x - min_x)/resolution)
        ysize = int((max_y - min_y)/resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"','')
            snap_coord = [float(c) for c in snap_coord.split(',')]#'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord)
        tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry
        
    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')
        
    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)
    
    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(model_dir, 'temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)
    
    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field])
        for stat in agg_stats:
            stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)]
            tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match)
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)]
        tiles.set_index(index_field, inplace=True)
    
    tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) 
                    for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()]
    tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) 
                    for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()]
    tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax)
                    in tiles[['ul_x', 'xmin','xmin']].iterrows()]
    tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) 
                    in tiles[['ul_y', 'ymin','ymin']].iterrows()]
    
    support_nrows = int(support_size[0]/abs(y_res))
    support_ncols = int(support_size[1]/abs(x_res))
    t1 = time.time()
    args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())]    
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())]
    
    if n_jobs > 1:
        print 'Predicting with %s jobs...\n' % n_jobs
        pool = Pool(n_jobs)
        pool.map(stem.predict_tile, args, 1)
        pool.close()
        pool.join()
    else:
        for arg in args:
            print 'Predicting with 1 job ...\n'
            stem.predict_tile(*arg)#'''
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600)
    
    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) 
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8)
        
        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            ds = gdal.Open(tile_file)
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile
            except Exception as e:
                import pdb; pdb.set_trace()
        
        out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata)
    
    # Clean up the tiles
    shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60)
    
    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion

        ''' 
         Read the mean or vote back in '''
        if 'vote' in agg_stats:
            vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp)
            ar_vote = gdal.Open(vote_path)
            print '\nComputing confusion matrix for vote...'
            vote_dir = os.path.join(model_dir, 'evaluation_vote')
            out_txt = os.path.join(vote_dir, 'confusion.txt')
            df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
            vote_acc = df_v.ix['producer', 'user']
            vote_kap = df_v.ix['producer', 'kappa']
            '''try:
                out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
                df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
            except Exception as e:
                print e'''

                
        if 'mean' in agg_stats:
            mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp)
            ar_mean = gdal.Open(mean_path)
            print '\nGetting confusion matrix for mean...'
            mean_dir = os.path.join(model_dir, 'evaluation_mean')
            out_txt = os.path.join(mean_dir, 'confusion.txt')
            df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
            mean_acc = df_m.ix['user','producer']
            mean_kap = df_m.ix['user', 'kappa']
            '''try:
                out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
                df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
            except Exception as e:
                print e#'''


        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        if 'vote' in agg_stats:
            print 'Vote accuracy .............. ', vote_acc
            print 'Vote kappa ................. ', vote_kap
        if 'mean' in agg_stats:
            print 'Mean accuracy .............. ', mean_acc
            print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)
예제 #11
0
def main(in_raster,
         snap_raster,
         in_nodata,
         out_nodata,
         out_path=None,
         mask_val=None,
         overwrite=False):

    t0 = time.time()
    in_nodata = int(in_nodata)
    out_nodata = int(out_nodata)

    print '\nOpening datasets... '
    t1 = time.time()
    ds_in = gdal.Open(in_raster)
    ar_in = ds_in.ReadAsArray()
    tx_in = ds_in.GetGeoTransform()
    driver = ds_in.GetDriver()
    ds_in = None

    ds_snap = gdal.Open(snap_raster)
    ar_snap = ds_snap.ReadAsArray()
    tx_snap = ds_snap.GetGeoTransform()
    prj = ds_snap.GetProjection()
    ds_snap = None
    print '%.1f seconds\n' % (time.time() - t1)

    print 'Snapping input raster...'
    t1 = time.time()
    offset = calc_offset((tx_snap[0], tx_snap[3]), tx_in)
    snap_inds, in_inds = get_offset_array_indices(ar_snap.shape, ar_in.shape,
                                                  offset)
    np_dtype = ar_in.dtype
    ar = np.full(ar_snap.shape, out_nodata, dtype=np_dtype)
    ar_in[ar_in == in_nodata] = out_nodata
    ar[snap_inds[0]:snap_inds[1],
       snap_inds[2]:snap_inds[3]] = ar_in[in_inds[0]:in_inds[1],
                                          in_inds[2]:in_inds[3]]

    if mask_val:
        mask_val = int(mask_val)
        ar[ar_snap == mask_val] = out_nodata

    print '%.1f seconds\n' % (time.time() - t1)

    if out_path:
        if ar.max() <= 255 and ar.min() >= 0:
            gdal_dtype = gdal.GDT_Byte
        else:
            gdal_dtype = gdal.GDT_Int16

        if os.path.exists(out_path) and not overwrite:
            sys.exit('out_path already exists')
        array_to_raster(ar, tx_snap, prj, driver, out_path, gdal_dtype,
                        out_nodata)

        # Write metadata
        desc = ('Input raster %s snapped to the extent of %s.') % (in_raster,
                                                                   snap_raster)
        if mask_val:
            desc += ' Data were masked from snap raster with value %s.' % mask_val
        createMetadata(sys.argv, out_path, description=desc)
    else:
        return ar

    print '\nTotal time to snap raster: %.1f seconds\n' % (time.time() - t0)