Exemplo n.º 1
0
def make_tiles(n_tiles, ds_snap):

    try:
        n_tiles = int(n_tiles)
    except ValueError:
        if ',' in n_tiles:
            try:
                n_tiles = [int(i) for i in n_tiles.split(',')]
            except ValueError:
                pass
        else:
            try:
                n_tiles = [int(i) for i in n_tiles.split()]
            except ValueError:
                raise ValueError('format of n_tiles not understood: %s' %
                                 n_tiles)

    ysize = ds_snap.RasterYSize
    xsize = ds_snap.RasterXSize
    tx = ds_snap.GetGeoTransform()

    # Figure out how many tiles belong in each row if necessary
    if isinstance(n_tiles, int):
        # if n_tiles = nx * ny and nx = ny * ratio -> y = (n_tiles/ratio) ** .5
        ratio = xsize / float(ysize)
        ny = int((n_tiles / ratio)**.5)
        nx = int(n_tiles / ny)
        n_tiles = ny, nx

    _, tiles, __ = stem.get_tiles(n_tiles, xsize, ysize, tx)
    #stem.coords_to_shp(_, '/vol/v2/stem/extent_shp/CAORWA.shp', '/home/server/pi/homes/shooper/delete/tiles.shp')

    return tiles
Exemplo n.º 2
0
def buffered_tile_inds(n_tiles, xsize, ysize, tx, tile_buffer, mask):

    # Find empty tiles
    print 'Finding empty tiles...'
    t1 = time.time()
    df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx)

    total_tiles = len(df_tiles)
    empty_tiles = find_empty_tiles(df_tiles, mask, tx)
    df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles)
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)

    # Add buffer around each tile
    df_buf = df_tiles.copy()
    df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer
    df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer
    df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize)
    df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize)

    return df_tiles, df_buf
Exemplo n.º 3
0
def get_stratified_sample_by_tile(raster_path,
                                  col_name,
                                  data_band,
                                  n_samples,
                                  bins,
                                  min_sample=None,
                                  max_sample=None,
                                  pct_train=1,
                                  nodata=None,
                                  sampling_scheme='equal',
                                  zero_inflation=None,
                                  data_type='continuous',
                                  kernel=False,
                                  n_tiles=(1, 1),
                                  boundary_shp=None,
                                  bin_scale=1,
                                  n_per_tile=None):
    '''
    Return a dataframe of stratified randomly sampled pixels from raster_path
    '''
    print 'Reading the raster_path... %s\n' % datetime.now()
    ds = gdal.Open(raster_path)
    tx = ds.GetGeoTransform()
    band = ds.GetRasterBand(data_band)
    #ar_full = band.ReadAsArray()
    #ar_data = band.ReadAsArray()
    if nodata == None:
        nodata = band.GetNoDataValue()
        if nodata == None:
            sys.exit('Could not obtain nodata value from dataset and' +\
            ' none specified in parameters file. Try re-running with' +\
            'nodata specified.')

    # Split up the raster into tiles and figure out how
    print 'Calculating sample size per tile...'
    t1 = time.time()
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize,
                                                      tx)
    total_tiles = len(df_tiles)
    # If a boundary shapefile is given, calculate the proportional area within
    #   each tile
    if boundary_shp:
        boundary_ds = ogr.Open(boundary_shp)
        boundary_lyr = boundary_ds.GetLayer()
        empty_tiles = stem.find_empty_tiles(df_tiles,
                                            boundary_lyr,
                                            tx,
                                            nodata=0)
        df_tiles.drop(empty_tiles, inplace=True)
        df_tiles_rc.drop(empty_tiles, inplace=True)
        total_tiles = len(df_tiles)
        calc_proportional_area(df_tiles, boundary_shp)  # Calcs area in place
        if n_per_tile:
            pct_area = df_tiles.pct_area
            df_tiles['pct_max_sample'] = pct_area / (pct_area.max() -
                                                     pct_area.min())
            df_tiles_rc['n_samples'] = (n_per_tile *
                                        df_tiles.pct_max_sample).astype(int)
        else:
            df_tiles_rc['n_samples'] = n_samples * df_tiles.pct_area
    else:
        if n_per_tile:
            df_tiles_rc['n_samples'] = n_per_tile
        else:
            df_tiles_rc['n_samples'] = float(n_samples) / total_tiles
    df_tiles['n_samples'] = df_tiles_rc.n_samples
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # For each tile, get random sample for each bin
    train_rows = []
    train_cols = []
    test_rows = []
    test_cols = []
    empty_tiles = []
    classes = ['_%s' % b[1] for b in bins]
    df_tiles = df_tiles.reindex(columns=df_tiles.columns.tolist() + classes,
                                fill_value=0)
    for c, (i, tile_coords) in enumerate(df_tiles_rc.iterrows()):
        t1 = time.time()
        print 'Sampling for %d pixels for tile %s of %s...' % (
            tile_coords.n_samples, c + 1, total_tiles)
        if tile_coords.n_samples == 0:
            print '\tSkipping this tile because all pixels == nodata...\n'
            empty_tiles.append(i)
            continue
        ul_r, lr_r, ul_c, lr_c = tile_coords[['ul_r', 'lr_r', 'ul_c', 'lr_c']]
        tile_ysize = lr_r - ul_r
        tile_xsize = lr_c - ul_c

        ar = band.ReadAsArray(ul_c, ul_r, tile_xsize, tile_ysize)
        if not type(ar) == np.ndarray:
            import pdb
            pdb.set_trace()
        nodata_mask = ar != nodata
        if not nodata_mask.any():
            print '\tSkipping this tile because all pixels == nodata...\n'
            empty_tiles.append(i)
            continue
        ar_rows, ar_cols = np.indices(ar.shape)
        ar_rows = ar_rows + ul_r
        ar_cols = ar_cols + ul_c

        n_per_bin, scaled_pcts = calc_strata_sizes(ar, nodata_mask,
                                                   tile_coords.n_samples, bins,
                                                   sampling_scheme, bin_scale,
                                                   zero_inflation)
        df_tiles.ix[i, classes] = n_per_bin  #record sample size per bin

        for i, (this_min, this_max) in enumerate(bins):
            #t2 = time.time()

            try:
                this_sample_size = n_per_bin[i]
                if min_sample:
                    this_sample_size = int(max(n_per_bin[i], min_sample))
                if max_sample:
                    if max_sample < this_sample_size:
                        this_sample_size = max_sample

            except:
                import pdb
                pdb.set_trace()
            print 'Sampling between %s and %s: %s pixels (%.1f%% of sample for this tile) ' % (
                this_min, this_max, this_sample_size, scaled_pcts[i] * 100)
            mask = (ar > this_min) & (ar <= this_max) & nodata_mask
            these_rows = ar_rows[mask]
            these_cols = ar_cols[mask]
            '''if this_max == 0 and zero_inflation:
                this_sample_size *= zero_inflation'''

            # If there aren't enough pixels to generate samples for this bin
            if these_rows.size < this_sample_size:
                #import pdb; pdb.set_trace()
                print (('Not enough pixels between {0} and {1} to generate {2}' +\
                ' random samples. Returning all {3} pixels for this bin.'))\
                .format(this_min, this_max, this_sample_size, these_rows.size)
                tr_rows = these_rows
                tr_cols = these_cols
            else:
                samples = random.sample(xrange(len(these_rows)),
                                        this_sample_size)
                tr_rows = these_rows[samples]
                tr_cols = these_cols[samples]

            # If pct_train is specified, split the sample indices into train/test sets
            te_rows = []
            te_cols = []
            if pct_train:
                split_ind = len(tr_rows) * pct_train
                te_rows = tr_rows[split_ind:]
                te_cols = tr_cols[split_ind:]
                tr_rows = tr_rows[:split_ind]
                tr_cols = tr_cols[:split_ind]

            train_rows.extend(tr_rows)
            train_cols.extend(tr_cols)
            test_rows.extend(te_rows)
            test_cols.extend(te_cols)
            #print '%.1f seconds\n' % (time.time() - t2)
        print 'Time for this tile: %.1f minutes\n' % ((time.time() - t1) / 60)
    del tr_rows, tr_cols, te_rows, te_cols, ar

    # Read the whole raster in to extract stuff
    ar = band.ReadAsArray()

    # If True, extract with 3x3 kernel. Otherwise, just get the vals (row,col)
    if kernel:
        train_vals = extract_by_kernel(ar, train_rows, train_cols, data_type,
                                       col_name, nodata)
    else:
        train_vals = ar[train_rows, train_cols]

    # Calculate x and y for later extractions
    ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx
    train_x = [int(ul_x + c * x_res) for c in train_cols]
    train_y = [int(ul_y + r * y_res) for r in train_rows]
    df_train = pd.DataFrame({
        'x': train_x,
        'y': train_y,
        'row': train_rows,
        'col': train_cols,
        col_name: train_vals
    })

    # If training and testing samples were split, get test vals
    df_test = None
    if pct_train > 1:
        if kernel:
            test_vals = extract_by_kernel(ar, train_rows, train_cols,
                                          data_type, col_name, nodata)
        else:
            test_vals = ar[test_rows, test_cols]
        test_x = [int(ul_x + c * x_res) for c in test_cols]
        test_y = [int(ul_y + r * y_res) for r in test_rows]
        df_test = pd.DataFrame({
            'x': test_x,
            'y': test_y,
            'row': test_rows,
            'col': test_cols,
            col_name: test_vals
        })
    ds = None

    # In case empty tiles weren't filtered out already
    df_tiles = df_tiles.ix[~df_tiles.index.isin(empty_tiles)]

    return df_train, df_test, df_tiles
Exemplo n.º 4
0
def main(params,
         n_pieces=False,
         ydims=None,
         constant_vars=None,
         year='',
         agg_method=None):

    t0 = time.time()
    print 'Predicting Random Forest... %s\n' % time.ctime(t0)

    # Set optional params to default:
    split_predictors = False

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        nodata = int(nodata)
        str_check = train_params, rf_path, mask_path, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in
    train_dict = forest.read_params(train_params)
    train_txt_bn = os.path.basename(train_dict['var_txt'][:-1])
    if 'var_txt' not in locals():
        var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn)
    if not os.path.exists(var_txt):
        print ''
        msg = 'Could not find var_txt:\n%s\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make sure vars are sorted alphabetically since they were for training
    pred_vars = sorted(df_var.index)
    df_var = df_var.reindex(pred_vars)
    '''if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        #year = constant_vars['YEAR']
        year = 2012
        pred_constants = sorted(constant_vars.keys())
    else:
        df_var.search_str = [s.format(2007) for s in df_var.search_str]'''

    #out_dir = os.path.dirname(out_raster)
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    new_params = os.path.join(out_dir, os.path.basename(params))
    shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year))

    # Load the Random Forest model
    print 'Loading the RandomForest model from \n%s... \n%s\n' % (
        rf_path, time.ctime(time.time()))
    if not os.path.exists(rf_path):
        raise IOError('%s does not exist' % rf_path)
    with open(rf_path) as f:
        rf_model = pickle.load(f)
    n_features = rf_model.n_features_
    n_vars = len(df_var.index.tolist())
    if 'constant_vars' in inputs:
        n_vars += len(pred_constants)
    if n_features != n_vars:
        print df_var.index.tolist() + pred_constants
        sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\
            '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \
            '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\
            '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt))
        #"""
    if 'agg_method' in inputs:
        agg_method = inputs['agg_method']

    # Get mask and raster info
    ds = gdal.Open(mask_path)
    ar = ds.ReadAsArray()
    nodata_mask = ar != 0
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = gdal.GetDriverByName('gtiff')
    ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx

    # Predict
    #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time()))
    t1 = time.time()
    predict_pieces = []

    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 25, 15
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]

    if 'n_tiles' in inputs:
        df_tiles, df_tiles_rc, tile_size = stem.get_tiles(
            n_tiles, xsize, ysize, tx)
        empty_tiles = []
        ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        tile_dir = os.path.join(out_dir, 'predict_tiles')
        if not os.path.isdir(tile_dir):
            os.mkdir(tile_dir)
        for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()):
            print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles))
            t1 = time.time()
            coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist()
            tsa_ar, tsa_off = mosaic.extract_kernel(ds,
                                                    1,
                                                    coords,
                                                    tx,
                                                    xsize,
                                                    ysize,
                                                    nodata=nodata)
            tsa_mask = tsa_ar == 0
            if tsa_mask.all():
                print 'Tile %s empty. Skipping...' % ind
                continue
            tsa_ar[tsa_mask] = nodata
            # Get the ids of TSAs this kernel covers
            tsa_ids = np.unique(tsa_ar)
            #tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata]
            tsa_strs = [str(tsa) for tsa in tsa_ids if tsa != nodata]
            array_shape = tsa_ar.shape

            # Get an array of predictors where each column is a flattened 2D array of a
            #   single predictor variable
            temp_nodata = -9999
            ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar,
                                                coords, tsa_mask, temp_nodata,
                                                1)
            nodata_mask = ~np.any(ar_predictors == temp_nodata, axis=1)
            predictors = ar_predictors[nodata_mask]
            t2 = time.time()
            if agg_method == 'mode':
                args = []
                for dt in rf_model.estimators_:
                    args.append([dt, predictors])
                pool = Pool(rf_model.n_jobs)
                t3 = time.time()
                dt_predictions = np.vstack(
                    pool.map(forest.par_predict_from_dt, args, 1))
                print 'Prediction time: %.1f minutes' % (
                    (time.time() - t3) / 60)
                t3 = time.time()
                predictions = stem.mode(dt_predictions, axis=0)
                print 'Aggregation time:  %.1f minutes' % (
                    (time.time() - t3) / 60)
                del dt_predictions
                t3 = time.time()
                pool.close()
                pool.join()
                print 'Closing time:  %.1f minutes' % ((time.time() - t3) / 60)
            else:
                predictions = rf_model.predict(ar_predictors[nodata_mask])
            print 'Prediction time: %.1f minutes' % ((time.time() - t2) / 60)

            ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8)
            ar_tile[nodata_mask] = predictions.astype(np.uint8)
            ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind]
            ar_out[ul_r:lr_r, ul_c:lr_c] = ar_tile.reshape(array_shape)
            tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res
            mosaic.array_to_raster(ar_tile.reshape(array_shape),
                                   tx_tile,
                                   prj,
                                   driver,
                                   os.path.join(tile_dir, 'tile_%s.tif' % ind),
                                   dtype=gdal.GDT_Byte,
                                   nodata=nodata)
            print 'Total time for this piece: %.1f minutes\n' % (
                (time.time() - t1) / 60)
            #del ar_predictors, nodata_mask, ar_prediction'''
        #ar_prediction = np.concatenate(predict_pieces)
        #del predict_pieces
        '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        for ind, tile_coords in df_tiles_rc.iterrows():
            if ind in empty_tiles:
                continue
            ul_r, lr_r, ul_c, lr_c = tile_coords
            tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind)
            if not os.path.exists(tile_file):
                continue
            ds_t = gdal.Open(tile_file)
            ar_tile = ds_t.ReadAsArray()
            t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']]
            ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile'''

    else:
        ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata)
        # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict
        #   separately, then stack them back together
        if split_predictors:
            split_predictors = int(split_predictors)
            predictions = []
            for i, p in enumerate(
                    np.array_split(ar_predictors, split_predictors)):
                t1 = time.time()
                print '\nPredicting for %s of %s pieces of the final array...' % (
                    i + 1, split_predictors)
                predictions.append(rf_model.predict(p))
                print '%.1f minutes' % ((time.time() - t1) / 60)
            predictions = np.concatenate(predictions)
            print ''
        else:
            print 'Predicting in one chunk...'
            predictions = rf_model.predict(ar_predictors)
        ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32)
        ar_prediction[nodata_mask] = predictions
        del ar_predictors, predictions

    # Save the prediction array to disk
    stamp = os.path.basename(out_dir)
    out_path = os.path.join(out_dir, '%s_rf_vote.tif' % stamp)
    #ar_prediction = ar_prediction.reshape(ysize, xsize)
    if constant_vars:
        out_path = out_path.replace('.tif', '_yr%s.tif' % year)
    forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte,
                           nodata)  #"""
    # Delete the tiles
    shutil.rmtree(tile_dir)
    ds = None
    '''stamp = os.path.basename(out_dir)
    path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) 
    stamp = os.path.basename(os.path.dirname(path))
    ds = gdal.Open(path)
    ar_prediction = ds.ReadAsArray()
    ds = None#'''

    if 'test_params' in inputs:
        #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id')
        print '\nEvaluating the model...'
        t1 = time.time()
        test_dict = forest.read_params(test_params)
        for i in test_dict:
            exec("{0} = str({1})").format(i, test_dict[i])

        if 'n_trials' in test_dict:
            n_trials = int(n_trials)
        else:
            'n_trials not specified. Setting default to 50...\n'
            n_trials = 50
        if 'year' in test_dict:
            year = int(year)
        else:
            year = None
        cell_size = [int(i) for i in cell_size.split(',')]
        n_per_cell = int(n_per_cell)
        param_bn = os.path.basename(test_params)
        shutil.copy2(
            test_params,
            os.path.join(out_dir, param_bn.replace('.txt', '_%s.txt' % year)))

        df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx,
                                                 cell_size, target_col,
                                                 n_per_cell, n_trials, year)
        if len(roc_curves) > 0:
            for fpr, tpr, thresholds in roc_curves:
                plt.plot(fpr, tpr, 'k', alpha=.1)
            out_png = os.path.join(out_dir,
                                   '{0}_roc_curve_{1}.png'.format(stamp, year))
            plt.savefig(out_png)

        if 'lc_path' in test_dict:
            '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
            out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year))
            df_lc.to_csv(out_txt, sep='\t')'''

        #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
        df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
        out_txt = os.path.join(
            out_dir,
            '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year))
        df_lc.to_csv(out_txt, sep='\t')
        if 'inventory_txt' in test_dict:
            score_cols = sorted(df.columns)
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            for col in score_cols:
                score_mean = df[col].mean()
                df_inv.ix[stamp, col] = score_mean
                print 'Average %s: %2.3f' % (col.upper(), score_mean)
            df_inv.to_csv(inventory_txt, sep='\t')
        out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year))
        df.to_csv(out_txt, sep='\t', index=False)
        samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t')
        print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1) / 60)
    else:
        print '\nEither "test_samples" or "inventory_txt" was not specified.' +\
            ' This model will not be evaluated...'

    print '\nTotal runtime: %.1f minutes' % ((time.time() - t0) / 60)

    return out_path
Exemplo n.º 5
0
def main(params, n_tiles=(25, 15), n_jobs=20, kernel_type='circle', filter_value=None):
    
    t0 = time.time()
    
    # Read params and make variables from text
    inputs = read_params(params)
        
    # Check params
    try:
        path = inputs['path']
        function = inputs['function']
        out_path = inputs['out_path']
        kernel_size = int(inputs['kernel_size'])
        databand = int(inputs['databand'])
    except KeyError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    if 'n_jobs' in inputs: n_jobs = int(inputs['n_jobs'])
    if 'n_tiles' in inputs: n_tiles = [int(n) for n in inputs['n_tiles'].split(',')]
    if 'nodata' in inputs: nodata = int(inputs['nodata'])
    
    extra_args = () # The default for ndi.generic_filter 'extra_args' is an empty tuple
    if 'average' in function.lower():
        func = np.nanmean
    elif 'mode' in function.lower():
        func = mode
    elif 'area' in function.lower():
        func = pct_nonzero
        if not filter_value and not 'filter_value' in inputs:
            sys.exit('Cannot calculate percent area without filter_value. ' +\
            'Try specifying filter_value in parameters file.')
        else:
            filter_value = int(inputs['filter_value'])
    elif 'equal' in function.lower():
        func = is_equal_to
        center_idx = kernel_size**2/2
        extra_args = tuple([center_idx])
        
    else:
        sys.exit('Could not find filtering function for alias: %s' % function)
    
    out_dir = os.path.dirname(out_path)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    shutil.copy2(params, out_dir)
        
    print '\nReading input raster...\n'
    t1 = time.time()
    ds = gdal.Open(path)
    band = ds.GetRasterBand(databand)
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = ds.GetDriver()
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    
    # Get an array and mask out nodata values with nans
    if 'nodata' not in inputs:
        print 'nodata not specified in params. Getting nodata value from input dataset...\n'
        nodata = band.GetNoDataValue()
    '''ar = band.ReadAsArray()
    ds = None
    array_dtype = ar.dtype
    ar = ar.astype(np.float16)
    mask = (ar != nodata) #& (ar != 255)
    ar[~mask] = np.nan'''
    if 'area' in function.lower():
        ar[(ar != filter_value) & mask] = 0
    #import pdb; pdb.set_trace()
    #ysize, xsize = ar.shape
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    if kernel_type.lower() == 'circle':
        #kernel_size /= 2
        kernel = circle_mask(kernel_size)
    else:
        kernel = np.ones((kernel_size, kernel_size))
    
    tile_buffer = kernel.shape[0]/2
    # Tile up the array to filter in parallel
    # Find empty tiles
    print 'Finding empty tiles...'
    t1 = time.time()
    df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx)

    total_tiles = len(df_tiles)
    '''empty_tiles = find_empty_tiles(df_tiles, mask, tx)
    df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles)
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)'''
    
    # Add buffer around each tile
    df_buf = df_tiles_rc.copy()
    df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer
    df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer
    df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize)
    df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize)
    
    # Get arrays
    print 'Getting buffered arrays...'
    t1 = time.time()
    n_full_tiles = len(df_tiles)
    args = []
    temp_dir = os.path.join(out_dir, 'tiles')
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)
    for i, (ind, r) in enumerate(df_buf.iterrows()):
        #this_ar = ar[r.ul_r : r.lr_r, r.ul_c : r.lr_c]
        #args.append([ind, this_ar, func, kernel, extra_args, i + 1, n_full_tiles])
        args.append([ind, path, databand, nodata, r, df_tiles.ix[ind], temp_dir, func, kernel, extra_args, i + 1, n_full_tiles])
        #arrays.append([i, this_ar])
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    print 'Filtering chunks in parallel with %s jobs...' % n_jobs
    p = Pool(n_jobs)
    tiles = p.map(par_filter, args, 1)

    print '\nTotal time for filtering: %.1f minutes\n' % ((time.time() - t1)/60)#'''

    
    print 'Tiling pieces back together...'
    t1 = time.time()
    gdal_dtype = band.DataType
    array_dtype = gdalnumeric.GDALTypeCodeToNumericTypeCode(gdal_dtype)
    filtered = np.full((ysize, xsize), nodata, dtype=array_dtype)
    for i, tile_path in tiles:
        if not tile_path:
            continue
        ds_t = gdal.Open(tile_path)
        buffered_tile = ds_t.ReadAsArray()
        b_inds = df_buf.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']]
        t_inds = df_tiles_rc.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']]
        d_ulr, d_lrr, d_ulc, d_lrc = t_inds - b_inds
        tile = buffered_tile[d_ulr : d_lrr, d_ulc : d_lrc]
        tile[np.isnan(tile)] = nodata
        tile = tile.astype(array_dtype)
        t_ulr, t_lrr, t_ulc, t_lrc = t_inds
        filtered[t_ulr : t_lrr, t_ulc : t_lrc] = tile
    print '%.1f minutes\n' % ((time.time() - t1)/60)   
    
    #filtered = filtered.astype(array_dtype)
    if 'out_nodata' in inputs: 
        #filtered[np.isnan(filtered) | ~mask] = nodata
        filtered[filtered == nodata] = int(inputs['out_nodata'])
        nodata = int(inputs['out_nodata'])

    try:
        array_to_raster(filtered, tx, prj, driver, out_path, dtype=gdal_dtype, nodata=nodata)
    except:
        array_to_raster(filtered, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
    desc = ('Raster filtered by kernel of shape {kernel_type} and size ' +\
            '{kernel_size} and function {func}').format(kernel_type=kernel_type,
                                                        kernel_size=kernel_size, 
                                                        func=function)
    meta_path = createMetadata(sys.argv, out_path, description=desc)
    write_params_to_meta(meta_path, params)
    del ar, filtered, tiles, args, p
    ds = None
    import pdb; pdb.set_trace()
    shutil.rmtree(temp_dir)
    
    print 'Total time: %.1f minutes' % ((time.time() - t0)/60)
Exemplo n.º 6
0
def main(model_dir, n_tiles, **kwargs):

    t0 = time.time()

    n_tiles = [int(n) for n in n_tiles.split(',')]
    if not os.path.isdir(model_dir):
        message = 'model directory given does not exist or is not a directory: ', model_dir
        raise IOError(message)

    model = os.path.basename(model_dir)
    dt_dir = os.path.join(model_dir, 'decisiontree_models')
    set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model)
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')

    pred_param_path = glob(os.path.join(model_dir,
                                        'predict_stem_*params.txt'))[0]
    predict_params, df_var = stem.read_params(pred_param_path)
    train_param_path = glob(os.path.join(model_dir,
                                         'train_stem_*params.txt'))[0]
    train_params, _ = stem.read_params(train_param_path)
    df_var.sort_index(inplace=True)

    nodata = int(predict_params['nodata'].replace('"', ''))
    if len(kwargs) == 0:
        var_ids = df_sets.max_importance.unique()
        var_names = df_var.ix[var_ids].index
        variables = zip(var_ids, var_names)
    else:
        variables = [(variable_id, variable_name)
                     for variable_name, variable_id in kwargs]

    mask_path = os.path.join(model_dir, '%s_vote.bsq' % model)
    if not os.path.exists(mask_path):
        mask_path = mask_path.replace('.bsq', '.tif')
    mask_ds = gdal.Open(mask_path)
    mask_tx = mask_ds.GetGeoTransform()
    xsize = mask_ds.RasterXSize
    ysize = mask_ds.RasterYSize
    prj = mask_ds.GetProjection()
    df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize,
                                                      mask_tx)
    total_tiles = len(df_tiles)
    df_tiles['tile'] = df_tiles.index

    # Find the tiles that have only nodata values
    t1 = time.time()
    print '\nFinding empty tiles...'
    mask = mask_ds.ReadAsArray() == nodata
    empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx)
    mask_ds = None
    print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\
    (len(empty_tiles), total_tiles, (time.time() - t1)/60)
    # Select only tiles that are not empty
    df_tiles = df_tiles.select(lambda x: x not in empty_tiles)
    total_tiles = len(df_tiles)

    #some_set = df_sets.iloc[0]
    support_size = [
        int(s)
        for s in train_params['support_size'].replace('"', '').split(',')
    ]
    set_size = [int(abs(s / mask_tx[1])) for s in support_size]

    out_dir = os.path.join(model_dir, 'importance_maps')
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    print variables
    for vi, (v_id, v_name) in enumerate(variables):

        t1 = time.time()
        print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1,
                                                           len(variables))

        ar = np.full((ysize, xsize), nodata, dtype=np.uint8)

        for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()):
            t2 = time.time()
            print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles)

            # Calculate the size of this tile in case it's at the edge where the
            #   tile size will be slightly different
            this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x -
                                                          t_row.ul_x)
            df_these_sets = stem.get_overlapping_sets(df_sets, t_row,
                                                      this_size, support_size)

            rc = df_tiles_rc.ix[t_ind]
            this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c
            n_sets = len(df_these_sets)

            # Load overlapping predictions from disk and read them as arrays
            tile_ul = t_row[['ul_x', 'ul_y']]

            print n_sets, ' Overlapping sets'
            importance_bands = []

            importance_values = []
            for s_ind, s_row in df_these_sets.iterrows():

                # Calculate offset and array/tile indices
                offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y),
                                          mask_tx)
                #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]):

                tile_inds, a_inds = mosaic.get_offset_array_indices(
                    tile_size, set_size, offset)

                # Get feature with maximum importance and fill tile with that val
                try:
                    with open(s_row.dt_file, 'rb') as f:
                        dt_model = pickle.load(f)
                    importance_value = int(
                        dt_model.feature_importances_[v_id] * 100)
                    importance_values.append(importance_value)
                    #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8)
                    #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata)
                    import_band = np.full(this_size, np.nan, dtype=np.float16)
                    import_band[tile_inds[0]:tile_inds[1],
                                tile_inds[2]:tile_inds[3]] = importance_value
                    importance_bands.append(import_band)
                except Exception as e:
                    print e
                    continue  #'''

            print 'Average importance for this tile: %.1f' % np.mean(
                importance_values)
            #Aggregate
            importance_stack = np.dstack(importance_bands)
            importance_tile = np.nanmean(importance_stack, axis=2)
            tile_mask = mask[rc.ul_r:rc.lr_r,
                             rc.ul_c:rc.lr_c] | np.isnan(importance_tile)
            importance_tile[tile_mask] = nodata
            ar[rc.ul_r:rc.lr_r,
               rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8)
            print 'Aggregation time for this tile: %.1f minutes\n' % (
                (time.time() - t2) / 60)
            '''temp_dir = os.path.join(out_dir, 'delete')
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)
            t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30
            array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)'''
        out_path = os.path.join(out_dir,
                                '%s_importance_%s.tif' % (model, v_name))
        try:
            array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'),
                            out_path, gdal.GDT_Byte, nodata)
        except Exception as e:
            print e
            import pdb
            pdb.set_trace()
        print 'Time for this variable: %.1f minutes\n' % (
            (time.time() - t1) / 60)

    print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), (
        (time.time() - t0) / 3600))
def main(n_tiles,
         tile_path=None,
         add_field=True,
         out_path=None,
         snap=True,
         clip=True):

    try:
        if add_field.lower() == 'false':
            add_field = False
    except:
        pass
    try:
        if snap.lower() == 'false':
            snap = False
    except:
        pass

    if tile_path is None:
        tile_path = TILE_PATH

    if not os.path.exists(tile_path):
        raise RuntimeError('tile_path does not exist: %s' % tile_path)

    try:
        n_tiles = tuple([int(i) for i in n_tiles.split(',')])
    except:
        raise ValueError(
            'Could not parse n_tiles %s. It must be given as "n_tiles, n_x_tiles"'
            % n_tiles)

    # Get processing tiles
    tx, (xmin, xmax, ymin, ymax) = tx_from_shp(tile_path, XRES, YRES)
    xsize = abs(int(xmax - xmin) / XRES)
    ysize = abs(int(ymax - ymin) / YRES)
    tiles, _, _ = get_tiles(n_tiles, xsize, ysize, tx=tx)
    tile_id_field = 'eetile%sx%s' % n_tiles
    tiles[tile_id_field] = tiles.index

    if snap:
        coords, _ = get_coords(tile_path, multipart='split')
        coords = np.array(coords)  #shape is (nfeatures, ncoords, 2)
        xcoords = np.unique(coords[:, :, 0])
        ycoords = np.unique(coords[:, :, 1])
        for i, processing_coords in tiles.iterrows():
            tiles.loc[i, 'ul_x'] = xcoords[np.argmin(
                np.abs(xcoords - processing_coords.ul_x))]
            tiles.loc[i, 'lr_x'] = xcoords[np.argmin(
                np.abs(xcoords - processing_coords.lr_x))]
            tiles.loc[i, 'ul_y'] = ycoords[np.argmin(
                np.abs(ycoords - processing_coords.ul_y))]
            tiles.loc[i, 'lr_y'] = ycoords[np.argmin(
                np.abs(ycoords - processing_coords.lr_y))]

    if not out_path:
        out_path = os.path.join(OUT_DIR,
                                'ee_processing_tiles_%sx%s.shp' % n_tiles)
    coords_to_shp(tiles, tile_path, out_path)
    descr = ('Tiles for processing data on Google Earth Engine. The tiles ' +
            'have %s row(s) and %s col(s) and are bounded by the extent of %s') %\
            (n_tiles[0], n_tiles[1], tile_path)
    '''if clip:
        ds = ogr.Open(tile_path)
        lyr = ds.GetLayer()
        geoms = ogr.Geometry(ogr.wkbMultiPolygon)
        for feature in lyr:
            g = feature.GetGeometryRef()
            geoms.AddGeometry(g)
        union = geoms.UnionCascaded()
        base_path, ext = os.path.splitext(tile_path)
        temp_file = tile_path.replace(ext, '_uniontemp' + ext)
        feature'''

    createMetadata(sys.argv, out_path, description=descr)
    print '\nNew processing tiles written to', out_path

    # Find which features processing tile touches which each CONUS storage tile
    #   use get_overallping_sets() to find which
    # Read in the CONUS storage tiles
    if add_field:
        conus_tiles = attributes_to_df(tile_path)

        # Make a temporary copy of it
        base_path, ext = os.path.splitext(tile_path)
        temp_file = tile_path.replace(ext, '_temp' + ext)
        df_to_shp(conus_tiles, tile_path, temp_file, copy_fields=False)

        # Loop through each processing tile and find all overlapping
        conus_tiles[tile_id_field] = -1
        ds = ogr.Open(tile_path)
        lyr = ds.GetLayer()
        for p_fid, processing_coords in tiles.iterrows():
            wkt = 'POLYGON (({0} {1}, {2} {1}, {2} {3}, {0} {3}, {0} {1}))'.format(
                processing_coords.ul_x, processing_coords.ul_y,
                processing_coords.lr_x, processing_coords.lr_y)
            p_geom = ogr.CreateGeometryFromWkt(wkt)
            p_geom.CloseRings()
            for c_fid in conus_tiles.index:
                feature = lyr.GetFeature(c_fid)
                geom = feature.GetGeometryRef()
                if geom.Intersection(p_geom).GetArea() > 0:
                    conus_tiles.loc[c_fid, tile_id_field] = p_fid
        lyr, feature = None, None

        # re-write the CONUS tiles shapefile with the new field
        df_to_shp(conus_tiles, tile_path, tile_path, copy_fields=False)

        # delete temporary file
        driver = ds.GetDriver()
        driver.DeleteDataSource(temp_file)
        ds = None
        print '\nField with processing tile ID added to', tile_path

        # if the metadata text file exists, add a line about appending the field.
        #   otherwise, make a new metadata file.
        meta_file = tile_path.replace(ext, '_meta.txt')
        if os.path.exists(meta_file):
            with open(meta_file, 'a') as f:
                f.write(
                    '\n\nAppended field %s with IDs from the overlapping feature of %s'
                    % (tile_id_field, out_path))
        else:
            descr = 'Tile system with appended field %s with IDs from the overlapping feature of %s' % (
                tile_id_field, out_path)
            createMetadata(sys.argv, tile_path, description=descr)