예제 #1
0
def main(params, test_txt=None, region_nodata=0, region_shp=None):

    t0 = time.time()
    # Read params and make variables from each line
    inputs = read_params(params)
    for var in inputs:
        exec("{0} = str({1})").format(var, inputs[var])

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    shutil.copy2(params, out_dir)

    n_samples = int(n_samples)
    p_nodata = int(p_nodata)
    t_nodata = int(t_nodata)
    bins = parse_bins(bins)
    if 'region_nodata' in inputs:
        region_nodata = int(region_nodata)

    ds = gdal.Open(region_raster)
    regions = ds.ReadAsArray()
    #nodata_mask = regions == region_nodata
    ds = None

    ds_t = gdal.Open(t_raster)
    ar_t = ds_t.ReadAsArray()

    ds_p = gdal.Open(p_raster)
    ar_p = ds_p.ReadAsArray()
    nodata_mask = (ar_p == p_nodata) | (ar_t == t_nodata)

    d = {}
    if 'region_ids' in inputs:
        region_ids = [int(r) for r in region_ids.split(',')]
    else:
        region_ids = np.unique(regions[~nodata_mask])

    if test_txt:
        test_sample = pd.read_csv(test_txt)
        if 'row' not in test_sample.columns:
            test_sample = pd.read_csv(test_txt, sep='\t')
    else:
        train_sample = pd.read_csv(train_txt, sep='\t', index_col='obs_id')

        # Set any pixels used for training to -1 so they can be avoided for testing
        n_rows, n_cols = regions.shape
        n_pixels = regions.size
        pixel_ids = np.arange(n_pixels,
                              dtype=np.uint32).reshape(n_rows, n_cols)
        pixel_ids[
            train_sample.row,
            train_sample.col] = n_pixels  #will always be 1 more than last col
        pixel_ids[nodata_mask] = n_pixels
        #ar_col[train_sample.row, train_sample.col] = -1

        #import pdb; pdb.set_trace()

        test_ids = np.array(random.sample(pixel_ids[pixel_ids != n_pixels],
                                          n_samples),
                            dtype=np.uint32)
        test_rows = test_ids / n_cols
        test_cols = test_ids % n_cols
        #test_cols = random.sample(ar_col[ar_col != -1], n_samples)
        test_sample = pd.DataFrame({'row': test_rows, 'col': test_cols})
        test_sample['region'] = regions[test_rows, test_cols]

    ind_mask = (pixel_ids == n_pixels).reshape(n_rows, n_cols)
    del pixel_ids
    ar_row, ar_col = np.indices(regions.shape, dtype=np.int32)

    confusion_dir = os.path.join(out_dir, 'region_confusion_tbls')
    if not os.path.isdir(os.path.join(confusion_dir)):
        os.mkdir(confusion_dir)

    test_sample['reference'] = -1
    test_sample['predicted'] = -1
    stats = []
    n_regions = len(region_ids)
    for i, r_id in enumerate(region_ids):
        print '\nCalculating stats for region %s (%s of %s)...' % (r_id, i + 1,
                                                                   n_regions)
        t1 = time.time()
        ''' figure out what's up with nodata values '''
        region_mask = regions == r_id
        min_row = int(ar_row[region_mask & ~ind_mask].min())
        max_row = int(ar_row[region_mask & ~ind_mask].max())
        min_col = int(ar_col[region_mask & ~ind_mask].min())
        max_col = int(ar_col[region_mask & ~ind_mask].max())
        nrows = max_row - min_row
        ncols = max_col - min_col

        ar_t_region = ds_t.ReadAsArray(min_col, min_row, ncols, nrows)
        ar_p_region = ds_p.ReadAsArray(min_col, min_row, ncols, nrows)
        #clipped_mask = region_mask[min_row:max_row, min_col:max_col]
        clipped_mask = (ar_t_region == t_nodata) | (ar_p_region == p_nodata)
        del region_mask

        region_sample = test_sample[test_sample.region == r_id].copy()
        region_sample['global_row'] = region_sample.row
        region_sample['global_col'] = region_sample.col
        region_sample['row'] = region_sample.row - min_row
        region_sample['col'] = region_sample.col - min_col
        region_sample['reference'] = ar_t_region[region_sample.row,
                                                 region_sample.col]
        region_sample['predicted'] = ar_p_region[region_sample.row,
                                                 region_sample.col]

        region_sample = region_sample
        test_sample.ix[region_sample.index,
                       'reference'] = region_sample['reference']
        test_sample.ix[region_sample.index,
                       'predicted'] = region_sample['predicted']
        #import pdb; pdb.set_trace()
        #try:
        df = evaluation.confusion_matrix_by_area(ar_p_region,
                                                 ar_t_region,
                                                 region_sample,
                                                 p_nodata,
                                                 t_nodata,
                                                 mask=clipped_mask,
                                                 bins=bins,
                                                 match='best')
        this_txt = os.path.join(confusion_dir, 'confusion_%s.txt' % r_id)
        df.to_csv(this_txt, sep='\t')
        accuracy = df.ix['producer', 'user']
        kappa = df.ix['producer', 'kappa']
        sample_mask = (region_sample.reference
                       == t_nodata) | (region_sample.predicted == p_nodata)
        rmse = sf.rmse(region_sample.reference[~sample_mask],
                       region_sample.predicted[~sample_mask])
        print len(sample_mask[sample_mask])

        stats.append({
            'region': r_id,
            'accuracy': accuracy,
            'kappa': kappa,
            'rmse': rmse
        })
        print 'Time for this region: %.1f minutes' % ((time.time() - t1) / 60)

    #if not test_txt:
    test_basename = 'test_sample_%s.txt' % n_samples
    test_txt = os.path.join(out_dir, test_basename)
    test_sample.to_csv(test_txt)
    desc = 'Random test sample of %s not used in training samples %s' % (
        n_samples, train_txt)
    createMetadata(sys.argv,
                   os.path.join(out_dir, test_basename),
                   description=desc)
    print '\nTest sample text file written to:', test_txt

    df = pd.DataFrame(stats)
    out_txt = os.path.join(out_dir, 'region_stats_%s.txt' % n_samples)
    df.to_csv(out_txt, sep='\t', index=False)
    test_basename = os.path.basename(test_txt)
    desc = 'Stats from modeling regions from %s. Samples drawn from %s' % (
        region_raster, os.path.join(out_dir, test_basename))
    createMetadata(sys.argv, out_txt, description=desc)

    if region_shp:
        out_shp = os.path.join(out_dir, 'modeling_region_stats.shp')
        df_to_shp(df,
                  region_shp,
                  out_shp,
                  copy_fields=False,
                  df_id_field='region',
                  shp_id_field='Zone_ID')
    print '\nFinished in %.1f minutes' % ((time.time() - t0) / 60)
예제 #2
0
def main(params,
         ar_p=None,
         out_txt=None,
         inventory_txt=None,
         target_col=None,
         match=False,
         file_stamp=None):
    #p_path, t_path, bins, sample_txt, p_nodata, t_nodata, out_dir, inventory_txt=None

    # Read params and make variables from text
    inputs = read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        bins = parse_bins(bins)
        p_nodata = int(p_nodata)
        t_nodata = int(t_nodata)
        str_check = sample_txt  #, target_col
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    #if out_dir_: # then out_dir came from predict_stem call
    #    out_dir = out_dir_
    #out_txt = os.path.join(out_dir, 'confusion.txt')
    if out_txt:
        out_dir = os.path.dirname(out_txt)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        shutil.copy2(params, out_dir)

    # If p_path was specified, this call of the function is coming from outside
    #   predict_stem.py. Otherwise, ar_p should be given.
    if 'p_path' in locals():
        print 'Reading in the prediction raster:%s\n' % p_path
        ds_p = gdal.Open(p_path)
        ar_p = ds_p.ReadAsArray()

    ds_t = gdal.Open(t_path)
    band = ds_t.GetRasterBand(1)
    ar_t = band.ReadAsArray()
    #ar_t=ar_t.GetRasterBand(1)
    #print('read in the truth raster')
    t_xsize = ds_t.RasterXSize
    #print('t_xsize is: ', t_xsize)
    t_ysize = ds_t.RasterYSize
    #print('tYsize is: ', t_ysize)
    p_xsize = ds_p.RasterXSize
    #print('p_xsize is: ', p_xsize)
    p_ysize = ds_p.RasterYSize
    #print('p_ysize is: ', p_ysize)
    tx_t = ds_t.GetGeoTransform()
    tx_p = ds_p.GetGeoTransform()
    # If two arrays are different sizes, make prediction array match reference
    if not t_xsize == p_xsize or t_ysize == p_ysize or tx_t != tx_p:
        print('entered if statement')
        warnings.warn(
            'Prediction and reference rasters do not share the same extent. Snapping prediction raster to reference....'
        )
        offset = mosaic.calc_offset((tx_t[0], tx_t[3]), tx_p)
        #print(offset)
        t_inds, p_inds = mosaic.get_offset_array_indices(
            (t_ysize, t_xsize), (p_ysize, p_xsize), offset)
        print(t_inds, p_inds)
        ar_buf = np.full(ar_t.shape, p_nodata, dtype=ar_p.dtype)
        print ar_buf.shape
        ar_buf[t_inds[0]:t_inds[1],
               t_inds[2]:t_inds[3]] = ar_p[p_inds[0]:p_inds[1],
                                           p_inds[2]:p_inds[3]]
        ar_p = ar_buf.copy()
        del ar_buf
    mask = (ar_p == p_nodata) | (ar_t == t_nodata)  #'''

    samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
    print samples
    df_adj, df_smp = confusion_matrix_by_area(ar_p,
                                              ar_t,
                                              samples,
                                              p_nodata,
                                              t_nodata,
                                              mask=mask,
                                              bins=bins,
                                              out_txt=out_txt,
                                              target_col=target_col,
                                              match=match)

    ar_p = None
    ar_t = None
    mask = None

    accuracy = df_adj.ix['producer', 'user']
    kappa = df_adj.ix['producer', 'kappa']
    if inventory_txt and file_stamp:
        df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
        if file_stamp in df_inv.index and 'vote' in os.path.basename(out_dir):
            cols = ['vote_accuracy', 'vote_kappa']
            df_inv.ix[file_stamp, cols] = accuracy, kappa
            df_inv.to_csv(inventory_txt, sep='\t')
            print 'Vote scores written to inventory_txt: ', inventory_txt

        if file_stamp in df_inv.index and 'mean' in os.path.basename(out_dir):
            cols = ['mean_accuracy', 'mean_kappa']
            df_inv.ix[file_stamp, cols] = accuracy, kappa
            df_inv.to_csv(inventory_txt, sep='\t')

    return df_smp