예제 #1
0
def main():
    path = sys.argv[1]
    files = os.listdir(path)
    files = [os.path.join(path,f) for f in files if f.endswith('.txt')]

    res = []
    for file in files:
        confusion_matrix.main(file)
        file_name = file.split('/')[-1].split('.')[0]
        input_file = '{}_confusion_matrix.txt'.format(file_name)
        with open(input_file) as f:
            first = 0
            starting = 0
            m = re.search('starting_(\d+)', input_file)
            if m is not None:
                starting = int(m.group(1))
            m = re.search('first_(\d+)', input_file)
            if m is not None:
                first = int(m.group(1))
            data = f.readlines()[-1]
            res.append((first, starting, data))

    res = sorted(res, key=lambda x: x[0], reverse=False)
    output_file = 'aggregated_confusion_matrix.txt'
    try:
        os.remove(output_file)
    except:
        pass
    with open(output_file, 'a') as f:
        for l in res:
            f.write('{} {} {}'.format(l[0], l[1], l[2]))
    generate_gnuplot(output_file)
예제 #2
0
def main(params, inventory_txt=None, constant_vars=None):
    
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    # If constants were given, make a dict and make sure they match the training
    #  constants
    if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        pred_constants = sorted(constant_vars.keys())
        train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')]
        train_constants = sorted(train_constants)
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if 'constant_vars' in inputs: 
        unmatched_vars += [v for v in pred_constants if v not in train_constants]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str
        raise NameError(msg)
    unmatched_vars = [v for v in train_vars if v not in pred_vars]
    if 'constant_vars' in inputs:
        unmatched_vars += [v for v in train_constants if v not in pred_constants]
        pred_vars += pred_constants # Add here because it would screw with stuff upstream
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    mosaic_ds = gdal.Open(mosaic_path)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)
    
    t0 = time.time()
    if 'n_jobs' in inputs:

        # Predict in parallel
        n_jobs = int(n_jobs)
        args = []
        t1 = time.time()
        print 'Predicting in parallel with %s jobs...' % n_jobs
        print 'Building args and making rasters of TSA arrays...'
        for c, (set_id, row) in enumerate(df_sets.iterrows()):
            
            # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx,
                            xsize, ysize, nodata=nodata)
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
            dtype_code = mosaic_ds.GetRasterBand(1).DataType
            mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True)
            
            # Build list of args to pass to the Pool
            tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id)
            ds = gdal.Open(tsa_raster)
            tsa_tx = ds.GetGeoTransform()
            ds = None
            tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx)
            args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, 
                         mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, 
                         constant_vars, predict_dir])
        print '%.1f minutes\n' % ((time.time() - t1)/60)
        p = Pool(n_jobs)
        p.map(stem.par_predict, args, 1)
            
    
    else:
        # Loop through each set and generate predictions
        for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()):
            t1 = time.time()
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            print '\nPredicting for set %s of %s' % (c + 1, total_sets)
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, 
                                     mosaic_tx, xsize, ysize, dt_model, nodata,
                                     np.int16, constant_vars)        
            tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res
            out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id)
            mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
            print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60)
    
        #mosaic_ds = None                  
    print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#'''
    
    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0)
    #df_sets.to_csv(set_txt, sep='\t')'''
    mosaic_ds = None
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp))
    ar_vote = ds.ReadAsArray()
    ds = None
    ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp))
    ar_mean = ds.ReadAsArray()
    ds = None#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion
        
        vote_dir = os.path.join(model_dir, 'evaluation_vote')
        mean_dir = os.path.join(model_dir, 'evaluation_mean')
        
        print '\nComputing confusion matrix for vote...'
        out_txt = os.path.join(vote_dir, 'confusion.txt')
        print confusion_params
        df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
        try:
            out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
            df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
        except Exception as e:
            print e
        
        '''print '\nGetting confusion matrix for mean...'
        out_txt = os.path.join(mean_dir, 'confusion.txt')
        df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
        try:
            out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
            df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
        except Exception as e:
            print e#'''
        
        vote_acc = df_v.ix['producer', 'user']
        vote_kap = df_v.ix['producer', 'kappa']
        #mean_acc = df_m.ix['user','producer']
        #mean_kap = df_m.ix['user', 'kappa']

        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        print 'Vote accuracy .............. ', vote_acc
        print 'Vote kappa ................. ', vote_kap
        #print 'Mean accuracy .............. ', mean_acc
        #print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
예제 #3
0
def main(params, inventory_txt=None):

    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        _, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str
        raise NameError(msg)
    unmatched_vars = [v for v in train_vars if v not in pred_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        #shutil.copy2(confusion_params, out_dir)
        conf_bn = os.path.basename(confusion_params)
        confusion_params = os.path.join(out_dir, conf_bn)

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    mosaic_ds = gdal.Open(mosaic_path)
    mosaic_tx = mosaic_ds.GetGeoTransform()
    xsize = mosaic_ds.RasterXSize
    ysize = mosaic_ds.RasterYSize
    prj = mosaic_ds.GetProjection()
    driver = mosaic_ds.GetDriver()
    m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx

    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)

    set_txt = glob.glob(
        os.path.join('/vol/v2/stem/imperv/imperv_bdt',
                     'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)
    '''# Loop through each set and generate predictions
    t0 = time.time()
    for c, (set_id, row) in enumerate(df_sets.iterrows()):
        t1 = time.time()
        with open(row.dt_file, 'rb') as f: 
            dt_model = pickle.load(f)
        print '\nPredicting for set %s of %s' % (c + 1, total_sets)
        ar_coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
        ar_predict = stem.predict_set_in_pieces(set_id, df_var, mosaic_ds, ar_coords, 
                                 mosaic_tx, xsize, ysize, dt_model, nodata)        
        tx = ar_coords.ul_x, x_res, x_rot, ar_coords.ul_y, y_rot, y_res
        out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id)
        array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata)
        print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60)

    #mosaic_ds = None                  
    print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#'''

    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    ar_mean, ar_vote, pct_importance, df_sets = stem.aggregate_predictions(
        ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir,
        df_sets, out_dir, file_stamp, prj, driver, 0)
    #df_sets.to_csv(set_txt, sep='\t')'''
    mosaic_ds = None
    ds = gdal.Open('/vol/v2/stem/canopy/canopy_bdt/canopy_bdt_vote.bsq')
    ar_vote = ds.ReadAsArray()
    ds = None

    if 'confusion_params' in locals():
        import confusion_matrix as confusion

        vote_dir = os.path.join(model_dir, 'evaluation_vote')
        mean_dir = os.path.join(model_dir, 'evaluation_mean')

        print '\nGetting confusion matrix for vote...'
        out_txt = os.path.join(vote_dir, 'confusion.txt')

        df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
        try:
            out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
            df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
        except Exception as e:
            print e

        print '\nGetting confusion matrix for mean...'
        out_txt = os.path.join(mean_dir, 'confusion.txt')
        df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
        try:
            out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
            df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
        except Exception as e:
            print e

        vote_acc = df_v.ix['user', 'producer']
        vote_kap = df_v.ix['user', 'kappa']
        mean_acc = df_m.ix['user', 'producer']
        mean_kap = df_m.ix['user', 'kappa']

        if 'inventory_txt':
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = [
                'vote_accuracy', 'vote_kappa', 'vote_mask', 'mean_accuracy',
                'mean_kappa', 'vote_mask'
            ]
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap, False, \
            mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'

        print ''
        print 'Vote accuracy .............. ', vote_acc
        print 'Vote kappa ................. ', vote_kap
        print 'Mean accuracy .............. ', mean_acc
        print 'Mean kappa ................. ', mean_kap

    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
예제 #4
0
def main(params,
         inventory_txt=None,
         constant_vars=None,
         mosaic_shp=None,
         resolution=30,
         n_jobs_pred=0,
         n_jobs_agg=0,
         mosaic_nodata=0):
    inputs, df_var = stem_conus.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    df_var.data_band = [int(b)
                        for b in df_var.data_band]  #sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem_conus.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)

    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)

    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir)  #Copy the params for reference

    if 'confusion_params' in inputs:
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path

    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)

    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        mosaic_dataset = ogr.Open(mosaic_path)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        xsize = int((max_x - min_x) / resolution)
        ysize = int((max_y - min_y) / resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        mosaic_tx, extent = stem_conus.tx_from_shp(mosaic_path, x_res, y_res)
        #df_tiles = attributes_to_df(mosaic_path)

    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    driver = gdal.GetDriverByName('gtiff')

    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 25, 15
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    df_tiles, df_tiles_rc, tile_size = stem_conus.get_tiles(
        n_tiles, xsize, ysize, mosaic_tx)

    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)

    set_txt = glob.glob(
        os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
    df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    total_sets = len(df_sets)

    t0 = time.time()
    if 'n_jobs_pred' in inputs:
        n_jobs = int(n_jobs_pred)
        # Predict in parallel
        args = []
        t1 = time.time()
        print 'Predicting in parallel with %s jobs...' % n_jobs
        print 'Building args and making rasters of tile arrays...'
        for c, (set_id, row) in enumerate(df_sets.iterrows()):

            # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            '''if mosaic_type == 'vector':
                tsa_ar, tsa_off = mosaic.kernel_from_shp(mosaic_ds, coords, mosaic_tx, nodata=0)
            else:
                tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords,
                                                        mosaic_tx, xsize, ysize,
                                                        nodata=nodata)
            set_mosaic_path = os.path.join(predict_dir, 'tsa_%s.tif' % set_id)
            tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5]
            np_dtype = get_min_numpy_dtype(tsa_ar)
            gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype)
            mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, set_mosaic_path, gdal_dtype, silent=True)
            pct_progress = float(c + 1)/total_sets * 100
            sys.stdout.write('\rRetreived points for feature %s of %s (%%%.1f)' % (c + 1, total_sets, pct_progress))
            sys.stdout.flush()'''

            # Build list of args to pass to the Pool
            #tsa_off = stem_conus.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tx_out[0], tx_out[3]), tx_out)
            args.append([
                coords, mosaic_type, mosaic_path, mosaic_tx, prj, nodata, c,
                total_sets, set_id, df_var, xsize, ysize, row.dt_file, nodata,
                np.uint8, constant_vars, predict_dir
            ])

            #args.append([c, total_sets, set_id, df_var, set_mosaic_path, tsa_off, coords,
            #mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8,
            #constant_vars, predict_dir])
        print '%.1f minutes\n' % ((time.time() - t1) / 60)
        p = Pool(n_jobs)
        p.map(stem_conus.par_predict, args, 1)

    else:
        # Loop through each set and generate predictions
        for c, (set_id, row) in enumerate(df_sets.iterrows()):
            t1 = time.time()
            with open(row.dt_file, 'rb') as f:
                dt_model = pickle.load(f)
            print '\nPredicting for set %s of %s' % (c + 1, total_sets)
            coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']]
            ar_predict = stem_conus.predict_set(set_id, df_var, mosaic_ds,
                                                coords, mosaic_tx, xsize,
                                                ysize, dt_model, nodata,
                                                np.int16, constant_vars)
            tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res
            out_path = os.path.join(predict_dir, 'prediction_%s.tif' % set_id)
            np_dtype = get_min_numpy_dtype(ar_predict)
            gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype)
            mosaic.array_to_raster(ar_predict,
                                   tx,
                                   prj,
                                   driver,
                                   out_path,
                                   gdal.GDT_Byte,
                                   nodata=nodata)
            print 'Total time for this set: %.1f minutes' % (
                (time.time() - t1) / 60)

        #mosaic = None
    print '\nTotal time for predicting: %.1f hours\n' % (
        (time.time() - t0) / 3600)  #''' """

    #Aggregate predictions by tile and stitch them back together
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)

    t1 = time.time()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    if 'n_jobs_agg' in inputs:
        n_jobs_agg = int(n_jobs_agg)

    if mosaic_type == 'vector':
        nodata_mask = mosaic_ds
    else:
        if 'mosaic_nodata' in inputs: mosaic_nodata = int(mosaic_nodata)
        nodata_mask = mosaic_ds.ReadAsArray() != mosaic_nodata

    ########################################################################################################################################
    # jdb 6/22/17 check for sets that errored - if there are any, remove them from the df_sets DF so that the aggregation step doesn't expect them
    setErrorLog = os.path.dirname(predict_dir) + '/predication_errors.txt'
    if os.path.isfile(setErrorLog):
        with open(setErrorLog) as f:
            lines = f.readlines()

        badSets = [
            int(line.split(':')[1].rstrip().strip()) for line in lines
            if 'set_id' in line
        ]
        for thisSet in badSets:
            df_sets = df_sets[df_sets.index != thisSet]
    ########################################################################################################################################

    pct_importance, df_sets = stem_conus.aggregate_predictions(
        n_tiles, ysize, xsize, nodata, nodata_mask, mosaic_tx, support_size,
        agg_stats, predict_dir, df_sets, out_dir, file_stamp, prj, driver,
        n_jobs_agg)
    #print 'Total aggregation time: %.1f hours\n' % ((time.time() - t0)/3600)
    mosaic_ds = None
    mosaic_dataset = None

    # Save the importance values
    importance = pd.DataFrame({
        'variable': pred_vars,
        'pct_importance': pct_importance,
        'index': range(len(pred_vars))
    })
    importance.set_index('index', inplace=True)
    importance['rank'] = [
        int(r) for r in importance.pct_importance.rank(method='first',
                                                       ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')  #'''

    if 'confusion_params' in locals():
        import confusion_matrix as confusion
        ''' 
         Read the mean or vote back in '''
        if 'vote' in agg_stats:
            vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp)
            ar_vote = gdal.Open(vote_path)
            print '\nComputing confusion matrix for vote...'
            vote_dir = os.path.join(model_dir, 'evaluation_vote')
            out_txt = os.path.join(vote_dir, 'confusion.txt')
            df_v = confusion.main(confusion_params,
                                  ar_vote,
                                  out_txt,
                                  match=True)
            vote_acc = df_v.ix['producer', 'user']
            vote_kap = df_v.ix['producer', 'kappa']
            '''try:
                out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
                df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
            except Exception as e:
                print e'''

        if 'mean' in agg_stats:
            mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp)
            ar_mean = gdal.Open(mean_path)
            print '\nGetting confusion matrix for mean...'
            mean_dir = os.path.join(model_dir, 'evaluation_mean')
            out_txt = os.path.join(mean_dir, 'confusion.txt')
            df_m = confusion.main(confusion_params,
                                  ar_mean,
                                  out_txt,
                                  match=True)
            mean_acc = df_m.ix['user', 'producer']
            mean_kap = df_m.ix['user', 'kappa']
            '''try:
                out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
                df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
            except Exception as e:
                print e#'''

        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = [
                'vote_accuracy', 'vote_kappa'
            ]  #, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[
                file_stamp,
                cols] = vote_acc, vote_kap  #, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'

        print ''
        if 'vote' in agg_stats:
            print 'Vote accuracy .............. ', vote_acc
            print 'Vote kappa ................. ', vote_kap
        if 'mean' in agg_stats:
            print 'Mean accuracy .............. ', mean_acc
            print 'Mean kappa ................. ', mean_kap

    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''

    print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0) / 60)
예제 #5
0
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'):
    inputs, df_var = stem.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])    
    df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float

    try:
        support_size = [int(i) for i in support_size.split(',')]
        nodata = int(nodata)
        str_check = model_dir, mosaic_path, out_dir, train_params
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)
    
    # Check that all the variables given were used in training and vice versa
    try:
        train_inputs, train_vars = stem.read_params(train_params)
    except:
        raise NameError('train_params not specified or does not exist')
    train_vars = sorted(train_vars.index)
    pred_vars  = sorted(df_var.index)
    # Make sure vars are sorted alphabetically since they were for training
    df_var = df_var.reindex(pred_vars)
    
    unmatched_vars = [v for v in pred_vars if v not in train_vars]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str
        raise NameError(msg)
    
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    if not os.path.exists(os.path.join(out_dir, os.path.basename(params))):
        shutil.copy2(params, out_dir) #Copy the params for reference
    
    if 'confusion_params' in inputs: 
        conf_bn = os.path.basename(confusion_params)
        new_conf_path = os.path.join(out_dir, conf_bn)
        if not os.path.exists(new_conf_path):
            shutil.copy2(confusion_params, out_dir)
        confusion_params = new_conf_path
    
    if not os.path.exists(model_dir):
        sys.exit('model_dir does not exist:\n%s' % model_dir)
    if not os.path.exists(mosaic_path):
        sys.exit('mosaic_path does not exist:\n%s' % mosaic_path)
    
    predict_dir = os.path.join(out_dir, 'decisiontree_predictions')
    if not os.path.exists(predict_dir):
        os.mkdir(predict_dir)
    
    if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir)
    db_path = os.path.join(model_dir, file_stamp + '.db')
    try:
        engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
        with engine.connect() as con, con.begin():
            df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#'''
    except:
        set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0]
        if not os.path.isfile(set_txt):
            raise IOError('No database or support set txt file found')
        df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id')
    
    if mosaic_path.endswith('.shp'):
        mosaic_type = 'vector'
        # if subset specified, clip the mosaic and set mosaic path to clipped shp
        if 'subset_shp' in inputs:
            out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp')
            out_shp = os.path.join(out_dir, out_shp_bn)
            cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path)
            subprocess.call(cmd, shell=True)#'''
            mosaic_path = out_shp
        mosaic_dataset = ogr.Open(mosaic_path)
        mosaic_ds = mosaic_dataset.GetLayer()
        min_x, max_x, min_y, max_y = mosaic_ds.GetExtent()
        if 'resolution' not in inputs:
            warnings.warn('Resolution not specified. Using default of 30...\n')
        # If subset specified, just get sets that overlap the subset
        if 'subset_shp' in inputs:
            mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon)
            for feature in mosaic_ds:
                mosaic_geom.AddGeometry(feature.GetGeometryRef())
            df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom)
        xsize = int((max_x - min_x)/resolution)
        ysize = int((max_y - min_y)/resolution)
        prj = mosaic_ds.GetSpatialRef().ExportToWkt()
        x_res = resolution
        y_res = -resolution
        x_rot = 0
        y_rot = 0
        if 'snap_coord' in train_inputs:
            snap_coord = train_inputs['snap_coord'].replace('"','')
            snap_coord = [float(c) for c in snap_coord.split(',')]#'''
        mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord)
        tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry
        
    else:
        mosaic_type = 'raster'
        mosaic_ds = gdal.Open(mosaic_path)
        mosaic_tx = mosaic_ds.GetGeoTransform()
        xsize = mosaic_ds.RasterXSize
        ysize = mosaic_ds.RasterYSize
        prj = mosaic_ds.GetProjection()
        driver = mosaic_ds.GetDriver()
        m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx
    #driver = gdal.GetDriverByName('gtiff')
        
    # If number of tiles not given, need to set it
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 90, 40
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx)
    
    total_sets = len(df_sets)
    t0 = time.time()
    last_dts = pd.Series()
    agg_stats = [s.strip().lower() for s in agg_stats.split(',')]
    n_jobs = int(n_jobs)
    tile_dir = os.path.join(model_dir, 'temp_tiles')
    #tile_dir = '/home/server/pi/homes/shooper/delete_test'
    if not os.path.isdir(tile_dir):
        os.mkdir(tile_dir)
    tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif')
    n_tiles = len(tiles)
    
    if not overwrite_tiles:
        files = os.listdir(tile_dir)
        tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field])
        for stat in agg_stats:
            stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)]
            tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match)
        index_field = tiles.index.name
        tiles[index_field] = tiles.index
        tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)]
        tiles.set_index(index_field, inplace=True)
    
    tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) 
                    for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()]
    tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) 
                    for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()]
    tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax)
                    in tiles[['ul_x', 'xmin','xmin']].iterrows()]
    tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) 
                    in tiles[['ul_y', 'ymin','ymin']].iterrows()]
    
    support_nrows = int(support_size[0]/abs(y_res))
    support_ncols = int(support_size[1]/abs(x_res))
    t1 = time.time()
    args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())]    
    #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())]
    
    if n_jobs > 1:
        print 'Predicting with %s jobs...\n' % n_jobs
        pool = Pool(n_jobs)
        pool.map(stem.predict_tile, args, 1)
        pool.close()
        pool.join()
    else:
        for arg in args:
            print 'Predicting with 1 job ...\n'
            stem.predict_tile(*arg)#'''
    print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600)
    
    t1 = time.time()
    mosaic_ul = mosaic_tx[0], mosaic_tx[3]
    driver = gdal.GetDriverByName('gtiff')
    for stat in agg_stats:
        if stat == 'stdv':
            this_nodata = -9999
            ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) 
        else:
            this_nodata = nodata
            ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8)
        
        for tile_id, tile_coords in tiles.iterrows():
            tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat))
            ds = gdal.Open(tile_file)
            tile_tx = ds.GetGeoTransform()
            tile_ul = tile_tx[0], tile_tx[3]
            row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx)
            # Make sure the tile doesn't exceed the size of ar
            tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off
            tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off
            ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows)
            try:
                ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile
            except Exception as e:
                import pdb; pdb.set_trace()
        
        out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat))
        #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat))
        gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype)
        mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata)
    
    # Clean up the tiles
    shutil.rmtree(tile_dir)
    print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60)
    
    # Get feature importances and max importance per set
    t1 = time.time()
    print 'Getting importance values...'
    importance_cols = sorted([c for c in df_sets.columns if 'importance' in c])
    df_sets['max_importance'] = nodata
    if len(importance_cols) == 0:
        # Loop through and get importance
        importance_per_var = []
        for s, row in df_sets.iterrows():
            with open(row.dt_file, 'rb') as f: 
                dt_model = pickle.load(f)
            max_importance, this_importance = stem.get_max_importance(dt_model)
            df_sets.ix[s, 'max_importance'] = max_importance
            importance_per_var.append(this_importance)
        importance = np.array(importance_per_var).mean(axis=0)
    else:
        df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1)
        importance = df_sets[importance_cols].mean(axis=0).values
    pct_importance = importance / importance.sum()
    print '%.1f minutes\n' % ((time.time() - t1)/60)
    
    # Save the importance values
    importance = pd.DataFrame({'variable': pred_vars,
                               'pct_importance': pct_importance,
                               'index': range(len(pred_vars))
                               })
    importance.set_index('index', inplace=True)
    importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp)
    importance.to_csv(out_txt, sep='\t')#'''
    
    if 'confusion_params' in locals():
        import confusion_matrix as confusion

        ''' 
         Read the mean or vote back in '''
        if 'vote' in agg_stats:
            vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp)
            ar_vote = gdal.Open(vote_path)
            print '\nComputing confusion matrix for vote...'
            vote_dir = os.path.join(model_dir, 'evaluation_vote')
            out_txt = os.path.join(vote_dir, 'confusion.txt')
            df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True)
            vote_acc = df_v.ix['producer', 'user']
            vote_kap = df_v.ix['producer', 'kappa']
            '''try:
                out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt')
                df_v_off = confusion.main(confusion_params, ar_vote, out_txt)
            except Exception as e:
                print e'''

                
        if 'mean' in agg_stats:
            mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp)
            ar_mean = gdal.Open(mean_path)
            print '\nGetting confusion matrix for mean...'
            mean_dir = os.path.join(model_dir, 'evaluation_mean')
            out_txt = os.path.join(mean_dir, 'confusion.txt')
            df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True)
            mean_acc = df_m.ix['user','producer']
            mean_kap = df_m.ix['user', 'kappa']
            '''try:
                out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt')
                df_m_off = confusion.main(confusion_params, ar_mean, out_txt)
            except Exception as e:
                print e#'''


        if 'inventory_txt' in inputs:
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask']
            df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print '\n"inventory_txt" was not specified.' +\
            ' Model evaluation scores will not be recorded...'
            
        print ''
        if 'vote' in agg_stats:
            print 'Vote accuracy .............. ', vote_acc
            print 'Vote kappa ................. ', vote_kap
        if 'mean' in agg_stats:
            print 'Mean accuracy .............. ', mean_acc
            print 'Mean kappa ................. ', mean_kap
        
    else:
        print '\n"confusion_params" was not specified.' +\
            ' This model will not be evaluated...' #'''
    
    print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)