示例#1
0
def stats(modelgrid, inventory, dx=100., Nsamp=None, method='nearest', extent='inventory', bins=None, runtests=True, showplots=True, saveplots=False, filepath=None):
    """
    Run through suite of tests for models that output probability or index that varies between 0 and 1
    :param modelgrid: Grid2D object of model results
    :param inventory: full file path to shapefile of inventory, must be in geographic coordinates, WGS84
    :type inventory: string
    :param dx: Approximate sample spacing in meters, overwritten if Nsamp is defined
    :type dx: float
    :param Nsamp: Total number of samples desired - will choose optimal dx to get slightly more than this number of samples delete samples outside bounds and randomly delete others until sample number is exactly Nsamp
    :type Nsamp: integer
    :param method: method used for interp2d when transforming sampled model values back from projected coordinates to geographic coordinates - 'nearest', 'linear', or 'cubic'
    :type method: string
    :param extent: extent to include in sampling - 'inventory' or 'model' or custom bounds as tuple (xmin, ymin, xmax, ymax) - in lats and lons
    :param bins: bin edges to use for various binning and threshold statistical calculations. if None bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]
    :param runtests: if True, will run various statistical tests, if False will just output sampled values
    :param showplots: if True, will disply the plots
    :param saveplots: if True, will save the plots
    :param filepath: Filepath for saved plots, if None, will save in current directory. Files are named with test name and time stamp
    :returns yespoints: Nx2 array of geographic coordinates of positive sample locations
    :returns nopoints: Nx2 array of geographic coordinates of negative sample locations
    :returns modelvalyes: N model output values corresponding to yespoints
    :returns modelvalno: N model output values corresponding to nopoints
    :returns results: dictionary of results of statistical tests. Will be empty if runtests=False
                      {'Occ_nonocc': dict,
                         'SRC': dict,
                         'ROC': dict,
                         'AUC_ROC': float,
                         'Log_loss': float,
                         'GFC': dict,
                         'Pred_vs_Obs': dict,
                         'Brier': float,
                         'Brier_no': float,
                         'Brier_yes': float}
    :rtype results: dictionary
    """
    plt.close('all')
    f = fiona.collection(inventory, 'r')
    shapes = list(f)
    bxmin, bymin, bxmax, bymax = f.bounds
    gdict = modelgrid.getGeoDict()

    if extent == 'model':
        extent = gdict.xmin, gdict.ymin, gdict.xmax, gdict.ymax
    elif extent == 'inventory':
        extent = bxmin, bymin, bxmax, bymax
    #yespoints, junk, nopoints, junk2, xvar, yvar, pshapes, proj = sampleFromShapes(shapes, extent, dx=dx, Nsamp=Nsamp, testPercent=100.)

    yespoints, nopoints, xvar, yvar, pshapes, proj = pointsFromShapes(shapes, extent, dx=dx, Nsamp=Nsamp)
    yesptx = [pt[0] for pt in yespoints]
    yespty = [pt[1] for pt in yespoints]
    noptx = [pt[0] for pt in nopoints]
    nopty = [pt[1] for pt in nopoints]
    #import pdb; pdb.set_trace()
    # Get values of model at those points
    lons = np.linspace(gdict.xmin, gdict.xmax, gdict.nx)
    lats = np.linspace(gdict.ymax, gdict.ymin, gdict.ny)
    if method.lower() == 'nearest':
        modelvalyes = []
        modelvalno = []
        for XX, YY in zip(yesptx, yespty):
            row = (np.abs(lats - YY)).argmin()
            col = (np.abs(lons - XX)).argmin()
            modelvalyes.append(modelgrid.getData()[row, col])
        for XX, YY in zip(noptx, nopty):
            row = (np.abs(lats - YY)).argmin()
            col = (np.abs(lons - XX)).argmin()
            modelvalno.append(modelgrid.getData()[row, col])
    else:
        func = interpolate.interp2d(lons, lats, modelgrid.getData(), kind=method.lower())
        modelvalyes = np.array([float(func(XX, YY)) for XX, YY in zip(yesptx, yespty)])
        modelvalno = np.array([float(func(XX, YY)) for XX, YY in zip(noptx, nopty)])

    modelvalyes = np.nan_to_num(np.array(modelvalyes))  # replace nan with zeros
    modelvalno = np.nan_to_num(np.array(modelvalno))  # replace nan with zeros

    # Now run the desired tests and make the desired plots
    results = {}

    if runtests is True:
        # Brier score
        N = len(yespoints) + len(nopoints)
        yessum = np.sum([(val-1)**2 for val in modelvalyes])
        nosum = np.sum([(val)**2 for val in modelvalno])
        results['Brier_yes'] = yessum/len(modelvalyes)
        results['Brier_no'] = nosum/len(modelvalno)
        results['Brier'] = (yessum + nosum)/N
        print(('Brier scores: overall %0.3f\nBrier_yes score: %0.3f\nBrier_no score %0.3f' % (results['Brier'], results['Brier_yes'], results['Brier_no'])))

        # Logarithmic score
        tempno = np.array(modelvalno).copy()
        tempyes = np.array(modelvalyes).copy()
        tempno[tempno == 0] = 1.e-15
        tempyes[tempyes == 0] = 1.e-15
        results['Log_loss'] = -(np.sum(np.log(tempyes)) + np.sum(np.log(1.-tempno)))/N
        print(('Log loss score: %0.3f' % (results['Log_loss'],)))

        if bins is None:
            bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]
        binvec = []
        observed = []
        percyes = []
        percno = []
        overall_tot = len(modelvalyes) + len(modelvalno)
        for i in range(len(bins[:-1])):
            binvec.append(bins[i]+(bins[i+1]-bins[i])/2)
            yestot = np.sum([(modelvalyes > bins[i]) & (modelvalyes < bins[i+1])])
            notot = np.sum([(modelvalno > bins[i]) & (modelvalno < bins[i+1])])
            if notot+yestot != 0:
                observed.append(float(yestot)/(yestot+notot))
            else:
                observed.append('nan')
            percyes.append((yestot/float(overall_tot))*100.)
            percno.append((notot/float(overall_tot))*100.)

        plt.ioff()

        # Predicted vs. Observed ratios
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(binvec, observed, '-o')
        ax.plot([0]+binvec, [0]+binvec, '--', color='gray')
        ax.set_xlabel('Expected ratio')
        ax.set_ylabel('Observed ratio')
        ax.set_xlim([bins[0], bins[-1]])
        ax.set_title('Predicted vs. Observed')
        results['Pred_vs_Obs'] = {'binvec': binvec, 'observed': observed}

        # Ground failure occurrence/nonoccurrence
        fig1 = plt.figure()
        ax1 = fig1.add_subplot(111)
        wid = (bins[1]-bins[0])/2.5
        rects1 = ax1.bar(np.array(bins[:-1]), percyes, width=wid)
        rects2 = ax1.bar(np.array(bins[:-1])+wid, percno, width=wid, color='r')
        ax1.set_xlabel('Predicted susceptibility range')
        ax1.set_ylabel('% of samples')
        ax1.legend((rects1[0], rects2[0]), ('Occurrence', 'Nonoccurrence'))
        ax1.set_title('Occurrence vs. Nonoccurrence')
        results['Occ_nonocc'] = {'bins': bins, 'percyes': percyes, 'percno': percno}

        # Ground failure capture for various thresholds
        gfc = []
        for val in bins:
            gfc.append(np.sum([modelvalyes > val])/float(len(yespoints)))
        fig2 = plt.figure()
        ax2 = fig2.add_subplot(111)
        ax2.plot(bins, gfc, 'o-')
        ax2.set_xlabel('Threshold')
        ax2.set_ylabel(r'%GFC')
        ax2.set_title('Ground Failure Capture')
        results['GFC'] = {'thresholds': bins, 'gfc': gfc}

        # ROC curves
        fpr, tpr, thresholds = roc_curve(np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))), np.concatenate((modelvalyes, modelvalno)))
        fig3 = plt.figure()
        ax3 = fig3.add_subplot(111)
        ax3.plot(fpr, tpr)
        ax3.set_xlabel('False positive rate')
        ax3.set_ylabel('True positive rate')
        ax3.set_xlim([0, 1.])
        ax3.set_ylim([0, 1.])
        ax3.plot(fpr, fpr, '--', color='gray')
        ax3.set_title('ROC curve')
        results['ROC'] = {'thresholds': bins, 'gfc': gfc}

        results['AUC_ROC'] = roc_auc_score(np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints)))), np.concatenate((modelvalyes, modelvalno)))
        print(('AUC_ROC: %0.3f' % (results['AUC_ROC'],)))
        ax3.text(0.8, 0.2, 'AUC: %0.3f' % results['AUC_ROC'])

        # Success rate curves
        sucbin = np.linspace(0, 1., 100)
        prop = []
        realvals = np.concatenate((np.ones(len(yespoints)), np.zeros(len(nopoints))))
        predvals = np.concatenate((modelvalyes, modelvalno))
        indx = np.argsort(predvals)
        predvals = predvals[indx]
        realvals = realvals[indx]
        for val in sucbin:
            prop.append(np.sum(realvals[predvals < val])/len(yespoints))
        fig4 = plt.figure()
        ax4 = fig4.add_subplot(111)
        ax4.plot(sucbin, prop)
        ax4.set_xlabel('Success Rate Curve')
        ax4.set_ylabel('Proportion of actual occurrences')
        ax4.set_title('Proportion of Study Area')
        AUC = auc(sucbin, prop)
        print(('AUC_SRC: %0.3f' % AUC))
        ax4.text(0.8, 0.2, 'AUC: %0.3f' % AUC)
        ax4.set_xlim([0, 1.])
        ax4.set_ylim([0, 1.])
        results['SRC'] = {'xvals': sucbin, 'proportion': prop, 'auc': AUC}

        if showplots is True:
            plt.show()
        if saveplots is True:
            if filepath is None:
                filepath = os.getcwd()
            import datetime
            time1 = datetime.datetime.utcnow().strftime('%d%b%Y_%H%M')
            fig.savefig(os.path.join(filepath, 'Pred_vs_obs_%s.pdf' % (time1,)))
            fig1.savefig(os.path.join(filepath, 'Occ_nonocc_%s.pdf' % (time1,)))
            fig2.savefig(os.path.join(filepath, 'GFC_%s.pdf' % (time1,)))
            fig3.savefig(os.path.join(filepath, 'ROC_%s.pdf' % (time1,)))
            fig4.savefig(os.path.join(filepath, 'SRC_%s.pdf' % (time1,)))

    return yespoints, nopoints, modelvalyes, modelvalno, results
示例#2
0
def computeCoverage_accurate(gdict, inventory, numdiv=10.):
    """
    VERY SLOW!!
    Slow but more accurate method to produce grid of area actually affected by landsliding in each cell defined by geodict
    :param gdict: geodict, likely taken from model to compare inventory against
    :param inventory: full file path to shapefile of inventory, must be in geographic coordinates, WGS84
    :type inventory: string
    :param numdiv: Approximate amount to subdivide each cell of geodict by to compute areas (higher number slower but more accurate)

    :return inventorygrid: Grid2D object reporting areal coverage of landsliding inside each cell defined by geodict
    """

    f = fiona.collection(inventory, 'r')
    shapes = list(f)
    bxmin, bymin, bxmax, bymax = f.bounds

    lons = np.linspace(gdict.xmin, gdict.xmax, gdict.nx)
    lats = np.linspace(gdict.ymax, gdict.ymin, gdict.ny)
    llons, llats = np.meshgrid(lons, lats)

    spacing = np.round(np.abs(((lats[1]-lats[0])*111.12*1000.)/numdiv))  # in meters
    yespoints, nopoints, xvar, yvar, pshapes, proj = pointsFromShapes(shapes, bounds=(gdict.xmin, gdict.ymin, gdict.xmax, gdict.ymax), dx=spacing)

    # Loop over lat lon pairs that are within boundaries of yes and no points
    ptlonmax = (np.max((yespoints[:, 0].max(), nopoints[:, 0].max())))
    ptlonmin = (np.max((yespoints[:, 0].min(), nopoints[:, 0].min())))
    ptlatmax = (np.max((yespoints[:, 1].max(), nopoints[:, 1].max())))
    ptlatmin = (np.max((yespoints[:, 1].min(), nopoints[:, 1].min())))

    subllons = llons[(llons >= ptlonmin) & (llons <= ptlonmax) & (llats >= ptlatmin) & (llats <= ptlatmax)]
    subllats = llats[(llons >= ptlonmin) & (llons <= ptlonmax) & (llats >= ptlatmin) & (llats <= ptlatmax)]

    import time
    # Contains points method
    t1 = time.clock()
    dx = gdict.dx
    area = np.zeros(np.shape(llons))
    numpts = area.copy()
    numyes = area.copy()
    for lat1, lon1 in zip(subllats, subllons):
        # Find ratio of yes points to no points
        bbPath = mplPath.Path(np.array([[lon1-0.5*dx, lat1-0.5*dx], [lon1-0.5*dx, lat1+0.5*dx], [lon1+0.5*dx, lat1+0.5*dx], [lon1+0.5*dx, lat1-0.5*dx]]))
        yesin = sum(bbPath.contains_points(yespoints))  # sum([(yes0 > lon1-0.5*dx) & (yes0 <= lon1+0.5*dx) & (yes1 > lat1-0.5*dx) & (yes1 <= lat1+0.5*dx)])
        noin = sum(bbPath.contains_points(nopoints))  # sum([(no0 > lon1-0.5*dx) & (no0 <= lon1+0.5*dx) & (no1 > lat1-0.5*dx) & (no1 <= lat1+0.5*dx)])
        total = yesin + noin
        if total == 0.:
            continue
        # get indices
        row = np.where(lats == lat1)
        col = np.where(lons == lon1)
        # Store total number of points in matrix
        numpts[row, col] = total
        # Store area
        numyes[row, col] = yesin
    t2 = time.clock()
    print(('Time elapsed %0.2f seconds' % (t2-t1)))

    # Correct for incompletely sampled squared (all unsampled points would be no points)
    numpts[numpts < (numpts[numpts != 0].mean() - numpts[numpts != 0].std())] = np.median(numpts[numpts != 0])  # Will change zeros to nonzeros, but yeses will be 0 in those cells so it doesn't matter
    area = numyes/numpts

    inventorygrid = GDALGrid(area, gdict)

    return inventorygrid