예제 #1
0
파일: WRTDS.py 프로젝트: fkwai/geolearn
def testWRTDS(dataName, trainSet, testSet, codeLst):
    DF = dbBasin.DataFrameBasin(dataName)
    # Calculate WRTDS from train and test set
    varX = ['00060']
    varY = codeLst
    d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varX=varX, varY=varY)
    d2 = dbBasin.DataModelBasin(DF, subset=testSet, varX=varX, varY=varY)
    tt1 = pd.to_datetime(d1.t)
    yr1 = tt1.year.values
    t1 = yr1 + tt1.dayofyear.values / 365
    sinT1 = np.sin(2 * np.pi * t1)
    cosT1 = np.cos(2 * np.pi * t1)
    tt2 = pd.to_datetime(d2.t)
    yr2 = tt2.year.values
    t2 = yr2 + tt2.dayofyear.values / 365
    sinT2 = np.sin(2 * np.pi * t2)
    cosT2 = np.cos(2 * np.pi * t2)
    ###
    yOut = np.full([len(d2.t), len(d2.siteNoLst), len(varY)], np.nan)
    t0 = time.time()
    for indS, siteNo in enumerate(d2.siteNoLst):
        for indC, code in enumerate(varY):
            print('{} {} {} {}'.format(indS, siteNo, code, time.time() - t0))
            y1 = d1.Y[:, indS, indC].copy()
            q1 = d1.X[:, indS, 0].copy()
            q1[q1 < 0] = 0
            logq1 = np.log(q1 + sn)
            x1 = np.stack([logq1, yr1, sinT1, cosT1]).T
            y2 = d2.Y[:, indS, indC].copy()
            q2 = d2.X[:, indS, 0].copy()
            q2[q2 < 0] = 0
            logq2 = np.log(q2 + sn)
            x2 = np.stack([logq2, yr2, sinT2, cosT2]).T
            [xx1, yy1], ind1 = utils.rmNan([x1, y1])
            if testSet == 'all':
                [xx2], ind2 = utils.rmNan([x2])
            else:
                [xx2, yy2], ind2 = utils.rmNan([x2, y2])
            if len(ind1) < 40:
                continue
            for k in ind2:
                dY = np.abs(t2[k] - t1[ind1])
                dQ = np.abs(logq2[k] - logq1[ind1])
                dS = np.min(np.stack(
                    [abs(np.ceil(dY) - dY),
                     abs(dY - np.floor(dY))]),
                            axis=0)
                d = np.stack([dY, dQ, dS])
                ww, ind = calWeight(d)
                model = sm.WLS(yy1[ind], xx1[ind], weights=ww).fit()
                yp = model.predict(x2[k, :])[0]
                yOut[k, indS, indC] = yp
    return yOut
예제 #2
0
 def errBySiteC(self, ycP, varC, subset=None, rmExt=False):
     if type(varC) is not list:
         varC = [varC]
     obsLst = self.extractSubset(subset=subset)
     ycT = obsLst[3]
     indC = [self.varC.index(var) for var in varC]
     info = self.info.loc[self.subset[subset].tolist()].reset_index()
     siteNoLst = self.info.siteNo.unique()
     statMat = np.full([len(siteNoLst), len(indC), 3], np.nan)
     for i, siteNo in enumerate(siteNoLst):
         indS = info[info['siteNo'] == siteNo].index.values
         for k, iC in enumerate(indC):
             a = ycT[indS, iC]
             b = ycP[indS, k]
             if rmExt is True and len(a) != 0:
                 aV = a[a < np.nanpercentile(a, 95)]
                 aV = aV[aV > np.nanpercentile(a, 5)]
                 ul = np.mean(aV) + np.std(aV) * 5
                 a[a > ul] = np.nan
             # indV = np.where(~np.isnan(a))
             if len(indS) > 0:
                 _, indV = utils.rmNan([a, b])
                 rmse = np.sqrt(np.nanmean((a[indV] - b[indV])**2))
                 corr = np.corrcoef(a[indV], b[indV])[0, 1]
                 # nse = 1-np.nansum((b-a)**2)/np.nansum((a-np.nanmean(a))**2)
                 # nse = np.nanmean(b)/np.nanmean(a)-1
                 nse = np.nanmean(np.abs((b - a) / a))
                 statMat[i, k, 0] = rmse
                 statMat[i, k, 1] = corr
                 statMat[i, k, 2] = nse
     return statMat
예제 #3
0
def funcPoint(iP, axP):
    siteNo = siteNoLst[iP]
    dfPred, dfObs = basins.loadSeq(outName, siteNo)
    t = dfPred['date'].values.astype(np.datetime64)
    tBar = np.datetime64('2000-01-01')
    # linear model
    ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index
    [x1, y1, yc1], _ = utils.rmNan([xL1[ind1, :], yL1[ind1, :], ycL1[ind1, :]])
    modelY = LinearRegression().fit(x1, y1)
    modelYC = LinearRegression().fit(x1, yc1)
    sd = np.datetime64('1979-01-01')
    ed = np.datetime64('2020-01-01')
    dfX = waterQuality.readSiteX(siteNo, sd, ed, varX)
    x2 = transform.transInAll(dfX.values, mtdX, statLst=statX)
    y2 = modelY.predict(x2)
    yc2 = modelYC.predict(x2)
    yp = wqData.transOut(y2, statY, varY)
    ycp = wqData.transOut(yc2, statYC, varYC)
    code = codeLst[0]
    axplot.plotTS(axP[0],
                  t, [dfPred['00060'], yp, dfObs['00060']],
                  tBar=tBar,
                  legLst=['lstm', 'lr', 'obs'],
                  styLst='---',
                  cLst='bgr')
    axplot.plotTS(axP[1],
                  t, [dfPred[code], ycp, dfObs[code]],
                  tBar=tBar,
                  legLst=['lstm', 'lr', 'obs'],
                  styLst='--*',
                  cLst='bgr')
예제 #4
0
def dictErr(dictLSTM, dictWRTDS, dictObs, codeLst):
    # calculate correlation
    tt = np.datetime64('2010-01-01')
    t0 = np.datetime64('1980-01-01')
    siteNoLst = list(dictObs.keys())
    # codeLst = dictObs[siteNoLst[0]].columns.tolist()
    t = dictObs[siteNoLst[0]].index.values
    ind1 = np.where((t < tt) & (t >= t0))[0]
    ind2 = np.where(t >= tt)[0]
    corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan)
    rmseMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan)
    for ic, code in enumerate(codeLst):
        for siteNo in siteNoLst:
            indS = siteNoLst.index(siteNo)
            v1 = dictLSTM[siteNo][code].iloc[ind2].values
            v2 = dictWRTDS[siteNo][code].iloc[ind2].values
            v3 = dictObs[siteNo][code].iloc[ind2].values
            dfQ1 = dictObs[siteNo][['00060', code]].iloc[ind1].dropna()
            (vv1, vv2, vv3), indV = utils.rmNan([v1, v2, v3])
            if (len(indV) < 50) or (len(dfQ1) < 50):
                # print(code, siteNo)
                pass
            else:
                rmse1, corr1 = utils.stat.calErr(vv1, vv2)
                rmse2, corr2 = utils.stat.calErr(vv1, vv3)
                rmse3, corr3 = utils.stat.calErr(vv2, vv3)
                corrMat[indS, ic, 0] = corr1
                corrMat[indS, ic, 1] = corr2
                corrMat[indS, ic, 2] = corr3
                rmseMat[indS, ic, 0] = rmse1
                rmseMat[indS, ic, 1] = rmse2
                rmseMat[indS, ic, 2] = rmse3
    return corrMat, rmseMat
예제 #5
0
def kateModel(q, c, x=None):
    (q, c), ind = utils.rmNan([q, c])
    popt, pcov = curve_fit(func, q, c, bounds=[(0, 0), (np.inf, 100)])
    ceq = popt[0]
    dw = popt[1]
    if x is None:
        out = None
    else:
        out = ceq / (1 + x / dw)
    return ceq, dw, out
예제 #6
0
def trainLR(dfXT, dfYT, dfXN, dfYN):
    [xx, yy], iv = utils.rmNan([dfXT.values, dfYT.values])
    if len(iv) > 0:
        modelYC = LinearRegression().fit(xx, yy)
        yp = modelYC.predict(dfXN.values)
        dfPN = pd.DataFrame(data=yp, index=dfYN.index, columns=dfYN.columns)
    else:
        dfPN = pd.DataFrame(index=dfYN.index,
                            columns=dfYN.columns,
                            data=np.nan)
    return dfPN
예제 #7
0
def funcPoint(iP, axes):
    kA = 0
    siteNo = siteNoLst[iP]
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)
    ctR = pd.date_range(startDate, endDate)
    dfData = pd.DataFrame({'date': ctR}).set_index('date')
    dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate)
    dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
    dfQ = dfQ.rename(columns={'00060_00003': '00060'})
    dfData = dfData.join(dfQ)
    dfData = dfData.join(dfC)

    # plot normalized time series
    ax = axes[kA]
    kA = kA + 1
    t = dfData.index.values
    dfDataN = (dfData - dfData.mean()) / dfData.std()
    varLst = dfData.columns.tolist()
    data = [dfDataN[var].values for var in varLst]
    legLst = ['streamflow'
              ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst]
    axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst)

    # plot C-Q
    nc = len(codeLst)
    for k in range(nc):
        code = codeLst[k]
        q = dfData['00060']
        c = dfData[code]
        [q, c], ind = utils.rmNan([q, c])
        ax = axes[kA]
        kA = kA + 1
        ax.plot(np.log(q), np.log(c), 'r*')

    # plot fractual
    for k in range(nc):
        code = codeLst[k]
        dfV = dfData[dfData[code].notna()]
        nt = len(dfData)
        x = dfV.index.values.astype('datetime64[D]')
        y = dfV[code].values
        freq = 2 * np.pi / np.linspace(2, nt, nt)
        power = signal.lombscargle(x, y, freq)
        ax = axes[kA]
        kA = kA + 1
        ax.plot(np.log(freq / 2 * np.pi), np.log(power), '-*')
        fyr = 2 * np.pi / 365
        pyr = signal.lombscargle(x, y, [fyr])
        ax.plot(np.log(fyr / 2 * np.pi), np.log(pyr), 'r*')
예제 #8
0
def funcPoint(iP, axP):
    siteNo = siteNoLst[iP]
    dfO = waterQuality.readSiteTS(siteNo, [code], freq='W')[code]
    t = dfO.index
    file1 = os.path.join(dirRoot1, 'output', siteNo)
    file2 = os.path.join(dirRoot2, 'output', siteNo)
    dfP1 = pd.read_csv(file1, index_col='date')[code]
    dfP2 = pd.read_csv(file2, index_col='date')[code]
    v = [dfP1.values, dfP2.values, dfO.values]
    [v1, v2, o], iv = utils.rmNan([dfP1.values, dfP2.values, dfO.values])
    tt = t[iv]
    styLst = [['-*'] for x in range(3)]
    axplot.plotTS(axP, tt.values, [v1, v2, o], cLst='rbk')
    # print corr
    rmse1, corr1 = utils.stat.calErr(v[0], v[-1])
    rmse2, corr2 = utils.stat.calErr(v[1], v[-1])
    axP.set_title('site {} WRTDS {:.2f} only T {:.2f}'.format(
        siteNo, corr1, corr2))
예제 #9
0
파일: basins.py 프로젝트: sadeghst/geolearn
def modelLinear(outName, testset, trainset=None, wqData=None):
    master = loadMaster(outName)
    dataName = master['dataName']
    if wqData is None:
        wqData = waterQuality.DataModelWQ(dataName)
    if trainset is None:
        trainset = master['trainName']
    infoTrain = wqData.info.iloc[wqData.subset[trainset]].reset_index()
    infoTest = wqData.info.iloc[wqData.subset[testset]].reset_index()

    # linear reg data
    statTup = loadStat(outName)
    varTup = (master['varX'], master['varXC'], master['varY'], master['varYC'])
    dataTup1 = wqData.transIn(subset=trainset, varTup=varTup, statTup=statTup)
    dataTup2 = wqData.transIn(subset=testset, varTup=varTup, statTup=statTup)
    dataTup1 = trainTS.dealNaN(dataTup1, master['optNaN'])
    dataTup2 = trainTS.dealNaN(dataTup2, master['optNaN'])
    varYC = varTup[3]
    statYC = statTup[3]
    x1 = dataTup1[0][-1, :, :]
    yc1 = dataTup1[3]
    x2 = dataTup2[0][-1, :, :]

    # point test l2 - linear
    nc = len(varYC)
    matP1 = np.full([len(infoTrain), nc], np.nan)
    matP2 = np.full([len(infoTest), nc], np.nan)
    siteNoLst = infoTest['siteNo'].unique().tolist()
    for siteNo in siteNoLst:
        ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index
        ind2 = infoTest[infoTest['siteNo'] == siteNo].index
        xT1 = x1[ind1, :]
        ycT1 = yc1[ind1, :]
        for ic in range(nc):
            [xx, yy], iv = utils.rmNan([xT1, ycT1[:, ic]])
            if len(iv) > 0:
                modelYC = LinearRegression().fit(xx, yy)
                matP1[ind1, ic] = modelYC.predict(xT1)
                if len(ind2) > 0:
                    xT2 = x2[ind2, :]
                    matP1[ind2, ic] = modelYC.predict(xT2)
    matO1 = wqData.transOut(matP1, statYC, varYC)
    matO2 = wqData.transOut(matP2, statYC, varYC)
    return matO1, matO2
예제 #10
0
def tsYr(t, y, cLst='rbkgcmy', figsize=(12, 4), showCorr=False):
    y = y if type(y) is list else [y]
    yrAll = pd.to_datetime(t).year
    yrLst = yrAll.unique().tolist()
    ny = len(yrLst)
    fig, axes = plt.subplots(ncols=ny, sharey=True, figsize=figsize)
    fig.subplots_adjust(wspace=0)
    for iYr, yr in enumerate(yrLst):
        ind = np.where(yrAll == yr)[0]
        _ = axplot.plotTS(axes[iYr], t[ind], [v[ind] for v in y], cLst=cLst)
        _ = axes[iYr].set_xlim(np.datetime64(str(yr)),
                               np.datetime64(str(yr + 1)))
        _ = axes[iYr].set_xticks([])
        corr = np.corrcoef(utils.rmNan([v[ind] for v in y],
                                       returnInd=False))[0, 1]
        if showCorr is True:
            _ = axes[iYr].set_xlabel('{}\n{:.2f}'.format(yr, corr))
        else:
            _ = axes[iYr].set_xlabel('{}'.format(yr))
    return fig
예제 #11
0
def funcPoint(iP, axP):
    siteNo = siteNoHBN[iP]
    dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst)
    dfQ = usgs.readStreamflow(siteNo)
    df = dfC.join(dfQ)
    t = df.index.values
    q = df['00060_00003'].values / area * unitConv
    c = df[code].values
    [q, c], ind = utils.rmNan([q, c])
    t = t[ind]
    qAll = dfQ['00060_00003'].values
    qT = dfQ.index.values
    axplot.plotTS(axP[0], qT, qAll, cLst='b', styLst='--')
    axplot.plotTS(axP[1], t, c)
    axP[2].plot(np.log(q), c, 'k*')
    x = 10**np.linspace(np.log10(np.min(q[q > 0])),
                        np.log10(np.max(q[~np.isnan(q)])), 20)
    ceq0 = pMat2[iP, 0]
    dw0 = pMat2[iP, 1]
    y0 = ceq0 * 1 / (x / dw0 + 1)
    axP[2].plot(np.log(x), y0, 'r-')
    axP[2].set_title('ceq={:.3f},dw={:.3f}'.format(ceq0, dw0))
예제 #12
0
def plotTS(ax,
           t,
           y,
           *,
           styLst=None,
           tBar=None,
           cLst='krbgcmy',
           legLst=None,
           sd=None,
           **kw):
    y = y if type(y) is list else [y]
    if sd is not None:
        ind = np.where(t >= sd)[0]
        t = t[ind]
        for k in range(len(y)):
            y[k] = y[k][ind]
    for k in range(len(y)):
        yy = y[k]
        # find out continuous / distinct
        if styLst is None:
            [_, _], ind = utils.rmNan([t, yy])
            r = len(ind) / (ind[-1] - ind[0]) if len(ind) > 0 else 0
            sty = '-' if r > 0.9 else '*'
        else:
            sty = styLst[k]
        legStr = None if legLst is None else legLst[k]
        ax.plot(t, yy, sty, color=cLst[k], label=legStr, **kw)
    if tBar is not None:
        ylim = ax.get_ylim()
        tBar = [tBar] if type(tBar) is not list else tBar
        for tt in tBar:
            ax.plot([tt, tt], ylim, '-k')
    if legLst is not None:
        # ax.legend(loc='upper right', frameon=False)
        ax.legend(loc='upper right')
    ax.xaxis_date()
    return ax
예제 #13
0
def funcPoint(iP, axes):
    kA = 0
    siteNo = siteNoLst[iP]
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)
    ctR = pd.date_range(startDate, endDate)
    dfData = pd.DataFrame({'date': ctR}).set_index('date')
    dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate)
    dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
    dfQ = dfQ.rename(columns={'00060_00003': '00060'})
    dfData = dfData.join(dfQ)
    dfData = dfData.join(dfC)

    # plot normalized time series
    ax = axes[kA]
    kA = kA + 1
    t = dfData.index.values
    dfDataN = (dfData - dfData.mean()) / dfData.std()
    varLst = dfData.columns.tolist()
    data = [dfDataN[var].values for var in varLst]
    legLst = ['streamflow'
              ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst]
    axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst)
    ax.set_title(siteNo)

    # plot C-Q
    nc = len(codeLst)
    for k in range(nc):
        code = codeLst[k]
        q = dfData['00060']
        c = dfData[code]
        [q, c], ind = utils.rmNan([q, c])
        ceq, dw, y = wqRela.kateModel(q, c, q)
        ax = axes[kA]
        kA = kA + 1
        ax.plot(np.log(q), np.log(c), 'r*')
        ax.plot(np.log(q), np.log(y), 'b*')
예제 #14
0
# training / testing
yr = df.index.year.values
ind1 = np.where(yr <= 2016)[0]
ind2 = np.where(yr > 2016)[0]
dfYP = pd.DataFrame(index=df.index, columns=['WRTDS', 'LSTM'])

# WRTDS
dfX = pd.DataFrame({'date': df.index}).set_index('date')
dfX = dfX.join(np.log(df['00060']+sn)).rename(
    columns={'00060': 'logQ'})
t = yr+dfX.index.dayofyear.values/365
dfX['sinT'] = np.sin(2*np.pi*t)
dfX['cosT'] = np.cos(2*np.pi*t)
x = dfX.iloc[ind1].values
y = df.iloc[ind1][code].values
[xx, yy], iv = utils.rmNan([x, y])
lrModel = LinearRegression()
lrModel = lrModel.fit(xx, yy)
b = dfX.isna().any(axis=1)
yp = lrModel.predict(dfX[~b].values)
dfYP.at[dfYP[~b].index, 'WRTDS'] = yp

# LSTM
varC = [code]
rho = 52
dfX = pd.DataFrame({'date': df.index}).set_index('date')
dfX = dfX.join(np.log(df['00060']+sn)).rename(
    columns={'00060': 'logQ'})
t = yr+dfX.index.dayofyear.values/365
dfX['sinT'] = np.sin(2*np.pi*t)
dfX['cosT'] = np.cos(2*np.pi*t)
예제 #15
0
                intMatC[k, j, 0] = len(tC)
                intMatC[k, j, 1] = np.percentile(dd, 25)
                intMatC[k, j, 2] = np.percentile(dd, 50)
                intMatC[k, j, 3] = np.percentile(dd, 75)


# calculate LombScargle
if False:
    pMat = np.full([len(siteNoLst), len(codeLst)], np.nan)
    for ic, code in enumerate(codeLst):
        for siteNo in dictSite[code]:
            indS = siteNoLst.index(siteNo)
            df = dictObs[siteNo]
            t = np.arange(len(df))*7
            y = df[code]
            tt, yy = utils.rmNan([t, y], returnInd=False)
            p = LombScargle(tt, yy).power(1/365)
            pMat[indS, ic] = p


# plot 121
# plt.close('all')
# codeLst2 = ['00095', '00400', '00405', '00600', '00605',
#             '00618', '00660', '00665', '00681', '00915',
#             '00925', '00930', '00935', '00940', '00945',
#             '00950', '00955', '70303', '71846', '80154']
# nfy, nfx = [5, 4]


nfy, nfx = [3, 2]
# codeLst2 = ['00010', '00300']
예제 #16
0
                dd = dt.astype('timedelta64[D]').astype(int)
                intMatC[k, j, 0] = len(tC)
                intMatC[k, j, 1] = np.percentile(dd, 25)
                intMatC[k, j, 2] = np.percentile(dd, 50)
                intMatC[k, j, 3] = np.percentile(dd, 75)

# calculate LombScargle
if True:
    pMat = np.full([len(siteNoLst), len(codeLst)], np.nan)
    for ic, code in enumerate(codeLst):
        for siteNo in dictSite[code]:
            indS = siteNoLst.index(siteNo)
            df = dictObs[siteNo]
            t = np.arange(len(df)) * 7
            y = df[code]
            tt, yy = utils.rmNan([t, y], returnInd=False)
            p = LombScargle(tt, yy).power(1 / 365)
            pMat[indS, ic] = p

# calculate linear CQ relationship
if True:
    rMat = np.full([len(siteNoLst), len(codeLst)], np.nan)
    for ic, code in enumerate(codeLst):
        for siteNo in dictSite[code]:
            indS = siteNoLst.index(siteNo)
            q = dictObs[siteNo]['00060'].values
            c = dictObs[siteNo][code].values
            qq, cc = utils.rmNan([q, c], returnInd=False)
            corr = np.corrcoef(np.log(qq + 1), cc)[1, 0]
            rMat[indS, ic] = corr**2
예제 #17
0
    # calculate correlation
    tt = np.datetime64('2010-01-01')
    t0 = np.datetime64('1980-01-01')
    indT1 = np.where((df.index.values < tt) & (df.index.values >= t0))[0]
    indT2 = np.where(df.index.values >= tt)[0]
    dictLSTM = dictLSTMLst[0]
    corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan)
    rmseMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan)
    for ic, code in enumerate(codeLst):
        for siteNo in dictSite[code]:
            indS = siteNoLst.index(siteNo)
            v1 = dictLSTM[siteNo][code].iloc[indT2].values
            v2 = dictWRTDS[siteNo][code].iloc[indT2].values
            v3 = dictObs[siteNo][code].iloc[indT2].values
            vv1, vv2, vv3 = utils.rmNan([v1, v2, v3], returnInd=False)
            rmse1, corr1 = utils.stat.calErr(vv1, vv2)
            rmse2, corr2 = utils.stat.calErr(vv1, vv3)
            rmse3, corr3 = utils.stat.calErr(vv2, vv3)
            corrMat[indS, ic, 0] = corr1
            corrMat[indS, ic, 1] = corr2
            corrMat[indS, ic, 2] = corr3
            rmseMat[indS, ic, 0] = rmse1
            rmseMat[indS, ic, 1] = rmse2
            rmseMat[indS, ic, 2] = rmse3

    # load basin attributes
    regionLst = ['ECO2_BAS_DOM', 'NUTR_BAS_DOM',
                 'HLR_BAS_DOM_100M', 'PNV_BAS_DOM']
    dfG = gageII.readData(siteNoLst=siteNoLst)
    fileT = os.path.join(gageII.dirTab, 'lookupPNV.csv')
예제 #18
0
    # calculate correlation
    tt = np.datetime64('2010-01-01')
    t0 = np.datetime64('1980-01-01')
    ind1 = np.where((df.index.values < tt) & (df.index.values >= t0))[0]
    ind2 = np.where(df.index.values >= tt)[0]
    corrMat = np.full([len(siteNoLst), len(codeLst), 4], np.nan)
    rmseMat = np.full([len(siteNoLst), len(codeLst), 4], np.nan)
    for ic, code in enumerate(codeLst):
        for k, ind in enumerate([ind1, ind2]):
            for siteNo in dictSite[code]:
                indS = siteNoLst.index(siteNo)
                v1 = dictComb[siteNo][code].iloc[ind].values
                v2 = dictSolo[code][siteNo][code].iloc[ind].values
                v3 = dictObs[siteNo][code].iloc[ind].values
                vv1, vv2, vv3 = utils.rmNan([v1, v2, v3], returnInd=False)
                rmse1, corr1 = utils.stat.calErr(vv1, vv3)
                rmse2, corr2 = utils.stat.calErr(vv2, vv3)
                corrMat[indS, ic, k*2] = corr1
                corrMat[indS, ic, k*2+1] = corr2


# significance test
dfS = pd.DataFrame(index=codeLst, columns=['rmse', 'corr'])
for k, code in enumerate(codeLst):
    a = corrMat[:, k, 2]
    b = corrMat[:, k, 3]
    aa, bb = utils.rmNan([a, b], returnInd=False)
    s, p = scipy.stats.ttest_ind(aa, bb)
    # s, p = scipy.stats.wilcoxon(aa, bb)
    dfS.at[code, 'corr'] = p
예제 #19
0
ns = len(siteNoLst)

# cal dw
rMat = np.ndarray([ns, nc])
pdfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst)
unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2
for k, siteNo in enumerate(siteNoLst):
    for i, code in enumerate(codeLst):
        area = pdfArea.loc[siteNo]['DRAIN_SQKM']
        dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate)
        dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
        df = dfC.join(dfQ)
        t = df.index.values
        q = df['00060_00003'].values / area * unitConv
        c = df[code].values
        (q, c), ind = utils.rmNan([q, c])
        x = 10**np.linspace(np.log10(np.min(q[q > 0])),
                            np.log10(np.max(q[~np.isnan(q)])), 20)
        ceq, dw, y = wqRela.kateModel(q, c, q)
        corr = np.corrcoef(c, y)[0, 1]
        rMat[k, i] = corr

dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
lat = dfCrd['LAT_GAGE'].values
lon = dfCrd['LNG_GAGE'].values


def funcMap():
    figM, axM = plt.subplots(nc, 1, figsize=(8, 6))
    for k in range(nc):
        axplot.mapPoint(axM[k], lat, lon, rMat[:, k], s=12)
예제 #20
0
파일: box2.py 프로젝트: fkwai/geolearn
# select sites
dictSiteName = 'dict{}.json'.format(dataName[:4])
dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel')
with open(os.path.join(dirSel, dictSiteName)) as f:
    dictSite = json.load(f)

statStrLst = ['Bias', 'RMSE', 'NSE', 'Corr']
dataPlot = list()
labelLst = [
    usgs.codePdf.loc[code]['shortName'] + '\n' + code for code in codeLst
]
for k, statStr in enumerate(statStrLst):
    temp = list()
    for ic, code in enumerate(codeLst):
        [a, b,
         c], _ = utils.rmNan([mat1[:, ic, k], mat2[:, ic, k], mat3[:, ic, k]])
        temp.append([a, b, c])
    sharey = False if statStr in ['Bias', 'RMSE'] else True
    fig, axes = figplot.boxPlot(temp,
                                widths=0.5,
                                figsize=(12, 4),
                                label2=['LSTM w/ Q', 'LSTM w/o Q', 'WRTDS'],
                                label1=labelLst,
                                sharey=sharey)
    if statStr == 'Bias':
        for ax in axes:
            _ = ax.axhline(0)
    fig.show()

#
# DF2 = dbBasin.DataFrameBasin('G400')
예제 #21
0
dictObs = dict()
for k, siteNo in enumerate(siteNoLst):
    print('\t USGS site {}/{}'.format(k, len(siteNoLst)), end='\r')
    df = waterQuality.readSiteTS(
        siteNo, varLst=['00060']+codeLst, freq='W', rmFlag=True)
    dictObs[siteNo] = df

# calculate correlation
corrMatTemp = np.full([len(siteNoLst), len(codeLst), 2], np.nan)
for ic, code in enumerate(codeLst):
    for siteNo in dictSite[code]:
        indS = siteNoLst.index(siteNo)
        v1 = dictL[siteNo][code].values
        v2 = dictS[siteNo][code].values
        v0 = dictObs[siteNo][code].values
        (vv0, vv1, vv2), indV = utils.rmNan([v0, v1, v2])
        rmse1, corr1 = utils.stat.calErr(vv1, vv0)
        rmse2, corr2 = utils.stat.calErr(vv2, vv0)
        corrMatTemp[indS, ic, 0] = corr1
        corrMatTemp[indS, ic, 1] = corr2

rMat = corrMatTemp**2
codeLst2 = ['00915', '00925', '00930', '00935', '00940', '00945',
            '00955', '70303', '80154']
[nfy, nfx] = [3, 3]

codeLst2 = ['00010', '00300', '00405', '00600', '00605',
            '00618', '00660', '00665', '00681', '00915',
            '00925', '00930', '00935', '00940', '00945',
            '00950', '00955', '70303', '71846', '80154']
nfy, nfx = [4, 5]
예제 #22
0
np.nanmean(errMatC1[:, 0, 1])
np.nanmean(errMatC2[:, 0, 1])

# transfer - validate if training error is correct
mtd = wqData.extractVarMtd(master['varYC'])
xcP = transform.transInAll(ycP2, mtd, statLst=statTup[3])
xcT = transform.transInAll(ycT2, mtd, statLst=statTup[3])
mtd = wqData.extractVarMtd(master['varY'])
xP = transform.transInAll(yP2, mtd, statLst=statTup[2])
xT = transform.transInAll(yT2, mtd, statLst=statTup[2])

np.sqrt(np.nanmean((xT - xP)**2))
np.sqrt(np.nanmean((xcT - xcP)**2))
(np.sqrt(np.nanmean((xT - xP)**2)) + np.sqrt(np.nanmean((xcT - xcP)**2))) / 2

# see correlation
info = wqData.subsetInfo(testSet)
siteNoLst = info.siteNo.unique()
corrMat = np.full([len(siteNoLst), 2], np.nan)
for i, siteNo in enumerate(siteNoLst):
    indS = info[info['siteNo'] == siteNo].index.values
    a = xcT[indS, 0]
    b = xcP[indS, 0]
    _, indV = utils.rmNan([a, b])
    corrMat[i, 1] = np.corrcoef(a[indV], b[indV])[0, 1]
    a = xT[-1, indS, 0]
    b = xP[-1, indS, 0]
    _, indV = utils.rmNan([a, b])
    corrMat[i, 0] = np.corrcoef(a[indV], b[indV])[0, 1]
np.mean(corrMat[:, 1])
예제 #23
0
파일: dQvsErr.py 프로젝트: fkwai/geolearn
codeLst2 = [
    '00010', '00095', '00300', '00400', '00405', '00600', '00605', '00618',
    '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940',
    '00945', '00955', '71846', '80154'
]
nfy, nfx = [5, 4]

code = '00915'
xLst = list()
yLst = list()
for siteNo in siteNoLst:
    dfErr = dictErr[siteNo]
    x = dfErr['dQ']
    y = dfErr[code].values
    [xx, yy] = utils.rmNan([x[ind2], y[ind2]], returnInd=False)
    xLst.append(xx)
    yLst.append(yy)
xMat = np.concatenate(xLst)
yMat = np.concatenate(yLst)
fig, ax = plt.subplots(1, 1)
ax.plot(xMat, yMat, '*')
ax.set_xlabel('dQ/dt')
ax.set_ylabel('error')
fig.show()

siteNoCode = dictSite[code]
siteNo = random.choice(siteNoCode)
dfErr = dictErr[siteNo]
x = dfErr['dQ']
y = dfErr[code].values
예제 #24
0
# # calculate correlation
tt = np.datetime64('2010-01-01')
ind1 = np.where(df.index.values < tt)[0]
ind2 = np.where(df.index.values >= tt)[0]
corrLSTM = np.full([len(siteNoLst), 2], np.nan)
rmseLSTM = np.full([len(siteNoLst),  2], np.nan)
corrWRTDS = np.full([len(siteNoLst), 2], np.nan)
rmseWRTDS = np.full([len(siteNoLst), 2], np.nan)
for k, indT in enumerate([ind1, ind2]):
    for siteNo in dictSite[code]:
        indS = siteNoLst.index(siteNo)
        v0 = dictObs[siteNo][code].iloc[indT].values
        v1 = dictLSTM[siteNo][code].iloc[indT].values
        v2 = dictWRTDS[siteNo][code].iloc[indT].values
        v3 = dictObs[siteNo][code].iloc[indT].values
        [v0, v1, v2], ind = utils.rmNan([v0, v1, v2])
        rmse1, corr1 = utils.stat.calErr(v1, v0, rmExt=True)
        rmse2, corr2 = utils.stat.calErr(v2, v0, rmExt=True)
        corrLSTM[indS, k] = corr1
        corrWRTDS[indS, k] = corr2
        rmseLSTM[indS, k] = rmse1
        rmseWRTDS[indS, k] = rmse2

# box

matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams.update({'lines.linewidth': 2})
matplotlib.rcParams.update({'lines.markersize': 12})
# # plot box
# labLst1 = [usgs.codePdf.loc[code]['shortName'] +
#            '\n'+code for code in codeLst]
예제 #25
0
 saveFile = os.path.join(dirOut, siteNo)
 if os.path.exists(saveFile):
     continue
 varC = codeLst
 # varQ = ['00060']
 df = waterQuality.readSiteTS(siteNo, varLst=varC, freq='W')
 dfX = pd.DataFrame({'date': df.index}).set_index('date')
 yr = dfX.index.year.values
 t = dfX.index.dayofyear.values/365
 dfX['sinT'] = np.sin(2*np.pi*t)
 dfX['cosT'] = np.cos(2*np.pi*t)
 dfYP = pd.DataFrame(index=df.index, columns=varC)
 dfYP.index.name = 'date'
 for code in varC:
     # print(code)
     [xx, yy], _ = utils.rmNan([dfX.values, df[code].values])
     [xp], iv = utils.rmNan([dfX.values])
     if len(yy) <= 2:
         dictRes[code].loc[siteNo] = [len(yy)]+[np.nan for x in range(5)]
     else:
         lrModel = LinearRegression()
         lrModel = lrModel.fit(xx, yy)
         yp = lrModel.predict(xp)
         yt = lrModel.predict(xx)
         dfYP.at[dfX.index[iv], code] = yp
         coef = lrModel.coef_
         inte = lrModel.intercept_
         rmse = np.sqrt(np.nanmean((yt-yy)**2))
         if len(np.unique(yy)) == 1:
             corr = -9999
         else:
예제 #26
0
labLst2 = ['WRTDS test', 'LSTM test']
dataBox = list()
for k in range(len(codeLst)):
    code = codeLst[k]
    temp = list()
    # for i in [2, 3, 0 ,1]:
    for i in [3, 1]:
        temp.append(corrMat[:, k, i])
    dataBox.append(temp)
fig = figplot.boxPlot(dataBox, label1=labLst1, widths=0.5, cLst='br',
                      label2=labLst2, figsize=(12, 4), yRange=[0, 1])
# fig = figplot.boxPlot(dataBox, label1=labLst1, widths=0.5,
#                       label2=labLst2, figsize=(12, 4), sharey=False)
fig.show()

# p-values
testLst = ['p-value']
indLst = [[1, 3]]
codeStrLst = ['{} {}'.format(
    code, usgs.codePdf.loc[code]['shortName']) for code in codeLst]
dfS = pd.DataFrame(index=codeStrLst, columns=testLst)
for (test, ind) in zip(testLst, indLst):
    for k, code in enumerate(codeLst):
        data = [corrMat[:, k, x] for x in ind]
        [a, b], _ = utils.rmNan(data)
        # s, p = scipy.stats.ttest_ind(a, b, equal_var=False)
        s, p = scipy.stats.ttest_rel(a, b)
        dfS.loc[codeStrLst[k]][test] = p
pd.options.display.float_format = '{:,.2f}'.format
print(dfS)
예제 #27
0
    dictObs[siteNo] = df

# calculate rsq
rMat = np.full([len(siteNoLst), len(codeLst), 2], np.nan)
tt = np.datetime64('2010-01-01')
t0 = np.datetime64('1980-01-01')
t = dictObs[siteNoLst[0]].index.values
ind1 = np.where((t < tt) & (t >= t0))[0]
ind2 = np.where(t >= tt)[0]
for ic, code in enumerate(codeLst):
    for siteNo in dictSite[code]:
        indS = siteNoLst.index(siteNo)
        v1 = dictL[siteNo][code].values
        v2 = dictS[siteNo][code].values
        v0 = dictObs[siteNo][code].values
        (vv0, vv1, vv2), indV = utils.rmNan([v0, v1, v2])
        rmse1, corr1 = utils.stat.calErr(vv1, vv0)
        rmse2, corr2 = utils.stat.calErr(vv2, vv0)
        rMat[indS, ic, 0] = corr1**2  # linearity
        rMat[indS, ic, 1] = corr2**2  # seasonality

# a cdf for rsq of seasonality and linearity
code = '00915'
indS = [siteNoLst.index(siteNo) for siteNo in dictSite[code]]
ic = codeLst.index(code)
fig, ax = plt.subplots(1, 1)
axplot.plotCDF(ax, [rMat[indS, ic, 0], rMat[indS, ic, 1]],
               legLst=['linearity', 'seasonality'])
fig.show()

fig, ax = plt.subplots(1, 1)
예제 #28
0
    dfX['cosT'] = np.cos(2 * np.pi * t)

    ctR = pd.date_range(startDate, endDate)
    dfXP = pd.DataFrame({'date': ctR}).set_index('date')
    dfXP = dfXP.join(np.log(dfQ['00060_00003'] +
                            0.01)).rename(columns={'00060_00003': 'logQ'})
    dfXP = dfXP.join(dfF)
    yr = dfXP.index.year.values
    t = yr + dfXP.index.dayofyear.values / 365
    # dfXP['t'] = t-1979
    dfXP['sinT'] = np.sin(2 * np.pi * t)
    dfXP['cosT'] = np.cos(2 * np.pi * t)

    for k in range(2):
        ind = indLst[k]
        saveFile = saveLst[k]
        dfYP = pd.DataFrame(index=ctR, columns=usgs.varC)
        dfYP.index.name = 'date'
        if len(ind) > 0:
            for code in usgs.varC:
                [xx, yy], iv = utils.rmNan(
                    [dfX.iloc[ind].values, dfY.iloc[ind][code].values])
                if len(xx) > 0:
                    lrModel = LinearRegression()
                    lrModel = lrModel.fit(xx, yy)
                    b = dfXP.isna().any(axis=1)
                    yp = lrModel.predict(dfXP[~b].values)
                    yp = np.exp(yp) - sn
                    dfYP.at[dfYP[~b].index, code] = yp
        dfYP.to_csv(saveFile)
예제 #29
0
dfGeo = gageII.updateCode(dfGeo)

# select sites
nS = 200
dfR1 = dfRes1[dfRes1['count'] > nS]
siteNoLst = dfR1.index.tolist()
dfR2 = dfRes2.loc[siteNoLst]
dfG = dfGeo.loc[siteNoLst]

varGLst = dfG.columns.tolist()
dfRsq = pd.DataFrame(index=varGLst, columns=['Rsq1', 'Rsq2'])
for varG in varGLst:
    x = dfG[varG].values
    y1 = dfR1['corr'].values
    y2 = dfR1['corr'].values
    (xx, yy1, yy2), _ = utils.rmNan([x, y1, y2])
    r1 = np.corrcoef(xx, yy1)[0, 1]
    dfRsq.at[varG, 'Rsq1'] = r1**2
    r2 = np.corrcoef(xx, yy2)[0, 1]
    dfRsq.at[varG, 'Rsq2'] = r2**2

dfRsq.to_csv('temp')
dfRsq.sort_values('Rsq1', ascending=False)

# varG = 'SLOPE_PCT'
varG = 'HLR_BAS_PCT_100M'
x = dfG[varG].values
y = dfR1['corr'].values
x[x < -900] = np.nan
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, '*')
예제 #30
0
ind = wqData.subset[testSet].tolist()
# ind=wqData.subset[trainSet].tolist()
axes[0].plot(wqData.c[ind, indC1], ypLst2[1][:, indC2], '*')
axes[1].plot(wqData.c[ind, indC1], ypLst2[0][:, indC2], '*')
fig.show()

fig, axes = plt.subplots(1, 2)
indF = wqData.varF.index('ph')
indC = wqData.varC.index('00945')
axes[0].plot(wqData.c[:, indC], wqData.f[-1, :, indF], '*')
indF = wqData.varF.index('SO4')
axes[1].plot(wqData.c[:, indC], wqData.f[-1, :, indF], '*')
fig.show()

indF = wqData.varF.index('ph')
_, ind = utils.rmNan([wqData.c[:, indC], wqData.f[-1, :, indF]])
np.corrcoef(wqData.c[ind, indC], wqData.f[-1, ind, indF])

# time series
siteNoLst = wqData.siteNoLst
fig, axes = plt.subplots(3, 1)
for k in range(3):
    siteNo = siteNoLst[k]
    info1 = wqData.info.iloc[wqData.subset[trainSet]].reset_index()
    info2 = wqData.info.iloc[wqData.subset[testSet]].reset_index()
    t1 = info1['date'].values
    t2 = info2['date'].values
    indS1 = info1[info1['siteNo'] == siteNo].index.values
    indS2 = info2[info2['siteNo'] == siteNo].index.values
    t = wqData.info[wqData.info['siteNo'] == siteNo]['date'].values
    v = wqData.c[wqData.info['siteNo'] == siteNo][:, 0]