def testWRTDS(dataName, trainSet, testSet, codeLst): DF = dbBasin.DataFrameBasin(dataName) # Calculate WRTDS from train and test set varX = ['00060'] varY = codeLst d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varX=varX, varY=varY) d2 = dbBasin.DataModelBasin(DF, subset=testSet, varX=varX, varY=varY) tt1 = pd.to_datetime(d1.t) yr1 = tt1.year.values t1 = yr1 + tt1.dayofyear.values / 365 sinT1 = np.sin(2 * np.pi * t1) cosT1 = np.cos(2 * np.pi * t1) tt2 = pd.to_datetime(d2.t) yr2 = tt2.year.values t2 = yr2 + tt2.dayofyear.values / 365 sinT2 = np.sin(2 * np.pi * t2) cosT2 = np.cos(2 * np.pi * t2) ### yOut = np.full([len(d2.t), len(d2.siteNoLst), len(varY)], np.nan) t0 = time.time() for indS, siteNo in enumerate(d2.siteNoLst): for indC, code in enumerate(varY): print('{} {} {} {}'.format(indS, siteNo, code, time.time() - t0)) y1 = d1.Y[:, indS, indC].copy() q1 = d1.X[:, indS, 0].copy() q1[q1 < 0] = 0 logq1 = np.log(q1 + sn) x1 = np.stack([logq1, yr1, sinT1, cosT1]).T y2 = d2.Y[:, indS, indC].copy() q2 = d2.X[:, indS, 0].copy() q2[q2 < 0] = 0 logq2 = np.log(q2 + sn) x2 = np.stack([logq2, yr2, sinT2, cosT2]).T [xx1, yy1], ind1 = utils.rmNan([x1, y1]) if testSet == 'all': [xx2], ind2 = utils.rmNan([x2]) else: [xx2, yy2], ind2 = utils.rmNan([x2, y2]) if len(ind1) < 40: continue for k in ind2: dY = np.abs(t2[k] - t1[ind1]) dQ = np.abs(logq2[k] - logq1[ind1]) dS = np.min(np.stack( [abs(np.ceil(dY) - dY), abs(dY - np.floor(dY))]), axis=0) d = np.stack([dY, dQ, dS]) ww, ind = calWeight(d) model = sm.WLS(yy1[ind], xx1[ind], weights=ww).fit() yp = model.predict(x2[k, :])[0] yOut[k, indS, indC] = yp return yOut
def errBySiteC(self, ycP, varC, subset=None, rmExt=False): if type(varC) is not list: varC = [varC] obsLst = self.extractSubset(subset=subset) ycT = obsLst[3] indC = [self.varC.index(var) for var in varC] info = self.info.loc[self.subset[subset].tolist()].reset_index() siteNoLst = self.info.siteNo.unique() statMat = np.full([len(siteNoLst), len(indC), 3], np.nan) for i, siteNo in enumerate(siteNoLst): indS = info[info['siteNo'] == siteNo].index.values for k, iC in enumerate(indC): a = ycT[indS, iC] b = ycP[indS, k] if rmExt is True and len(a) != 0: aV = a[a < np.nanpercentile(a, 95)] aV = aV[aV > np.nanpercentile(a, 5)] ul = np.mean(aV) + np.std(aV) * 5 a[a > ul] = np.nan # indV = np.where(~np.isnan(a)) if len(indS) > 0: _, indV = utils.rmNan([a, b]) rmse = np.sqrt(np.nanmean((a[indV] - b[indV])**2)) corr = np.corrcoef(a[indV], b[indV])[0, 1] # nse = 1-np.nansum((b-a)**2)/np.nansum((a-np.nanmean(a))**2) # nse = np.nanmean(b)/np.nanmean(a)-1 nse = np.nanmean(np.abs((b - a) / a)) statMat[i, k, 0] = rmse statMat[i, k, 1] = corr statMat[i, k, 2] = nse return statMat
def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfPred, dfObs = basins.loadSeq(outName, siteNo) t = dfPred['date'].values.astype(np.datetime64) tBar = np.datetime64('2000-01-01') # linear model ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index [x1, y1, yc1], _ = utils.rmNan([xL1[ind1, :], yL1[ind1, :], ycL1[ind1, :]]) modelY = LinearRegression().fit(x1, y1) modelYC = LinearRegression().fit(x1, yc1) sd = np.datetime64('1979-01-01') ed = np.datetime64('2020-01-01') dfX = waterQuality.readSiteX(siteNo, sd, ed, varX) x2 = transform.transInAll(dfX.values, mtdX, statLst=statX) y2 = modelY.predict(x2) yc2 = modelYC.predict(x2) yp = wqData.transOut(y2, statY, varY) ycp = wqData.transOut(yc2, statYC, varYC) code = codeLst[0] axplot.plotTS(axP[0], t, [dfPred['00060'], yp, dfObs['00060']], tBar=tBar, legLst=['lstm', 'lr', 'obs'], styLst='---', cLst='bgr') axplot.plotTS(axP[1], t, [dfPred[code], ycp, dfObs[code]], tBar=tBar, legLst=['lstm', 'lr', 'obs'], styLst='--*', cLst='bgr')
def dictErr(dictLSTM, dictWRTDS, dictObs, codeLst): # calculate correlation tt = np.datetime64('2010-01-01') t0 = np.datetime64('1980-01-01') siteNoLst = list(dictObs.keys()) # codeLst = dictObs[siteNoLst[0]].columns.tolist() t = dictObs[siteNoLst[0]].index.values ind1 = np.where((t < tt) & (t >= t0))[0] ind2 = np.where(t >= tt)[0] corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan) rmseMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan) for ic, code in enumerate(codeLst): for siteNo in siteNoLst: indS = siteNoLst.index(siteNo) v1 = dictLSTM[siteNo][code].iloc[ind2].values v2 = dictWRTDS[siteNo][code].iloc[ind2].values v3 = dictObs[siteNo][code].iloc[ind2].values dfQ1 = dictObs[siteNo][['00060', code]].iloc[ind1].dropna() (vv1, vv2, vv3), indV = utils.rmNan([v1, v2, v3]) if (len(indV) < 50) or (len(dfQ1) < 50): # print(code, siteNo) pass else: rmse1, corr1 = utils.stat.calErr(vv1, vv2) rmse2, corr2 = utils.stat.calErr(vv1, vv3) rmse3, corr3 = utils.stat.calErr(vv2, vv3) corrMat[indS, ic, 0] = corr1 corrMat[indS, ic, 1] = corr2 corrMat[indS, ic, 2] = corr3 rmseMat[indS, ic, 0] = rmse1 rmseMat[indS, ic, 1] = rmse2 rmseMat[indS, ic, 2] = rmse3 return corrMat, rmseMat
def kateModel(q, c, x=None): (q, c), ind = utils.rmNan([q, c]) popt, pcov = curve_fit(func, q, c, bounds=[(0, 0), (np.inf, 100)]) ceq = popt[0] dw = popt[1] if x is None: out = None else: out = ceq / (1 + x / dw) return ceq, dw, out
def trainLR(dfXT, dfYT, dfXN, dfYN): [xx, yy], iv = utils.rmNan([dfXT.values, dfYT.values]) if len(iv) > 0: modelYC = LinearRegression().fit(xx, yy) yp = modelYC.predict(dfXN.values) dfPN = pd.DataFrame(data=yp, index=dfYN.index, columns=dfYN.columns) else: dfPN = pd.DataFrame(index=dfYN.index, columns=dfYN.columns, data=np.nan) return dfPN
def funcPoint(iP, axes): kA = 0 siteNo = siteNoLst[iP] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) ctR = pd.date_range(startDate, endDate) dfData = pd.DataFrame({'date': ctR}).set_index('date') dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) dfData = dfData.join(dfQ) dfData = dfData.join(dfC) # plot normalized time series ax = axes[kA] kA = kA + 1 t = dfData.index.values dfDataN = (dfData - dfData.mean()) / dfData.std() varLst = dfData.columns.tolist() data = [dfDataN[var].values for var in varLst] legLst = ['streamflow' ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst] axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst) # plot C-Q nc = len(codeLst) for k in range(nc): code = codeLst[k] q = dfData['00060'] c = dfData[code] [q, c], ind = utils.rmNan([q, c]) ax = axes[kA] kA = kA + 1 ax.plot(np.log(q), np.log(c), 'r*') # plot fractual for k in range(nc): code = codeLst[k] dfV = dfData[dfData[code].notna()] nt = len(dfData) x = dfV.index.values.astype('datetime64[D]') y = dfV[code].values freq = 2 * np.pi / np.linspace(2, nt, nt) power = signal.lombscargle(x, y, freq) ax = axes[kA] kA = kA + 1 ax.plot(np.log(freq / 2 * np.pi), np.log(power), '-*') fyr = 2 * np.pi / 365 pyr = signal.lombscargle(x, y, [fyr]) ax.plot(np.log(fyr / 2 * np.pi), np.log(pyr), 'r*')
def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfO = waterQuality.readSiteTS(siteNo, [code], freq='W')[code] t = dfO.index file1 = os.path.join(dirRoot1, 'output', siteNo) file2 = os.path.join(dirRoot2, 'output', siteNo) dfP1 = pd.read_csv(file1, index_col='date')[code] dfP2 = pd.read_csv(file2, index_col='date')[code] v = [dfP1.values, dfP2.values, dfO.values] [v1, v2, o], iv = utils.rmNan([dfP1.values, dfP2.values, dfO.values]) tt = t[iv] styLst = [['-*'] for x in range(3)] axplot.plotTS(axP, tt.values, [v1, v2, o], cLst='rbk') # print corr rmse1, corr1 = utils.stat.calErr(v[0], v[-1]) rmse2, corr2 = utils.stat.calErr(v[1], v[-1]) axP.set_title('site {} WRTDS {:.2f} only T {:.2f}'.format( siteNo, corr1, corr2))
def modelLinear(outName, testset, trainset=None, wqData=None): master = loadMaster(outName) dataName = master['dataName'] if wqData is None: wqData = waterQuality.DataModelWQ(dataName) if trainset is None: trainset = master['trainName'] infoTrain = wqData.info.iloc[wqData.subset[trainset]].reset_index() infoTest = wqData.info.iloc[wqData.subset[testset]].reset_index() # linear reg data statTup = loadStat(outName) varTup = (master['varX'], master['varXC'], master['varY'], master['varYC']) dataTup1 = wqData.transIn(subset=trainset, varTup=varTup, statTup=statTup) dataTup2 = wqData.transIn(subset=testset, varTup=varTup, statTup=statTup) dataTup1 = trainTS.dealNaN(dataTup1, master['optNaN']) dataTup2 = trainTS.dealNaN(dataTup2, master['optNaN']) varYC = varTup[3] statYC = statTup[3] x1 = dataTup1[0][-1, :, :] yc1 = dataTup1[3] x2 = dataTup2[0][-1, :, :] # point test l2 - linear nc = len(varYC) matP1 = np.full([len(infoTrain), nc], np.nan) matP2 = np.full([len(infoTest), nc], np.nan) siteNoLst = infoTest['siteNo'].unique().tolist() for siteNo in siteNoLst: ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index ind2 = infoTest[infoTest['siteNo'] == siteNo].index xT1 = x1[ind1, :] ycT1 = yc1[ind1, :] for ic in range(nc): [xx, yy], iv = utils.rmNan([xT1, ycT1[:, ic]]) if len(iv) > 0: modelYC = LinearRegression().fit(xx, yy) matP1[ind1, ic] = modelYC.predict(xT1) if len(ind2) > 0: xT2 = x2[ind2, :] matP1[ind2, ic] = modelYC.predict(xT2) matO1 = wqData.transOut(matP1, statYC, varYC) matO2 = wqData.transOut(matP2, statYC, varYC) return matO1, matO2
def tsYr(t, y, cLst='rbkgcmy', figsize=(12, 4), showCorr=False): y = y if type(y) is list else [y] yrAll = pd.to_datetime(t).year yrLst = yrAll.unique().tolist() ny = len(yrLst) fig, axes = plt.subplots(ncols=ny, sharey=True, figsize=figsize) fig.subplots_adjust(wspace=0) for iYr, yr in enumerate(yrLst): ind = np.where(yrAll == yr)[0] _ = axplot.plotTS(axes[iYr], t[ind], [v[ind] for v in y], cLst=cLst) _ = axes[iYr].set_xlim(np.datetime64(str(yr)), np.datetime64(str(yr + 1))) _ = axes[iYr].set_xticks([]) corr = np.corrcoef(utils.rmNan([v[ind] for v in y], returnInd=False))[0, 1] if showCorr is True: _ = axes[iYr].set_xlabel('{}\n{:.2f}'.format(yr, corr)) else: _ = axes[iYr].set_xlabel('{}'.format(yr)) return fig
def funcPoint(iP, axP): siteNo = siteNoHBN[iP] dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst) dfQ = usgs.readStreamflow(siteNo) df = dfC.join(dfQ) t = df.index.values q = df['00060_00003'].values / area * unitConv c = df[code].values [q, c], ind = utils.rmNan([q, c]) t = t[ind] qAll = dfQ['00060_00003'].values qT = dfQ.index.values axplot.plotTS(axP[0], qT, qAll, cLst='b', styLst='--') axplot.plotTS(axP[1], t, c) axP[2].plot(np.log(q), c, 'k*') x = 10**np.linspace(np.log10(np.min(q[q > 0])), np.log10(np.max(q[~np.isnan(q)])), 20) ceq0 = pMat2[iP, 0] dw0 = pMat2[iP, 1] y0 = ceq0 * 1 / (x / dw0 + 1) axP[2].plot(np.log(x), y0, 'r-') axP[2].set_title('ceq={:.3f},dw={:.3f}'.format(ceq0, dw0))
def plotTS(ax, t, y, *, styLst=None, tBar=None, cLst='krbgcmy', legLst=None, sd=None, **kw): y = y if type(y) is list else [y] if sd is not None: ind = np.where(t >= sd)[0] t = t[ind] for k in range(len(y)): y[k] = y[k][ind] for k in range(len(y)): yy = y[k] # find out continuous / distinct if styLst is None: [_, _], ind = utils.rmNan([t, yy]) r = len(ind) / (ind[-1] - ind[0]) if len(ind) > 0 else 0 sty = '-' if r > 0.9 else '*' else: sty = styLst[k] legStr = None if legLst is None else legLst[k] ax.plot(t, yy, sty, color=cLst[k], label=legStr, **kw) if tBar is not None: ylim = ax.get_ylim() tBar = [tBar] if type(tBar) is not list else tBar for tt in tBar: ax.plot([tt, tt], ylim, '-k') if legLst is not None: # ax.legend(loc='upper right', frameon=False) ax.legend(loc='upper right') ax.xaxis_date() return ax
def funcPoint(iP, axes): kA = 0 siteNo = siteNoLst[iP] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) ctR = pd.date_range(startDate, endDate) dfData = pd.DataFrame({'date': ctR}).set_index('date') dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) dfData = dfData.join(dfQ) dfData = dfData.join(dfC) # plot normalized time series ax = axes[kA] kA = kA + 1 t = dfData.index.values dfDataN = (dfData - dfData.mean()) / dfData.std() varLst = dfData.columns.tolist() data = [dfDataN[var].values for var in varLst] legLst = ['streamflow' ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst] axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst) ax.set_title(siteNo) # plot C-Q nc = len(codeLst) for k in range(nc): code = codeLst[k] q = dfData['00060'] c = dfData[code] [q, c], ind = utils.rmNan([q, c]) ceq, dw, y = wqRela.kateModel(q, c, q) ax = axes[kA] kA = kA + 1 ax.plot(np.log(q), np.log(c), 'r*') ax.plot(np.log(q), np.log(y), 'b*')
# training / testing yr = df.index.year.values ind1 = np.where(yr <= 2016)[0] ind2 = np.where(yr > 2016)[0] dfYP = pd.DataFrame(index=df.index, columns=['WRTDS', 'LSTM']) # WRTDS dfX = pd.DataFrame({'date': df.index}).set_index('date') dfX = dfX.join(np.log(df['00060']+sn)).rename( columns={'00060': 'logQ'}) t = yr+dfX.index.dayofyear.values/365 dfX['sinT'] = np.sin(2*np.pi*t) dfX['cosT'] = np.cos(2*np.pi*t) x = dfX.iloc[ind1].values y = df.iloc[ind1][code].values [xx, yy], iv = utils.rmNan([x, y]) lrModel = LinearRegression() lrModel = lrModel.fit(xx, yy) b = dfX.isna().any(axis=1) yp = lrModel.predict(dfX[~b].values) dfYP.at[dfYP[~b].index, 'WRTDS'] = yp # LSTM varC = [code] rho = 52 dfX = pd.DataFrame({'date': df.index}).set_index('date') dfX = dfX.join(np.log(df['00060']+sn)).rename( columns={'00060': 'logQ'}) t = yr+dfX.index.dayofyear.values/365 dfX['sinT'] = np.sin(2*np.pi*t) dfX['cosT'] = np.cos(2*np.pi*t)
intMatC[k, j, 0] = len(tC) intMatC[k, j, 1] = np.percentile(dd, 25) intMatC[k, j, 2] = np.percentile(dd, 50) intMatC[k, j, 3] = np.percentile(dd, 75) # calculate LombScargle if False: pMat = np.full([len(siteNoLst), len(codeLst)], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) df = dictObs[siteNo] t = np.arange(len(df))*7 y = df[code] tt, yy = utils.rmNan([t, y], returnInd=False) p = LombScargle(tt, yy).power(1/365) pMat[indS, ic] = p # plot 121 # plt.close('all') # codeLst2 = ['00095', '00400', '00405', '00600', '00605', # '00618', '00660', '00665', '00681', '00915', # '00925', '00930', '00935', '00940', '00945', # '00950', '00955', '70303', '71846', '80154'] # nfy, nfx = [5, 4] nfy, nfx = [3, 2] # codeLst2 = ['00010', '00300']
dd = dt.astype('timedelta64[D]').astype(int) intMatC[k, j, 0] = len(tC) intMatC[k, j, 1] = np.percentile(dd, 25) intMatC[k, j, 2] = np.percentile(dd, 50) intMatC[k, j, 3] = np.percentile(dd, 75) # calculate LombScargle if True: pMat = np.full([len(siteNoLst), len(codeLst)], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) df = dictObs[siteNo] t = np.arange(len(df)) * 7 y = df[code] tt, yy = utils.rmNan([t, y], returnInd=False) p = LombScargle(tt, yy).power(1 / 365) pMat[indS, ic] = p # calculate linear CQ relationship if True: rMat = np.full([len(siteNoLst), len(codeLst)], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) q = dictObs[siteNo]['00060'].values c = dictObs[siteNo][code].values qq, cc = utils.rmNan([q, c], returnInd=False) corr = np.corrcoef(np.log(qq + 1), cc)[1, 0] rMat[indS, ic] = corr**2
# calculate correlation tt = np.datetime64('2010-01-01') t0 = np.datetime64('1980-01-01') indT1 = np.where((df.index.values < tt) & (df.index.values >= t0))[0] indT2 = np.where(df.index.values >= tt)[0] dictLSTM = dictLSTMLst[0] corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan) rmseMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v1 = dictLSTM[siteNo][code].iloc[indT2].values v2 = dictWRTDS[siteNo][code].iloc[indT2].values v3 = dictObs[siteNo][code].iloc[indT2].values vv1, vv2, vv3 = utils.rmNan([v1, v2, v3], returnInd=False) rmse1, corr1 = utils.stat.calErr(vv1, vv2) rmse2, corr2 = utils.stat.calErr(vv1, vv3) rmse3, corr3 = utils.stat.calErr(vv2, vv3) corrMat[indS, ic, 0] = corr1 corrMat[indS, ic, 1] = corr2 corrMat[indS, ic, 2] = corr3 rmseMat[indS, ic, 0] = rmse1 rmseMat[indS, ic, 1] = rmse2 rmseMat[indS, ic, 2] = rmse3 # load basin attributes regionLst = ['ECO2_BAS_DOM', 'NUTR_BAS_DOM', 'HLR_BAS_DOM_100M', 'PNV_BAS_DOM'] dfG = gageII.readData(siteNoLst=siteNoLst) fileT = os.path.join(gageII.dirTab, 'lookupPNV.csv')
# calculate correlation tt = np.datetime64('2010-01-01') t0 = np.datetime64('1980-01-01') ind1 = np.where((df.index.values < tt) & (df.index.values >= t0))[0] ind2 = np.where(df.index.values >= tt)[0] corrMat = np.full([len(siteNoLst), len(codeLst), 4], np.nan) rmseMat = np.full([len(siteNoLst), len(codeLst), 4], np.nan) for ic, code in enumerate(codeLst): for k, ind in enumerate([ind1, ind2]): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v1 = dictComb[siteNo][code].iloc[ind].values v2 = dictSolo[code][siteNo][code].iloc[ind].values v3 = dictObs[siteNo][code].iloc[ind].values vv1, vv2, vv3 = utils.rmNan([v1, v2, v3], returnInd=False) rmse1, corr1 = utils.stat.calErr(vv1, vv3) rmse2, corr2 = utils.stat.calErr(vv2, vv3) corrMat[indS, ic, k*2] = corr1 corrMat[indS, ic, k*2+1] = corr2 # significance test dfS = pd.DataFrame(index=codeLst, columns=['rmse', 'corr']) for k, code in enumerate(codeLst): a = corrMat[:, k, 2] b = corrMat[:, k, 3] aa, bb = utils.rmNan([a, b], returnInd=False) s, p = scipy.stats.ttest_ind(aa, bb) # s, p = scipy.stats.wilcoxon(aa, bb) dfS.at[code, 'corr'] = p
ns = len(siteNoLst) # cal dw rMat = np.ndarray([ns, nc]) pdfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst) unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2 for k, siteNo in enumerate(siteNoLst): for i, code in enumerate(codeLst): area = pdfArea.loc[siteNo]['DRAIN_SQKM'] dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) df = dfC.join(dfQ) t = df.index.values q = df['00060_00003'].values / area * unitConv c = df[code].values (q, c), ind = utils.rmNan([q, c]) x = 10**np.linspace(np.log10(np.min(q[q > 0])), np.log10(np.max(q[~np.isnan(q)])), 20) ceq, dw, y = wqRela.kateModel(q, c, q) corr = np.corrcoef(c, y)[0, 1] rMat[k, i] = corr dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values def funcMap(): figM, axM = plt.subplots(nc, 1, figsize=(8, 6)) for k in range(nc): axplot.mapPoint(axM[k], lat, lon, rMat[:, k], s=12)
# select sites dictSiteName = 'dict{}.json'.format(dataName[:4]) dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel') with open(os.path.join(dirSel, dictSiteName)) as f: dictSite = json.load(f) statStrLst = ['Bias', 'RMSE', 'NSE', 'Corr'] dataPlot = list() labelLst = [ usgs.codePdf.loc[code]['shortName'] + '\n' + code for code in codeLst ] for k, statStr in enumerate(statStrLst): temp = list() for ic, code in enumerate(codeLst): [a, b, c], _ = utils.rmNan([mat1[:, ic, k], mat2[:, ic, k], mat3[:, ic, k]]) temp.append([a, b, c]) sharey = False if statStr in ['Bias', 'RMSE'] else True fig, axes = figplot.boxPlot(temp, widths=0.5, figsize=(12, 4), label2=['LSTM w/ Q', 'LSTM w/o Q', 'WRTDS'], label1=labelLst, sharey=sharey) if statStr == 'Bias': for ax in axes: _ = ax.axhline(0) fig.show() # # DF2 = dbBasin.DataFrameBasin('G400')
dictObs = dict() for k, siteNo in enumerate(siteNoLst): print('\t USGS site {}/{}'.format(k, len(siteNoLst)), end='\r') df = waterQuality.readSiteTS( siteNo, varLst=['00060']+codeLst, freq='W', rmFlag=True) dictObs[siteNo] = df # calculate correlation corrMatTemp = np.full([len(siteNoLst), len(codeLst), 2], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v1 = dictL[siteNo][code].values v2 = dictS[siteNo][code].values v0 = dictObs[siteNo][code].values (vv0, vv1, vv2), indV = utils.rmNan([v0, v1, v2]) rmse1, corr1 = utils.stat.calErr(vv1, vv0) rmse2, corr2 = utils.stat.calErr(vv2, vv0) corrMatTemp[indS, ic, 0] = corr1 corrMatTemp[indS, ic, 1] = corr2 rMat = corrMatTemp**2 codeLst2 = ['00915', '00925', '00930', '00935', '00940', '00945', '00955', '70303', '80154'] [nfy, nfx] = [3, 3] codeLst2 = ['00010', '00300', '00405', '00600', '00605', '00618', '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940', '00945', '00950', '00955', '70303', '71846', '80154'] nfy, nfx = [4, 5]
np.nanmean(errMatC1[:, 0, 1]) np.nanmean(errMatC2[:, 0, 1]) # transfer - validate if training error is correct mtd = wqData.extractVarMtd(master['varYC']) xcP = transform.transInAll(ycP2, mtd, statLst=statTup[3]) xcT = transform.transInAll(ycT2, mtd, statLst=statTup[3]) mtd = wqData.extractVarMtd(master['varY']) xP = transform.transInAll(yP2, mtd, statLst=statTup[2]) xT = transform.transInAll(yT2, mtd, statLst=statTup[2]) np.sqrt(np.nanmean((xT - xP)**2)) np.sqrt(np.nanmean((xcT - xcP)**2)) (np.sqrt(np.nanmean((xT - xP)**2)) + np.sqrt(np.nanmean((xcT - xcP)**2))) / 2 # see correlation info = wqData.subsetInfo(testSet) siteNoLst = info.siteNo.unique() corrMat = np.full([len(siteNoLst), 2], np.nan) for i, siteNo in enumerate(siteNoLst): indS = info[info['siteNo'] == siteNo].index.values a = xcT[indS, 0] b = xcP[indS, 0] _, indV = utils.rmNan([a, b]) corrMat[i, 1] = np.corrcoef(a[indV], b[indV])[0, 1] a = xT[-1, indS, 0] b = xP[-1, indS, 0] _, indV = utils.rmNan([a, b]) corrMat[i, 0] = np.corrcoef(a[indV], b[indV])[0, 1] np.mean(corrMat[:, 1])
codeLst2 = [ '00010', '00095', '00300', '00400', '00405', '00600', '00605', '00618', '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940', '00945', '00955', '71846', '80154' ] nfy, nfx = [5, 4] code = '00915' xLst = list() yLst = list() for siteNo in siteNoLst: dfErr = dictErr[siteNo] x = dfErr['dQ'] y = dfErr[code].values [xx, yy] = utils.rmNan([x[ind2], y[ind2]], returnInd=False) xLst.append(xx) yLst.append(yy) xMat = np.concatenate(xLst) yMat = np.concatenate(yLst) fig, ax = plt.subplots(1, 1) ax.plot(xMat, yMat, '*') ax.set_xlabel('dQ/dt') ax.set_ylabel('error') fig.show() siteNoCode = dictSite[code] siteNo = random.choice(siteNoCode) dfErr = dictErr[siteNo] x = dfErr['dQ'] y = dfErr[code].values
# # calculate correlation tt = np.datetime64('2010-01-01') ind1 = np.where(df.index.values < tt)[0] ind2 = np.where(df.index.values >= tt)[0] corrLSTM = np.full([len(siteNoLst), 2], np.nan) rmseLSTM = np.full([len(siteNoLst), 2], np.nan) corrWRTDS = np.full([len(siteNoLst), 2], np.nan) rmseWRTDS = np.full([len(siteNoLst), 2], np.nan) for k, indT in enumerate([ind1, ind2]): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v0 = dictObs[siteNo][code].iloc[indT].values v1 = dictLSTM[siteNo][code].iloc[indT].values v2 = dictWRTDS[siteNo][code].iloc[indT].values v3 = dictObs[siteNo][code].iloc[indT].values [v0, v1, v2], ind = utils.rmNan([v0, v1, v2]) rmse1, corr1 = utils.stat.calErr(v1, v0, rmExt=True) rmse2, corr2 = utils.stat.calErr(v2, v0, rmExt=True) corrLSTM[indS, k] = corr1 corrWRTDS[indS, k] = corr2 rmseLSTM[indS, k] = rmse1 rmseWRTDS[indS, k] = rmse2 # box matplotlib.rcParams.update({'font.size': 18}) matplotlib.rcParams.update({'lines.linewidth': 2}) matplotlib.rcParams.update({'lines.markersize': 12}) # # plot box # labLst1 = [usgs.codePdf.loc[code]['shortName'] + # '\n'+code for code in codeLst]
saveFile = os.path.join(dirOut, siteNo) if os.path.exists(saveFile): continue varC = codeLst # varQ = ['00060'] df = waterQuality.readSiteTS(siteNo, varLst=varC, freq='W') dfX = pd.DataFrame({'date': df.index}).set_index('date') yr = dfX.index.year.values t = dfX.index.dayofyear.values/365 dfX['sinT'] = np.sin(2*np.pi*t) dfX['cosT'] = np.cos(2*np.pi*t) dfYP = pd.DataFrame(index=df.index, columns=varC) dfYP.index.name = 'date' for code in varC: # print(code) [xx, yy], _ = utils.rmNan([dfX.values, df[code].values]) [xp], iv = utils.rmNan([dfX.values]) if len(yy) <= 2: dictRes[code].loc[siteNo] = [len(yy)]+[np.nan for x in range(5)] else: lrModel = LinearRegression() lrModel = lrModel.fit(xx, yy) yp = lrModel.predict(xp) yt = lrModel.predict(xx) dfYP.at[dfX.index[iv], code] = yp coef = lrModel.coef_ inte = lrModel.intercept_ rmse = np.sqrt(np.nanmean((yt-yy)**2)) if len(np.unique(yy)) == 1: corr = -9999 else:
labLst2 = ['WRTDS test', 'LSTM test'] dataBox = list() for k in range(len(codeLst)): code = codeLst[k] temp = list() # for i in [2, 3, 0 ,1]: for i in [3, 1]: temp.append(corrMat[:, k, i]) dataBox.append(temp) fig = figplot.boxPlot(dataBox, label1=labLst1, widths=0.5, cLst='br', label2=labLst2, figsize=(12, 4), yRange=[0, 1]) # fig = figplot.boxPlot(dataBox, label1=labLst1, widths=0.5, # label2=labLst2, figsize=(12, 4), sharey=False) fig.show() # p-values testLst = ['p-value'] indLst = [[1, 3]] codeStrLst = ['{} {}'.format( code, usgs.codePdf.loc[code]['shortName']) for code in codeLst] dfS = pd.DataFrame(index=codeStrLst, columns=testLst) for (test, ind) in zip(testLst, indLst): for k, code in enumerate(codeLst): data = [corrMat[:, k, x] for x in ind] [a, b], _ = utils.rmNan(data) # s, p = scipy.stats.ttest_ind(a, b, equal_var=False) s, p = scipy.stats.ttest_rel(a, b) dfS.loc[codeStrLst[k]][test] = p pd.options.display.float_format = '{:,.2f}'.format print(dfS)
dictObs[siteNo] = df # calculate rsq rMat = np.full([len(siteNoLst), len(codeLst), 2], np.nan) tt = np.datetime64('2010-01-01') t0 = np.datetime64('1980-01-01') t = dictObs[siteNoLst[0]].index.values ind1 = np.where((t < tt) & (t >= t0))[0] ind2 = np.where(t >= tt)[0] for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v1 = dictL[siteNo][code].values v2 = dictS[siteNo][code].values v0 = dictObs[siteNo][code].values (vv0, vv1, vv2), indV = utils.rmNan([v0, v1, v2]) rmse1, corr1 = utils.stat.calErr(vv1, vv0) rmse2, corr2 = utils.stat.calErr(vv2, vv0) rMat[indS, ic, 0] = corr1**2 # linearity rMat[indS, ic, 1] = corr2**2 # seasonality # a cdf for rsq of seasonality and linearity code = '00915' indS = [siteNoLst.index(siteNo) for siteNo in dictSite[code]] ic = codeLst.index(code) fig, ax = plt.subplots(1, 1) axplot.plotCDF(ax, [rMat[indS, ic, 0], rMat[indS, ic, 1]], legLst=['linearity', 'seasonality']) fig.show() fig, ax = plt.subplots(1, 1)
dfX['cosT'] = np.cos(2 * np.pi * t) ctR = pd.date_range(startDate, endDate) dfXP = pd.DataFrame({'date': ctR}).set_index('date') dfXP = dfXP.join(np.log(dfQ['00060_00003'] + 0.01)).rename(columns={'00060_00003': 'logQ'}) dfXP = dfXP.join(dfF) yr = dfXP.index.year.values t = yr + dfXP.index.dayofyear.values / 365 # dfXP['t'] = t-1979 dfXP['sinT'] = np.sin(2 * np.pi * t) dfXP['cosT'] = np.cos(2 * np.pi * t) for k in range(2): ind = indLst[k] saveFile = saveLst[k] dfYP = pd.DataFrame(index=ctR, columns=usgs.varC) dfYP.index.name = 'date' if len(ind) > 0: for code in usgs.varC: [xx, yy], iv = utils.rmNan( [dfX.iloc[ind].values, dfY.iloc[ind][code].values]) if len(xx) > 0: lrModel = LinearRegression() lrModel = lrModel.fit(xx, yy) b = dfXP.isna().any(axis=1) yp = lrModel.predict(dfXP[~b].values) yp = np.exp(yp) - sn dfYP.at[dfYP[~b].index, code] = yp dfYP.to_csv(saveFile)
dfGeo = gageII.updateCode(dfGeo) # select sites nS = 200 dfR1 = dfRes1[dfRes1['count'] > nS] siteNoLst = dfR1.index.tolist() dfR2 = dfRes2.loc[siteNoLst] dfG = dfGeo.loc[siteNoLst] varGLst = dfG.columns.tolist() dfRsq = pd.DataFrame(index=varGLst, columns=['Rsq1', 'Rsq2']) for varG in varGLst: x = dfG[varG].values y1 = dfR1['corr'].values y2 = dfR1['corr'].values (xx, yy1, yy2), _ = utils.rmNan([x, y1, y2]) r1 = np.corrcoef(xx, yy1)[0, 1] dfRsq.at[varG, 'Rsq1'] = r1**2 r2 = np.corrcoef(xx, yy2)[0, 1] dfRsq.at[varG, 'Rsq2'] = r2**2 dfRsq.to_csv('temp') dfRsq.sort_values('Rsq1', ascending=False) # varG = 'SLOPE_PCT' varG = 'HLR_BAS_PCT_100M' x = dfG[varG].values y = dfR1['corr'].values x[x < -900] = np.nan fig, ax = plt.subplots(1, 1) ax.plot(x, y, '*')
ind = wqData.subset[testSet].tolist() # ind=wqData.subset[trainSet].tolist() axes[0].plot(wqData.c[ind, indC1], ypLst2[1][:, indC2], '*') axes[1].plot(wqData.c[ind, indC1], ypLst2[0][:, indC2], '*') fig.show() fig, axes = plt.subplots(1, 2) indF = wqData.varF.index('ph') indC = wqData.varC.index('00945') axes[0].plot(wqData.c[:, indC], wqData.f[-1, :, indF], '*') indF = wqData.varF.index('SO4') axes[1].plot(wqData.c[:, indC], wqData.f[-1, :, indF], '*') fig.show() indF = wqData.varF.index('ph') _, ind = utils.rmNan([wqData.c[:, indC], wqData.f[-1, :, indF]]) np.corrcoef(wqData.c[ind, indC], wqData.f[-1, ind, indF]) # time series siteNoLst = wqData.siteNoLst fig, axes = plt.subplots(3, 1) for k in range(3): siteNo = siteNoLst[k] info1 = wqData.info.iloc[wqData.subset[trainSet]].reset_index() info2 = wqData.info.iloc[wqData.subset[testSet]].reset_index() t1 = info1['date'].values t2 = info2['date'].values indS1 = info1[info1['siteNo'] == siteNo].index.values indS2 = info2[info2['siteNo'] == siteNo].index.values t = wqData.info[wqData.info['siteNo'] == siteNo]['date'].values v = wqData.c[wqData.info['siteNo'] == siteNo][:, 0]