def readSiteTS(siteNo, varLst, freq='D', area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31'), rmFlag=True): # read data td = pd.date_range(sd, ed) varC = list(set(varLst).intersection(usgs.varC)) varQ = list(set(varLst).intersection(usgs.varQ)) varF = list(set(varLst).intersection(gridMET.varLst)) varP = list(set(varLst).intersection(ntn.varLst)) varR = list(set(varLst).intersection(GLASS.varLst)) varT = list(set(varLst).intersection(varTLst)) dfD = pd.DataFrame({'date': td}).set_index('date') if len(varC) > 0: if rmFlag: dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=sd, flag=2) dfC = usgs.removeFlag(dfC, dfCF) else: dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd) dfD = dfD.join(dfC) if len(varQ) > 0: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varLst: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfD = dfD.join(dfQ) if len(varF) > 0: dfF = gridMET.readBasin(siteNo, varLst=varF) dfD = dfD.join(dfF) if len(varP) > 0: dfP = ntn.readBasin(siteNo, varLst=varP, freq='D') dfD = dfD.join(dfP) if len(varR) > 0: dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D') dfD = dfD.join(dfR) if len(varT) > 0: t = dfD.index.values matT, _ = calT(t) dfT = pd.DataFrame(index=t, columns=varTLst, data=matT) dfD = dfD.join(dfT[varT]) dfD = dfD[varLst] if freq == 'D': return dfD elif freq == 'W': dfW = dfD.resample('W-TUE').mean() return dfW
def readSiteY(siteNo, varY, area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2020-01-01')): tr = pd.date_range(sd, ed) dfY = pd.DataFrame({'date': tr}).set_index('date') # extract data codeLst = [code for code in varY if code in usgs.codeLst] dfC, dfCF = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd, flag=True) if '00060' in varY or 'runoff' in varY: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varY: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfY = dfY.join(dfQ) dfY = dfY.join(dfC) dfY = dfY.join(dfCF) dfY = dfY[varY] return dfY
def funcPoint(iP, axes): kA = 0 siteNo = siteNoLst[iP] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) ctR = pd.date_range(startDate, endDate) dfData = pd.DataFrame({'date': ctR}).set_index('date') dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) dfData = dfData.join(dfQ) dfData = dfData.join(dfC) # plot normalized time series ax = axes[kA] kA = kA + 1 t = dfData.index.values dfDataN = (dfData - dfData.mean()) / dfData.std() varLst = dfData.columns.tolist() data = [dfDataN[var].values for var in varLst] legLst = ['streamflow' ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst] axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst) # plot C-Q nc = len(codeLst) for k in range(nc): code = codeLst[k] q = dfData['00060'] c = dfData[code] [q, c], ind = utils.rmNan([q, c]) ax = axes[kA] kA = kA + 1 ax.plot(np.log(q), np.log(c), 'r*') # plot fractual for k in range(nc): code = codeLst[k] dfV = dfData[dfData[code].notna()] nt = len(dfData) x = dfV.index.values.astype('datetime64[D]') y = dfV[code].values freq = 2 * np.pi / np.linspace(2, nt, nt) power = signal.lombscargle(x, y, freq) ax = axes[kA] kA = kA + 1 ax.plot(np.log(freq / 2 * np.pi), np.log(power), '-*') fyr = 2 * np.pi / 365 pyr = signal.lombscargle(x, y, [fyr]) ax.plot(np.log(fyr / 2 * np.pi), np.log(pyr), 'r*')
def funcPoint(iP, axP): siteNo = siteNoHBN[iP] dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst) dfQ = usgs.readStreamflow(siteNo) df = dfC.join(dfQ) t = df.index.values q = df['00060_00003'].values / area * unitConv c = df[code].values [q, c], ind = utils.rmNan([q, c]) t = t[ind] qAll = dfQ['00060_00003'].values qT = dfQ.index.values axplot.plotTS(axP[0], qT, qAll, cLst='b', styLst='--') axplot.plotTS(axP[1], t, c) axP[2].plot(np.log(q), c, 'k*') x = 10**np.linspace(np.log10(np.min(q[q > 0])), np.log10(np.max(q[~np.isnan(q)])), 20) ceq0 = pMat2[iP, 0] dw0 = pMat2[iP, 1] y0 = ceq0 * 1 / (x / dw0 + 1) axP[2].plot(np.log(x), y0, 'r-') axP[2].set_title('ceq={:.3f},dw={:.3f}'.format(ceq0, dw0))
def funcPoint(iP, axes): kA = 0 siteNo = siteNoLst[iP] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) ctR = pd.date_range(startDate, endDate) dfData = pd.DataFrame({'date': ctR}).set_index('date') dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) dfData = dfData.join(dfQ) dfData = dfData.join(dfC) # plot normalized time series ax = axes[kA] kA = kA + 1 t = dfData.index.values dfDataN = (dfData - dfData.mean()) / dfData.std() varLst = dfData.columns.tolist() data = [dfDataN[var].values for var in varLst] legLst = ['streamflow' ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst] axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst) ax.set_title(siteNo) # plot C-Q nc = len(codeLst) for k in range(nc): code = codeLst[k] q = dfData['00060'] c = dfData[code] [q, c], ind = utils.rmNan([q, c]) ceq, dw, y = wqRela.kateModel(q, c, q) ax = axes[kA] kA = kA + 1 ax.plot(np.log(q), np.log(c), 'r*') ax.plot(np.log(q), np.log(y), 'b*')
from hydroDL.app import waterQuality from hydroDL.data import usgs import numpy as np import pandas as pd from hydroDL.post import axplot, figplot import matplotlib.pyplot as plt siteNo = '09163500' varC = ['00660', '00618'] sd = np.datetime64('1979-01-01') ed = np.datetime64('2019-12-31') df = waterQuality.readSiteTS(siteNo, varLst=['00060'] + varC) dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=sd, flag=2) # fig, axes = plt.subplots(2, 1) for k, code in enumerate(varC): v = dfC[code].values f = dfCF[code + '_cd'].values t = dfC.index.values indF = np.where(f == 1)[0] axplot.plotTS(axes[k], t, v, cLst='r', styLst=['-*']) axplot.plotTS(axes[k], t[indF], v[indF], cLst='b', styLst='*') fig.show()
varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values def funcMap(): figM, axM = plt.subplots(1, 1, figsize=(8, 4)) axplot.mapPoint(axM, lat, lon, nSite, s=12) figP, axP = plt.subplots(1, 1, figsize=(12, 6)) return figM, axM, figP, axP, lon, lat def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfC = waterQuality.readSiteY(siteNo, [code]) t = dfC.index.values.astype(np.datetime64) axplot.plotTS(axP, t, dfC[code], styLst='*') axP.set_title('{} #samples = {}'.format(siteNo, dfC.count().values)) figplot.clickMap(funcMap, funcPoint) siteNo = '401733105392404' sd = pd.datetime(1980, 1, 1) dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd) fig, ax = plt.subplots(1, 1, figsize=(12, 6), '*') ax.plot(dfC) fig.show() dfC.plot() plt.show()
dirUSGS = os.path.join(kPath.dirData, 'USGS') dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') dirCQ = os.path.join(kPath.dirWQ, 'C-Q') fileSiteNoLst = os.path.join(dirInv, 'siteNoLst') siteNoLst = pd.read_csv(fileSiteNoLst, header=None, dtype=str)[0].tolist() t0 = time.time() fileName = os.path.join(dirCQ, 'CQall') if not os.path.exists(fileName): dictData = dict() errLst = list() for i, siteNo in enumerate(siteNoLst): csvC = os.path.join(kPath.dirData, 'USGS', 'sample', 'csv', siteNo) csvQ = os.path.join(kPath.dirData, 'USGS', 'streamflow', 'csv', siteNo) dfC = usgs.readSample(siteNo, codeLst=waterQuality.codeLst) dfQ = usgs.readStreamflow(siteNo) if len(dfC.index) == 0: errLst.append(siteNo) pdf = pd.concat( [dfC.set_index('date').dropna(how='all'), dfQ.set_index('date')], axis=1, join='inner') dictData[siteNo] = pdf print('\t {}/{} {:.2f}'.format(i, len(siteNoLst), time.time() - t0), end='\r') fileName = os.path.join(kPath.dirWQ, 'tempData', 'CQall') pickle.dump(dictData, open(fileName, 'wb')) else:
def wrapData(caseName, siteNoLst, rho=365, nFill=5, varC=usgs.varC, varG=gageII.lstWaterQuality): """ wrap up input and target data for the model,as: x=[nT,nP,nX] y=[nP,nY] c=[nP,nC] where nP is number of time series Arguments: caseName {str} -- name of current data case siteNoLst {list} -- list of USGS site Keyword Arguments: rho {int} -- [description] (default: {365}) nFill {int} -- max number of continous nan to interpolate in input data (default: {5}) varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample}) varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality}) varQ and varF are fixed so far """ # add a start/end date to improve efficiency. startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).interpolate(limit=nFill, limit_direction='both') qLst.append(tempQ.values) fLst.append(tempF.values) cLst.append(dfC.iloc[k].values) cfLst.append(dfCF.iloc[k].values) gLst.append(tabG.loc[siteNo].values) infoLst.append(dict(siteNo=siteNo, date=ct)) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32) cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32) infoDf = pd.DataFrame(infoLst) # add runoff runoff = calRunoff(q[:, :, 0], infoDf) q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32) saveFolder = os.path.join(kPath.dirWQ, 'trainData') saveName = os.path.join(saveFolder, caseName) np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf) infoDf.to_csv(saveName + '.csv') dictData = dict(name=caseName, rho=rho, nFill=nFill, varG=varG, varC=varC, varQ=['00060', 'runoff'], varF=gridMET.varLst, siteNoLst=siteNoLst) with open(saveName + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
from hydroDL import kPath, utils from hydroDL.app import waterQuality from hydroDL.master import basins from hydroDL.data import usgs, gageII, gridMET, ntn, transform from hydroDL.master import slurm from hydroDL.post import axplot, figplot import numpy as np import matplotlib.pyplot as plt code = '00660' siteNo = '01111500' df = waterQuality.readSiteTS(siteNo, [code], freq='D').dropna() dfC, dfCF = usgs.readSample(siteNo, codeLst=[code], flag=2) dfC = dfC.resample('W-TUE').mean() dfCF = dfCF.fillna(0) dfCFW = dfCF.resample('W-TUE').mean() dfCFW = dfCFW.fillna(0) dfCFW[dfCFW != 0] = 1 fig, ax = plt.subplots(1, 1, figsize=(12, 4)) t = dfC.index v = dfC[code].values flag = dfCFW[code+'_cd'].values ax.plot(t[flag == 0], v[flag == 0], 'r*') ax.plot(t[flag != 0], v[flag != 0], 'k*') fig.show()
# code = '00955' err = errMat[:, wqData.varC.index(code), 1] fig, ax = plt.subplots(1, 1) ax.plot(area, err, 'b*') ax.plot(area[indHBN], err[indHBN], 'r*') # np.nanmedian(err) # np.nanmedian(err[indHBN, :]) fig.show() # dw vs error code = '00955' # code = '00600' pMat = np.full([len(siteNoLst), 2], np.nan) for k, siteNo in enumerate(siteNoLst): area = dfX.loc[siteNo]['DRAIN_SQKM'] dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst) dfQ = usgs.readStreamflow(siteNo) df = dfC.join(dfQ) t = df.index.values q = df['00060_00003'].values / area * unitConv c = df[code].values try: ceq, dw, y = relaCQ.kateModel2(q, c) pMat[k, 0] = ceq pMat[k, 1] = dw except: pass fig, ax = plt.subplots(1, 1) ax.plot(pMat[:, 1], err, 'b*') ax.plot(pMat[indHBN, 1], err[indHBN], 'r*') fig.show()
dfRmseLst = [ pd.DataFrame(index=siteNoLst, columns=usgs.varC) for x in range(2) ] for siteNo in siteNoLst: outFolder = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-F') saveFile = os.path.join(outFolder, trainSet, siteNo) dfP = pd.read_csv(saveFile, index_col=None) # a bug - did not save dates startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2020, 1, 1) ctR = pd.date_range(startDate, endDate) dfP.index = ctR dfP.index.name = 'date' dfY = pd.DataFrame({'date': ctR}).set_index('date') dfC, dfCF = usgs.readSample(siteNo, usgs.varC, flag=2) dfC[dfCF != 0] = np.nan dfY = dfY.join(dfC) yr = dfY.index.year.values indLst = [np.where(yr % 2 == x)[0] for x in [0, 1]] for code in usgs.varC: for k in range(2): ind = indLst[k] corr = dfY.iloc[ind][code].corr(dfP.iloc[ind][code]) rmse = np.sqrt( np.sum((dfY.iloc[ind][code] - dfP.iloc[ind][code])**2)) dfCorrLst[k].loc[siteNo][code] = corr dfRmseLst[k].loc[siteNo][code] = corr for k in range(2): if k == 0: testSet = 'Yeven'
# upgrade code to read flags and save CSV from hydroDL.data import usgs from hydroDL import kPath from hydroDL.app import waterQuality import os import pandas as pd siteNo = '07060710' dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst, flag=True, csv=False)
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() dfAll = pd.read_csv(os.path.join(dirInv, 'codeCount.csv'), dtype={ 'siteNo': str }).set_index('siteNo') # pick some sites # codeLst = ['00915', '00940', '00955','00300'] codeLst = ['00660', '00600'] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) siteNo = '07060710' dfC, dfCF = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate, flag=True) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) fig, axes = plt.subplots(len(codeLst), 1) for k, code in enumerate(codeLst): flagLst = ['x', 'X', '<', 'E'] axes[k].plot(dfC[code], '*', label='others') for flag in flagLst: axes[k].plot(dfC[code][dfCF[code + '_cd'] == flag], '*', label=flag) shortName = usgs.codePdf.loc[code]['shortName'] title = '{} {} {}'.format(siteNo, shortName, code) axes[k].set_title(title) axes[k].legend() fig.show()
idOut[indRow] = ntnId distOut[indRow] = dist[ntnId] dist = dist.drop(ntnId) indRow = np.unique(np.where(np.isnan(data))[0]) if len(indRow) == 0: break # end of while distOut[indRow] = np.nan idOut[indRow] = np.nan dfP = pd.DataFrame(index=t, columns=varNtn, data=data) dfP['distNTN'] = distOut dfP['idNTN'] = idOut dfP.index.name = 'date' # read C, Q, F dfC = usgs.readSample(siteNo, codeLst=varC) dfQ = usgs.readStreamflow(siteNo) dfF = gridMET.readBasin(siteNo) # convert to weekly td = pd.date_range(start='1979-01-01', end='2019-12-30', freq='D') df = pd.DataFrame({'date': td}).set_index('date') df = df.join(dfC) df = df.join(dfQ) df = df.join(dfF) df = df.rename(columns={'00060_00003': '00060'}) dfW = df.resample('W-TUE').mean() dfW = dfW.join(dfP) dfW = dfW.loc[t] # weekly load dfW['Q'] = dfW['00060']*60*60*24*7*(0.3048**3) # m^3/week
tabG = gageII.updateCode(tabG) siteNo = siteNoLst[0] # testset - only get sd ed tTest = infoTest[infoTest['siteNo'] == siteNo]['date'].values sdX = tTest[0] - np.timedelta64(rho - 1, 'D') sdY = tTest[0] ed = tTest[-1] trX = pd.date_range(sdX, ed) trY = pd.date_range(sdY, ed) dfX = pd.DataFrame({'date': trX}).set_index('date') dfY = pd.DataFrame({'date': trY}).set_index('date') # extract data dfC = usgs.readSample(siteNo, codeLst=varYC, startDate=sdX) dfF = gridMET.readBasin(siteNo) dfQ = usgs.readStreamflow(siteNo, startDate=sdX) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2 dfQ['runoff'] = dfQ['00060'] / area * unitConv if '00060' in varX or 'runoff' in varX: dfX = dfX.join(dfQ) elif '00060' in varY or 'runoff' in varY: dfY = dfY.join(dfQ) dfX = dfX.join(dfF) dfY = dfY.join(dfC) dfX = dfX[varX] dfY = dfY[varY + varYC]