def readSiteY(siteNo, varY, area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2020-01-01')): tr = pd.date_range(sd, ed) dfY = pd.DataFrame({'date': tr}).set_index('date') # extract data codeLst = [code for code in varY if code in usgs.codeLst] dfC, dfCF = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd, flag=True) if '00060' in varY or 'runoff' in varY: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varY: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfY = dfY.join(dfQ) dfY = dfY.join(dfC) dfY = dfY.join(dfCF) dfY = dfY[varY] return dfY
def readSiteTS(siteNo, varLst, freq='D', area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31'), rmFlag=True): # read data td = pd.date_range(sd, ed) varC = list(set(varLst).intersection(usgs.varC)) varQ = list(set(varLst).intersection(usgs.varQ)) varF = list(set(varLst).intersection(gridMET.varLst)) varP = list(set(varLst).intersection(ntn.varLst)) varR = list(set(varLst).intersection(GLASS.varLst)) varT = list(set(varLst).intersection(varTLst)) dfD = pd.DataFrame({'date': td}).set_index('date') if len(varC) > 0: if rmFlag: dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=sd, flag=2) dfC = usgs.removeFlag(dfC, dfCF) else: dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd) dfD = dfD.join(dfC) if len(varQ) > 0: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varLst: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfD = dfD.join(dfQ) if len(varF) > 0: dfF = gridMET.readBasin(siteNo, varLst=varF) dfD = dfD.join(dfF) if len(varP) > 0: dfP = ntn.readBasin(siteNo, varLst=varP, freq='D') dfD = dfD.join(dfP) if len(varR) > 0: dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D') dfD = dfD.join(dfR) if len(varT) > 0: t = dfD.index.values matT, _ = calT(t) dfT = pd.DataFrame(index=t, columns=varTLst, data=matT) dfD = dfD.join(dfT[varT]) dfD = dfD[varLst] if freq == 'D': return dfD elif freq == 'W': dfW = dfD.resample('W-TUE').mean() return dfW
def funcPoint(iP, axes): kA = 0 siteNo = siteNoLst[iP] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) ctR = pd.date_range(startDate, endDate) dfData = pd.DataFrame({'date': ctR}).set_index('date') dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) dfData = dfData.join(dfQ) dfData = dfData.join(dfC) # plot normalized time series ax = axes[kA] kA = kA + 1 t = dfData.index.values dfDataN = (dfData - dfData.mean()) / dfData.std() varLst = dfData.columns.tolist() data = [dfDataN[var].values for var in varLst] legLst = ['streamflow' ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst] axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst) # plot C-Q nc = len(codeLst) for k in range(nc): code = codeLst[k] q = dfData['00060'] c = dfData[code] [q, c], ind = utils.rmNan([q, c]) ax = axes[kA] kA = kA + 1 ax.plot(np.log(q), np.log(c), 'r*') # plot fractual for k in range(nc): code = codeLst[k] dfV = dfData[dfData[code].notna()] nt = len(dfData) x = dfV.index.values.astype('datetime64[D]') y = dfV[code].values freq = 2 * np.pi / np.linspace(2, nt, nt) power = signal.lombscargle(x, y, freq) ax = axes[kA] kA = kA + 1 ax.plot(np.log(freq / 2 * np.pi), np.log(power), '-*') fyr = 2 * np.pi / 365 pyr = signal.lombscargle(x, y, [fyr]) ax.plot(np.log(fyr / 2 * np.pi), np.log(pyr), 'r*')
def funcPoint(iP, axP): siteNo = siteNoHBN[iP] dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst) dfQ = usgs.readStreamflow(siteNo) df = dfC.join(dfQ) t = df.index.values q = df['00060_00003'].values / area * unitConv c = df[code].values [q, c], ind = utils.rmNan([q, c]) t = t[ind] qAll = dfQ['00060_00003'].values qT = dfQ.index.values axplot.plotTS(axP[0], qT, qAll, cLst='b', styLst='--') axplot.plotTS(axP[1], t, c) axP[2].plot(np.log(q), c, 'k*') x = 10**np.linspace(np.log10(np.min(q[q > 0])), np.log10(np.max(q[~np.isnan(q)])), 20) ceq0 = pMat2[iP, 0] dw0 = pMat2[iP, 1] y0 = ceq0 * 1 / (x / dw0 + 1) axP[2].plot(np.log(x), y0, 'r-') axP[2].set_title('ceq={:.3f},dw={:.3f}'.format(ceq0, dw0))
def funcPoint(iP, axes): kA = 0 siteNo = siteNoLst[iP] startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) ctR = pd.date_range(startDate, endDate) dfData = pd.DataFrame({'date': ctR}).set_index('date') dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) dfData = dfData.join(dfQ) dfData = dfData.join(dfC) # plot normalized time series ax = axes[kA] kA = kA + 1 t = dfData.index.values dfDataN = (dfData - dfData.mean()) / dfData.std() varLst = dfData.columns.tolist() data = [dfDataN[var].values for var in varLst] legLst = ['streamflow' ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst] axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst) ax.set_title(siteNo) # plot C-Q nc = len(codeLst) for k in range(nc): code = codeLst[k] q = dfData['00060'] c = dfData[code] [q, c], ind = utils.rmNan([q, c]) ceq, dw, y = wqRela.kateModel(q, c, q) ax = axes[kA] kA = kA + 1 ax.plot(np.log(q), np.log(c), 'r*') ax.plot(np.log(q), np.log(y), 'b*')
def readSiteX(siteNo, varX, area=None, nFill=5, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2020-01-01')): tr = pd.date_range(sd, ed) dfX = pd.DataFrame({'date': tr}).set_index('date') # extract data dfF = gridMET.readBasin(siteNo) if '00060' in varX or 'runoff' in varX: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varX: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfX = dfX.join(dfQ) dfX = dfX.join(dfF) dfX = dfX[varX] dfX = dfX.interpolate(limit=nFill, limit_direction='both') return dfX
def funcPoint(iP, axP): siteNo = siteNoLst[iP] info1 = wqData.subsetInfo(trainSet) info2 = wqData.subsetInfo(testSet) ind1 = info1[info1['siteNo'] == siteNo].index ind2 = info2[info2['siteNo'] == siteNo].index t1 = info1['date'][ind1].values.astype(np.datetime64) t2 = info2['date'][ind2].values.astype(np.datetime64) tBar = t1[-1] + (t2[0] - t1[-1]) / 2 t = np.concatenate([t1, t2]) # plot Q tq = pd.date_range(t[0], t[-1]) tempQ = usgs.readStreamflow(siteNo) dfQ = pd.DataFrame({'date': tq}).set_index('date').join(tempQ) axplot.plotTS(axP[0], dfQ.index.values, [dfQ['00060_00003'].values], tBar=tBar, styLst=['--b']) # plot C k = 1 for code in codeSel: ic = wqData.varC.index(code) shortName = codePdf.loc[code]['shortName'] title = '{} {} {}'.format(siteNo, shortName, code) xTS = list() xTS.append(np.concatenate([pLst1[1][ind1, ic], pLst2[1][ind2, ic]])) xTS.append(np.concatenate([pLst1[3][ind1, ic], pLst2[3][ind2, ic]])) xTS.append(np.concatenate([o1[ind1, ic], o2[ind2, ic]])) axplot.plotTS(axP[k], t, xTS, styLst=['--b', '--r', '*k'], legLst=['w/ Q', 'w/o Q', 'obs'], tBar=tBar) axP[k].set_title(title) k = k + 1
dirUSGS = os.path.join(kPath.dirData, 'USGS') dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') dirCQ = os.path.join(kPath.dirWQ, 'C-Q') fileSiteNoLst = os.path.join(dirInv, 'siteNoLst') siteNoLst = pd.read_csv(fileSiteNoLst, header=None, dtype=str)[0].tolist() t0 = time.time() fileName = os.path.join(dirCQ, 'CQall') if not os.path.exists(fileName): dictData = dict() errLst = list() for i, siteNo in enumerate(siteNoLst): csvC = os.path.join(kPath.dirData, 'USGS', 'sample', 'csv', siteNo) csvQ = os.path.join(kPath.dirData, 'USGS', 'streamflow', 'csv', siteNo) dfC = usgs.readSample(siteNo, codeLst=waterQuality.codeLst) dfQ = usgs.readStreamflow(siteNo) if len(dfC.index) == 0: errLst.append(siteNo) pdf = pd.concat( [dfC.set_index('date').dropna(how='all'), dfQ.set_index('date')], axis=1, join='inner') dictData[siteNo] = pdf print('\t {}/{} {:.2f}'.format(i, len(siteNoLst), time.time() - t0), end='\r') fileName = os.path.join(kPath.dirWQ, 'tempData', 'CQall') pickle.dump(dictData, open(fileName, 'wb')) else: dictData = pickle.load(open(fileName, 'rb'))
# read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).join(dfP).interpolate( limit=nFill, limit_direction='both') qLst.append(tempQ.values)
def wrapData(caseName, siteNoLst, rho=365, nFill=5, varC=usgs.varC, varG=gageII.lstWaterQuality): """ wrap up input and target data for the model,as: x=[nT,nP,nX] y=[nP,nY] c=[nP,nC] where nP is number of time series Arguments: caseName {str} -- name of current data case siteNoLst {list} -- list of USGS site Keyword Arguments: rho {int} -- [description] (default: {365}) nFill {int} -- max number of continous nan to interpolate in input data (default: {5}) varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample}) varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality}) varQ and varF are fixed so far """ # add a start/end date to improve efficiency. startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).interpolate(limit=nFill, limit_direction='both') qLst.append(tempQ.values) fLst.append(tempF.values) cLst.append(dfC.iloc[k].values) cfLst.append(dfCF.iloc[k].values) gLst.append(tabG.loc[siteNo].values) infoLst.append(dict(siteNo=siteNo, date=ct)) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32) cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32) infoDf = pd.DataFrame(infoLst) # add runoff runoff = calRunoff(q[:, :, 0], infoDf) q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32) saveFolder = os.path.join(kPath.dirWQ, 'trainData') saveName = os.path.join(saveFolder, caseName) np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf) infoDf.to_csv(saveName + '.csv') dictData = dict(name=caseName, rho=rho, nFill=nFill, varG=varG, varC=varC, varQ=['00060', 'runoff'], varF=gridMET.varLst, siteNoLst=siteNoLst) with open(saveName + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)