def transIn(self, statTup=None, subset=None, varTup=None): # normalize data in dataTup = self.extractData(varTup=varTup, subset=subset) t0 = time.time() if statTup is None: [outDataLst, outStatLst] = [list(), list()] for (data, var) in zip(dataTup, varTup): if data is not None: mtd = self.extractVarMtd(var) outData, outStat = transform.transInAll(data, mtd) else: (outData, outStat) = (None, None) outDataLst.append(outData) outStatLst.append(outStat) print('transform time {:.3f}'.format(time.time() - t0)) return outDataLst, outStatLst else: outDataLst = list() for (data, var, stat) in zip(dataTup, varTup, statTup): if data is not None: mtd = self.extractVarMtd(var) outData = transform.transInAll(data, mtd, statLst=stat) else: outData = None outDataLst.append(outData) print('transform time {:.3f}'.format(time.time() - t0)) return outDataLst
def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfPred, dfObs = basins.loadSeq(outName, siteNo) t = dfPred['date'].values.astype(np.datetime64) tBar = np.datetime64('2000-01-01') # linear model ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index [x1, y1, yc1], _ = utils.rmNan([xL1[ind1, :], yL1[ind1, :], ycL1[ind1, :]]) modelY = LinearRegression().fit(x1, y1) modelYC = LinearRegression().fit(x1, yc1) sd = np.datetime64('1979-01-01') ed = np.datetime64('2020-01-01') dfX = waterQuality.readSiteX(siteNo, sd, ed, varX) x2 = transform.transInAll(dfX.values, mtdX, statLst=statX) y2 = modelY.predict(x2) yc2 = modelYC.predict(x2) yp = wqData.transOut(y2, statY, varY) ycp = wqData.transOut(yc2, statYC, varYC) code = codeLst[0] axplot.plotTS(axP[0], t, [dfPred['00060'], yp, dfObs['00060']], tBar=tBar, legLst=['lstm', 'lr', 'obs'], styLst='---', cLst='bgr') axplot.plotTS(axP[1], t, [dfPred[code], ycp, dfObs[code]], tBar=tBar, legLst=['lstm', 'lr', 'obs'], styLst='--*', cLst='bgr')
def transIn(self, dataTup, varTup, statTup=None): # normalize data in if statTup is None: [outDataLst, outStatLst] = [list(), list()] for (data, var) in zip(dataTup, varTup): if data is not None: mtd = io.extractVarMtd(var) outData, outStat = transform.transInAll(data, mtd) else: (outData, outStat) = (None, None) outDataLst.append(outData) outStatLst.append(outStat) return outDataLst, outStatLst else: outDataLst = list() for (data, var, stat) in zip(dataTup, varTup, statTup): if data is not None: mtd = io.extractVarMtd(var) outData = transform.transInAll(data, mtd, statLst=stat) else: outData = None outDataLst.append(outData) return outDataLst
def runModel(dfX, dfG): # test model xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(dfG.values.astype(np.float), axis=0) mtdX = wqData.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = wqData.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) yOut = trainTS.testModel(model, x, xc) # transfer out nt = len(dfX) ny = len(varY) if varY is not None else 0 nyc = len(varYC) if varYC is not None else 0 yP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC) # save output t = dfX.index.values.astype('datetime64[D]') colY = [] if varY is None else varY colYC = [] if varYC is None else varYC dfOut = pd.DataFrame(data=yP, columns=colY + colYC, index=t) dfOut.index.name = 'date' dfOut = dfOut.reset_index() return dfOut
def testModelSeq(outName, siteNoLst, wqData=None, ep=None, returnOut=False, retest=False, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31')): # run sequence test for all sites, default to be from first date to last date if type(siteNoLst) is not list: siteNoLst = [siteNoLst] master = loadMaster(outName) if master['crit'] == 'SigmaLoss': doSigma = True else: doSigma = False if ep is None: ep = master['nEpoch'] outDir = nameFolder(outName) sdS = pd.to_datetime(sd).strftime('%Y%m%d') edS = pd.to_datetime(ed).strftime('%Y%m%d') saveDir = os.path.join(outDir, 'seq-{}-{}-ep{}'.format(sdS, edS, ep)) if not os.path.exists(saveDir): os.mkdir(saveDir) siteSaveLst = os.listdir(saveDir) if retest is True: sitePredLst = siteNoLst else: sitePredLst = [ siteNo for siteNo in siteNoLst if siteNo not in siteSaveLst ] if len(sitePredLst) != 0: if wqData is None: wqData = waterQuality.DataModelWQ(master['dataName']) (varX, varXC, varY, varYC) = (master['varX'], master['varXC'], master['varY'], master['varYC']) (statX, statXC, statY, statYC) = loadStat(outName) model = loadModel(outName, ep=ep) tabG = gageII.readData(varLst=varXC, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) for siteNo in sitePredLst: if 'DRAIN_SQKM' in varXC: area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values[0] else: area = None # test model print('testing {} from {} to {}'.format(siteNo, sdS, edS)) freq = wqData.freq dfX = waterQuality.readSiteTS(siteNo, varX, freq=freq, area=area, sd=sd, ed=ed) # dfX = waterQuality.readSiteX( # siteNo, varX, sd=sd, ed=ed, area=area, nFill=5) xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float), axis=0) mtdX = waterQuality.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = waterQuality.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) [x, xc] = trainTS.dealNaN([x, xc], master['optNaN'][:2]) yOut = trainTS.testModel(model, x, xc) # transfer out nt = len(dfX) ny = len(varY) if varY is not None else 0 nyc = len(varYC) if varYC is not None else 0 if doSigma: yP = np.full([nt, ny + nyc], np.nan) sP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny * 2:2], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny * 2::2], statYC, varYC) sP[:, :ny] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, 1:ny * 2:2])), statY, varY) sP[:, ny:] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, ny * 2 + 1::2])), statYC, varYC) else: yP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC) # save output t = dfX.index.values.astype('datetime64[D]') colY = [] if varY is None else varY colYC = [] if varYC is None else varYC dfOut = pd.DataFrame(data=yP, columns=[colY + colYC], index=t) dfOut.index.name = 'date' dfOut = dfOut.reset_index() dfOut.to_csv(os.path.join(saveDir, siteNo), index=False) if doSigma: dfOutS = pd.DataFrame(data=sP, columns=[colY + colYC], index=t) dfOutS.index.name = 'date' dfOutS = dfOut.reset_index() dfOutS.to_csv(os.path.join(saveDir, siteNo + '_sigma'), index=False) # load all csv if returnOut: dictOut = dict() for siteNo in siteNoLst: # print('loading {} from {} to {}'.format(siteNo, sdS, edS)) dfOut = pd.read_csv(os.path.join(saveDir, siteNo)) dictOut[siteNo] = dfOut if doSigma: dfOut = pd.read_csv(os.path.join(saveDir, siteNo + '_sigma')) dictOut[siteNo + '_sigma'] = dfOut return dictOut
import matplotlib.pyplot as plt import torch.nn as nn from hydroDL.model import rnn, crit import os siteNo = '01434025' # siteNo = '01364959' codeLst = ['00915', '00940', '00955'] varX = gridMET.varLst varY = ['00060'] dfX = waterQuality.readSiteX(siteNo, varX) dfY = waterQuality.readSiteY(siteNo, varY) mtdX = waterQuality.extractVarMtd(varX) normX, statX = transform.transInAll(dfX.values, mtdX) dfXN = pd.DataFrame(data=normX, index=dfX.index, columns=dfX.columns) mtdY = waterQuality.extractVarMtd(varY) normY, statY = transform.transInAll(dfY.values, mtdY) dfYN = pd.DataFrame(data=normY, index=dfY.index, columns=dfY.columns) matX1 = dfXN[dfXN.index < np.datetime64('2000-01-01')].values matY1 = dfYN[dfYN.index < np.datetime64('2000-01-01')].values matX2 = dfXN[dfXN.index >= np.datetime64('2000-01-01')].values matY2 = dfYN[dfYN.index >= np.datetime64('2000-01-01')].values matX = dfXN.values matY = dfYN.values nx = len(varX) ny = len(varY) ind1 = np.where(~np.isnan(matY1))[0]
print( 'first iteration failed again for CUDNN_STATUS_EXECUTION_FAILED ' ) yP, ycP = model(xT) loss = lossFun(yP, ycP, yT[:, :, :ny], yT[-1, :, ny:]) loss.backward() optim.step() model.zero_grad() print('{} {:.3f} {:.3f}'.format(k, loss, time.time() - t0)) # test statX, statXC, statY, statYC = statTup xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float), axis=0) mtdX = wqData.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = wqData.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) yA = np.expand_dims(dfY.values, axis=1) ycA = np.expand_dims(dfYC.values, axis=1) mtdY = wqData.extractVarMtd(varY) y = transform.transInAll(yA, mtdY, statLst=statY) mtdYC = wqData.extractVarMtd(varYC) yc = transform.transInAll(ycA, mtdYC, statLst=statYC) (x, xc) = trainTS.dealNaN((x, xc), [1, 1]) nt = x.shape[0] xT = torch.from_numpy(np.concatenate([x, np.tile(xc, [nt, 1, 1])], axis=-1)).float() if torch.cuda.is_available():
from hydroDL.master import slurm from hydroDL.post import axplot, figplot import numpy as np import matplotlib.pyplot as plt codeLst = sorted(usgs.newC) # dataName = 'nbWT' dataName = 'nbW' wqData = waterQuality.DataModelWQ(dataName) siteNoLst = wqData.info.siteNo.unique() codeLst = usgs.newC icLst = [wqData.varC.index(code) for code in codeLst] data = wqData.c[:, np.array(icLst)] mtdLst = waterQuality.extractVarMtd(codeLst) dataNorm, stat = transform.transInAll(data, mtdLst) info = wqData.info code = '00660' ic = codeLst.index(code) fig, axes = plt.subplots(2, 1, figsize=(6, 8)) for siteNo in siteNoLst: indS = info[info['siteNo'] == siteNo].index.values yr = utils.sortData(data[indS, ic]) yn = utils.sortData(dataNorm[indS, ic]) x = np.arange(len(yr)) / len(yr) _ = axes[0].plot(x, yr, 'k-', alpha=0.2) _ = axes[1].plot(x, yn, 'k-', alpha=0.2) shortName = usgs.codePdf.loc[code]['shortName'] axes[1].set_ylim([-0.2, 1.2]) axes[0].set_title('{} {} CDFs '.format(code, shortName))
yT2 = obsLst2[2][:, :, 0:1] errMatC1 = wqData.errBySiteC(ycP1, subset=trainSet, varC=master['varYC']) errMatC2 = wqData.errBySiteC(ycP2, subset=testSet, varC=master['varYC']) # errMatQ1 = wqData.errBySiteQ( # yP1, subset=trainSet, varQ=master['varY']) # errMatQ2 = wqData.errBySiteQ( # yP2, subset=testSet, varQ=master['varY']) # np.nanmean(errMatQ2[:, 0, 1]) np.nanmean(errMatC1[:, 0, 1]) np.nanmean(errMatC2[:, 0, 1]) # transfer - validate if training error is correct mtd = wqData.extractVarMtd(master['varYC']) xcP = transform.transInAll(ycP2, mtd, statLst=statTup[3]) xcT = transform.transInAll(ycT2, mtd, statLst=statTup[3]) mtd = wqData.extractVarMtd(master['varY']) xP = transform.transInAll(yP2, mtd, statLst=statTup[2]) xT = transform.transInAll(yT2, mtd, statLst=statTup[2]) np.sqrt(np.nanmean((xT - xP)**2)) np.sqrt(np.nanmean((xcT - xcP)**2)) (np.sqrt(np.nanmean((xT - xP)**2)) + np.sqrt(np.nanmean((xcT - xcP)**2))) / 2 # see correlation info = wqData.subsetInfo(testSet) siteNoLst = info.siteNo.unique() corrMat = np.full([len(siteNoLst), 2], np.nan) for i, siteNo in enumerate(siteNoLst): indS = info[info['siteNo'] == siteNo].index.values
def loadSeq(siteNo, varY, model, optX='F', optT='Y8090', order=(5, 0, 5)): if model == 'ARMA': dirAR = os.path.join(kPath.dirWQ, 'modelStat', 'ARMA') strOrder = '-'.join([str(k) for k in order]) saveFolderName = '{}-{}-{}-{}'.format(optX, optT, varY, strOrder) saveFolder = os.path.join(dirAR, saveFolderName) elif model == 'LR': dirLR = os.path.join(kPath.dirWQ, 'modelStat', 'LR') saveFolderName = '{}-{}-{}'.format(optX, optT, varY) saveFolder = os.path.join(dirLR, saveFolderName) else: raise Exception('model {} invalid!'.format(model)) predFile = os.path.join(saveFolder, siteNo) if not os.path.exists(saveFolder): os.mkdir(saveFolder) if os.path.exists(predFile): dfP = pd.read_csv(predFile, index_col=None) dfP = utils.time.datePdf(dfP) else: if optX == 'F': varX = gridMET.varLst elif optX == 'QF': varX = ['00060'] + gridMET.varLst else: raise Exception('optX {} invalid!'.format(optX)) dfX = waterQuality.readSiteX(siteNo, varX) dfY = waterQuality.readSiteY(siteNo, [varY]) # normalize mtdX = waterQuality.extractVarMtd(varX) normX, statX = transform.transInAll(dfX.values, mtdX) dfXN = pd.DataFrame(data=normX, index=dfX.index, columns=dfX.columns) mtdY = waterQuality.extractVarMtd([varY]) normY, statY = transform.transInAll(dfY.values, mtdY) dfYN = pd.DataFrame(data=normY, index=dfY.index, columns=dfY.columns) if optT == 'Y8090': dfXT = dfXN[dfXN.index < np.datetime64('2000-01-01')] dfYT = dfYN[dfYN.index < np.datetime64('2000-01-01')] elif optT == 'Y0010': dfXT = dfXN[dfXN.index >= np.datetime64('2000-01-01')] dfYT = dfYN[dfYN.index >= np.datetime64('2000-01-01')] else: raise Exception('optT {} invalid!'.format(optT)) # train and test if model == 'ARMA': dfPN, resT = trainARMA(dfXT, dfYT, dfXN, dfYN, order) if model == 'LR': dfPN = trainLR(dfXT, dfYT, dfXN, dfYN) yP = transform.transOut(dfPN.values, mtdY[0], statY[0]) dfP = pd.DataFrame(data=yP, index=dfYN.index, columns=dfYN.columns) # save result, model, stat dfP.reset_index().to_csv(predFile, index=False) statFile = os.path.join(saveFolder, siteNo + '_stat.json') with open(statFile, 'w') as fp: json.dump(dict(statX=statX, statY=statY), fp, indent=4) # save model # if model == 'ARMA': # modelFile = os.path.join(saveFolder, siteNo+'_model.p') # resT.save(modelFile) return dfP
yrTrain = [2000, 2005] yr = df.index.year.values indTrain = np.where((yr >= yrTrain[0]) & (yr < yrTrain[1]))[0] # data sn = 1 # varX = varF varX = ['pr'] varY = ['runoff'] nx = len(varX) ny = len(varY) X = df[varX].values Y = df[varY].values mtdX = waterQuality.extractVarMtd(varX) # mtdY = waterQuality.extractVarMtd(varY) x, statX = transform.transInAll(X, mtdX) # y, statY = transform.transInAll(Y, mtdY) # y = np.log(Y+sn) # x[np.isnan(x)] = -1 y = np.log(Y+sn) xx = x[indTrain, :] yy = y[indTrain, :] # conv nt = len(indTrain) nbatch = 100 rho = 1000 aLst = np.exp(np.arange(0, 2, 0.1)) m = 30 nq = len(aLst) nd = 365
matC[kk, :, :] = dfC.values codeLst2 = [ '00095', '00400', '00405', '00600', '00605', '00618', '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940', '00945', '00950', '00955', '70303', '71846', '80154' ] # plot hist importlib.reload(axplot) importlib.reload(transform) importlib.reload(usgs) varRLst = [code + '-R' for code in usgs.newC] mtdLst = waterQuality.extractVarMtd(varRLst) matRN, stat = transform.transInAll(matR, mtdLst) matRN2 = transform.transOutAll(matRN, mtdLst, stat) fig, axes = plt.subplots(5, 4) ticks = [-0.5, 0, 0.5, 1] for k, code in enumerate(codeLst2): j, i = utils.index2d(k, 5, 4) ax = axes[j, i] siteNoCode = dictSite[code] indS = [siteNoLst.index(siteNo) for siteNo in siteNoCode] ic = usgs.newC.index(code) data = matRN2[indS, :, ic] x1 = utils.flatData(data) x2 = utils.rmExt(x1, p=5) s, p = scipy.stats.kstest(x2 / np.std(x2) - np.mean(x2), 'laplace')
dfQ['runoff'] = dfQ['00060'] / area * unitConv if '00060' in varX or 'runoff' in varX: dfX = dfX.join(dfQ) elif '00060' in varY or 'runoff' in varY: dfY = dfY.join(dfQ) dfX = dfX.join(dfF) dfY = dfY.join(dfC) dfX = dfX[varX] dfY = dfY[varY + varYC] # normalize concat input data dfX = dfX.interpolate(limit=nFill, limit_direction='both') xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float), axis=0) mtdX = wqData.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = wqData.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) yP = trainTS.testModel(model, x, xc) # # test # nt = len(dfX) # x, xc = trainTS.dealNaN((x, xc), dictP['optNaN'][:2]) # xx = np.concatenate([x, np.tile(xc[0, :], [1, nt, 1])], axis=-1).swapaxes(0, 1) # xT = torch.from_numpy(xx).float() # if torch.cuda.is_available(): # xT = xT.cuda() # # if i == 0 and ind1 == 0: # # try: # # yT = model(xT)
# training / testing yrTrain = [2000, 2005] yr = df.index.year.values indTrain = np.where((yr >= yrTrain[0]) & (yr < yrTrain[1]))[0] # data # varX = varF varX = ['pr'] varY = ['00060'] nx = len(varX) ny = len(varY) X = df[varX].values Y = df[varY].values mtdX = waterQuality.extractVarMtd(varX) mtdY = waterQuality.extractVarMtd(varY) x, statX = transform.transInAll(X, mtdX) y, statY = transform.transInAll(Y, mtdY) x[np.isnan(x)] = -1 xx = x[indTrain, :] yy = y[indTrain, :] model = rnn.LstmModel(nx=nx, ny=ny, hiddenSize=256).cuda() lossFun = crit.RmseLoss().cuda() optim = torch.optim.Adadelta(model.parameters()) nt = len(indTrain) nbatch = 100 rho = 1000 # train nEp = 500