def modelLinear(outName, testset, trainset=None, wqData=None): master = loadMaster(outName) dataName = master['dataName'] if wqData is None: wqData = waterQuality.DataModelWQ(dataName) if trainset is None: trainset = master['trainName'] infoTrain = wqData.info.iloc[wqData.subset[trainset]].reset_index() infoTest = wqData.info.iloc[wqData.subset[testset]].reset_index() # linear reg data statTup = loadStat(outName) varTup = (master['varX'], master['varXC'], master['varY'], master['varYC']) dataTup1 = wqData.transIn(subset=trainset, varTup=varTup, statTup=statTup) dataTup2 = wqData.transIn(subset=testset, varTup=varTup, statTup=statTup) dataTup1 = trainTS.dealNaN(dataTup1, master['optNaN']) dataTup2 = trainTS.dealNaN(dataTup2, master['optNaN']) varYC = varTup[3] statYC = statTup[3] x1 = dataTup1[0][-1, :, :] yc1 = dataTup1[3] x2 = dataTup2[0][-1, :, :] # point test l2 - linear nc = len(varYC) matP1 = np.full([len(infoTrain), nc], np.nan) matP2 = np.full([len(infoTest), nc], np.nan) siteNoLst = infoTest['siteNo'].unique().tolist() for siteNo in siteNoLst: ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index ind2 = infoTest[infoTest['siteNo'] == siteNo].index xT1 = x1[ind1, :] ycT1 = yc1[ind1, :] for ic in range(nc): [xx, yy], iv = utils.rmNan([xT1, ycT1[:, ic]]) if len(iv) > 0: modelYC = LinearRegression().fit(xx, yy) matP1[ind1, ic] = modelYC.predict(xT1) if len(ind2) > 0: xT2 = x2[ind2, :] matP1[ind2, ic] = modelYC.predict(xT2) matO1 = wqData.transOut(matP1, statYC, varYC) matO2 = wqData.transOut(matP2, statYC, varYC) return matO1, matO2
def trainModelTS(outName): outFolder = nameFolder(outName) dictP = loadMaster(outName) # load data wqData = waterQuality.DataModelWQ(dictP['dataName']) varTup = (dictP['varX'], dictP['varXC'], dictP['varY'], dictP['varYC']) dataTup, statTup = wqData.transIn(subset=dictP['trainName'], varTup=varTup) dataTup = trainTS.dealNaN(dataTup, dictP['optNaN']) wrapStat(outName, statTup) # train model [nx, nxc, ny, nyc, nt, ns] = trainTS.getSize(dataTup) if dictP['modelName'] == 'CudnnLSTM': model = rnn.CudnnLstmModel(nx=nx + nxc, ny=ny + nyc, hiddenSize=dictP['hiddenSize']) lossFun = crit.RmseLoss() if torch.cuda.is_available(): lossFun = lossFun.cuda() model = model.cuda() optim = torch.optim.Adadelta(model.parameters()) lossLst = list() nEp = dictP['nEpoch'] sEp = dictP['saveEpoch'] logFile = os.path.join(outFolder, 'log') if os.path.exists(logFile): os.remove(logFile) for k in range(0, nEp, sEp): model, optim, lossEp = trainTS.trainModel(dataTup, model, lossFun, optim, batchSize=dictP['batchSize'], nEp=sEp, cEp=k, logFile=logFile) # save model saveModel(outName, k + sEp, model, optim=optim) lossLst = lossLst + lossEp lossFile = os.path.join(outFolder, 'loss.csv') pd.DataFrame(lossLst).to_csv(lossFile, index=False, header=False)
def testModel(outName, testset, wqData=None, ep=None, reTest=False): # load master master = loadMaster(outName) if ep is None: ep = master['nEpoch'] outFolder = nameFolder(outName) testFileName = 'testP-{}-Ep{}.npz'.format(testset, ep) testFile = os.path.join(outFolder, testFileName) if os.path.exists(testFile) and reTest is False: print('load saved test result') npz = np.load(testFile, allow_pickle=True) yP = npz['yP'] ycP = npz['ycP'] else: statTup = loadStat(outName) model = loadModel(outName, ep=ep) # load test data if wqData is None: wqData = waterQuality.DataModelWQ(master['dataName']) varTup = (master['varX'], master['varXC'], master['varY'], master['varYC']) testDataLst = wqData.transIn(subset=testset, statTup=statTup, varTup=varTup) sizeLst = trainTS.getSize(testDataLst) testDataLst = trainTS.dealNaN(testDataLst, master['optNaN']) x = testDataLst[0] xc = testDataLst[1] ny = sizeLst[2] # test model - point by point yOut, ycOut = trainTS.testModel(model, x, xc, ny) yP = wqData.transOut(yOut, statTup[2], master['varY']) ycP = wqData.transOut(ycOut, statTup[3], master['varYC']) np.savez(testFile, yP=yP, ycP=ycP) return yP, ycP
def testModelSeq(outName, siteNoLst, wqData=None, ep=None, returnOut=False, retest=False, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31')): # run sequence test for all sites, default to be from first date to last date if type(siteNoLst) is not list: siteNoLst = [siteNoLst] master = loadMaster(outName) if master['crit'] == 'SigmaLoss': doSigma = True else: doSigma = False if ep is None: ep = master['nEpoch'] outDir = nameFolder(outName) sdS = pd.to_datetime(sd).strftime('%Y%m%d') edS = pd.to_datetime(ed).strftime('%Y%m%d') saveDir = os.path.join(outDir, 'seq-{}-{}-ep{}'.format(sdS, edS, ep)) if not os.path.exists(saveDir): os.mkdir(saveDir) siteSaveLst = os.listdir(saveDir) if retest is True: sitePredLst = siteNoLst else: sitePredLst = [ siteNo for siteNo in siteNoLst if siteNo not in siteSaveLst ] if len(sitePredLst) != 0: if wqData is None: wqData = waterQuality.DataModelWQ(master['dataName']) (varX, varXC, varY, varYC) = (master['varX'], master['varXC'], master['varY'], master['varYC']) (statX, statXC, statY, statYC) = loadStat(outName) model = loadModel(outName, ep=ep) tabG = gageII.readData(varLst=varXC, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) for siteNo in sitePredLst: if 'DRAIN_SQKM' in varXC: area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values[0] else: area = None # test model print('testing {} from {} to {}'.format(siteNo, sdS, edS)) freq = wqData.freq dfX = waterQuality.readSiteTS(siteNo, varX, freq=freq, area=area, sd=sd, ed=ed) # dfX = waterQuality.readSiteX( # siteNo, varX, sd=sd, ed=ed, area=area, nFill=5) xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float), axis=0) mtdX = waterQuality.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = waterQuality.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) [x, xc] = trainTS.dealNaN([x, xc], master['optNaN'][:2]) yOut = trainTS.testModel(model, x, xc) # transfer out nt = len(dfX) ny = len(varY) if varY is not None else 0 nyc = len(varYC) if varYC is not None else 0 if doSigma: yP = np.full([nt, ny + nyc], np.nan) sP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny * 2:2], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny * 2::2], statYC, varYC) sP[:, :ny] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, 1:ny * 2:2])), statY, varY) sP[:, ny:] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, ny * 2 + 1::2])), statYC, varYC) else: yP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC) # save output t = dfX.index.values.astype('datetime64[D]') colY = [] if varY is None else varY colYC = [] if varYC is None else varYC dfOut = pd.DataFrame(data=yP, columns=[colY + colYC], index=t) dfOut.index.name = 'date' dfOut = dfOut.reset_index() dfOut.to_csv(os.path.join(saveDir, siteNo), index=False) if doSigma: dfOutS = pd.DataFrame(data=sP, columns=[colY + colYC], index=t) dfOutS.index.name = 'date' dfOutS = dfOut.reset_index() dfOutS.to_csv(os.path.join(saveDir, siteNo + '_sigma'), index=False) # load all csv if returnOut: dictOut = dict() for siteNo in siteNoLst: # print('loading {} from {} to {}'.format(siteNo, sdS, edS)) dfOut = pd.read_csv(os.path.join(saveDir, siteNo)) dictOut[siteNo] = dfOut if doSigma: dfOut = pd.read_csv(os.path.join(saveDir, siteNo + '_sigma')) dictOut[siteNo + '_sigma'] = dfOut return dictOut
def testModel(outName, testset, wqData=None, ep=None, reTest=False): # load master master = loadMaster(outName) if master['crit'] == 'SigmaLoss': doSigma = True else: doSigma = False if ep is None: ep = master['nEpoch'] outFolder = nameFolder(outName) testFileName = 'testP-{}-Ep{}.npz'.format(testset, ep) testFile = os.path.join(outFolder, testFileName) if os.path.exists(testFile) and reTest is False: print('load saved test result') npz = np.load(testFile, allow_pickle=True) yP = npz['yP'] ycP = npz['ycP'] if doSigma: sP = npz['sP'] scP = npz['scP'] else: statTup = loadStat(outName) model = loadModel(outName, ep=ep) # load test data if wqData is None: wqData = waterQuality.DataModelWQ(master['dataName']) varTup = (master['varX'], master['varXC'], master['varY'], master['varYC']) testDataLst = wqData.transIn(subset=testset, statTup=statTup, varTup=varTup) sizeLst = trainTS.getSize(testDataLst) if master['optNaN'] == [2, 2, 0, 0]: master['optNaN'] = [0, 0, 0, 0] testDataLst = trainTS.dealNaN(testDataLst, master['optNaN']) x = testDataLst[0] xc = testDataLst[1] ny = sizeLst[2] if not doSigma: # test model - point by point yOut, ycOut = trainTS.testModel(model, x, xc, ny) yP = wqData.transOut(yOut, statTup[2], master['varY']) ycP = wqData.transOut(ycOut, statTup[3], master['varYC']) np.savez(testFile, yP=yP, ycP=ycP) else: print('sigma model') ny = ny * 2 yOut, ycOut = trainTS.testModel(model, x, xc, ny) yP = wqData.transOut(yOut[:, :, ::2], statTup[2], master['varY']) sP = wqData.transOut(np.sqrt(np.exp(yOut[:, :, 1::2])), statTup[2], master['varY']) ycP = wqData.transOut(ycOut[:, ::2], statTup[3], master['varYC']) scP = wqData.transOut(np.sqrt(np.exp(ycOut[:, 1::2])), statTup[3], master['varYC']) np.savez(testFile, yP=yP, ycP=ycP, sP=sP, scP=scP) if doSigma: return yP, ycP, sP, scP else: return yP, ycP
def trainModelTS(outName): outFolder = nameFolder(outName) dictP = loadMaster(outName) # load data rmFlag = dictP['rmFlag'] if 'rmFlag' in dictP else False wqData = waterQuality.DataModelWQ(dictP['dataName'], rmFlag) varTup = (dictP['varX'], dictP['varXC'], dictP['varY'], dictP['varYC']) dataTup, statTup = wqData.transIn(subset=dictP['trainName'], varTup=varTup) dataTup = trainTS.dealNaN(dataTup, dictP['optNaN']) wrapStat(outName, statTup) # train model [nx, nxc, ny, nyc, nt, ns] = trainTS.getSize(dataTup) # define loss if dictP['crit'] == 'RmseLoss': lossFun = crit.RmseLoss() elif dictP['crit'] == 'RmseLoss2D': lossFun = crit.RmseLoss2D() elif dictP['crit'] == 'SigmaLoss': lossFun = crit.SigmaLoss() ny = ny * 2 nyc = nyc * 2 else: raise RuntimeError('loss function not specified') # define model if dictP['modelName'] == 'CudnnLSTM': model = rnn.CudnnLstmModel(nx=nx + nxc, ny=ny + nyc, hiddenSize=dictP['hiddenSize']) elif dictP['modelName'] == 'LstmModel': model = rnn.LstmModel(nx=nx + nxc, ny=ny + nyc, hiddenSize=dictP['hiddenSize']) elif dictP['modelName'] == 'AgeLSTM': model = rnn.AgeLSTM2(nx=nx + nxc, ny=ny, nyc=nyc, rho=365, nh=dictP['hiddenSize']) else: raise RuntimeError('Model not specified') if torch.cuda.is_available(): lossFun = lossFun.cuda() model = model.cuda() if dictP['optim'] == 'AdaDelta': optim = torch.optim.Adadelta(model.parameters()) else: raise RuntimeError('optimizor function not specified') lossLst = list() nEp = dictP['nEpoch'] sEp = dictP['saveEpoch'] logFile = os.path.join(outFolder, 'log') if os.path.exists(logFile): os.remove(logFile) for k in range(0, nEp, sEp): model, optim, lossEp = trainTS.trainModel(dataTup, model, lossFun, optim, batchSize=dictP['batchSize'], nEp=sEp, cEp=k, logFile=logFile) # save model saveModel(outName, k + sEp, model, optim=optim) lossLst = lossLst + lossEp lossFile = os.path.join(outFolder, 'loss.csv') pd.DataFrame(lossLst).to_csv(lossFile, index=False, header=False)
code = '00945' label = 'plain' trainSet = '{}-Y1'.format(code) testSet = '{}-Y2'.format(code) outName = '{}-{}-{}-{}'.format(dataName, code, label, trainSet) outFolder = basins.nameFolder(outName) dictP = basins.loadMaster(outName) # load data rmFlag = dictP['rmFlag'] if 'rmFlag' in dictP else False wqData = waterQuality.DataModelWQ(dictP['dataName'], rmFlag) varTup = (dictP['varX'], dictP['varXC'], dictP['varY'], dictP['varYC']) dataTup, statTup = wqData.transIn(subset=dictP['trainName'], varTup=varTup) dataTup = trainTS.dealNaN(dataTup, dictP['optNaN']) # wrapStat(outName, statTup) [nx, nxc, ny, nyc, nt, ns] = trainTS.getSize(dataTup) model = basins.loadModel(outName, ep=500) lossFun = crit.RmseLoss() lossFun = lossFun.cuda() model = model.cuda() # training parts dataLst = dataTup sizeLst = trainTS.getSize(dataLst) [nx, nxc, ny, nyc, nt, ns] = sizeLst rho, nbatch = dictP['batchSize'] rho = nt batchSize = [rho, nbatch]
siteNo = '07060710' codeLst = ['00660', '00600'] # codeLst = ['00915', '00955'] nh = 256 batchSize = [365, 50] # if not waterQuality.exist(siteNo): # wqData = waterQuality.DataModelWQ.new(siteNo, [siteNo]) wqData = waterQuality.DataModelWQ(siteNo, rmFlag=False) varX = wqData.varF varXC = wqData.varG varY = [wqData.varQ[0]] varYC = codeLst varTup = (varX, varXC, varY, varYC) dataTup, statTup = wqData.transIn(varTup=varTup) dataTup = trainTS.dealNaN(dataTup, [1, 1, 0, 0]) sizeLst = trainTS.getSize(dataTup) [nx, nxc, ny, nyc, nt, ns] = sizeLst tabG = gageII.readData(varLst=varXC, siteNoLst=[siteNo]) tabG = gageII.updateCode(tabG) dfX = waterQuality.readSiteX(siteNo, varX, nFill=5) dfY = waterQuality.readSiteY(siteNo, varY) dfYC = waterQuality.readSiteY(siteNo, varYC) importlib.reload(rnn) model = rnn.AgeLSTM(nx=nx + nxc, ny=ny, nyc=nyc, nh=nh) optim = torch.optim.Adadelta(model.parameters()) lossFun = crit.RmseMix() if torch.cuda.is_available(): lossFun = lossFun.cuda()
varX = gridMET.varLst varXC = gageII.lstWaterQuality varY = usgs.varQ varYC = None varTup = (varX, varXC, varY, varYC) # dataTup = wqData.extractData(varTup=varTup) # xR, xcR, yR, ycR = dataTup # mtdX = ['log-norm', 'norm', 'norm', 'norm', 'norm', 'norm', 'norm'] # x, statX = transform.transInAll(xR, mtdX) dataTup, statTup = wqData.transIn(varTup=varTup) (x, xc, y, yc) = dataTup dataTup = trainTS.dealNaN(dataTup, [1, 1, 0, 0]) (statX, statXC, statY, statYC) = statTup # concatenate all data [nx, nxc, ny, nyc, nt, ns] = trainTS.getSize(dataTup) xx = np.zeros([ns, nt, nx + nxc]) for k in range(ns): xTemp = dataTup[0][:, k, :] xcTemp = dataTup[1][k, :] temp = np.concatenate([xTemp, np.tile(xcTemp, [365, 1])], axis=-1) xx[k, :, :] = temp xT = torch.from_numpy(xx).float().cuda() yy = np.swapaxes(dataTup[2], 0, 1) yT = torch.from_numpy(yy).float().cuda() # xT = xT[0:1, :, :]
wqData = waterQuality.DataModelWQ('Silica64') master = basins.loadMaster(outName) dataName = master['dataName'] if wqData is None: wqData = waterQuality.DataModelWQ(dataName) trainset = master['trainName'] infoTrain = wqData.info.iloc[wqData.subset[trainset]].reset_index() infoTest = wqData.info.iloc[wqData.subset[testset]].reset_index() # linear reg data statTup = basins.loadStat(outName) varTup = (master['varX'], master['varXC'], master['varY'], master['varYC']) dataTup1 = wqData.transIn(subset=trainset, varTup=varTup, statTup=statTup) dataTup2 = wqData.transIn(subset=testset, varTup=varTup, statTup=statTup) dataTup1 = trainTS.dealNaN(dataTup1, master['optNaN']) dataTup2 = trainTS.dealNaN(dataTup2, master['optNaN']) varYC = varTup[3] statYC = statTup[3] x1 = dataTup1[0][-1, :, :] yc1 = dataTup1[3] x2 = dataTup2[0][-1, :, :] # point test l2 - linear nc = len(varYC) matP1 = np.full([len(infoTrain), nc], np.nan) matP2 = np.full([len(infoTest), nc], np.nan) siteNoLst = infoTest['siteNo'].unique().tolist() for siteNo in siteNoLst: ind1 = infoTrain[infoTrain['siteNo'] == siteNo].index ind2 = infoTest[infoTest['siteNo'] == siteNo].index