subset='CONUSv4f1', tRange=ty2) x = df.getData(varT=dbCsv.varForcing, varC=dbCsv.varConst, doNorm=True, rmNan=True) y = df.getData(varT='SMAP_AM', doNorm=True, rmNan=False) nx = x.shape[-1] ny = 1 model = rnn.CnnCondLstm(nx=nx, ny=ny, ct=365, hiddenSize=64, cnnSize=32, opt=3) lossFun = crit.RmseLoss() model = train.trainModel(model, x, y, lossFun, xc=c, nEpoch=nEpoch, miniBatch=[100, 30]) yOut = train.testModelCnnCond(model, x, y) # yOut = train.testModel(model, x) yP = dbCsv.transNorm(yOut[:, :, 0], rootDB=rootDB, fieldName='SMAP_AM', fromRaw=False) yT = dbCsv.transNorm(y[:, model.ct:, 0], rootDB=rootDB, fieldName='SMAP_AM', fromRaw=False) statDict = post.statError(yP, yT)
# load data df = hydroDL.data.dbCsv.DataframeCsv(rootDB=rootDB, subset='CONUSv4f1', tRange=ty1) x = df.getDataTs(dbCsv.varForcing, doNorm=True, rmNan=True) c = df.getDataConst(dbCsv.varConst, doNorm=True, rmNan=True) y = df.getDataTs('SMAP_AM', doNorm=True, rmNan=False) nx = x.shape[-1] + c.shape[-1] ny = 1 model = rnn.CudnnLstmModel(nx=nx, ny=ny, hiddenSize=64) lossFun = crit.RmseLoss() model = train.trainModel(model, x, y, c, lossFun, nEpoch=nEpoch, miniBatch=[100, 30]) modelName = 'test-LSTM' train.saveModel(outFolder, model, nEpoch, modelName=modelName) for k in dLst: sd = utils.time.t2dt(ty1[0]) - dt.timedelta(days=k) ed = utils.time.t2dt(ty1[1]) - dt.timedelta(days=k) df2 = hydroDL.data.dbCsv.DataframeCsv(rootDB=rootDB, subset='CONUSv4f1', tRange=[sd, ed]) obs = df2.getDataTs('SMAP_AM', doNorm=True, rmNan=False) model = rnn.LstmCloseModel(nx=nx, ny=ny, hiddenSize=64)
dfz2 = camels.DataframeCamels(subset='all', tRange=[20041225, 20091225]) z2 = dfz2.getDataObs(doNorm=True, rmNan=True) # z2 = interp.interpNan1d(z2, mode='pre') xz2 = np.concatenate([x1, z2], axis=2) ny = 1 nx = x1.shape[-1] + c1.shape[-1] lossFun = crit.RmseLoss() # model1 = rnn.CudnnLstmModel(nx=nx, ny=ny, hiddenSize=64) # model1 = train.trainModel( # model1, x1, y1, c1, lossFun, nEpoch=nEpoch, miniBatch=(50, 365)) # train.saveModel(outFolder, model1, nEpoch, modelName='LSTM') model2 = rnn.CudnnLstmModel(nx=nx + 1, ny=ny, hiddenSize=64) model2 = train.trainModel( model2, xz1, y1, c1, lossFun, nEpoch=nEpoch, miniBatch=(50, 365)) train.saveModel(outFolder, model2, nEpoch, modelName='DA-1') model3 = rnn.CudnnLstmModel(nx=nx + 1, ny=ny, hiddenSize=64) model3 = train.trainModel( model3, xz2, y1, c1, lossFun, nEpoch=nEpoch, miniBatch=(50, 365)) train.saveModel(outFolder, model3, nEpoch, modelName='DA-7') if 'test' in doLst: df2 = camels.DataframeCamels(subset='all', tRange=[20050101, 20150101]) x2 = df2.getDataTS(varLst=camels.forcingLst, doNorm=True, rmNan=True) c2 = df2.getDataConst(varLst=camels.attrLstSel, doNorm=True, rmNan=True) yt2 = df2.getDataObs(doNorm=False, rmNan=False).squeeze() dfz1 = camels.DataframeCamels(subset='all', tRange=[20041231, 20141231]) z1 = dfz1.getDataObs(doNorm=True, rmNan=True)
def train(mDict): if mDict is str: mDict = readMasterFile(mDict) out = mDict["out"] optData = mDict["data"] optModel = mDict["model"] optLoss = mDict["loss"] optTrain = mDict["train"] # fix the random seed if optTrain["seed"] is None: # generate random seed randomseed = int(np.random.uniform(low=0, high=1e6)) optTrain["seed"] = randomseed print("random seed updated!") else: randomseed = optTrain["seed"] random.seed(randomseed) torch.manual_seed(randomseed) np.random.seed(randomseed) torch.cuda.manual_seed(randomseed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # data df, x, y, c = loadData(optData) # x: ngage*nday*nvar # y: ngage*nday*nvar # c: ngage*nvar # temporal test, fill obs nan using LSTM forecast # temp = x[:,:,-1, None] # y[np.isnan(y)] = temp[np.isnan(y)] if c is None: if type(x) is tuple: nx = x[0].shape[-1] else: nx = x.shape[-1] else: if type(x) is tuple: nx = x[0].shape[-1] + c.shape[-1] else: nx = x.shape[-1] + c.shape[-1] ny = y.shape[-1] # loss if eval(optLoss["name"]) is hydroDL.model.crit.SigmaLoss: lossFun = crit.SigmaLoss(prior=optLoss["prior"]) optModel["ny"] = ny * 2 elif eval(optLoss["name"]) is hydroDL.model.crit.RmseLoss: lossFun = crit.RmseLoss() optModel["ny"] = ny elif eval(optLoss["name"]) is hydroDL.model.crit.NSELoss: lossFun = crit.NSELoss() optModel["ny"] = ny elif eval(optLoss["name"]) is hydroDL.model.crit.NSELosstest: lossFun = crit.NSELosstest() optModel["ny"] = ny elif eval(optLoss["name"]) is hydroDL.model.crit.MSELoss: lossFun = crit.MSELoss() optModel["ny"] = ny elif eval(optLoss["name"]) is hydroDL.model.crit.RmseLossCNN: lossFun = crit.RmseLossCNN() optModel["ny"] = ny elif eval(optLoss["name"]) is hydroDL.model.crit.ModifyTrend1: lossFun = crit.ModifyTrend1() optModel["ny"] = ny # model if optModel["nx"] != nx: print("updated nx by input data") optModel["nx"] = nx if eval(optModel["name"]) is hydroDL.model.rnn.CudnnLstmModel: if type(x) is tuple: x = np.concatenate([x[0], x[1]], axis=2) if c is None: nx = x.shape[-1] else: nx = x.shape[-1] + c.shape[-1] optModel["nx"] = nx print("Concatenate input and obs, update nx by obs") model = rnn.CudnnLstmModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"] ) elif eval(optModel["name"]) is hydroDL.model.rnn.CpuLstmModel: model = rnn.CpuLstmModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"] ) elif eval(optModel["name"]) is hydroDL.model.rnn.LstmCloseModel: model = rnn.LstmCloseModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"], fillObs=True, ) elif eval(optModel["name"]) is hydroDL.model.rnn.AnnModel: model = rnn.AnnCloseModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"] ) elif eval(optModel["name"]) is hydroDL.model.rnn.AnnCloseModel: model = rnn.AnnCloseModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"], fillObs=True, ) elif eval(optModel["name"]) is hydroDL.model.cnn.LstmCnn1d: convpara = optModel["convNKSP"] model = hydroDL.model.cnn.LstmCnn1d( nx=optModel["nx"], ny=optModel["ny"], rho=optModel["rho"], nkernel=convpara[0], kernelSize=convpara[1], stride=convpara[2], padding=convpara[3], ) elif eval(optModel["name"]) is hydroDL.model.rnn.CNN1dLSTMmodel: daobsOption = optData["daObs"] if type(daobsOption) is list: if len(daobsOption) - 3 >= 7: # using 1dcnn only when number of obs larger than 7 optModel["nobs"] = len(daobsOption) convpara = optModel["convNKS"] model = rnn.CNN1dLSTMmodel( nx=optModel["nx"], ny=optModel["ny"], nobs=optModel["nobs"] - 3, hiddenSize=optModel["hiddenSize"], nkernel=convpara[0], kernelSize=convpara[1], stride=convpara[2], poolOpt=optModel["poolOpt"], ) print("CNN1d Kernel is used!") else: if type(x) is tuple: x = np.concatenate([x[0], x[1]], axis=2) nx = x.shape[-1] + c.shape[-1] optModel["nx"] = nx print("Concatenate input and obs, update nx by obs") model = rnn.CudnnLstmModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"], ) optModel["name"] = "hydroDL.model.rnn.CudnnLstmModel" print("Too few obserservations, not using cnn kernel") else: raise Exception("CNN kernel used but daobs option is not obs list") elif eval(optModel["name"]) is hydroDL.model.rnn.CNN1dLSTMInmodel: # daobsOption = optData['daObs'] daobsOption = list(range(24)) if type(daobsOption) is list: if len(daobsOption) - 3 >= 7: # using 1dcnn only when number of obs larger than 7 optModel["nobs"] = len(daobsOption) convpara = optModel["convNKS"] model = rnn.CNN1dLSTMInmodel( nx=optModel["nx"], ny=optModel["ny"], # nobs=optModel['nobs']-3, nobs=24, # temporary test hiddenSize=optModel["hiddenSize"], nkernel=convpara[0], kernelSize=convpara[1], stride=convpara[2], poolOpt=optModel["poolOpt"], ) print("CNN1d Kernel is used!") else: if type(x) is tuple: x = np.concatenate([x[0], x[1]], axis=2) nx = x.shape[-1] + c.shape[-1] optModel["nx"] = nx print("Concatenate input and obs, update nx by obs") model = rnn.CudnnLstmModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"], ) optModel["name"] = "hydroDL.model.rnn.CudnnLstmModel" print("Too few obserservations, not using cnn kernel") else: raise Exception("CNN kernel used but daobs option is not obs list") elif eval(optModel["name"]) is hydroDL.model.rnn.CNN1dLCmodel: # LCrange = optData['lckernel'] # tLCLst = utils.time.tRange2Array(LCrange) if len(x[1].shape) == 2: # for LC-FDC optModel["nobs"] = x[1].shape[-1] elif len(x[1].shape) == 3: # for LC-SMAP--get time step optModel["nobs"] = x[1].shape[1] convpara = optModel["convNKS"] model = rnn.CNN1dLCmodel( nx=optModel["nx"], ny=optModel["ny"], nobs=optModel["nobs"], hiddenSize=optModel["hiddenSize"], nkernel=convpara[0], kernelSize=convpara[1], stride=convpara[2], poolOpt=optModel["poolOpt"], ) print("CNN1d Local calibartion Kernel is used!") elif eval(optModel["name"]) is hydroDL.model.rnn.CNN1dLCInmodel: LCrange = optData["lckernel"] tLCLst = utils.time.tRange2Array(LCrange) optModel["nobs"] = x[1].shape[-1] convpara = optModel["convNKS"] model = rnn.CNN1dLCInmodel( nx=optModel["nx"], ny=optModel["ny"], nobs=optModel["nobs"], hiddenSize=optModel["hiddenSize"], nkernel=convpara[0], kernelSize=convpara[1], stride=convpara[2], poolOpt=optModel["poolOpt"], ) print("CNN1d Local calibartion Kernel is used!") elif eval(optModel["name"]) is hydroDL.model.rnn.CudnnInvLstmModel: # optModel['ninv'] = x[1].shape[-1] optModel["ninv"] = x[1].shape[-1] + c.shape[-1] # Test the inv using attributes model = rnn.CudnnInvLstmModel( nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"], ninv=optModel["ninv"], nfea=optModel["nfea"], hiddeninv=optModel["hiddeninv"], ) print("LSTMInv model is used!") # train if optTrain["saveEpoch"] > optTrain["nEpoch"]: optTrain["saveEpoch"] = optTrain["nEpoch"] # train model writeMasterFile(mDict) model = trainModel( model, x, y, c, lossFun, nEpoch=optTrain["nEpoch"], miniBatch=optTrain["miniBatch"], saveEpoch=optTrain["saveEpoch"], saveFolder=out, )
lossFun = RmseLoss() # the loaded loss should be consistent with the 'name' in optLoss Dict above for logging purpose # update and write the dictionary variable to out folder for logging and future testing masterDict = wrapMaster(out, optData, optModel, optLoss, optTrain) writeMasterFile(masterDict) # log statistics statFile = os.path.join(out, "statDict.json") with open(statFile, "w") as fp: json.dump(statDict, fp, indent=4) # train model model = trainModel( model, xTrain, yTrain, attrs, lossFun, nEpoch=EPOCH, miniBatch=[BATCH_SIZE, RHO], saveEpoch=saveEPOCH, saveFolder=out, ) elif interfaceOpt == 0: # directly train the model using dictionary variable master.train(masterDict) # Train DI model if 1 in Action: nDayLst = [1, 3] for nDay in nDayLst: # nDay: previous Nth day observation to integrate # update parameter "daObs" for data dictionary variable optData = default.update(default.optDataCamels, daObs=nDay)
# select model: GPU or CPU if torch.cuda.is_available(): LSTM = LSTM else: LSTM = LSTM_CPU model = LSTM(nx=len(var_time_series) + len(var_constant), ny=len(target), hiddenSize=HIDDEN_SIZE) # training the model last_model = trainModel( model, x_train, y_train, c_train, loss_fn, nEpoch=EPOCH, miniBatch=[BATCH_SIZE, RHO], saveEpoch=1, saveFolder=output_s, ) # validation the result # load validation datasets val_date_list = ["2016-04-01", "2017-03-31"] # validation period # load your data. same as training data val_csv = LoadCSV(csv_path_s, val_date_list, all_date_list) x_val = val_csv.load_time_series(var_time_series) c_val = val_csv.load_constant(var_constant, convert_time_series=False) y_val = val_csv.load_time_series(target, remove_nan=False)
lossFun = crit.RmseLoss() # the loaded loss should be consistent with the 'name' in optLoss Dict above for logging purpose # update and write the dictionary variable to out folder for logging and future testing masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain) master.writeMasterFile(masterDict) # train model out1 = out ############ model = train.trainModel(model, x, y, c, lossFun, nEpoch=EPOCH, miniBatch=[BATCH_SIZE, RHO], saveEpoch=saveEPOCH, saveFolder=out) elif interfaceOpt == 0: # directly train the model using dictionary variable master.train(masterDict) # Test models if 2 in Action: TestEPOCH = 2000 # it was 200 # choose the model to test after trained "TestEPOCH" epoches # generate a folder name list containing all the tested model output folders caseLst = [ 'All-2010-2016' ] #, '494-B247-H100','460-B230-H100' ,'327-B163-H100','258-B129-H100' ,'169-B169-H100', '29-B29-H100']
outFolder = os.path.join(hydroDL.pathSMAP['outTest'], 'closeLoop') ty1 = [20150401, 20160401] ty2 = [20160401, 20170401] ty3 = [20170401, 20180401] doLst = list() doLst.append('train') # doLst.append('test') # doLst.append('post') df = hydroDL.data.dbCsv.DataframeCsv(rootDB=rootDB, subset='CONUSv4f1', tRange=ty1) x = df.getData(varT=dbCsv.varForcing, varC=dbCsv.varConst, doNorm=True, rmNan=True) y = df.getData(varT='SMAP_AM', doNorm=True, rmNan=False) nx = x.shape[-1] ny = 1 model3 = rnn.LstmCloseModel(nx=nx + 1, ny=ny, hiddenSize=64, opt=1) lossFun = crit.RmseLoss() model3 = train.trainModel(model3, x, y, lossFun, nEpoch=nEpoch, miniBatch=[100, 30]) modelName = 'LSTM-DA' train.saveModel(outFolder, model3, nEpoch, modelName=modelName)
nx=optModel["nx"], ny=optModel["ny"], hiddenSize=optModel["hiddenSize"] ) # Wrap up all the training configurations to one dictionary in order to save into "out" folder masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain) master.writeMasterFile(masterDict) # log statistics statFile = os.path.join(out, "statDict.json") with open(statFile, "w") as fp: json.dump(statDict, fp, indent=4) # Train the model trainedModel = train.trainModel( model, xTrain, yTrain, attrs, lossFun, nEpoch=EPOCH, miniBatch=[BATCH_SIZE, RHO], saveEpoch=saveEPOCH, saveFolder=out, ) if interfaceOpt == 0: # Only need to pass the wrapped configuration dict 'masterDict' for training # nx, ny will be automatically updated later masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain) master.train(masterDict) ## Not used here. ## A potential way to run batch jobs simultaneously in background through multiple GPUs and Linux screens. ## To use this, must manually set the "pathCamels['DB']" in hydroDL/__init__.py as your own root path of CAMELS data.