# wqData = waterQuality.DataModelWQ('Silica64') # siteNoLst = wqData.siteNoLst # if not waterQuality.exist('Silica64Seq'): # wqData = waterQuality2.DataModelWQ.new('Silica64Seq', siteNoLst) # importlib.reload(waterQuality2) # wqData = waterQuality2.DataModelWQ('Silica64Seq') temp = waterQuality.DataModelWQ('Silica64') siteNoLst = temp.siteNoLst # wqData = waterQuality2.DataModelWQ.new('Silica64Seq', siteNoLst) wqData = waterQuality2.DataModelWQ('Silica64Seq') # subset only have silica code = '00955' ic = wqData.varQ.index(code) indC = np.where(~np.isnan(wqData.q[-1,:, ic]))[0] wqData.saveSubset(code, indC) indYr1 = waterQuality.indYr(wqData.info.iloc[indC], yrLst=[1979, 2000])[0] wqData.saveSubset('{}-Y8090'.format(code), indYr1) indYr2 = waterQuality.indYr(wqData.info.iloc[indC], yrLst=[2000, 2020])[0] wqData.saveSubset('{}-Y0010'.format(code), indYr2) saveName = 'Silica64Seq-Y8090' caseName = basins.wrapMaster(dataName='Silica64Seq', trainName='00955-Y8090', batchSize=[None, 200], varY=['00060','00955'], varYC=None, outName=saveName) cmdP = 'python /home/users/kuaifang/GitHUB/geolearn/app/waterQual/model/cmdTrain.py -M {}' slurm.submitJobGPU(caseName, cmdP.format(caseName), nH=6)
# rmnan in q q = wqData.q[:, :, 0] info = wqData.info len(np.where(np.isnan(q).all(axis=0))[0]) len(np.where(np.isnan(q).any(axis=0))[0]) len(wqData.info) # nan in Q - 3% all nan, 7% any nan indR = np.where(np.isnan(q).any(axis=0))[0] indK = np.where(~np.isnan(q).any(axis=0))[0] # purify q infoK = info.iloc[indK] dfSite = waterQuality.countSite(infoK) ind1 = dfSite[dfSite['pRank'] <= 0.5].index.values ind2 = dfSite[dfSite['pRank'] > 0.5].index.values wqData.saveSubset(['pQ-F50', 'pQ-L50'], [ind1, ind2]) # yr after purify q indYr = waterQuality.indYr(infoK) yrLst = ['Y80', 'Y90', 'Y00', 'Y10'] wqData.saveSubset(['pQ-' + x for x in yrLst], indYr) indYrCmp = list() indAll = infoK.index.values for ind in indYr: indYrCmp.append(np.setdiff1d(indAll, ind)) wqData.saveSubset(['pQ-rm' + x for x in yrLst], indYrCmp) # validate # d=wqData.info.iloc[wqData.subset['pQ-Y00']]['date'] # np.sort(pd.DatetimeIndex(d).year.unique()) # regional / area subsets varG = ['DRAIN_SQKM', 'ECO2_BAS_DOM', 'NUTR_BAS_DOM', 'HLR_BAS_DOM_100M'] tabG = gageII.readData(varLst=varG, siteNoLst=wqData.siteNoLst) info = wqData.info
doLst = list() doLst.append('subset') if 'subset' in doLst: # find ind have SiO4, NO3 codeLst = ['00618', '00955'] icLst = [wqData.varC.index(code) for code in codeLst] indAll = np.where(~np.isnan(wqData.c[:, icLst]).all(axis=1))[0] indAny = np.where(~np.isnan(wqData.c[:, icLst]).any(axis=1))[0] # print number of samples for code in codeLst: ic = wqData.varC.index(code) indC = np.where(~np.isnan(wqData.c[:, ic]))[0] # seperate index by years for ind, lab in zip([indAll, indAny], ['all', 'any']): indYr = waterQuality.indYr( wqData.info.iloc[ind], yrLst=[1979, 2000])[0] indYrCmp = np.setdiff1d(ind, indYr) wqData.saveSubset('-'.join(sorted(codeLst)+[lab, 'Y8090']), indYr) wqData.saveSubset('-'.join(sorted(codeLst)+[lab, 'rmY8090']), indYrCmp) for code in codeLst: ic = wqData.varC.index(code) indC = np.where(~np.isnan(wqData.c[:, ic]))[0] indYr = waterQuality.indYr( wqData.info.iloc[indC], yrLst=[1979, 2000])[0] indYrCmp = np.setdiff1d(indC, indYr) wqData.saveSubset(code+'-Y8090', indYr) wqData.saveSubset(code+'-rmY8090', indYrCmp) # d=wqData.info.iloc[wqData.subset['00618-00955-any-Y10']]['date'] # np.sort(pd.DatetimeIndex(d).year.unique()) # ind=wqData.info.iloc[wqData.subset['00618-00955-any-Y10']].index.values # wqData.c[ind, wqData.varC.index('00618')]