Exemplo n.º 1
0
def funcP(axP, iP, iM):
    rr = xMat[:, iM]**2-yMat[:, iM]**2
    cc = cMat[:, iM] if cMat.ndim == 2 else cMat
    dfCrd = gageII.readData(
        varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
    lat = dfCrd['LAT_GAGE'].values
    lon = dfCrd['LNG_GAGE'].values
    # maps
    axplot.mapPoint(axP[0], lat, lon, rr, vRange=[-0.3, 0.3], s=16, cb=False)
    circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False)
    axP[0].add_patch(circle)
    axplot.mapPoint(axP[1], lat, lon, cc, vRange=cR, s=16, cb=False)
    circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False)
    axP[1].add_patch(circle)
    siteNo = siteNoLst[iP]
    # ts
    code = codeLst2[iM]
    print(code, siteNo)
    print(iP, iM)
    v0 = dictObs[siteNo][code].values
    v1 = dictLSTM[siteNo][code].values
    v2 = dictWRTDS[siteNo][code].values
    t = dictObs[siteNo].index.values
    legLst = ['LSTM', 'WRTDS', 'Obs']
    axplot.plotTS(axP[2], t[ind1], [v1[ind1], v2[ind1], v0[ind1]],
                  styLst='--*', cLst='rbk', legLst=legLst)
    axplot.plotTS(axP[3], t[ind2], [v1[ind2], v2[ind2], v0[ind2]],
                  styLst='--*', cLst='rbk', legLst=legLst)
    # cq
    q = dictObs[siteNo]['00060'].values
    c = dictObs[siteNo][code].values
    td = dictObs[siteNo].index.dayofyear
    sc = axP[4].scatter(np.log(q), c, c=td, cmap='hsv', vmin=0, vmax=365)
Exemplo n.º 2
0
def readSiteY(siteNo,
              varY,
              area=None,
              sd=np.datetime64('1979-01-01'),
              ed=np.datetime64('2020-01-01')):
    tr = pd.date_range(sd, ed)
    dfY = pd.DataFrame({'date': tr}).set_index('date')
    # extract data
    codeLst = [code for code in varY if code in usgs.codeLst]
    dfC, dfCF = usgs.readSample(siteNo,
                                codeLst=codeLst,
                                startDate=sd,
                                flag=True)
    if '00060' in varY or 'runoff' in varY:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varY:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfY = dfY.join(dfQ)
    dfY = dfY.join(dfC)
    dfY = dfY.join(dfCF)
    dfY = dfY[varY]
    return dfY
Exemplo n.º 3
0
 def getGeo(self, subsetName=None):
     siteNoLst = self.getSite(subsetName=subsetName)
     dfCrd = gageII.readData(
         varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
     lat = dfCrd['LAT_GAGE'].values
     lon = dfCrd['LNG_GAGE'].values
     return lat, lon
Exemplo n.º 4
0
def plotP(xx, yy, cc, iP, code):
    dfCrd = gageII.readData(
        varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
    lat = dfCrd['LAT_GAGE'].values
    lon = dfCrd['LNG_GAGE'].values
    # maps
    axplot.mapPoint(axP[0], lat, lon, xx, vRange=[-0.5, 1], s=16)
    circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False)
    axP[0].add_patch(circle)
    axplot.mapPoint(axP[1], lat, lon, yy, vRange=[-0.5, 1], s=16)
    circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False)
    axP[1].add_patch(circle)
    axplot.mapPoint(axP[2], lat, lon, cc, vRange=cR, s=16)
    circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False)
    axP[2].add_patch(circle)
    siteNo = siteNoLst[iP]
    # ts
    v0 = dictObs[siteNo][code].values
    v1 = dictLSTM[siteNo][code].values
    v2 = dictWRTDS[siteNo][code].values
    t = dictObs[siteNo].index.values
    legLst = ['LSTM', 'WRTDS', 'Obs']
    axplot.plotTS(axP[3], t[ind1], [v1[ind1], v2[ind1], v0[ind1]],
                  styLst='--*', cLst='rbk', legLst=legLst)
    axplot.plotTS(axP[4], t[ind2], [v1[ind2], v2[ind2], v0[ind2]],
                  styLst='--*', cLst='rbk', legLst=legLst)
    # cq
    q = dictObs[siteNo]['00060'].values
    c = dictObs[siteNo][code].values
    td = dictObs[siteNo].index.dayofyear
    sc = axP[5].scatter(np.log(q), c, c=td, cmap='hsv', vmin=0, vmax=365)
    # figP.colorbar(sc, ax=axP[5])
    figP.suptitle('code {} {}; siteNo {} \n corrLSTM {:.2f}; corrWRTDS {:.2f}; {} {}'.format(
        code, usgs.codePdf.loc[code]['shortName'], siteNo, xx[iP], yy[iP], cVar, cc[iP]))
    figP.show()
Exemplo n.º 5
0
def calRunoff(q, info):
    siteNoLst = info.siteNo.unique().tolist()
    dfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst)
    dfArea.rename({'STAID': 'siteNo'})
    area = info.join(dfArea, on='siteNo')['DRAIN_SQKM'].values
    runoff = calRunoffArea(q, area)
    return runoff
Exemplo n.º 6
0
def wrapData(caseName,
             siteNoLst,
             nFill=5,
             freq='D',
             sdStr='1979-01-01',
             edStr='2019-12-31'):
    varF = gridMET.varLst
    varQ = usgs.varQ
    varG = gageII.lstWaterQuality
    varC = usgs.newC

    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)
    tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr))
    fLst, qLst, gLst, cLst = [list() for x in range(4)]

    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        varLst = varQ + varF + varC
        df = readSiteTS(siteNo, varLst=varLst, freq=freq)
        # streamflow
        tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ])
        qLst.append(tempQ.values)
        # forcings
        tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF])
        tempF = tempF.interpolate(limit=nFill,
                                  limit_direction='both',
                                  limit_area='inside')
        fLst.append(tempF.values)
        # # water quality
        tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC])
        cLst.append(tempC.values)
        # geog
        gLst.append(tabG.loc[siteNo].values)
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)

    # save
    saveFolder = caseFolder(caseName)
    if not os.path.exists(saveFolder):
        os.mkdir(saveFolder)
    np.savez_compressed(os.path.join(saveFolder, 'data'), c=c, q=q, f=f, g=g)
    dictData = dict(name=caseName,
                    varG=varG,
                    varQ=varQ,
                    varF=varF,
                    varC=varC,
                    sd=sdStr,
                    ed=edStr,
                    freq=freq,
                    siteNoLst=siteNoLst)
    with open(os.path.join(saveFolder, 'info') + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
Exemplo n.º 7
0
def wrapData(caseName,
             siteNoLst,
             nFill=5,
             freq='D',
             sdStr='1979-01-01',
             edStr='2019-12-31',
             varF=gridMET.varLst + ntn.varLst + GLASS.varLst,
             varQ=usgs.varQ,
             varG=gageII.varLst,
             varC=usgs.newC):
    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)
    tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr))
    fLst, qLst, gLst, cLst = [list() for x in range(4)]

    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        varLst = varQ + varF + varC
        df = readSiteTS(siteNo, varLst=varLst, freq=freq)
        # streamflow
        tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ])
        qLst.append(tempQ.values)
        # forcings
        tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF])
        tempF = tempF.interpolate(limit=nFill,
                                  limit_direction='both',
                                  limit_area='inside')
        fLst.append(tempF.values)
        # # water quality
        tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC])
        cLst.append(tempC.values)
        # geog
        gLst.append(tabG.loc[siteNo].values)
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)

    # save
    saveDataFrame(caseName,
                  c=c,
                  q=q,
                  f=f,
                  g=g,
                  varC=varC,
                  varQ=varQ,
                  varF=varF,
                  varG=varG,
                  sdStr=sdStr,
                  edStr=edStr,
                  freq=freq,
                  siteNoLst=siteNoLst)
Exemplo n.º 8
0
def readSiteTS(siteNo,
               varLst,
               freq='D',
               area=None,
               sd=np.datetime64('1979-01-01'),
               ed=np.datetime64('2019-12-31'),
               rmFlag=True):
    # read data
    td = pd.date_range(sd, ed)
    varC = list(set(varLst).intersection(usgs.varC))
    varQ = list(set(varLst).intersection(usgs.varQ))
    varF = list(set(varLst).intersection(gridMET.varLst))
    varP = list(set(varLst).intersection(ntn.varLst))
    varR = list(set(varLst).intersection(GLASS.varLst))
    varT = list(set(varLst).intersection(varTLst))

    dfD = pd.DataFrame({'date': td}).set_index('date')
    if len(varC) > 0:
        if rmFlag:
            dfC, dfCF = usgs.readSample(siteNo,
                                        codeLst=varC,
                                        startDate=sd,
                                        flag=2)
            dfC = usgs.removeFlag(dfC, dfCF)
        else:
            dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd)
        dfD = dfD.join(dfC)
    if len(varQ) > 0:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varLst:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfD = dfD.join(dfQ)
    if len(varF) > 0:
        dfF = gridMET.readBasin(siteNo, varLst=varF)
        dfD = dfD.join(dfF)
    if len(varP) > 0:
        dfP = ntn.readBasin(siteNo, varLst=varP, freq='D')
        dfD = dfD.join(dfP)
    if len(varR) > 0:
        dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D')
        dfD = dfD.join(dfR)
    if len(varT) > 0:
        t = dfD.index.values
        matT, _ = calT(t)
        dfT = pd.DataFrame(index=t, columns=varTLst, data=matT)
        dfD = dfD.join(dfT[varT])
    dfD = dfD[varLst]
    if freq == 'D':
        return dfD
    elif freq == 'W':
        dfW = dfD.resample('W-TUE').mean()
        return dfW
Exemplo n.º 9
0
def funcM():
    dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE'],
                            siteNoLst=siteNoLst)
    lat = dfCrd['LAT_GAGE'].values
    lon = dfCrd['LNG_GAGE'].values
    lat[np.isnan(xMat)] = 9999
    lon[np.isnan(xMat)] = 9999
    figM, axM = plt.subplots(1, 1, figsize=(12, 4))
    axplot.mapPoint(axM, lat, lon, xMat**2 - yMat**2, vRange=[-0.3, 0.3], s=16)
    axM.set_title('testing Rsq LSTM - Rsq WRTDS')
    figP = plt.figure(figsize=[16, 6])
    axP = list()
    gsP = gridspec.GridSpec(2, 3)
    axP.append(figP.add_subplot(gsP[0, :2]))
    axP.append(figP.add_subplot(gsP[1, :2]))
    axP.append(figP.add_subplot(gsP[0:, 2]))
    axP = np.array(axP)
    return figM, axM, figP, axP, lon, lat
Exemplo n.º 10
0
def readSiteX(siteNo,
              varX,
              area=None,
              nFill=5,
              sd=np.datetime64('1979-01-01'),
              ed=np.datetime64('2020-01-01')):
    tr = pd.date_range(sd, ed)
    dfX = pd.DataFrame({'date': tr}).set_index('date')
    # extract data
    dfF = gridMET.readBasin(siteNo)
    if '00060' in varX or 'runoff' in varX:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varX:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfX = dfX.join(dfQ)
    dfX = dfX.join(dfF)
    dfX = dfX[varX]
    dfX = dfX.interpolate(limit=nFill, limit_direction='both')
    return dfX
Exemplo n.º 11
0
    fig = figplot.boxPlot(dataBox, label1=labLst1, label2=labLst2)
    fig.suptitle(title)
    fig.show()
    fig.savefig(os.path.join(figFolder, figName))

siteNoLst = wqData.info['siteNo'].unique().tolist()
dfHBN = pd.read_csv(os.path.join(kPath.dirData, 'USGS', 'inventory',
                                 'HBN.csv'),
                    dtype={
                        'siteNo': str
                    }).set_index('siteNo')
siteNoHBN = [siteNo for siteNo in dfHBN.index.tolist() if siteNo in siteNoLst]
dropColLst = [
    'STANAME', 'WR_REPORT_REMARKS', 'ADR_CITATION', 'SCREENING_COMMENTS'
]
dfX = gageII.readData(siteNoLst=siteNoLst).drop(columns=dropColLst)
dfX = gageII.updateCode(dfX)
unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2

# area vs error
indHBN = [siteNoLst.index(siteNo) for siteNo in siteNoHBN]
area = dfX['DRAIN_SQKM'].values
errMat = errMatLst2[0]
code = '00605'
# code = '00955'
err = errMat[:, wqData.varC.index(code), 1]
fig, ax = plt.subplots(1, 1)
ax.plot(area, err, 'b*')
ax.plot(area[indHBN], err[indHBN], 'r*')
# np.nanmedian(err)
# np.nanmedian(err[indHBN, :])
Exemplo n.º 12
0
def wrapData(caseName,
             siteNoLst,
             rho=365,
             nFill=5,
             varC=usgs.varC,
             varG=gageII.lstWaterQuality):
    """ wrap up input and target data for the model,as:
    x=[nT,nP,nX]
    y=[nP,nY]
    c=[nP,nC]
    where nP is number of time series
    Arguments:
        caseName {str} -- name of current data case
        siteNoLst {list} -- list of USGS site
    Keyword Arguments:
        rho {int} -- [description] (default: {365})
        nFill {int} -- max number of continous nan to interpolate in input data (default: {5})
        varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample})
        varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality})
        varQ and varF are fixed so far
    """
    # add a start/end date to improve efficiency.
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)

    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)

    # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
    fLst = list()  # forcing ts
    gLst = list()  # geo-const
    qLst = list()  # streamflow
    cLst = list()  # water quality
    cfLst = list()  # water quality flags
    infoLst = list()
    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        dfC, dfCF = usgs.readSample(siteNo,
                                    codeLst=varC,
                                    startDate=startDate,
                                    flag=2)
        dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
        dfF = gridMET.readBasin(siteNo)
        for k in range(len(dfC)):
            ct = dfC.index[k]
            ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct)
            if (ctR[0] < startDate) or (ctR[-1] > endDate):
                continue
            tempQ = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfQ).interpolate(limit=nFill,
                                                       limit_direction='both')
            tempF = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfF).interpolate(limit=nFill,
                                                       limit_direction='both')
            qLst.append(tempQ.values)
            fLst.append(tempF.values)
            cLst.append(dfC.iloc[k].values)
            cfLst.append(dfCF.iloc[k].values)
            gLst.append(tabG.loc[siteNo].values)
            infoLst.append(dict(siteNo=siteNo, date=ct))
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    infoDf = pd.DataFrame(infoLst)
    # add runoff
    runoff = calRunoff(q[:, :, 0], infoDf)
    q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32)
    saveFolder = os.path.join(kPath.dirWQ, 'trainData')
    saveName = os.path.join(saveFolder, caseName)
    np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf)
    infoDf.to_csv(saveName + '.csv')
    dictData = dict(name=caseName,
                    rho=rho,
                    nFill=nFill,
                    varG=varG,
                    varC=varC,
                    varQ=['00060', 'runoff'],
                    varF=gridMET.varLst,
                    siteNoLst=siteNoLst)
    with open(saveName + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
Exemplo n.º 13
0
from hydroDL.master import basins
from hydroDL.data import usgs, gageII
from hydroDL.master import slurm
from hydroDL.post import axplot, figplot
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import json
import sklearn.tree
import matplotlib.gridspec as gridspec

from sklearn import decomposition

# load gageII
dfGeo = gageII.readData()
dfGeo = gageII.updateCode(dfGeo)
dfGeo = gageII.removeField(dfGeo)
dirTree = r'C:\Users\geofk\work\waterQuality\C-Q\tree'

# count
fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()
codeCount = sorted(usgs.codeLst)
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory')
countMatAll = np.load(os.path.join(dirInv, 'matCountWeekly.npy'))
countMat = np.ndarray([len(siteNoLstAll), len(codeCount)])
for ic, code in enumerate(codeCount):
    countMat[:, ic] = np.sum(countMatAll[:, :, ic], axis=1)

# select site
Exemplo n.º 14
0
dictLSTM = dictLSTMLst[0]
corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan)
for ic, code in enumerate(codeLst):
    for siteNo in dictSite[code]:
        indS = siteNoLst.index(siteNo)
        v0 = dictObs[siteNo][code].iloc[ind2].values
        v1 = dictLSTM[siteNo][code].iloc[ind2].values
        v2 = dictWRTDS[siteNo][code].iloc[ind2].values
        rmse1, corr1 = utils.stat.calErr(v1, v0)
        rmse2, corr2 = utils.stat.calErr(v2, v0)
        rmse3, corr3 = utils.stat.calErr(v1, v2)
        corrMat[indS, ic, 0] = corr1
        corrMat[indS, ic, 1] = corr2
        corrMat[indS, ic, 2] = corr3

dfG = gageII.readData(varLst=None, siteNoLst=siteNoLst)
varG = 'DDENS_2009'

# plot 121
importlib.reload(axplot)
codeLst2 = [
    '00095', '00400', '00405', '00600', '00605', '00618', '00660', '00665',
    '00681', '00915', '00925', '00930', '00935', '00940', '00945', '00950',
    '00955', '70303', '71846', '80154'
]
fig, axes = plt.subplots(5, 4)
for k, code in enumerate(codeLst2):
    j, i = utils.index2d(k, 5, 4)
    ax = axes[j, i]
    ic = codeLst.index(code)
    # x = corrMat[:, ic, 1]
Exemplo n.º 15
0
from hydroDL.data import dbCsv
from hydroDL.utils import gis, grid
from hydroDL.data import usgs, gageII, gridMET, ntn, transform
from hydroDL import kPath
import time
import csv
import os
import pandas as pd
import numpy as np

# load sites
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory')
fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979')
siteNoLst = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()
varLst = ['ECO3_BAS_DOM', 'LAT_GAGE', 'LNG_GAGE', 'CLASS']
dfR = gageII.readData(varLst=varLst, siteNoLst=siteNoLst)
dfR = gageII.updateCode(dfR)
fileT = os.path.join(gageII.dirTab, 'lookupEco.csv')
tabT = pd.read_csv(fileT).set_index('Eco3code')

mat = np.full([len(siteNoLst), 3], np.nan)
for code in range(1, 85):
    siteNoTemp = dfR[dfR['ECO3_BAS_DOM'] == code].index
    ind = [siteNoLst.index(siteNo) for siteNo in siteNoTemp]
    eco3 = tabT.loc[code]['Eco3']
    EcoB1, EcoB2, EcoB3 = eco3.split('.')
    mat[ind, 0] = EcoB1
    mat[ind, 1] = EcoB2
    mat[ind, 2] = EcoB3
dfEcoB = pd.DataFrame(index=siteNoLst,
                      columns=['EcoB1', 'EcoB2', 'EcoB3'],
Exemplo n.º 16
0
def testModelSeq(outName,
                 siteNoLst,
                 wqData=None,
                 ep=None,
                 returnOut=False,
                 retest=False,
                 sd=np.datetime64('1979-01-01'),
                 ed=np.datetime64('2019-12-31')):
    # run sequence test for all sites, default to be from first date to last date
    if type(siteNoLst) is not list:
        siteNoLst = [siteNoLst]
    master = loadMaster(outName)
    if master['crit'] == 'SigmaLoss':
        doSigma = True
    else:
        doSigma = False
    if ep is None:
        ep = master['nEpoch']
    outDir = nameFolder(outName)
    sdS = pd.to_datetime(sd).strftime('%Y%m%d')
    edS = pd.to_datetime(ed).strftime('%Y%m%d')
    saveDir = os.path.join(outDir, 'seq-{}-{}-ep{}'.format(sdS, edS, ep))
    if not os.path.exists(saveDir):
        os.mkdir(saveDir)
    siteSaveLst = os.listdir(saveDir)
    if retest is True:
        sitePredLst = siteNoLst
    else:
        sitePredLst = [
            siteNo for siteNo in siteNoLst if siteNo not in siteSaveLst
        ]
    if len(sitePredLst) != 0:
        if wqData is None:
            wqData = waterQuality.DataModelWQ(master['dataName'])
        (varX, varXC, varY, varYC) = (master['varX'], master['varXC'],
                                      master['varY'], master['varYC'])
        (statX, statXC, statY, statYC) = loadStat(outName)
        model = loadModel(outName, ep=ep)
        tabG = gageII.readData(varLst=varXC, siteNoLst=siteNoLst)
        tabG = gageII.updateCode(tabG)
        for siteNo in sitePredLst:
            if 'DRAIN_SQKM' in varXC:
                area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values[0]
            else:
                area = None
            # test model
            print('testing {} from {} to {}'.format(siteNo, sdS, edS))
            freq = wqData.freq
            dfX = waterQuality.readSiteTS(siteNo,
                                          varX,
                                          freq=freq,
                                          area=area,
                                          sd=sd,
                                          ed=ed)
            # dfX = waterQuality.readSiteX(
            #     siteNo, varX, sd=sd, ed=ed, area=area, nFill=5)
            xA = np.expand_dims(dfX.values, axis=1)
            xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float),
                                 axis=0)
            mtdX = waterQuality.extractVarMtd(varX)
            x = transform.transInAll(xA, mtdX, statLst=statX)
            mtdXC = waterQuality.extractVarMtd(varXC)
            xc = transform.transInAll(xcA, mtdXC, statLst=statXC)
            [x, xc] = trainTS.dealNaN([x, xc], master['optNaN'][:2])
            yOut = trainTS.testModel(model, x, xc)
            # transfer out
            nt = len(dfX)
            ny = len(varY) if varY is not None else 0
            nyc = len(varYC) if varYC is not None else 0
            if doSigma:
                yP = np.full([nt, ny + nyc], np.nan)
                sP = np.full([nt, ny + nyc], np.nan)
                yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny * 2:2], statY,
                                             varY)
                yP[:, ny:] = wqData.transOut(yOut[:, 0, ny * 2::2], statYC,
                                             varYC)
                sP[:, :ny] = wqData.transOut(
                    np.sqrt(np.exp(yOut[:, 0, 1:ny * 2:2])), statY, varY)
                sP[:, ny:] = wqData.transOut(
                    np.sqrt(np.exp(yOut[:, 0, ny * 2 + 1::2])), statYC, varYC)
            else:
                yP = np.full([nt, ny + nyc], np.nan)
                yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY)
                yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC)
            # save output
            t = dfX.index.values.astype('datetime64[D]')
            colY = [] if varY is None else varY
            colYC = [] if varYC is None else varYC
            dfOut = pd.DataFrame(data=yP, columns=[colY + colYC], index=t)
            dfOut.index.name = 'date'
            dfOut = dfOut.reset_index()
            dfOut.to_csv(os.path.join(saveDir, siteNo), index=False)
            if doSigma:
                dfOutS = pd.DataFrame(data=sP, columns=[colY + colYC], index=t)
                dfOutS.index.name = 'date'
                dfOutS = dfOut.reset_index()
                dfOutS.to_csv(os.path.join(saveDir, siteNo + '_sigma'),
                              index=False)
    # load all csv
    if returnOut:
        dictOut = dict()
        for siteNo in siteNoLst:
            # print('loading {} from {} to {}'.format(siteNo, sdS, edS))
            dfOut = pd.read_csv(os.path.join(saveDir, siteNo))
            dictOut[siteNo] = dfOut
            if doSigma:
                dfOut = pd.read_csv(os.path.join(saveDir, siteNo + '_sigma'))
                dictOut[siteNo + '_sigma'] = dfOut
        return dictOut
Exemplo n.º 17
0
from hydroDL import kPath
from hydroDL.app import waterQuality
from hydroDL.data import gageII
import pandas as pd
import numpy as np
import os
import time

# all gages
fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()

tabSel = gageII.readData(
    varLst=['CLASS'], siteNoLst=siteNoLstAll)
tabSel = gageII.updateCode(tabSel)
siteNoLst = tabSel[tabSel['CLASS'] == 1].index.tolist()

# wqData = waterQuality.DataModelWQ.new('basinRef', siteNoLst)
wqData = waterQuality.DataModelWQ('basinRef')

# indYr1 = waterQuality.indYr(wqData.info, yrLst=[1979, 2000])[0]
# wqData.saveSubset('Y8090', indYr1)
# indYr2 = waterQuality.indYr(wqData.info, yrLst=[2000, 2020])[0]
# wqData.saveSubset('Y0010', indYr2)

indYrO, indYrE = waterQuality.indYrOddEven(wqData.info)
wqData.saveSubset('Yodd', indYrO)
wqData.saveSubset('Yeven', indYrE)
Exemplo n.º 18
0
            [dfC.set_index('date').dropna(how='all'),
             dfQ.set_index('date')],
            axis=1,
            join='inner')
        dictData[siteNo] = pdf
        print('\t {}/{} {:.2f}'.format(i, len(siteNoLst),
                                       time.time() - t0),
              end='\r')
    fileName = os.path.join(kPath.dirWQ, 'tempData', 'CQall')
    pickle.dump(dictData, open(fileName, 'wb'))
else:
    dictData = pickle.load(open(fileName, 'rb'))
print('read all C-Q data {:.2f}'.format(time.time() - t0))

# calculate slope
pdfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst)
unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2
codeLst = waterQuality.codeLst
# codeLst = ['00955', '00940', '00915']

nSite = len(siteNoLst)
codeQ = '00060_00003'
pMat = np.full([nSite, len(codeLst), 4], np.nan)
nMat = np.full([nSite, len(codeLst)], np.nan)
t0 = time.time()
for i, codeC in enumerate(codeLst):
    for j, siteNo in enumerate(siteNoLst):
        pdf = dictData[siteNo][[codeC, codeQ]].dropna()
        if len(pdf.index) > 10:
            area = pdfArea.loc[siteNo].values[0]
            q = pdf[codeQ].values / area * unitConv
Exemplo n.º 19
0
varPLst = ['ph', 'Conduc', 'Ca', 'Mg', 'K', 'Na', 'NH4', 'NO3', 'Cl', 'SO4']
dfP = pd.DataFrame(columns=varPLst)
for k in range(len(tab)):
    t1 = pd.to_datetime(tab.iloc[k]['dateon']).date()
    t2 = pd.to_datetime(tab.iloc[k]['dateoff']).date()
    tt = pd.date_range(t1, t2)[:-1]
    data = np.tile(tab.iloc[k][varPLst].values, [len(tt), 1])
    tabTemp = pd.DataFrame(index=tt, columns=varPLst, data=data)
    dfP = dfP.append(tabTemp)
dfP.dropna(how='all')

startDate = pd.datetime(1979, 1, 1)
endDate = pd.datetime(2019, 12, 31)

# gageII
tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
tabG = gageII.updateCode(tabG)

# read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
fLst = list()  # forcing ts
gLst = list()  # geo-const
qLst = list()  # streamflow
cLst = list()  # water quality
cfLst = list()  # water quality flags
infoLst = list()
t0 = time.time()
for i, siteNo in enumerate(siteNoLst):
    t1 = time.time()
    dfC, dfCF = usgs.readSample(siteNo,
                                codeLst=varC,
                                startDate=startDate,
Exemplo n.º 20
0
from hydroDL import kPath
from hydroDL.app import waterQuality
from hydroDL.master import basins
from hydroDL.data import usgs, gageII, gridMET, ntn
import numpy as np
import pandas as pd
import json
import os

regionLst = ['ECO2_BAS_DOM', 'NUTR_BAS_DOM', 'HLR_BAS_DOM_100M', 'PNV_BAS_DOM']
dfG = gageII.readData(varLst=regionLst+['LAT_GAGE', 'LNG_GAGE', 'CLASS'])

# deal with PNV
fileT = os.path.join(gageII.dirTab, 'lookupPNV.csv')
tabT = pd.read_csv(fileT).set_index('PNV_CODE')
for code in range(1, 63):
    siteNoTemp = dfG[dfG['PNV_BAS_DOM'] == code].index
    dfG.at[siteNoTemp, 'PNV_BAS_DOM2'] = tabT.loc[code]['PNV_CLASS_CODE']


dictName = {
    'PNV': 'PNV_BAS_DOM2',
    'NUTR': 'NUTR_BAS_DOM',
    'HLR': 'HLR_BAS_DOM_100M',
    'ECO': 'ECO2_BAS_DOM'}
dictRegion = {
    'PNV': [2, 3, 4, 5, 9, 11],
    'NUTR': [2, 3, 4, 5, 6, 7, 8, 9, 11, 14],
    'HLR': [3, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 20],
    'ECO': [5.3, 6.2, 8.1, 8.2, 8.3, 8.4, 9.2, 9.3, 9.4, 10.1, 11.1]
Exemplo n.º 21
0
nh = 256
batchSize = [365, 50]
# if not waterQuality.exist(siteNo):
#     wqData = waterQuality.DataModelWQ.new(siteNo, [siteNo])
wqData = waterQuality.DataModelWQ(siteNo, rmFlag=False)
varX = wqData.varF
varXC = wqData.varG
varY = [wqData.varQ[0]]
varYC = codeLst
varTup = (varX, varXC, varY, varYC)
dataTup, statTup = wqData.transIn(varTup=varTup)
dataTup = trainTS.dealNaN(dataTup, [1, 1, 0, 0])
sizeLst = trainTS.getSize(dataTup)
[nx, nxc, ny, nyc, nt, ns] = sizeLst

tabG = gageII.readData(varLst=varXC, siteNoLst=[siteNo])
tabG = gageII.updateCode(tabG)
dfX = waterQuality.readSiteX(siteNo, varX, nFill=5)
dfY = waterQuality.readSiteY(siteNo, varY)
dfYC = waterQuality.readSiteY(siteNo, varYC)

importlib.reload(rnn)
model = rnn.AgeLSTM(nx=nx + nxc, ny=ny, nyc=nyc, nh=nh)
optim = torch.optim.Adadelta(model.parameters())
lossFun = crit.RmseMix()
if torch.cuda.is_available():
    lossFun = lossFun.cuda()
    model = model.cuda()

# train
model.train()
Exemplo n.º 22
0
# load WRTDS results
dirRoot1 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly')
dirRoot2 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly_rmq')

code = '00955'
dfRes1 = pd.read_csv(os.path.join(dirRoot1, 'result', code),
                     dtype={
                         'siteNo': str
                     }).set_index('siteNo')
dfRes2 = pd.read_csv(os.path.join(dirRoot2, 'result', code),
                     dtype={
                         'siteNo': str
                     }).set_index('siteNo')

# dfRes1[dfRes1 == -9999] = np.nan
dfGeo = gageII.readData(siteNoLst=dfRes1.index.tolist())
dfGeo = gageII.updateCode(dfGeo)

# select sites
nS = 200
dfR1 = dfRes1[dfRes1['count'] > nS]
siteNoLst = dfR1.index.tolist()
dfR2 = dfRes2.loc[siteNoLst]
dfG = dfGeo.loc[siteNoLst]

varGLst = dfG.columns.tolist()
dfRsq = pd.DataFrame(index=varGLst, columns=['Rsq1', 'Rsq2'])
for varG in varGLst:
    x = dfG[varG].values
    y1 = dfR1['corr'].values
    y2 = dfR1['corr'].values
Exemplo n.º 23
0
            v3 = dictObs[siteNo][code].iloc[indT2].values
            vv1, vv2, vv3 = utils.rmNan([v1, v2, v3], returnInd=False)
            rmse1, corr1 = utils.stat.calErr(vv1, vv2)
            rmse2, corr2 = utils.stat.calErr(vv1, vv3)
            rmse3, corr3 = utils.stat.calErr(vv2, vv3)
            corrMat[indS, ic, 0] = corr1
            corrMat[indS, ic, 1] = corr2
            corrMat[indS, ic, 2] = corr3
            rmseMat[indS, ic, 0] = rmse1
            rmseMat[indS, ic, 1] = rmse2
            rmseMat[indS, ic, 2] = rmse3

    # load basin attributes
    regionLst = ['ECO2_BAS_DOM', 'NUTR_BAS_DOM',
                 'HLR_BAS_DOM_100M', 'PNV_BAS_DOM']
    dfG = gageII.readData(siteNoLst=siteNoLst)
    fileT = os.path.join(gageII.dirTab, 'lookupPNV.csv')
    tabT = pd.read_csv(fileT).set_index('PNV_CODE')
    for code in range(1, 63):
        siteNoTemp = dfG[dfG['PNV_BAS_DOM'] == code].index
        dfG.at[siteNoTemp, 'PNV_BAS_DOM2'] = tabT.loc[code]['PNV_CLASS_CODE']
    dfG = gageII.updateCode(dfG)

    # calculate LombScargle
    pMat = np.full([len(siteNoLst), len(codeLst)], np.nan)
    for ic, code in enumerate(codeLst):
        for siteNo in dictSite[code]:
            indS = siteNoLst.index(siteNo)
            df = dictObs[siteNo]
            t = np.arange(len(df))*7
            y = df[code]
Exemplo n.º 24
0
from hydroDL.data import gageII, usgs, gridMET
from hydroDL import kPath, utils
import os
import pandas as pd
import numpy as np
from hydroDL import kPath

fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()

# all gages
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory')
fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()
codeLst = sorted(usgs.newC)
dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE', 'CLASS'],
                        siteNoLst=siteNoLstAll)
dfCrd = gageII.updateCode(dfCrd)
sd = np.datetime64('1979-01-01')

# load all data
dictC = dict()
dictCF = dict()
for k, siteNo in enumerate(siteNoLstAll):
    print(k, siteNo)
    dfC, dfCF = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd, flag=2)
    dictC[siteNo] = dfC
    dictCF[siteNo] = dfCF
dictQ = dict()
for k, siteNo in enumerate(siteNoLstAll):
    print(k, siteNo)
    dfQ = usgs.readStreamflow(siteNo, startDate=sd)
Exemplo n.º 25
0
codeLst = sorted(usgs.newC)
ep = 500
reTest = False
dataName = 'rbWN5'
siteNoLst = dictSite['comb']
nSite = len(siteNoLst)

# load all sequence
if False:
    importlib.reload(wq.wqLoad)
    outNameLSTM = '{}-{}-{}-{}'.format('rbWN5', 'comb', 'QTFP_C', 'comb-B10')
    dictLSTM, dictWRTDS, dictObs = wq.loadModel(
        siteNoLst, outNameLSTM, codeLst)
    corrMat, rmseMat = wq.dictErr(dictLSTM, dictWRTDS, dictObs, codeLst)
    # load basin attributes
    dfG = gageII.readData(siteNoLst=siteNoLst)
    dfG = gageII.updateRegion(dfG)
    dfG = gageII.updateCode(dfG)

t = dictObs[siteNoLst[0]].index.values
tt = np.datetime64('2010-01-01')
t0 = np.datetime64('1980-01-01')
ind1 = np.where((t < tt) & (t >= t0))[0]
ind2 = np.where(t >= tt)[0]

# caluculate interval
if False:
    intMatC = np.full([len(siteNoLst), len(codeLst), 4], np.nan)
    for k, siteNo in enumerate(siteNoLst):
        dfC = dictObs[siteNo]
        print('\t {}/{}'.format(k, len(siteNoLst)), end='\r')
Exemplo n.º 26
0
codeLst = ['00915', '00945', '00955']
tempLst = list()
for code in codeLst:
    temp = dfAll[dfAll[code] > 200].index.tolist()
    tempLst.append(temp)
siteNoLst = tempLst[0]
for k in range(1, len(tempLst)):
    siteNoLst = list(set(siteNoLst).intersection(tempLst[k]))
startDate = pd.datetime(1979, 1, 1)
endDate = pd.datetime(2019, 12, 31)
nc = len(codeLst)
ns = len(siteNoLst)

# cal dw
rMat = np.ndarray([ns, nc])
pdfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst)
unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2
for k, siteNo in enumerate(siteNoLst):
    for i, code in enumerate(codeLst):
        area = pdfArea.loc[siteNo]['DRAIN_SQKM']
        dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate)
        dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
        df = dfC.join(dfQ)
        t = df.index.values
        q = df['00060_00003'].values / area * unitConv
        c = df[code].values
        (q, c), ind = utils.rmNan([q, c])
        x = 10**np.linspace(np.log10(np.min(q[q > 0])),
                            np.log10(np.max(q[~np.isnan(q)])), 20)
        ceq, dw, y = wqRela.kateModel(q, c, q)
        corr = np.corrcoef(c, y)[0, 1]
Exemplo n.º 27
0
# pick
pickMat = (count >= 400)
len(np.where(pickMat)[0])
indS = np.where(pickMat)[0]
dictSite = dict()
siteNoSel = [siteNoLst[ind] for ind in indS]

siteNoSel = [
    '01184000', '01434025', '01435000', '01466500', '04063700', '06313500',
    '06317000', '06324500', '09163500', '09352900', '11264500',
    '401733105392404'
]
indS = [siteNoLst.index(siteNo) for siteNo in siteNoSel]
dictSite['k12'] = siteNoSel

dfCrd = gageII.readData(siteNoLst=siteNoSel,
                        varLst=['DRAIN_SQKM', 'LNG_GAGE', 'LAT_GAGE'])
lat = dfCrd['LAT_GAGE'].values
lon = dfCrd['LNG_GAGE'].values
area = dfCrd['DRAIN_SQKM'].values
nc = len(codeSel)


def funcM():
    figM, axM = plt.subplots(2, 1, figsize=(6, 4))
    axplot.mapPoint(axM[0], lat, lon, area, s=16, cb=True)
    axplot.mapPoint(axM[1], lat, lon, count[indS], s=16, cb=True)
    figP, axP = plt.subplots(nc, 1, figsize=(12, 8))
    return figM, axM, figP, axP, lon, lat


def funcP(iP, axP):
Exemplo n.º 28
0
from hydroDL.data import gageII
import numpy as np
import pandas as pd
import os

varLst = ['ECO2_BAS_DOM', 'ECO3_BAS_DOM']
dfR = gageII.readData(varLst=varLst)
dfR = gageII.updateCode(dfR)

fileEco3 = r'C:\Users\geofk\work\map\ecoRegion\tabEco3.csv'
tabEco3 = pd.read_csv(fileEco3)

fileLookup = os.path.join(gageII.dirTab, 'conterm_x_ecoregion3_names.csv')
tabLookup = pd.read_csv(fileLookup)

len(np.sort(dfR['ECO3_BAS_DOM'].unique()))
codeLst = list(range(1, 85))
dfT = pd.DataFrame(index=codeLst, columns=['Eco2', 'Eco3', 'Eco3_Name'])
for code in codeLst:
    eco2 = dfR[dfR['ECO3_BAS_DOM'] == code]['ECO2_BAS_DOM'].unique()
    eco3Name = tabLookup[tabLookup['ECO3_CODE'] == code]['ECO3_NAME'].values
    if len(eco3Name) == 1:
        eco3 = tabEco3[tabEco3['NA_L3NAME'] == eco3Name[0]]['NA_L3CODE'].values
        dfT.at[code, 'Eco3_Name'] = eco3Name[0]
    if len(eco2) == 1:
        dfT.at[code, 'Eco2'] = eco2[0]
    if len(eco3) == 1:
        dfT.at[code, 'Eco3'] = eco3[0]

fileT = os.path.join(gageII.dirTab, 'EcoTab.csv')
dfT.to_csv(fileT)
Exemplo n.º 29
0
        fig.show()

if 'plotTsMap' in doLst:
    # plot map
    iCLst = [0, 11]
    tempLst = [npfLst[0]['matRmse2'][:, iC] for iC in iCLst]
    temp = np.sum(tempLst, axis=0)

    indG = np.where(~np.isnan(temp))[0].tolist()
    npf = npfLst[0]
    dataLst = [npf['matRmse2'][indG, iC] for iC in iCLst]
    dataNLst = [npf['matN2'][indG, iC] for iC in iCLst]
    mapTitleLst = ['RMSE of ' + codePdf['shortName'][varC[iC]]
                   for iC in iCLst]
    siteNoLstTemp = [siteNoLst[i] for i in indG]
    dfCrd = gageII.readData(
        varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLstTemp)
    lat = dfCrd['LAT_GAGE'].values
    lon = dfCrd['LNG_GAGE'].values
    nTs = len(iCLst)
    nMap = len(dataLst)
    gsR = nTs
    figsize = [12, 8]
    # setup axes
    fig = plt.figure(figsize=figsize)
    gs = gridspec.GridSpec(gsR + nTs, nMap)
    gs.update(wspace=0.025, hspace=0.5)
    axTsLst = list()
    for k in range(nTs):
        axTs = fig.add_subplot(gs[k + gsR, :])
        axTsLst.append(axTs)
    for k in range(nMap):
Exemplo n.º 30
0
# ts map of single dataset, label and code
freq = 'W'
dirRoot1 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly')
dirRoot2 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly_rmq')

code = '00955'
dfRes1 = pd.read_csv(os.path.join(dirRoot1, 'result', code),
                     dtype={
                         'siteNo': str
                     }).set_index('siteNo')
dfRes2 = pd.read_csv(os.path.join(dirRoot2, 'result', code),
                     dtype={
                         'siteNo': str
                     }).set_index('siteNo')
dfGeo = gageII.readData(siteNoLst=dfRes1.index.tolist())
dfGeo = gageII.updateCode(dfGeo)

# select number of sites
countS = np.sort(dfRes1['count'].values)[::-1]
fig, ax = plt.subplots(1, 1)
ax.plot(np.arange(len(countS)), countS, '-*')
fig.show()

# plot map
nS = 200
dfR1 = dfRes1[dfRes1['count'] > nS]
siteNoLst = dfR1.index.tolist()
dfR2 = dfRes2.loc[siteNoLst]
dfG = dfGeo.loc[siteNoLst]