Пример #1
0
def readSiteTS(siteNo,
               varLst,
               freq='D',
               area=None,
               sd=np.datetime64('1979-01-01'),
               ed=np.datetime64('2019-12-31'),
               rmFlag=True):
    # read data
    td = pd.date_range(sd, ed)
    varC = list(set(varLst).intersection(usgs.varC))
    varQ = list(set(varLst).intersection(usgs.varQ))
    varF = list(set(varLst).intersection(gridMET.varLst))
    varP = list(set(varLst).intersection(ntn.varLst))
    varR = list(set(varLst).intersection(GLASS.varLst))
    varT = list(set(varLst).intersection(varTLst))

    dfD = pd.DataFrame({'date': td}).set_index('date')
    if len(varC) > 0:
        if rmFlag:
            dfC, dfCF = usgs.readSample(siteNo,
                                        codeLst=varC,
                                        startDate=sd,
                                        flag=2)
            dfC = usgs.removeFlag(dfC, dfCF)
        else:
            dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd)
        dfD = dfD.join(dfC)
    if len(varQ) > 0:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varLst:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfD = dfD.join(dfQ)
    if len(varF) > 0:
        dfF = gridMET.readBasin(siteNo, varLst=varF)
        dfD = dfD.join(dfF)
    if len(varP) > 0:
        dfP = ntn.readBasin(siteNo, varLst=varP, freq='D')
        dfD = dfD.join(dfP)
    if len(varR) > 0:
        dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D')
        dfD = dfD.join(dfR)
    if len(varT) > 0:
        t = dfD.index.values
        matT, _ = calT(t)
        dfT = pd.DataFrame(index=t, columns=varTLst, data=matT)
        dfD = dfD.join(dfT[varT])
    dfD = dfD[varLst]
    if freq == 'D':
        return dfD
    elif freq == 'W':
        dfW = dfD.resample('W-TUE').mean()
        return dfW
Пример #2
0
def readSiteY(siteNo,
              varY,
              area=None,
              sd=np.datetime64('1979-01-01'),
              ed=np.datetime64('2020-01-01')):
    tr = pd.date_range(sd, ed)
    dfY = pd.DataFrame({'date': tr}).set_index('date')
    # extract data
    codeLst = [code for code in varY if code in usgs.codeLst]
    dfC, dfCF = usgs.readSample(siteNo,
                                codeLst=codeLst,
                                startDate=sd,
                                flag=True)
    if '00060' in varY or 'runoff' in varY:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varY:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfY = dfY.join(dfQ)
    dfY = dfY.join(dfC)
    dfY = dfY.join(dfCF)
    dfY = dfY[varY]
    return dfY
Пример #3
0
def funcPoint(iP, axes):
    kA = 0
    siteNo = siteNoLst[iP]
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)
    ctR = pd.date_range(startDate, endDate)
    dfData = pd.DataFrame({'date': ctR}).set_index('date')
    dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate)
    dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
    dfQ = dfQ.rename(columns={'00060_00003': '00060'})
    dfData = dfData.join(dfQ)
    dfData = dfData.join(dfC)

    # plot normalized time series
    ax = axes[kA]
    kA = kA + 1
    t = dfData.index.values
    dfDataN = (dfData - dfData.mean()) / dfData.std()
    varLst = dfData.columns.tolist()
    data = [dfDataN[var].values for var in varLst]
    legLst = ['streamflow'
              ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst]
    axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst)

    # plot C-Q
    nc = len(codeLst)
    for k in range(nc):
        code = codeLst[k]
        q = dfData['00060']
        c = dfData[code]
        [q, c], ind = utils.rmNan([q, c])
        ax = axes[kA]
        kA = kA + 1
        ax.plot(np.log(q), np.log(c), 'r*')

    # plot fractual
    for k in range(nc):
        code = codeLst[k]
        dfV = dfData[dfData[code].notna()]
        nt = len(dfData)
        x = dfV.index.values.astype('datetime64[D]')
        y = dfV[code].values
        freq = 2 * np.pi / np.linspace(2, nt, nt)
        power = signal.lombscargle(x, y, freq)
        ax = axes[kA]
        kA = kA + 1
        ax.plot(np.log(freq / 2 * np.pi), np.log(power), '-*')
        fyr = 2 * np.pi / 365
        pyr = signal.lombscargle(x, y, [fyr])
        ax.plot(np.log(fyr / 2 * np.pi), np.log(pyr), 'r*')
Пример #4
0
def funcPoint(iP, axP):
    siteNo = siteNoHBN[iP]
    dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst)
    dfQ = usgs.readStreamflow(siteNo)
    df = dfC.join(dfQ)
    t = df.index.values
    q = df['00060_00003'].values / area * unitConv
    c = df[code].values
    [q, c], ind = utils.rmNan([q, c])
    t = t[ind]
    qAll = dfQ['00060_00003'].values
    qT = dfQ.index.values
    axplot.plotTS(axP[0], qT, qAll, cLst='b', styLst='--')
    axplot.plotTS(axP[1], t, c)
    axP[2].plot(np.log(q), c, 'k*')
    x = 10**np.linspace(np.log10(np.min(q[q > 0])),
                        np.log10(np.max(q[~np.isnan(q)])), 20)
    ceq0 = pMat2[iP, 0]
    dw0 = pMat2[iP, 1]
    y0 = ceq0 * 1 / (x / dw0 + 1)
    axP[2].plot(np.log(x), y0, 'r-')
    axP[2].set_title('ceq={:.3f},dw={:.3f}'.format(ceq0, dw0))
Пример #5
0
def funcPoint(iP, axes):
    kA = 0
    siteNo = siteNoLst[iP]
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)
    ctR = pd.date_range(startDate, endDate)
    dfData = pd.DataFrame({'date': ctR}).set_index('date')
    dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate)
    dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
    dfQ = dfQ.rename(columns={'00060_00003': '00060'})
    dfData = dfData.join(dfQ)
    dfData = dfData.join(dfC)

    # plot normalized time series
    ax = axes[kA]
    kA = kA + 1
    t = dfData.index.values
    dfDataN = (dfData - dfData.mean()) / dfData.std()
    varLst = dfData.columns.tolist()
    data = [dfDataN[var].values for var in varLst]
    legLst = ['streamflow'
              ] + [usgs.codePdf.loc[code]['shortName'] for code in codeLst]
    axplot.plotTS(ax, t, data, styLst='-***', cLst='krgb', legLst=legLst)
    ax.set_title(siteNo)

    # plot C-Q
    nc = len(codeLst)
    for k in range(nc):
        code = codeLst[k]
        q = dfData['00060']
        c = dfData[code]
        [q, c], ind = utils.rmNan([q, c])
        ceq, dw, y = wqRela.kateModel(q, c, q)
        ax = axes[kA]
        kA = kA + 1
        ax.plot(np.log(q), np.log(c), 'r*')
        ax.plot(np.log(q), np.log(y), 'b*')
Пример #6
0
from hydroDL.app import waterQuality
from hydroDL.data import usgs
import numpy as np
import pandas as pd
from hydroDL.post import axplot, figplot
import matplotlib.pyplot as plt

siteNo = '09163500'
varC = ['00660', '00618']

sd = np.datetime64('1979-01-01')
ed = np.datetime64('2019-12-31')

df = waterQuality.readSiteTS(siteNo, varLst=['00060'] + varC)

dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=sd, flag=2)

#
fig, axes = plt.subplots(2, 1)
for k, code in enumerate(varC):
    v = dfC[code].values
    f = dfCF[code + '_cd'].values
    t = dfC.index.values
    indF = np.where(f == 1)[0]
    axplot.plotTS(axes[k], t, v, cLst='r', styLst=['-*'])
    axplot.plotTS(axes[k], t[indF], v[indF], cLst='b', styLst='*')
fig.show()
Пример #7
0
    varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
lat = dfCrd['LAT_GAGE'].values
lon = dfCrd['LNG_GAGE'].values


def funcMap():
    figM, axM = plt.subplots(1, 1, figsize=(8, 4))
    axplot.mapPoint(axM, lat, lon, nSite, s=12)
    figP, axP = plt.subplots(1, 1, figsize=(12, 6))
    return figM, axM, figP, axP, lon, lat


def funcPoint(iP, axP):
    siteNo = siteNoLst[iP]
    dfC = waterQuality.readSiteY(siteNo, [code])
    t = dfC.index.values.astype(np.datetime64)
    axplot.plotTS(axP, t, dfC[code], styLst='*')
    axP.set_title('{} #samples = {}'.format(siteNo, dfC.count().values))


figplot.clickMap(funcMap, funcPoint)

siteNo = '401733105392404'
sd = pd.datetime(1980, 1, 1)
dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd)
fig, ax = plt.subplots(1, 1, figsize=(12, 6), '*')
ax.plot(dfC)
fig.show()

dfC.plot()
plt.show()
Пример #8
0
dirUSGS = os.path.join(kPath.dirData, 'USGS')
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory')
dirCQ = os.path.join(kPath.dirWQ, 'C-Q')
fileSiteNoLst = os.path.join(dirInv, 'siteNoLst')
siteNoLst = pd.read_csv(fileSiteNoLst, header=None, dtype=str)[0].tolist()

t0 = time.time()
fileName = os.path.join(dirCQ, 'CQall')
if not os.path.exists(fileName):
    dictData = dict()
    errLst = list()
    for i, siteNo in enumerate(siteNoLst):
        csvC = os.path.join(kPath.dirData, 'USGS', 'sample', 'csv', siteNo)
        csvQ = os.path.join(kPath.dirData, 'USGS', 'streamflow', 'csv', siteNo)
        dfC = usgs.readSample(siteNo, codeLst=waterQuality.codeLst)
        dfQ = usgs.readStreamflow(siteNo)
        if len(dfC.index) == 0:
            errLst.append(siteNo)
        pdf = pd.concat(
            [dfC.set_index('date').dropna(how='all'),
             dfQ.set_index('date')],
            axis=1,
            join='inner')
        dictData[siteNo] = pdf
        print('\t {}/{} {:.2f}'.format(i, len(siteNoLst),
                                       time.time() - t0),
              end='\r')
    fileName = os.path.join(kPath.dirWQ, 'tempData', 'CQall')
    pickle.dump(dictData, open(fileName, 'wb'))
else:
Пример #9
0
def wrapData(caseName,
             siteNoLst,
             rho=365,
             nFill=5,
             varC=usgs.varC,
             varG=gageII.lstWaterQuality):
    """ wrap up input and target data for the model,as:
    x=[nT,nP,nX]
    y=[nP,nY]
    c=[nP,nC]
    where nP is number of time series
    Arguments:
        caseName {str} -- name of current data case
        siteNoLst {list} -- list of USGS site
    Keyword Arguments:
        rho {int} -- [description] (default: {365})
        nFill {int} -- max number of continous nan to interpolate in input data (default: {5})
        varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample})
        varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality})
        varQ and varF are fixed so far
    """
    # add a start/end date to improve efficiency.
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)

    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)

    # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
    fLst = list()  # forcing ts
    gLst = list()  # geo-const
    qLst = list()  # streamflow
    cLst = list()  # water quality
    cfLst = list()  # water quality flags
    infoLst = list()
    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        dfC, dfCF = usgs.readSample(siteNo,
                                    codeLst=varC,
                                    startDate=startDate,
                                    flag=2)
        dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
        dfF = gridMET.readBasin(siteNo)
        for k in range(len(dfC)):
            ct = dfC.index[k]
            ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct)
            if (ctR[0] < startDate) or (ctR[-1] > endDate):
                continue
            tempQ = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfQ).interpolate(limit=nFill,
                                                       limit_direction='both')
            tempF = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfF).interpolate(limit=nFill,
                                                       limit_direction='both')
            qLst.append(tempQ.values)
            fLst.append(tempF.values)
            cLst.append(dfC.iloc[k].values)
            cfLst.append(dfCF.iloc[k].values)
            gLst.append(tabG.loc[siteNo].values)
            infoLst.append(dict(siteNo=siteNo, date=ct))
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    infoDf = pd.DataFrame(infoLst)
    # add runoff
    runoff = calRunoff(q[:, :, 0], infoDf)
    q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32)
    saveFolder = os.path.join(kPath.dirWQ, 'trainData')
    saveName = os.path.join(saveFolder, caseName)
    np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf)
    infoDf.to_csv(saveName + '.csv')
    dictData = dict(name=caseName,
                    rho=rho,
                    nFill=nFill,
                    varG=varG,
                    varC=varC,
                    varQ=['00060', 'runoff'],
                    varF=gridMET.varLst,
                    siteNoLst=siteNoLst)
    with open(saveName + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
Пример #10
0
from hydroDL import kPath, utils
from hydroDL.app import waterQuality
from hydroDL.master import basins
from hydroDL.data import usgs, gageII, gridMET, ntn, transform
from hydroDL.master import slurm
from hydroDL.post import axplot, figplot
import numpy as np
import matplotlib.pyplot as plt
code = '00660'

siteNo = '01111500'
df = waterQuality.readSiteTS(siteNo, [code], freq='D').dropna()
dfC, dfCF = usgs.readSample(siteNo, codeLst=[code], flag=2)
dfC = dfC.resample('W-TUE').mean()
dfCF = dfCF.fillna(0)
dfCFW = dfCF.resample('W-TUE').mean()
dfCFW = dfCFW.fillna(0)
dfCFW[dfCFW != 0] = 1
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
t = dfC.index
v = dfC[code].values
flag = dfCFW[code+'_cd'].values
ax.plot(t[flag == 0], v[flag == 0], 'r*')
ax.plot(t[flag != 0], v[flag != 0], 'k*')
fig.show()
Пример #11
0
# code = '00955'
err = errMat[:, wqData.varC.index(code), 1]
fig, ax = plt.subplots(1, 1)
ax.plot(area, err, 'b*')
ax.plot(area[indHBN], err[indHBN], 'r*')
# np.nanmedian(err)
# np.nanmedian(err[indHBN, :])
fig.show()

# dw vs error
code = '00955'
# code = '00600'
pMat = np.full([len(siteNoLst), 2], np.nan)
for k, siteNo in enumerate(siteNoLst):
    area = dfX.loc[siteNo]['DRAIN_SQKM']
    dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst)
    dfQ = usgs.readStreamflow(siteNo)
    df = dfC.join(dfQ)
    t = df.index.values
    q = df['00060_00003'].values / area * unitConv
    c = df[code].values
    try:
        ceq, dw, y = relaCQ.kateModel2(q, c)
        pMat[k, 0] = ceq
        pMat[k, 1] = dw
    except:
        pass
fig, ax = plt.subplots(1, 1)
ax.plot(pMat[:, 1], err, 'b*')
ax.plot(pMat[indHBN, 1], err[indHBN], 'r*')
fig.show()
Пример #12
0
    dfRmseLst = [
        pd.DataFrame(index=siteNoLst, columns=usgs.varC) for x in range(2)
    ]

    for siteNo in siteNoLst:
        outFolder = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-F')
        saveFile = os.path.join(outFolder, trainSet, siteNo)
        dfP = pd.read_csv(saveFile, index_col=None)
        # a bug - did not save dates
        startDate = pd.datetime(1979, 1, 1)
        endDate = pd.datetime(2020, 1, 1)
        ctR = pd.date_range(startDate, endDate)
        dfP.index = ctR
        dfP.index.name = 'date'
        dfY = pd.DataFrame({'date': ctR}).set_index('date')
        dfC, dfCF = usgs.readSample(siteNo, usgs.varC, flag=2)
        dfC[dfCF != 0] = np.nan
        dfY = dfY.join(dfC)
        yr = dfY.index.year.values
        indLst = [np.where(yr % 2 == x)[0] for x in [0, 1]]
        for code in usgs.varC:
            for k in range(2):
                ind = indLst[k]
                corr = dfY.iloc[ind][code].corr(dfP.iloc[ind][code])
                rmse = np.sqrt(
                    np.sum((dfY.iloc[ind][code] - dfP.iloc[ind][code])**2))
                dfCorrLst[k].loc[siteNo][code] = corr
                dfRmseLst[k].loc[siteNo][code] = corr
    for k in range(2):
        if k == 0:
            testSet = 'Yeven'
Пример #13
0
# upgrade code to read flags and save CSV
from hydroDL.data import usgs
from hydroDL import kPath
from hydroDL.app import waterQuality
import os
import pandas as pd

siteNo = '07060710'
dfC = usgs.readSample(siteNo, codeLst=usgs.codeLst, flag=True, csv=False)
Пример #14
0
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory')
fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()
dfAll = pd.read_csv(os.path.join(dirInv, 'codeCount.csv'),
                    dtype={
                        'siteNo': str
                    }).set_index('siteNo')

# pick some sites
# codeLst = ['00915', '00940', '00955','00300']
codeLst = ['00660', '00600']
startDate = pd.datetime(1979, 1, 1)
endDate = pd.datetime(2019, 12, 31)
siteNo = '07060710'
dfC, dfCF = usgs.readSample(siteNo,
                            codeLst=codeLst,
                            startDate=startDate,
                            flag=True)
dfQ = usgs.readStreamflow(siteNo, startDate=startDate)

fig, axes = plt.subplots(len(codeLst), 1)
for k, code in enumerate(codeLst):
    flagLst = ['x', 'X', '<', 'E']
    axes[k].plot(dfC[code], '*', label='others')
    for flag in flagLst:
        axes[k].plot(dfC[code][dfCF[code + '_cd'] == flag], '*', label=flag)
    shortName = usgs.codePdf.loc[code]['shortName']
    title = '{} {} {}'.format(siteNo, shortName, code)
    axes[k].set_title(title)
    axes[k].legend()
fig.show()
Пример #15
0
    idOut[indRow] = ntnId
    distOut[indRow] = dist[ntnId]
    dist = dist.drop(ntnId)
    indRow = np.unique(np.where(np.isnan(data))[0])
    if len(indRow) == 0:
        break
    # end of while
distOut[indRow] = np.nan
idOut[indRow] = np.nan
dfP = pd.DataFrame(index=t, columns=varNtn, data=data)
dfP['distNTN'] = distOut
dfP['idNTN'] = idOut
dfP.index.name = 'date'

# read C, Q, F
dfC = usgs.readSample(siteNo, codeLst=varC)
dfQ = usgs.readStreamflow(siteNo)
dfF = gridMET.readBasin(siteNo)
# convert to weekly
td = pd.date_range(start='1979-01-01', end='2019-12-30', freq='D')
df = pd.DataFrame({'date': td}).set_index('date')
df = df.join(dfC)
df = df.join(dfQ)
df = df.join(dfF)
df = df.rename(columns={'00060_00003': '00060'})
dfW = df.resample('W-TUE').mean()
dfW = dfW.join(dfP)
dfW = dfW.loc[t]

# weekly load
dfW['Q'] = dfW['00060']*60*60*24*7*(0.3048**3)  # m^3/week
Пример #16
0
tabG = gageII.updateCode(tabG)

siteNo = siteNoLst[0]

# testset - only get sd ed
tTest = infoTest[infoTest['siteNo'] == siteNo]['date'].values
sdX = tTest[0] - np.timedelta64(rho - 1, 'D')
sdY = tTest[0]
ed = tTest[-1]
trX = pd.date_range(sdX, ed)
trY = pd.date_range(sdY, ed)
dfX = pd.DataFrame({'date': trX}).set_index('date')
dfY = pd.DataFrame({'date': trY}).set_index('date')

# extract data
dfC = usgs.readSample(siteNo, codeLst=varYC, startDate=sdX)
dfF = gridMET.readBasin(siteNo)
dfQ = usgs.readStreamflow(siteNo, startDate=sdX)
dfQ = dfQ.rename(columns={'00060_00003': '00060'})
area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values
unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2
dfQ['runoff'] = dfQ['00060'] / area * unitConv
if '00060' in varX or 'runoff' in varX:
    dfX = dfX.join(dfQ)
elif '00060' in varY or 'runoff' in varY:
    dfY = dfY.join(dfQ)
dfX = dfX.join(dfF)
dfY = dfY.join(dfC)
dfX = dfX[varX]
dfY = dfY[varY + varYC]