varC = usgs.newC # selected sites with obs more than xxx countMat = np.sum(matC * ~matCF, axis=1) tempC = [ '00300', '00400', '00405', '00600', '00605', '00618', '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940', '00945', '00955', '71846', '80154' ] nc = len(codeLst) mat1 = np.zeros([nc, nc]) mat2 = np.zeros([nc, nc]) for j, c1 in enumerate(codeLst): a = matC[:, :, codeLst.index(c1)] for i, c2 in enumerate(codeLst): print(j, i) b = matC[:, :, codeLst.index(c2)] mat1[j, i] = np.sum(a * b) / np.sum(a) the = 200 ix = np.sum(a, axis=1) > the mat2[j, i] = np.sum(a[ix, :] * b[ix, :]) / np.sum(a[ix, :]) fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, mat1 * 100, labLst=codeLst) fig.show() fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, mat2 * 100, labLst=codeLst) fig.show()
# codeSel = ['00405','00600', '00605', '00618', '00660', '00665','71846'] codeSel = usgs.newC indC = [codeLst.index(code) for code in codeSel] mat = matB[:, :, indC] the = 0 count = np.sum(np.any(mat, axis=2), axis=1) indS = np.where(count > the)[0] nc = len(codeSel) out = np.ndarray([nc, nc]) for j, codej in enumerate(codeSel): cj = codeLst.index(codej) for i, codei in enumerate(codeSel): ci = codeLst.index(codei) if i == j: a = matB[indS, :, cj] b1 = np.any(matB[indS, :, :cj], axis=2) b2 = np.any(matB[indS, :, cj+1:], axis=2) b = b1 | b2 # at least one other is observed out[j, i] = 1-np.sum(a & b)/np.sum(a) else: a = matB[indS, :, cj] b = matB[indS, :, ci] out[j, i] = np.sum(a & b)/np.sum(a) labelLst = ['{} {}'.format(usgs.codePdf.loc[code]['shortName'], code) for code in codeSel] fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, out*100, labLst=labelLst) fig.show()
if 'calCount' in doLst: # find out two variables (hopefully one rock one bio) that are most related df0 = pd.read_csv(os.path.join(dirInv, 'codeCount.csv'), dtype={'siteNo': str}, index_col='siteNo') df1 = pd.read_csv(os.path.join(dirInv, 'codeCount_B2000.csv'), dtype={'siteNo': str}, index_col='siteNo') df2 = pd.read_csv(os.path.join(dirInv, 'codeCount_A2000.csv'), dtype={'siteNo': str}, index_col='siteNo') nc = len(codeLst) dfLst = [df0, df1, df2] titleLst = ['all', 'B2000', 'C2000'] for df, title in zip(dfLst, titleLst): matCorr = np.full([nc, nc], np.nan) for j, c1 in enumerate(codeLst): for i, c2 in enumerate(codeLst): v1 = df[c1].values v2 = df[c2].values # ind = np.where((v1 != 0) & (v2 != 0))[0] # corr, p = scipy.stats.spearmanr(v1[ind], v2[ind]) corr, p = scipy.stats.spearmanr(v1, v2) # corr, p = scipy.stats.pearsonr(v1, v2) matCorr[j, i] = corr varNameLst = ['{} {}'.format( usgs.codePdf.loc[code]['shortName'], code) for code in codeLst] fig, ax = plt.subplots() axplot.plotHeatMap(ax, matCorr*100, varNameLst) ax.set_title('spearman correlation of {}'.format(title)) fig.tight_layout() fig.show()
fig, ax = plt.subplots(1, 1) axplot.plotTS(ax, d2.t, [yOut[:, 0, 0], d2.y[:, 0, 0]]) fig.show() fig, ax = plt.subplots(1, 1) axplot.plotTS(ax, d2.t, [yOut[:, 0, 1], d2.y[:, 0, 1]]) fig.show() k = 0 # dataPlot = [yP[:, k, :], d1.Y[:, k, :], d2.Y[:, k, :]] dataPlot = [yOut[:, k, :], d1.y[:, k, :], d2.y[:, k, :]] cLst = ['red', 'grey', 'black'] fig, axes = figplot.multiTS(DF.t, dataPlot, cLst=cLst) fig.show() for k in range(len(varY)): utils.stat.calCorr(yOut[:, 0, k], d2.y[:, 0, k]) w = model.linearOut._parameters['weight'].detach().cpu().numpy() b = model.linearOut._parameters['bias'].detach().cpu().numpy() fig, ax = plt.subplots(1, 1) ind = np.argsort(w[0, :]) ax.plot(w[0, ind], 'k-') for k, code in enumerate(codeSel): ax.plot(w[k + 1, ind], '-') fig.show() fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, np.corrcoef(w) * 100, varY) fig.show()
d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varY=codeSel) d2 = dbBasin.DataModelBasin(DF, subset=testSet, varY=codeSel) for k in range(len(DF.siteNoLst)): dataPlot = [yW[:, k, :], yP[:, k, :], d1.Y[:, k, :], d2.Y[:, k, :]] cLst = ['blue', 'red', 'grey', 'black'] fig, axes = figplot.multiTS(DF.t, dataPlot, labelLst=labelLst, cLst=cLst) fig.show() mat1 = np.ndarray([len(siteNoLst), len(codeSel)]) mat2 = np.ndarray([len(siteNoLst), len(codeSel)]) for indS, siteNo in enumerate(siteNoLst): for indC, code in enumerate(codeSel): corr1 = utils.stat.calCorr(yP[:, indS, indC], d2.Y[:, indS, indC]) mat1[indS, indC] = corr1 corr2 = utils.stat.calCorr(yW[:, indS, indC], d2.Y[:, indS, indC]) mat2[indS, indC] = corr2 fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, mat1 * 100, labLst=[siteNoLst, codeSel], vRange=[70, 90]) fig.show() fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, mat2 * 100, labLst=[siteNoLst, codeSel], vRange=[70, 90]) fig.show()
c = wqData.c varC = wqData.varC varNameLst = usgs.codePdf.loc[varC]['shortName'].tolist() nc = c.shape[1] # calculate all at once matCorr = np.full([nc, nc], np.nan) for j in range(nc): for i in range(nc): (a, b), kk = utils.rmNan([c[:, j], c[:, i]]) if len(kk) > 0: matCorr[j, i] = np.corrcoef(a, b)[0, 1] importlib.reload(axplot) fig, ax = plt.subplots() axplot.plotHeatMap(ax, matCorr * 100, varNameLst) fig.tight_layout() fig.show() # calculate site by site ns = len(siteNoLst) matCorrAll = np.full([nc, nc, ns], np.nan) for k in range(ns): siteNo = siteNoLst[k] ind = wqData.info[wqData.info['siteNo'] == siteNo].index c = wqData.c[ind] for j in range(nc): for i in range(nc): (a, b), kk = utils.rmNan([c[:, j], c[:, i]]) if len(kk) > 0: matCorrAll[j, i, k] = np.corrcoef(a, b)[0, 1]
dictSite = json.load(f) codeLst = sorted(usgs.newC) ep = 500 reTest = True siteNoLst = dictSite['comb'] nSite = len(siteNoLst) dataName = 'rbWN5' wqData = waterQuality.DataModelWQ(dataName) codeLst = sorted(usgs.newC) info = wqData.info out = np.ndarray([len(codeLst), len(codeLst)]) for k, code in enumerate(codeLst): ic = wqData.varC.index(code) siteNoCode = dictSite[code] bs = info['siteNo'].isin(siteNoCode) bv = ~np.isnan(wqData.c[:, wqData.varC.index(code)]) ind = info.index[bs & bv].values mat = wqData.c[ind, :] count = np.sum(~np.isnan(mat), axis=0) n = count[ic] countP = count / n for j, code2 in enumerate(codeLst): ic2 = wqData.varC.index(code2) out[k, j] = countP[ic2] fig, ax = plt.subplots(1, 1) axplot.plotHeatMap(ax, out * 100, codeLst) fig.show()
import pandas as pd import numpy as np import os import time import scipy import json # all gages dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() codeLst = sorted(usgs.codeLst) countMatD = np.load(os.path.join(dirInv, 'matCountDaily.npy')) countMatW = np.load(os.path.join(dirInv, 'matCountWeekly.npy')) ny = 3 nsLst = np.arange(5, 20) * ny # nsLst = [20, 24, 28, 32, 36, 40, 44, 45, # 46, 47, 48, 52, 56, 60, 64, 68, 72, 76] outMat = np.ndarray([len(codeLst), len(nsLst)]) for i, code in enumerate(codeLst): ic = codeLst.index(code) count = np.sum(countMatW[:, -ny:, ic], axis=1) for j, ns in enumerate(nsLst): outMat[i, j] = np.sum(count >= ns) # plot fig, ax = plt.subplots(1, 1, figsize=(6, 6)) axplot.plotHeatMap(ax, outMat, labLst=[codeLst, nsLst]) fig.show()
corrMat[iS, iT] = corr rmseMat[iS, iT] = rmse dfG = gageII.readData(varLst=gageII.varLst, siteNoLst=siteNoLst) dfG = gageII.updateCode(dfG) pMat = dfG.values dfS = DGSA.DGSA_light(pMat, corrMat[:, 1:2], ParametersNames=dfG.columns.tolist(), n_clsters=3) dfP[code] = dfS importlib.reload(axplot) dfP = dfP.sort_index(axis=1) labX = list() for code in dfP.columns.tolist(): temp = usgs.codePdf.loc[code]['shortName'] labX.append('{} {}'.format(temp, code)) labLst = [dfP.index.tolist(), labX] fig, ax = plt.subplots() ax = axplot.plotHeatMap(ax, dfP.values, fmt='{:.2f}', labLst=labLst, vRange=[0, 3]) fig.tight_layout() fig.show()