def quicklook_dataset(datasetpath, altmax=4000): """Quick look at the original data. Parameters ---------- datasetpath: str Path to the data file. Must follow the convention adopted in the BLUSC program. Example: "DATASET_2015_0210.PASSY2015_BT-T_linear_dz40_dt30_zmax2000.nc" altmax {int, float} Top altitude of the graph (meter above ground level) Returns ------- Display the default variable of the given file against time and altitude. In X-axis is the time In Y-axis is the height (m agl) Variable values are in shades of colors. """ X_raw, t, z = utils.load_dataset( datasetpath, variables_to_load=["X_raw", "time", "altitude"]) TZ = utils.grid_to_scatter(utils.dtlist2slist(t), z) for p in range(X_raw.shape[1]): t1, z1, V = utils.scatter_to_grid(TZ, X_raw[:, p]) plt.figure() # plt.title("Variable "+str(p)+" of dataset") plt.pcolormesh(t, z, V.T, shading="auto") plt.colorbar() plt.gcf().autofmt_xdate() plt.xlabel("Time (UTC)") plt.ylabel("Alt (m agl)") if storeImages: fileName = "QL_Xraw" + str(p) plt.savefig(figureDir + fileName + fmtImages) plt.close() else: plt.show(block=False)
def clusterZTview_manyclusters(t_values, z_values, zoneIDs, delete_mask=None, titl=None, fileName=None): """Plots cluster labels in the same time and altitude grid where measurements have been done (boundary layer classification). Repeat it of 6 differents number of clusters. Parameters ---------- t_values: array-like of shape (nt,) Vector of time within the day z_values: array-like of shape (nalt,) Vector of altitude zoneIDs: list of array-like of shape (N,) Cluster labels for each point and for each number of clusters delete_mask: array-like of shape (nt*nalt,) Mask at True when observation has been removed by the `utils.deletelines` function (to avoid NaNs) titl: str, optional Customised title for the figure fileName: str, optional Customised file name for saving the figure Returns ------- 3x2 tile of clusters labels on a time-altitude grid In X-axis is the time In Y-axis is the height (m agl) Clusters are shown with differents colors. """ if titl is None: titl = "" count2letter = ['a)', 'b)', 'c)', 'd)', 'e)', 'f)'] z_values = z_values / 1000 # convert meters to kilometers # 1. Conversion datetime -> seconds t0 = t_values[0] st_values = utils.dtlist2slist(t_values) # 2. Format from grid(z,t) to scatter TZ = utils.grid_to_scatter(st_values, z_values) n_kvalues = len(zoneIDs) nl = int(np.sqrt(n_kvalues)) nc = int(np.ceil(n_kvalues / nl)) # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- fig, axes = plt.subplots(nrows=nl, ncols=nc, figsize=(12, 8), sharex=True, sharey=True) plt.suptitle(titl) for ink in range(n_kvalues): zoneID = zoneIDs[ink] K = np.max(zoneID) + 1 clustersIDs = np.arange(K) clist = [] cticks = [] cticklabels = [] for k in np.unique(zoneID): cticks.append(k + 0.5) cticklabels.append(clustersIDs[k]) clist.append(clusterMarks[clustersIDs[k]][:-1]) colormap = ListedColormap(clist) # 3. Set labels at grid(z,t) format t_trash, z_trash, labels = utils.scatter_to_grid(TZ, zoneID) if (np.max(np.abs(z_values - z_trash)) + np.max(np.abs(st_values - t_trash)) > 1e-13): raise Exception( "Error in z,t retrieval : max(|z_values-z_trash|)=", np.max(np.abs(z_values - z_trash)), "max(|t_values-t_trash|)=", np.max(np.abs(st_values - t_trash)), ) labels = np.ma.array(labels, mask=np.isnan(labels)) # 4. Graphic plt.subplot(nl, nc, ink + 1) im = plt.pcolormesh(t_values, z_values, labels.T, vmin=0, vmax=K, cmap=colormap, shading="auto") plt.text(t_values[-7], z_values[-4], count2letter[ink], fontweight='bold', fontsize=16) plt.gcf().autofmt_xdate() # Colorbar cbar = plt.colorbar() cbar.set_ticks(cticks) cbar.set_ticklabels(cticklabels) if np.mod(ink, nc) == nl: cbar.set_label("Cluster labels") if np.mod(ink, nc) == 0: plt.ylabel("Alt (km agl)") if ink >= (nl - 1) * nc: plt.xlabel("Time (UTC)") fig.subplots_adjust(wspace=0, hspace=0.1) plt.tight_layout() if storeImages: if fileName is None: fileName = "multi_clusterZTview" plt.savefig(figureDir + fileName + fmtImages) plt.close() print("Figure saved:", figureDir + fileName + fmtImages) else: plt.show(block=False)
def clusterZTview( t_values, z_values, zoneID, delete_mask=None, fileName=None, clustersIDs=None, displayClustersIDs=False, titl=None, ): """Plots cluster labels in the same time and altitude grid where measurements have been done (boundary layer classification). Parameters ---------- t_values: array-like of shape (nt,) Vector of time within the day z_values: array-like of shape (nalt,) Vector of altitude zoneID: array-like of shape (N,) Cluster labels of each point delete_mask: array-like of shape (nt*nalt,) Mask at True when observation has been removed by the `utils.deletelines` function (to avoid NaNs) fileName: str, optional Customised file name for saving the figure clustersIDs: dict, optional Connection between cluster numbers and boundary layer types Example: {0:"CL",1:"SBL",2:"FA",3:"ML"}. Default is {0:0,1:1,...}. displayClustersIDs: bool If True, displays the clusterIDs over the graph, at the center of the cluster. titl: str, optional Customised title for the figure Returns ------- Clusters labels on a time-altitude grid In X-axis is the time In Y-axis is the height (m agl) Clusters are shown with differents colors. """ if clustersIDs is None: K = np.max(zoneID) + 1 clustersIDs = np.arange(K) else: K = len(clustersIDs.items()) for it in clustersIDs.items(): key, val = it clusterMarks[val] = clusterMarks[key] if titl is None: titl = "Cluster in time-altitude grid | " + str(K) + " clusters" clist = [] cticks = [] cticklabels = [] for k in range(K): cticks.append(k + 0.5) cticklabels.append(clustersIDs[k]) clist.append(clusterMarks[clustersIDs[k]][:-1]) colormap = ListedColormap(clist) # 1. Deleted labels completion (when missing data) if delete_mask is not None: fullzoneID = np.full(np.size(delete_mask), np.nan) fullzoneID[~delete_mask] = zoneID else: fullzoneID = zoneID # 2. Conversion datetime -> seconds t0 = t_values[0] st_values = utils.dtlist2slist(t_values) # 3. Format from grid(z,t) to scatter TZ = utils.grid_to_scatter(st_values, z_values) # 4. Set labels at grid(z,t) format t_trash, z_trash, labels = utils.scatter_to_grid(TZ, fullzoneID) if np.max(np.abs(z_values - z_trash)) + np.max( np.abs(st_values - t_trash)) > 1e-13: raise Exception( "Error in z,t retrieval : max(|z_values-z_trash|)=", np.max(np.abs(z_values - z_trash)), "max(|t_values-t_trash|)=", np.max(np.abs(st_values - t_trash)), ) labels = np.ma.array(labels, mask=np.isnan(labels)) # 5. Graphic # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- fig = plt.figure() # plt.title(titl) plt.pcolormesh(t_values, z_values, labels.T, vmin=0, vmax=K, cmap=colormap, shading="auto") if displayClustersIDs: for k in np.unique(zoneID): idxk = np.where(zoneID == k)[0] x0text = t0 + dt.timedelta(seconds=np.mean(TZ[idxk, 0], axis=0)) x1text = np.mean(TZ[idxk, 1], axis=0) plt.text(x0text, x1text, clustersIDs[k], fontweight="bold", fontsize=18) cbar = plt.colorbar(label="Cluster label") cbar.set_ticks(cticks) cbar.set_ticklabels(cticklabels) plt.gcf().autofmt_xdate() plt.xlabel("Time (UTC)") plt.ylabel("Alt (m agl)") if storeImages: if fileName is None: fileName = "clusterZTview_K" + str(K) plt.savefig(figureDir + fileName + fmtImages) plt.close() print("Figure saved:", figureDir + fileName + fmtImages) else: plt.show(block=False)
def write_dataset(datasetpath, X_raw, t_common, z_common): """Write the data prepared for the classification in a netcdf file with the grid on which it has been estimated. Dataset name must of the form: 'DATASET_CAMPAGNE_PREDICTEURS_INTERPOLATION_dz***_dt***_zmax***.nc' Parameters ---------- datasetpath: str Path and name of the netcdf file to be created. X_raw: ndarray of shape (N,p) Data matrix (not normalised) t_common: array-like of shape (Nt,) with dtype=datetime.datetime Time vector of the grid z_common array-like of shape (Nz,): Altitude vector of the grid Returns ------- msg: str Message saying the netcdf file has been successfully written """ import netCDF4 as nc N, p = X_raw.shape if N != len(t_common) * len(z_common): raise ValueError( "Shapes of X_raw and grid do not match. Dataset NOT CREATED.") n_invalidValues = np.sum(np.isnan(X_raw)) + np.sum(np.isinf(X_raw)) if n_invalidValues > 0: raise ValueError(n_invalidValues, "invalid values. Dataset NOT CREATED.") # print("datasetpath=",datasetpath) dataset = nc.Dataset(datasetpath, "w") # General information dataset.description = "Dataset cleaned and prepared in order to make unsupervised boundary layer classification. The file is named according to the variables present in the dataset, their vertical and time resolution (all avariable are on the same grid) and the upper limit of the grid." dataset.source = "Meteo-France CNRM/GMEI/LISA" dataset.history = "Created " + time.ctime(time.time()) dataset.contactperson = "Thomas Rieutord ([email protected])" # In[117]: # Coordinate declaration dataset.createDimension("individuals", N) dataset.createDimension("predictors", p) dataset.createDimension("time", len(t_common)) dataset.createDimension("altitude", len(z_common)) # Fill in altitude vector altitude = dataset.createVariable("altitude", np.float64, ("altitude", )) altitude[:] = z_common altitude.units = "Meter above ground level (m)" # Fill in time vector Time = dataset.createVariable("time", np.float64, ("time", )) Time[:] = utils.dtlist2slist(t_common) Time.units = "Second since midnight (s)" # Fill in the design matrix designMatrix = dataset.createVariable("X_raw", np.float64, ("individuals", "predictors")) designMatrix[:, :] = X_raw designMatrix.units = "Different for each column. Adimensionalisation is necessary before comparing columns." # Closing the netcdf file dataset.close() return "Dataset sucessfully written in the file " + datasetpath
def estimateInterpolationError(z_target, t_target, z_known, t_known, V_known, n_randoms=10, plot_on=True): """Estimate the error and the computing time for several interpolation method. Errors are estimated by cross-validation. The function repeats the interpolation with all methods for severals train/test splits. The list of tested methods as well as their parameters must be changed inside the function. Default list: '4NearestNeighbors','8NearestNeighbors','linear','cubic' Parameters ---------- z_target: ndarray of shape (n1_z,) Altitude vector of the target grid (m agl) t_target: array-like of shape (n1_t,) with dtype=datetime Time vector of the target grid z_known: ndarray of shape (n0_z,) Altitude vector of the known grid (m agl) t_known: array-like of shape (n0_t,) with dtype=datetime Time vector of the known grid V_known: ndarray of shape (n0_t,n0_z) Data values on the known grid n_randoms: int, default=10 Number of repeated random split between training and testing sets plot_on: bool, default=True If True, the graphics showing computing time versus accuracy is drawn Returns ------- accuracies: ndarray of shape (n_randoms,n_regressors) R2 score of each regressor (one per line) for each random split (one per column). chronos: ndarray of shape (n_randoms,n_regressors) Computing time of each regressor (one per line) for each random split (one per column). reg_names: list of shape (n_regressors,) Names of regressions methods performed """ from sklearn.neighbors import KNeighborsRegressor from scipy.interpolate import griddata from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # Switch from format "data=f(coordinates)" to format "obs=f(predictors)" st_known = utils.dtlist2slist(t_known) st_target = utils.dtlist2slist(t_target) X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known) X_target = utils.grid_to_scatter(st_target, z_target) # NaN are removed X_known = X_known[~np.isnan(Y_known), :] Y_known = Y_known[~np.isnan(Y_known)] regressors = [] reg_names = [] #### ========= Estimation with 4-nearest neighbors KNN4 = KNeighborsRegressor(n_neighbors=4) regressors.append(KNN4) reg_names.append("4NearestNeighbors") #### ========= Estimation with 8-nearest neighbors KNN8 = KNeighborsRegressor(n_neighbors=8) regressors.append(KNN8) reg_names.append("8NearestNeighbors") chronos = np.zeros((len(regressors) + 2, n_randoms)) accuracies = np.zeros((len(regressors) + 2, n_randoms)) for icl in range(len(regressors)): reg = regressors[icl] print("Testing ", str(reg).split("(")[0]) for ird in range(n_randoms): X_train, X_test, y_train, y_test = train_test_split( X_known, Y_known, test_size=0.2, random_state=ird) t0 = time.time() #:::::: reg.fit(X_train, y_train) accuracies[icl, ird] = reg.score(X_test, y_test) t1 = time.time() #:::::: chronos[icl, ird] = t1 - t0 #### ========= Estimation with 2D linear interpolation reg_names.append("Linear2DInterp") print("Testing Linear2DInterp") for ird in range(n_randoms): X_train, X_test, y_train, y_test = train_test_split(X_known, Y_known, test_size=0.2, random_state=ird) y_pred = griddata(X_train, y_train, X_test, method="linear") # Some data can still be missing even after the interpolation # * Radiometer : resolution coarsens with altitude => last gates missing # * Ceilometer : high lowest range => first gates missing y_test = y_test[~np.isnan(y_pred)] y_pred = y_pred[~np.isnan(y_pred)] accuracies[-2, ird] = r2_score(y_test, y_pred) t1 = time.time() #:::::: chronos[-2, ird] = t1 - t0 #### ========= Estimation with 2D cubic interpolation reg_names.append("Cubic2DInterp") print("Testing Cubic2DInterp") for ird in range(n_randoms): X_train, X_test, y_train, y_test = train_test_split(X_known, Y_known, test_size=0.2, random_state=ird) y_pred = griddata(X_train, y_train, X_test, method="linear") # Some data can still be missing even after the interpolation # * Radiometer : resolution coarsens with altitude => last gates missing # * Ceilometer : high lowest range => first gates missing y_test = y_test[~np.isnan(y_pred)] y_pred = y_pred[~np.isnan(y_pred)] accuracies[-1, ird] = r2_score(y_test, y_pred) t1 = time.time() #:::::: chronos[-1, ird] = t1 - t0 if plot_on: graphics.estimator_quality(accuracies, chronos, reg_names) return accuracies, chronos, reg_names
def estimateongrid(z_target, t_target, z_known, t_known, V_known, method="linear"): """Interpolate the data on a target grid knowning it on another grid. Grids are time-altitude. Supported interpolation methods: 'linear','cubic','nearestneighbors' For nearest neighbors, the number of neighbors must be passed as the first character. For example: method='4nearestneighbors' For more insights about how to choose the good methods (error, computing time...) please refer to the notebook `tuto-0to1-prepdataset.ipynb` Parameters ---------- z_target: ndarray of shape (n1_z,) Altitude vector of the target grid (m agl) t_target: array-like of shape (n1_t,) with dtype=datetime Time vector of the target grid z_known: ndarray of shape (n0_z,) Altitude vector of the known grid (m agl) t_known: array-like of shape (n0_t,) with dtype=datetime Time vector of the known grid V_known: ndarray of shape (n0_t,n0_z) Data values on the known grid method: {'linear','cubic','nearestneighbors'}, default='linear' Interpolation method. Returns ------- V_target: ndarray of shape (n1_t,n1_z) Values on the target grid """ # Switch from format "data=f(coordinates)" to format "obs=f(predictors)" st_known = utils.dtlist2slist(t_known) st_target = utils.dtlist2slist(t_target) X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known) X_target = utils.grid_to_scatter(st_target, z_target) # NaN are removed X_known = X_known[~np.isnan(Y_known), :] Y_known = Y_known[~np.isnan(Y_known)] #### ========= Estimation with K-nearest neighbors if method[1:].lower() == "nearestneighbors": from sklearn.neighbors import KNeighborsRegressor KNN = KNeighborsRegressor(n_neighbors=int(method[0])) KNN.fit(X_known, Y_known) Y_target = KNN.predict(X_target) else: #### ========= Estimation with 2D interpolation from scipy.interpolate import griddata Y_target = griddata(X_known, Y_known, X_target, method=method.lower()) # Shape the output t1, z1, V_target = utils.scatter_to_grid(X_target, Y_target) # Sanity checks if np.shape(V_target) != (np.size(st_target), np.size(z_target)): raise Exception( "Output has not expected shape : shape(st_target)", np.shape(st_target), "shape(z_target)", np.shape(z_target), "shape(V_target)", np.shape(V_target), ) if (np.abs(t1 - st_target) > 10**(-10)).any(): raise Exception( "Time vector has been altered : max(|t1-t_target|)=", np.max(np.abs(t1 - st_target)), ) if (np.abs(z1 - z_target) > 10**(-10)).any(): raise Exception( "Altitude vector has been altered : max(|z1-z_target|)=", np.max(np.abs(z1 - z_target)), ) return V_target