def quicklook_dataset(datasetpath, altmax=4000):
    """Quick look at the original data.
    
    
    Parameters
    ----------
    datasetpath: str
        Path to the data file. Must follow the convention adopted in
        the BLUSC program.
        Example: "DATASET_2015_0210.PASSY2015_BT-T_linear_dz40_dt30_zmax2000.nc"
    
    altmax {int, float}
        Top altitude of the graph (meter above ground level)
    
    
    Returns
    -------
    Display the default variable of the given file against time
    and altitude.
        In X-axis is the time
        In Y-axis is the height (m agl)
        Variable values are in shades of colors.
    """
    X_raw, t, z = utils.load_dataset(
        datasetpath, variables_to_load=["X_raw", "time", "altitude"])
    TZ = utils.grid_to_scatter(utils.dtlist2slist(t), z)

    for p in range(X_raw.shape[1]):

        t1, z1, V = utils.scatter_to_grid(TZ, X_raw[:, p])

        plt.figure()
        # plt.title("Variable "+str(p)+" of dataset")
        plt.pcolormesh(t, z, V.T, shading="auto")
        plt.colorbar()
        plt.gcf().autofmt_xdate()
        plt.xlabel("Time (UTC)")
        plt.ylabel("Alt (m agl)")
        if storeImages:
            fileName = "QL_Xraw" + str(p)
            plt.savefig(figureDir + fileName + fmtImages)
            plt.close()
        else:
            plt.show(block=False)
def clusterZTview_manyclusters(t_values,
                               z_values,
                               zoneIDs,
                               delete_mask=None,
                               titl=None,
                               fileName=None):
    """Plots cluster labels in the same time and altitude grid where
    measurements have been done (boundary layer classification).
    Repeat it of 6 differents number of clusters.
    
    
    Parameters
    ----------
    t_values: array-like of shape (nt,)
        Vector of time within the day
    
    z_values: array-like of shape (nalt,)
        Vector of altitude
    
    zoneIDs: list of array-like of shape (N,)
        Cluster labels for each point and for each number of clusters
    
    delete_mask: array-like of shape (nt*nalt,)
        Mask at True when observation has been removed by the
        `utils.deletelines` function (to avoid NaNs)
    
    titl: str, optional
        Customised title for the figure
    
    fileName: str, optional
        Customised file name for saving the figure
    
    
    Returns
    -------
    3x2 tile of clusters labels on a time-altitude grid
        In X-axis is the time
        In Y-axis is the height (m agl)
        Clusters are shown with differents colors.
    """

    if titl is None:
        titl = ""

    count2letter = ['a)', 'b)', 'c)', 'd)', 'e)', 'f)']

    z_values = z_values / 1000  # convert meters to kilometers

    # 1. Conversion datetime -> seconds
    t0 = t_values[0]
    st_values = utils.dtlist2slist(t_values)

    # 2. Format from grid(z,t) to scatter
    TZ = utils.grid_to_scatter(st_values, z_values)

    n_kvalues = len(zoneIDs)
    nl = int(np.sqrt(n_kvalues))
    nc = int(np.ceil(n_kvalues / nl))

    # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    fig, axes = plt.subplots(nrows=nl,
                             ncols=nc,
                             figsize=(12, 8),
                             sharex=True,
                             sharey=True)
    plt.suptitle(titl)
    for ink in range(n_kvalues):
        zoneID = zoneIDs[ink]

        K = np.max(zoneID) + 1
        clustersIDs = np.arange(K)

        clist = []
        cticks = []
        cticklabels = []
        for k in np.unique(zoneID):
            cticks.append(k + 0.5)
            cticklabels.append(clustersIDs[k])
            clist.append(clusterMarks[clustersIDs[k]][:-1])
        colormap = ListedColormap(clist)

        # 3. Set labels at grid(z,t) format
        t_trash, z_trash, labels = utils.scatter_to_grid(TZ, zoneID)
        if (np.max(np.abs(z_values - z_trash)) +
                np.max(np.abs(st_values - t_trash)) > 1e-13):
            raise Exception(
                "Error in z,t retrieval : max(|z_values-z_trash|)=",
                np.max(np.abs(z_values - z_trash)),
                "max(|t_values-t_trash|)=",
                np.max(np.abs(st_values - t_trash)),
            )

        labels = np.ma.array(labels, mask=np.isnan(labels))

        # 4. Graphic
        plt.subplot(nl, nc, ink + 1)
        im = plt.pcolormesh(t_values,
                            z_values,
                            labels.T,
                            vmin=0,
                            vmax=K,
                            cmap=colormap,
                            shading="auto")
        plt.text(t_values[-7],
                 z_values[-4],
                 count2letter[ink],
                 fontweight='bold',
                 fontsize=16)
        plt.gcf().autofmt_xdate()

        # Colorbar
        cbar = plt.colorbar()
        cbar.set_ticks(cticks)
        cbar.set_ticklabels(cticklabels)

        if np.mod(ink, nc) == nl:
            cbar.set_label("Cluster labels")
        if np.mod(ink, nc) == 0:
            plt.ylabel("Alt (km agl)")
        if ink >= (nl - 1) * nc:
            plt.xlabel("Time (UTC)")

    fig.subplots_adjust(wspace=0, hspace=0.1)
    plt.tight_layout()
    if storeImages:
        if fileName is None:
            fileName = "multi_clusterZTview"
        plt.savefig(figureDir + fileName + fmtImages)
        plt.close()
        print("Figure saved:", figureDir + fileName + fmtImages)
    else:
        plt.show(block=False)
def clusterZTview(
    t_values,
    z_values,
    zoneID,
    delete_mask=None,
    fileName=None,
    clustersIDs=None,
    displayClustersIDs=False,
    titl=None,
):
    """Plots cluster labels in the same time and altitude grid where
    measurements have been done (boundary layer classification).
    
    
    Parameters
    ----------
    t_values: array-like of shape (nt,)
        Vector of time within the day
    
    z_values: array-like of shape (nalt,)
        Vector of altitude
    
    zoneID: array-like of shape (N,)
        Cluster labels of each point
    
    delete_mask: array-like of shape (nt*nalt,)
        Mask at True when observation has been removed by the
        `utils.deletelines` function (to avoid NaNs)
    
    fileName: str, optional
        Customised file name for saving the figure
    
    clustersIDs:  dict, optional
        Connection between cluster numbers and boundary layer types
        Example: {0:"CL",1:"SBL",2:"FA",3:"ML"}. Default is {0:0,1:1,...}.
    
    displayClustersIDs: bool
        If True, displays the clusterIDs over the graph, at the center
        of the cluster.
    
    titl: str, optional
        Customised title for the figure
        
    
    Returns
    -------
    Clusters labels on a time-altitude grid
        In X-axis is the time
        In Y-axis is the height (m agl)
        Clusters are shown with differents colors.
    """

    if clustersIDs is None:
        K = np.max(zoneID) + 1
        clustersIDs = np.arange(K)
    else:
        K = len(clustersIDs.items())
        for it in clustersIDs.items():
            key, val = it
            clusterMarks[val] = clusterMarks[key]

    if titl is None:
        titl = "Cluster in time-altitude grid | " + str(K) + " clusters"

    clist = []
    cticks = []
    cticklabels = []
    for k in range(K):
        cticks.append(k + 0.5)
        cticklabels.append(clustersIDs[k])
        clist.append(clusterMarks[clustersIDs[k]][:-1])
    colormap = ListedColormap(clist)

    # 1. Deleted labels completion (when missing data)
    if delete_mask is not None:
        fullzoneID = np.full(np.size(delete_mask), np.nan)
        fullzoneID[~delete_mask] = zoneID
    else:
        fullzoneID = zoneID

    # 2. Conversion datetime -> seconds
    t0 = t_values[0]
    st_values = utils.dtlist2slist(t_values)

    # 3. Format from grid(z,t) to scatter
    TZ = utils.grid_to_scatter(st_values, z_values)

    # 4. Set labels at grid(z,t) format
    t_trash, z_trash, labels = utils.scatter_to_grid(TZ, fullzoneID)
    if np.max(np.abs(z_values - z_trash)) + np.max(
            np.abs(st_values - t_trash)) > 1e-13:
        raise Exception(
            "Error in z,t retrieval : max(|z_values-z_trash|)=",
            np.max(np.abs(z_values - z_trash)),
            "max(|t_values-t_trash|)=",
            np.max(np.abs(st_values - t_trash)),
        )

    labels = np.ma.array(labels, mask=np.isnan(labels))

    # 5. Graphic
    # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    fig = plt.figure()
    # plt.title(titl)
    plt.pcolormesh(t_values,
                   z_values,
                   labels.T,
                   vmin=0,
                   vmax=K,
                   cmap=colormap,
                   shading="auto")
    if displayClustersIDs:
        for k in np.unique(zoneID):
            idxk = np.where(zoneID == k)[0]
            x0text = t0 + dt.timedelta(seconds=np.mean(TZ[idxk, 0], axis=0))
            x1text = np.mean(TZ[idxk, 1], axis=0)
            plt.text(x0text,
                     x1text,
                     clustersIDs[k],
                     fontweight="bold",
                     fontsize=18)
    cbar = plt.colorbar(label="Cluster label")
    cbar.set_ticks(cticks)
    cbar.set_ticklabels(cticklabels)
    plt.gcf().autofmt_xdate()
    plt.xlabel("Time (UTC)")
    plt.ylabel("Alt (m agl)")
    if storeImages:
        if fileName is None:
            fileName = "clusterZTview_K" + str(K)
        plt.savefig(figureDir + fileName + fmtImages)
        plt.close()
        print("Figure saved:", figureDir + fileName + fmtImages)
    else:
        plt.show(block=False)
示例#4
0
def write_dataset(datasetpath, X_raw, t_common, z_common):
    """Write the data prepared for the classification in a netcdf file
    with the grid on which it has been estimated.
    Dataset name must of the form:
        'DATASET_CAMPAGNE_PREDICTEURS_INTERPOLATION_dz***_dt***_zmax***.nc'
    
    
    Parameters
    ----------
    datasetpath: str
        Path and name of the netcdf file to be created.
    
    X_raw: ndarray of shape (N,p)
        Data matrix (not normalised)
    
    t_common: array-like of shape (Nt,) with dtype=datetime.datetime
        Time vector of the grid
    
    z_common array-like of shape (Nz,):
        Altitude vector of the grid
    
    
    Returns
    -------
    msg: str
        Message saying the netcdf file has been successfully written
    """
    import netCDF4 as nc

    N, p = X_raw.shape
    if N != len(t_common) * len(z_common):
        raise ValueError(
            "Shapes of X_raw and grid do not match. Dataset NOT CREATED.")

    n_invalidValues = np.sum(np.isnan(X_raw)) + np.sum(np.isinf(X_raw))
    if n_invalidValues > 0:
        raise ValueError(n_invalidValues,
                         "invalid values. Dataset NOT CREATED.")

    # print("datasetpath=",datasetpath)
    dataset = nc.Dataset(datasetpath, "w")

    # General information
    dataset.description = "Dataset cleaned and prepared in order to make unsupervised boundary layer classification. The file is named according to the variables present in the dataset, their vertical and time resolution (all avariable are on the same grid) and the upper limit of the grid."
    dataset.source = "Meteo-France CNRM/GMEI/LISA"
    dataset.history = "Created " + time.ctime(time.time())
    dataset.contactperson = "Thomas Rieutord ([email protected])"

    # In[117]:

    # Coordinate declaration
    dataset.createDimension("individuals", N)
    dataset.createDimension("predictors", p)
    dataset.createDimension("time", len(t_common))
    dataset.createDimension("altitude", len(z_common))

    # Fill in altitude vector
    altitude = dataset.createVariable("altitude", np.float64, ("altitude", ))
    altitude[:] = z_common
    altitude.units = "Meter above ground level (m)"

    # Fill in time vector
    Time = dataset.createVariable("time", np.float64, ("time", ))
    Time[:] = utils.dtlist2slist(t_common)
    Time.units = "Second since midnight (s)"

    # Fill in the design matrix
    designMatrix = dataset.createVariable("X_raw", np.float64,
                                          ("individuals", "predictors"))
    designMatrix[:, :] = X_raw
    designMatrix.units = "Different for each column. Adimensionalisation is necessary before comparing columns."

    # Closing the netcdf file
    dataset.close()

    return "Dataset sucessfully written in the file " + datasetpath
示例#5
0
def estimateInterpolationError(z_target,
                               t_target,
                               z_known,
                               t_known,
                               V_known,
                               n_randoms=10,
                               plot_on=True):
    """Estimate the error and the computing time for several interpolation
    method.
    
    Errors are estimated by cross-validation. The function repeats the
    interpolation with all methods for severals train/test splits.
    The list of tested methods as well as their parameters must be
    changed inside the function.
    
    Default list: '4NearestNeighbors','8NearestNeighbors','linear','cubic'
    
    
    Parameters
    ----------
    z_target: ndarray of shape (n1_z,)
        Altitude vector of the target grid (m agl)
    
    t_target: array-like of shape (n1_t,) with dtype=datetime
        Time vector of the target grid
    
    z_known: ndarray of shape (n0_z,)
        Altitude vector of the known grid (m agl)
    
    t_known: array-like of shape (n0_t,) with dtype=datetime
        Time vector of the known grid
    
    V_known: ndarray of shape (n0_t,n0_z)
        Data values on the known grid
    
    n_randoms: int, default=10
        Number of repeated random split between training and testing sets
        
    plot_on: bool, default=True
        If True, the graphics showing computing time versus accuracy is drawn
        
        
    Returns
    -------
    accuracies: ndarray of shape (n_randoms,n_regressors)
        R2 score of each regressor (one per line) for each random split (one per
        column).
        
    chronos: ndarray of shape (n_randoms,n_regressors)
        Computing time of each regressor (one per line) for each random split
        (one per column).
    
    reg_names: list of shape (n_regressors,)
        Names of regressions methods performed
    """

    from sklearn.neighbors import KNeighborsRegressor
    from scipy.interpolate import griddata
    from sklearn.metrics import r2_score
    from sklearn.model_selection import train_test_split

    # Switch from format "data=f(coordinates)" to format "obs=f(predictors)"
    st_known = utils.dtlist2slist(t_known)
    st_target = utils.dtlist2slist(t_target)
    X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known)
    X_target = utils.grid_to_scatter(st_target, z_target)

    # NaN are removed
    X_known = X_known[~np.isnan(Y_known), :]
    Y_known = Y_known[~np.isnan(Y_known)]

    regressors = []
    reg_names = []

    #### ========= Estimation with 4-nearest neighbors
    KNN4 = KNeighborsRegressor(n_neighbors=4)
    regressors.append(KNN4)
    reg_names.append("4NearestNeighbors")

    #### ========= Estimation with 8-nearest neighbors
    KNN8 = KNeighborsRegressor(n_neighbors=8)
    regressors.append(KNN8)
    reg_names.append("8NearestNeighbors")

    chronos = np.zeros((len(regressors) + 2, n_randoms))
    accuracies = np.zeros((len(regressors) + 2, n_randoms))
    for icl in range(len(regressors)):
        reg = regressors[icl]
        print("Testing ", str(reg).split("(")[0])
        for ird in range(n_randoms):
            X_train, X_test, y_train, y_test = train_test_split(
                X_known, Y_known, test_size=0.2, random_state=ird)
            t0 = time.time()  #::::::
            reg.fit(X_train, y_train)
            accuracies[icl, ird] = reg.score(X_test, y_test)
            t1 = time.time()  #::::::
            chronos[icl, ird] = t1 - t0

    #### ========= Estimation with 2D linear interpolation
    reg_names.append("Linear2DInterp")
    print("Testing Linear2DInterp")
    for ird in range(n_randoms):
        X_train, X_test, y_train, y_test = train_test_split(X_known,
                                                            Y_known,
                                                            test_size=0.2,
                                                            random_state=ird)
        y_pred = griddata(X_train, y_train, X_test, method="linear")
        # Some data can still be missing even after the interpolation
        #   * Radiometer : resolution coarsens with altitude => last gates missing
        #   * Ceilometer : high lowest range => first gates missing
        y_test = y_test[~np.isnan(y_pred)]
        y_pred = y_pred[~np.isnan(y_pred)]
        accuracies[-2, ird] = r2_score(y_test, y_pred)
        t1 = time.time()  #::::::
        chronos[-2, ird] = t1 - t0

    #### ========= Estimation with 2D cubic interpolation
    reg_names.append("Cubic2DInterp")
    print("Testing Cubic2DInterp")
    for ird in range(n_randoms):
        X_train, X_test, y_train, y_test = train_test_split(X_known,
                                                            Y_known,
                                                            test_size=0.2,
                                                            random_state=ird)
        y_pred = griddata(X_train, y_train, X_test, method="linear")
        # Some data can still be missing even after the interpolation
        #   * Radiometer : resolution coarsens with altitude => last gates missing
        #   * Ceilometer : high lowest range => first gates missing
        y_test = y_test[~np.isnan(y_pred)]
        y_pred = y_pred[~np.isnan(y_pred)]
        accuracies[-1, ird] = r2_score(y_test, y_pred)
        t1 = time.time()  #::::::
        chronos[-1, ird] = t1 - t0

    if plot_on:
        graphics.estimator_quality(accuracies, chronos, reg_names)

    return accuracies, chronos, reg_names
示例#6
0
def estimateongrid(z_target,
                   t_target,
                   z_known,
                   t_known,
                   V_known,
                   method="linear"):
    """Interpolate the data on a target grid knowning it on another grid.
    Grids are time-altitude.
    
    Supported interpolation methods: 'linear','cubic','nearestneighbors'
    
    For nearest neighbors, the number of neighbors must be passed as the
    first character. For example: method='4nearestneighbors'
    For more insights about how to choose the good methods (error, computing time...)
    please refer to the notebook `tuto-0to1-prepdataset.ipynb`
    
    
    Parameters
    ----------
    z_target: ndarray of shape (n1_z,)
        Altitude vector of the target grid (m agl)
    
    t_target: array-like of shape (n1_t,) with dtype=datetime
        Time vector of the target grid
    
    z_known: ndarray of shape (n0_z,)
        Altitude vector of the known grid (m agl)
    
    t_known: array-like of shape (n0_t,) with dtype=datetime
        Time vector of the known grid
    
    V_known: ndarray of shape (n0_t,n0_z)
        Data values on the known grid
    
    method: {'linear','cubic','nearestneighbors'}, default='linear'
        Interpolation method.
        
        
    Returns
    -------
    V_target: ndarray of shape (n1_t,n1_z)
        Values on the target grid
    """

    # Switch from format "data=f(coordinates)" to format "obs=f(predictors)"
    st_known = utils.dtlist2slist(t_known)
    st_target = utils.dtlist2slist(t_target)
    X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known)
    X_target = utils.grid_to_scatter(st_target, z_target)

    # NaN are removed
    X_known = X_known[~np.isnan(Y_known), :]
    Y_known = Y_known[~np.isnan(Y_known)]

    #### ========= Estimation with K-nearest neighbors
    if method[1:].lower() == "nearestneighbors":
        from sklearn.neighbors import KNeighborsRegressor

        KNN = KNeighborsRegressor(n_neighbors=int(method[0]))

        KNN.fit(X_known, Y_known)
        Y_target = KNN.predict(X_target)

    else:
        #### ========= Estimation with 2D interpolation
        from scipy.interpolate import griddata

        Y_target = griddata(X_known, Y_known, X_target, method=method.lower())

    # Shape the output
    t1, z1, V_target = utils.scatter_to_grid(X_target, Y_target)

    # Sanity checks
    if np.shape(V_target) != (np.size(st_target), np.size(z_target)):
        raise Exception(
            "Output has not expected shape : shape(st_target)",
            np.shape(st_target),
            "shape(z_target)",
            np.shape(z_target),
            "shape(V_target)",
            np.shape(V_target),
        )
    if (np.abs(t1 - st_target) > 10**(-10)).any():
        raise Exception(
            "Time vector has been altered : max(|t1-t_target|)=",
            np.max(np.abs(t1 - st_target)),
        )
    if (np.abs(z1 - z_target) > 10**(-10)).any():
        raise Exception(
            "Altitude vector has been altered : max(|z1-z_target|)=",
            np.max(np.abs(z1 - z_target)),
        )

    return V_target