Exemplo n.º 1
0
def test_prepare_data_multiprof():

    n_profiles = 3
    testFile = paths.file_defaultlidardata()
    t_values, z_values, rcss = utils.extract_data(
        testFile, to_extract=["rcs_1", "rcs_2"])
    rcs_1 = rcss["rcs_1"]
    rcs_2 = rcss["rcs_2"]

    params = utils.get_default_params()
    params["predictors"] = {
        "day": ["rcs_1", "rcs_2"],
        "night": ["rcs_1", "rcs_2"]
    }

    loc, dateofday, lat, lon = utils.where_and_when(testFile)
    t = 55
    coords = {
        "time": dt.datetime.utcfromtimestamp(t_values[t]),
        "lat": lat,
        "lon": lon
    }
    t_back = max(t - n_profiles + 1, 0)
    rcss = {"rcs_1": rcs_1[t_back:t + 1, :], "rcs_2": rcs_2[t_back:t + 1, :]}

    X, Z = prepare_data(coords, z_values, rcss=rcss, params=params)

    assert X.shape == (438, 2) and Z.shape == (438, )
Exemplo n.º 2
0
def quicklook_output(nc_file):
    """Same as blhs_over_data, but directly from the output netcf file
    (and with less flexibility).
    
    Parameters
    ----------
    nc_file : str
        Path to the netcdf file containing the data
    
    Returns
    -------
    `matplotlib.pyplot figure`
        Same as kabl.graphics.blhs_over_data
    """

    location, day, lat, lon = utils.where_and_when(nc_file)
    t, z, dat = utils.extract_data(nc_file,
                                   to_extract=["rcs_0", "blh_kabl", "pbl"])
    rcs = dat["rcs_0"]
    blh_new = dat["blh_kabl"]
    blh_mnf = dat["pbl"]

    fig = blhs_over_data(
        t,
        z,
        rcs,
        [blh_new, blh_mnf[:, 0]],
        blhs_names=["BLH KABL", "BLH manufacturer"],
        titre="Lidar backscatter | " + location + " " +
        day.strftime("%Y/%m/%d"),
    )

    return fig
Exemplo n.º 3
0
def quicklook_data(nc_file, max_height=4500, with_pbl=False, with_cbh=False):
    '''Give a quick look of the data, only the data.
    
    [IN]
        - nc_file (str): path to the netcdf file containing the data
    
    [OUT]
         (matplotlib.pyplot figure): same as blhs_over_data'''

    location, day, lat, lon = utils.where_and_when(nc_file)

    to_be_extracted = ['rcs_0']
    if with_pbl:
        to_be_extracted.append('pbl')
    if with_cbh:
        to_be_extracted.append('cloud_base_height')

    data = utils.extract_data(nc_file,
                              max_height=max_height,
                              to_extract=to_be_extracted)

    if with_pbl and with_cbh:
        t, z, rcs, pbl, cbh = data
    elif with_pbl:
        t, z, rcs, pbl = data
    elif with_cbh:
        t, z, rcs, cbh = data
    else:
        t, z, rcs = data

    plt.figure(figsize=(14, 7))
    plt.pcolormesh(t, z, rcs.T, alpha=0.8, cmap='rainbow', vmin=-0.1, vmax=0.8)
    if with_pbl:
        pbl[pbl == -999] = np.nan
        for layer in range(pbl.shape[1]):
            plt.plot(t, pbl[:, layer], 'k*')
    if with_cbh:
        cbh[cbh == -999] = np.nan
        for layer in range(cbh.shape[1]):
            plt.plot(t, cbh[:, layer], 'r.')
    axes = plt.gca()
    plt.title("Lidar backscatter | " + location + " " +
              day.strftime('%Y/%m/%d'))
    axes.set_xlabel('Hour')
    axes.set_ylabel('Height (m agl)')
    plt.tight_layout()
    plt.grid(color='white', ls='solid')
    plt.colorbar(label="Range corrected signal", alpha=0.8)

    locs, labels = plt.xticks()
    labels = [
        dt.datetime.utcfromtimestamp(loc).strftime('%H:%M') for loc in locs
    ]

    axes.set_xticks(locs)
    axes.set_xticklabels(labels)
    plt.gcf().autofmt_xdate()
    plt.show(block=False)
Exemplo n.º 4
0
def quicklook_benchmark(
    data_file,
    blh_file,
    rs_file=None,
):
    """Same as blhs_over_data, but directly from the output netcf file
    (and with less flexibility).
    
    Parameters
    ----------
    data_file : str
        Path to the netcdf file containing the data
    
    blh_file : str
        Path to the netcdf file containing the BLH estimation
    
    Returns
    -------
    `matplotlib.pyplot figure`
        Same as kabl.graphics.blhs_over_data
    """

    location, day, lat, lon = utils.where_and_when(data_file)
    t, z, rcss = utils.extract_data(data_file, to_extract=["rcs_0"])
    rcs = rcss["rcs_0"]

    BLHS = []
    BLH_NAMES = []
    ncf = nc.Dataset(blh_file)
    for key in ncf.variables.keys():
        if "BLH" in key:
            BLHS.append(np.array(ncf.variables[key]))
            BLH_NAMES.append(key[4:])

    if rs_file is not None:
        blh_rs = utils.extract_rs(rs_file, t[0], t[-1])
    else:
        blh_rs = None

    fig = blhs_over_data(
        t,
        z,
        rcs,
        BLHS,
        blhs_names=BLH_NAMES,
        blh_rs=blh_rs,
        titre="Lidar backscatter | " + location + " " +
        day.strftime("%Y/%m/%d"),
    )

    return fig
Exemplo n.º 5
0
def quicklook_benchmark(data_file,
                        blh_file,
                        rs_file=None,
                        showFigure=True,
                        storeImages=False,
                        fmtImages=".png"):
    '''Same as blhs_over_data, but directly from the output netcf file
    (and with less flexibility).
    
    [IN]
        - data_file (str): path to the netcdf file containing the data
        - data_file (str): path to the netcdf file containing the BLH estimation
    
    [OUT]
        - (matplotlib.pyplot figure): same as blhs_over_data'''

    location, day, lat, lon = utils.where_and_when(data_file)
    t, z, rcs = utils.extract_data(data_file, to_extract=['rcs_0'])

    BLHS = []
    BLH_NAMES = []
    ncf = nc.Dataset(blh_file)
    for key in ncf.variables.keys():
        if "BLH" in key:
            BLHS.append(np.array(ncf.variables[key]))
            BLH_NAMES.append(key[4:])

    if rs_file is not None:
        blh_rs = utils.extract_rs(rs_file, t[0], t[-1])
    else:
        blh_rs = None

    fig = blhs_over_data(t,
                         z,
                         rcs,
                         BLHS,
                         blhs_names=BLH_NAMES,
                         blh_rs=blh_rs,
                         titre="Lidar backscatter | " + location + " " +
                         day.strftime('%Y/%m/%d'),
                         showFigure=showFigure,
                         storeImages=storeImages,
                         fmtImages=fmtImages)

    return fig
Exemplo n.º 6
0
def quicklook_output(nc_file):
    '''Same as blhs_over_data, but directly from the output netcf file
    (and with less flexibility).
    
    [IN]
        - nc_file (str): path to the netcdf file containing the data
    
    [OUT]
        - (matplotlib.pyplot figure): same as blhs_over_data'''

    location, day, lat, lon = utils.where_and_when(nc_file)
    t, z, rcs, blh_new, blh_mnf = utils.extract_data(
        nc_file, to_extract=['rcs_0', 'blh_kabl', 'pbl'])

    fig = blhs_over_data(t,
                         z,
                         rcs, [blh_new, blh_mnf[:, 0]],
                         blhs_names=['BLH KABL', 'BLH manufacturer'],
                         titre="Lidar backscatter | " + location + " " +
                         day.strftime('%Y/%m/%d'))

    return fig
Exemplo n.º 7
0
def test_prepare_data_cl31():

    n_profiles = 3
    testFile = paths.file_defaultcl31data()
    t_values, z_values, rcss = utils.extract_data(testFile,
                                                  to_extract=["rcs_0"])
    rcs_0 = rcss["rcs_0"]

    params = utils.get_default_params()
    params["predictors"] = {"day": ["rcs_0"], "night": ["rcs_0"]}

    loc, dateofday, lat, lon = utils.where_and_when(testFile)
    t = 55
    coords = {
        "time": dt.datetime.utcfromtimestamp(t_values[t]),
        "lat": lat,
        "lon": lon
    }
    t_back = max(t - n_profiles + 1, 0)
    rcs_0 = rcs_0[t_back:t + 1, :]

    X, Z = prepare_data(coords, z_values, rcss={"rcs_0": rcs_0}, params=params)

    assert X.shape == (1347, 1) and Z.shape == (1347, )
Exemplo n.º 8
0
    # Test of prepare_data
    #----------------------
    print("\n --------------- Test of prepare_data")
    testFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.nc'
    print(' ** Single profile **')
    z_values, rcs_1, rcs_2, coords = utils.extract_testprofile(
        testFile, profile_id=2, return_coords=True)
    print("z_values.shape", z_values.shape, "rcs_1.shape", rcs_1.shape,
          "rcs_2.shape", rcs_2.shape)
    X, Z = prepare_data(coords, z_values, rcs_1, rcs_2)
    print("X.shape=", X.shape)
    print("Z.shape=", Z.shape)

    n_profiles = 3
    print(' ** Concatenated profiles ** (', n_profiles, ')')
    t_values, z_values, rcs_1, rcs_2 = utils.extract_data(
        testFile, to_extract=['rcs_1', 'rcs_2'])
    loc, dateofday, lat, lon = utils.where_and_when(testFile)
    t = 55
    coords = {
        'time': dt.datetime.utcfromtimestamp(t_values[t]),
        'lat': lat,
        'lon': lon
    }
    t_back = max(t - n_profiles + 1, 0)
    rcs_1 = rcs_1[t_back:t + 1, :]
    rcs_2 = rcs_2[t_back:t + 1, :]
    print("z_values.shape", z_values.shape, "rcs_1.shape", rcs_1.shape,
          "rcs_2.shape", rcs_2.shape)
    X, Z = prepare_data(coords, z_values, rcs_1, rcs_2)
    print("X.shape=", X.shape)
    print("Z.shape=", Z.shape)
Exemplo n.º 9
0
    labels = core.apply_algo(X, 3)
    blh = core.blh_from_labels(labels, Z)

    blhs_over_profile(z_values, rcs_1, blh, labels=labels)

    plt.figure()
    plt.hist(rcs_1, 35)
    plt.title("Histogram of a single profile of RCS")
    plt.show(block=False)

    # Test of blhs_over_data
    #------------------------
    print("\n --------------- Test of blhs_over_data")
    testFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.nc'
    blh = core.blh_estimation(testFile)
    t_values, z_values, rcs_1, rcs_2 = utils.extract_data(testFile)

    blhs_over_data(t_values, z_values, rcs_1, blh)

    # Test of scatterplot_blhs
    #------------------------
    print("\n --------------- Test of scatterplot_blhs")
    outputFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.out.nc'
    t_values, z_values, blh_new, blh_mnf = utils.extract_data(
        outputFile, to_extract=['blh_kabl', 'pbl'])

    scatterplot_blhs(t_values, blh_mnf[:, 0], blh_new)

    # Test of quicklook_output
    #------------------------
    print("\n --------------- Test of quicklook_output")
Exemplo n.º 10
0
from kabl import adabl
from kabl import paths

# Usual Python packages
import pickle
import numpy as np
import datetime as dt
import pytz
import sys
import time
import netCDF4 as nc

lidarFile = paths.file_defaultcl31data()

t_values, z_values, rcss = utils.extract_data(lidarFile,
                                              max_height=4620,
                                              to_extract=["rcs_0"])
rcs_0 = rcss["rcs_0"]

# Estimation with KABL
# ----------------------
params = utils.get_default_params()
params["n_clusters"] = 3
params["predictors"] = {"day": ["rcs_0"], "night": ["rcs_0"]}
params["n_profiles"] = 1
params["init"] = "advanced"

blh_kabl = core.blh_estimation(lidarFile, storeInNetcdf=False, params=params)

# Plot
# ------
Exemplo n.º 11
0
def quicklook_data(nc_file, max_height=4500, with_pbl=False, with_cbh=False):
    """Give a quick look of the data, only the data.
    
    Parameters
    ----------
    nc_file : str
        Path to the netcdf file containing the data
    
    max_height : {float, int}, default=4500
        Top height on the graphic
    
    with_pbl : bool, default=False
        If True, add onto the data the boundary layer height calculated
        by the manufacturer
    
    with_cbh : bool, default=False
        If True, add onto the data the first cloud base height
        calculated by the manufacturer
    
    Returns
    -------
    None
    """

    location, day, lat, lon = utils.where_and_when(nc_file)

    to_be_extracted = ["rcs_0"]
    if with_pbl:
        to_be_extracted.append("pbl")
    if with_cbh:
        to_be_extracted.append("cloud_base_height")

    t, z, dat = utils.extract_data(nc_file,
                                   max_height=max_height,
                                   to_extract=to_be_extracted)

    rcs = dat["rcs_0"]
    if "pbl" in to_be_extracted:
        pbl = dat["pbl"]
    if "cloud_base_height" in to_be_extracted:
        cbh = dat["cloud_base_height"]

    plt.figure(figsize=(14, 7))
    plt.pcolormesh(t, z, rcs.T, alpha=0.8, cmap="rainbow", vmin=-0.1, vmax=0.8)
    if with_pbl:
        pbl[pbl == -999] = np.nan
        for layer in range(pbl.shape[1]):
            plt.plot(t, pbl[:, layer], "k*")
    if with_cbh:
        cbh[cbh == -999] = np.nan
        for layer in range(cbh.shape[1]):
            plt.plot(t, cbh[:, layer], "r.")
    axes = plt.gca()
    plt.title("Lidar backscatter | " + location + " " +
              day.strftime("%Y/%m/%d"))
    axes.set_xlabel("Hour")
    axes.set_ylabel("Height (m agl)")
    plt.tight_layout()
    plt.grid(color="white", ls="solid")
    plt.colorbar(label="Range corrected signal", alpha=0.8)

    locs, labels = plt.xticks()
    labels = [
        dt.datetime.utcfromtimestamp(loc).strftime("%H:%M") for loc in locs
    ]

    axes.set_xticks(locs)
    axes.set_xticklabels(labels)
    plt.gcf().autofmt_xdate()
    plt.show(block=False)
Exemplo n.º 12
0
from kabl import utils
from kabl import graphics
from kabl import adabl
# Usual Python packages
import pickle
import numpy as np
import datetime as dt
import pytz
import sys
import time
import netCDF4 as nc

lidarFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.nc'
rsFile = '../data_samples/radiosoundings/BLH_RS_liss3_BRNliss10_BREST.nc'

t_values, z_values, rcs_1, rcs_2, blh_mnf = utils.extract_data(
    lidarFile, max_height=4620, to_extract=['rcs_1', 'rcs_2', 'pbl'])

# Estimation with KABL
#----------------------
params = dict()
params['algo'] = 'kmeans'
params['n_clusters'] = 3
params['predictors'] = {'day': ['rcs_1'], 'night': ['rcs_1']}
params['classif_score'] = 'db'
params['n_inits'] = 1
params['n_profiles'] = 1
params['max_k'] = 6
params['init'] = 'given'
params['cov_type'] = 'full'
params['max_height'] = 4500
params['sunrise_shift'] = 1
Exemplo n.º 13
0
def adabl_qualitymetrics(
    dataFile: str,
    modelFile: str,
    scalerFile: str,
    refFile: str = "indus",
    outputFile: str = "None",
    addResultsToNetcdf: bool = False,
):
    """Perform BLH estimation with ADABL on all profiles of the day and 
    write it into a copy of the netcdf file
    
    
    Parameters
    ----------
    dataFile : str
        Path to the input file, as generated by raw2l1
    
    modelFile : str
        Path to the model file (pickle object)
    
    scalerFile : str
        Path to the scaler file (pickle object)
    
    refFile : str
        Path to reference BLH estimation (handmade of manufacturer's). Default
        is the manufacturer.
    
    outputFile : str
        Path to the output file. Must be specified if addResultsToNetcdf=True
    
    addResultsToNetcdf : bool, default=False
        If True, adds the quality metrics to the existing result file specified 
        in outputFile
    
    
    
    Returns
    -------
    errl2_blh : float
        Root mean squared gap between BLH from KABL and the reference
        .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2}
    
    errl1_blh : float
        Mean absolute gap between BLH from KABL and the reference
        .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert
      
    errl0_blh : float
        Maximum absolute gap between BLH from KABL and the reference
        .. math:: \max_i \vert Z(i)-Zref(i) \vert
    
    ch_score : float
        Average Calinski-Harabasz score (the higher, the better) over
        the full day
        
    db_scores : float
        Average Davies-Bouldin score (the lower, the better) over
        the full day
    
    s_scores : float
        Average silhouette score (the higher, the better) over
        the full day
    
    chrono : float
        Computation time for the full day (seconds)
    
    n_invalid : int
        Number of BLH estimation at NaN or Inf
        """
    
    t0 = time.time()  #::::::::::::::::::::::

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(dataFile)
    t_values, z_values, dat = utils.extract_data(
        dataFile, to_extract=["rcs_1", "rcs_2", "pbl"]
    )
    rcs_1 = dat["rcs_1"]
    rcs_2 = dat["rcs_2"]
    blh_mnf = dat["pbl"]
    sec_intheday = np.mod(t_values, 24 * 3600)

    Nt, Nz = rcs_1.shape

    # Load pre-trained model
    # ------------------------
    fc = open(modelFile, "rb")
    model = pickle.load(fc)
    fc = open(scalerFile, "rb")
    scaler = pickle.load(fc)

    blh = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write(
        "ADABL estimation ("
        + loc
        + dateofday.strftime(", %Y/%m/%d")
        + "): [%s]" % ("." * toolbar_width)
    )
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(Nt):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        rcs1loc = rcs_1[t, :]
        rcs2loc = rcs_2[t, :]
        rcs1loc[rcs1loc <= 0] = 1e-5
        rcs2loc[rcs2loc <= 0] = 1e-5

        X_new = np.array(
            [
                np.repeat(sec_intheday[t], Nz),
                z_values,
                np.log10(rcs1loc),
                np.log10(rcs2loc),
            ]
        ).T
        X_new = scaler.transform(X_new)

        # 3. Apply the machine learning algorithm
        # ---------------------
        y_new = model.predict(X_new)

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(y_new, z_values))

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if os.path.isfile(refFile):
        blh_ref = np.loadtxt(refFile)
    else:
        blh_ref = blh_mnf[:, 0]

    if addResultsToNetcdf:
        BLHS = [np.array(blh)]
        BLH_NAMES = ["BLH_ADABL"]

        msg = add_blhs_to_netcdf(outputFile, BLHS, BLH_NAMES)
        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref) ** 2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return errl2_blh, errl1_blh, errl0_blh, corr_blh, chrono, n_invalid
Exemplo n.º 14
0
def prepare_supervised_dataset(
    dataFiles: list,
    refFiles: list,
    saveInCSV: bool = False,
    outputFile: str = None,
    plot_on: bool = False,
):
    """Create a dataframe with appropriate fields from original data format.
    
    Lidar data is expected to be provided in raw2l1 files and handmade BLH 
    estimation is expected in .csv file with 2 columns: time, BLH values.
    Paths are given in a list in order to easily had multiple days.
    
    
    Parameters
    ----------
    dataFile : list of str
        Paths to the data input file, as generated by raw2l1
    
    refFile : list of str
        Paths to the reference file (handmade BLH estimation) in CSV format
    
    saveInCSV : bool, default=False
        If True, the dataset is saved in a .csv file at the specified location
        
    outputFile : str, default=None
        Path to the file where the dataset is stored, if saveInCSV=True
    
    plot_on : bool, default=False
        If True, display the handmade BLH over the data.
    
    
    Returns
    -------
    df : `pandas.DataFrame`
        Ready-to-use dataframe for ADABL training. Contains 5 columns of input
        data and one column of output binary data
    """

    RCS0 = []
    RCS1 = []
    RCS2 = []
    SEC0 = []
    ALTI = []
    y = []
    for i in range(len(dataFiles)):
        dataFile = dataFiles[i]
        refFile = refFiles[i]
        print("Reading file ", dataFile, "with reference", refFile)
        t_values, z_values, dat = utils.extract_data(
            dataFile, max_height=4620, to_extract=["rcs_0", "rcs_1", "rcs_2", "pbl"]
        )
        rcs_0 = dat["rcs_0"]
        rcs_1 = dat["rcs_1"]
        rcs_2 = dat["rcs_2"]
        blh_mnf = dat["pbl"]

        blh_ref = pd.read_csv(refFile, delimiter=",", header=0)
        blh_ref = blh_ref["blh_ref"].values

        if plot_on:
            graphics.blhs_over_data(t_values, z_values, rcs_0, blh_ref)

        # Input data
        # ----------
        sec_intheday = np.mod(t_values, 24 * 3600)
        Nt, Nz = rcs_1.shape

        rcs0loc = rcs_0.ravel()
        rcs0loc[rcs0loc <= 0] = 1e-5
        RCS0.append(np.log10(rcs0loc))

        rcs1loc = rcs_1.ravel()
        rcs1loc[rcs1loc <= 0] = 1e-5
        RCS1.append(np.log10(rcs1loc))

        rcs2loc = rcs_2.ravel()
        rcs2loc[rcs2loc <= 0] = 2e-5
        RCS2.append(np.log10(rcs2loc))

        SEC0.append(np.repeat(sec_intheday, Nz))
        ALTI.append(np.tile(z_values, Nt))

        # Output data
        # -----------
        yday = []
        for t in range(Nt):
            yloc = np.zeros(Nz)
            yloc[z_values > blh_ref[t]] = 1
            yday.append(yloc)

        y.append(np.array(yday, dtype=int).ravel())

    # Create dataframe
    # ------------------
    df = pd.DataFrame(
        {
            "sec0": np.concatenate(SEC0),
            "alti": np.concatenate(ALTI),
            "rcs0": np.concatenate(RCS0),
            "rcs1": np.concatenate(RCS1),
            "rcs2": np.concatenate(RCS2),
            "isBL": np.concatenate(y),
        }
    )

    if saveInCSV:
        if outputFile is None:
            outputFile = paths.file_labelleddataset()
        df.to_csv(outputFile, index=False)
        print("Dataset for ADABL is saved in", outputFile)

    return df
Exemplo n.º 15
0
def adabl_blh_estimation(
    dataFile: str,
    modelFile: str,
    scalerFile: str,
    outputFile: bool = None,
    storeInNetcdf: bool = False,
):
    """Perform BLH estimation with ADABL on all profiles of the day and 
    write it into a copy of the netcdf file
    
    
    Parameters
    ----------
    dataFile : str
        Path to the input file, as generated by raw2l1
    
    modelFile : str
        Path to the model file (pickle object)
    
    scalerFile : str
        Path to the scaler file (pickle object)
    
    outputFile : str
        Path to the output file. Default adds ".out" before ".nc"
    
    storeInNetcdf : bool
        If True, the field 'blh_ababl', containg BLH estimation, is stored in
        the outputFile
    
    
    Returns
    -------
    blh : ndarray of shape (Nt,)
        Time series of BLH as estimated by the ADABL algorithm.
    """

    t0 = time.time()  #::::::::::::::::::::::

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(dataFile)
    t_values, z_values, dat = utils.extract_data(
        dataFile, to_extract=["rcs_1", "rcs_2", "pbl"]
    )
    rcs_1 = dat["rcs_1"]
    rcs_2 = dat["rcs_2"]
    blh_mnf = dat["pbl"]
    sec_intheday = np.mod(t_values, 24 * 3600)

    Nt, Nz = rcs_1.shape

    # Load pre-trained model
    # ------------------------
    fc = open(modelFile, "rb")
    model = pickle.load(fc)
    fc = open(scalerFile, "rb")
    scaler = pickle.load(fc)

    blh = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write(
        "ADABL estimation ("
        + loc
        + dateofday.strftime(", %Y/%m/%d")
        + "): [%s]" % ("." * toolbar_width)
    )
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(Nt):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        rcs1loc = rcs_1[t, :]
        rcs2loc = rcs_2[t, :]
        rcs1loc[rcs1loc <= 0] = 1e-5
        rcs2loc[rcs2loc <= 0] = 1e-5

        X_new = np.array(
            [
                np.repeat(sec_intheday[t], Nz),
                z_values,
                np.log10(rcs1loc),
                np.log10(rcs2loc),
            ]
        ).T
        X_new = scaler.transform(X_new)

        # 3. Apply the machine learning algorithm
        # ---------------------
        y_new = model.predict(X_new)

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(y_new, z_values))

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        outputFile = dataFile[:-3] + ".out.nc"

    # 5. Store the new BLH estimation into a copy of the original netCDF
    if storeInNetcdf:
        utils.add_blh_to_netcdf(dataFile, outputFile, blh, origin="adabl")

    return np.array(blh)
Exemplo n.º 16
0
def blh_estimation(inputFile,
                   outputFile=None,
                   storeInNetcdf=True,
                   params=None):
    '''Perform BLH estimation on all profiles of the day and write it into
    a copy of the netcdf file.
    
    [IN]
      - inputFile (str): path to the input file, as generated by raw2l1
      - outputFile (str): path to the output file. Default adds ".out" before ".nc"
      - storeInNetcdf (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile
      - params (dict): dict of parameters. Depends on 'n_clusters'
    
    [OUT]
      - blh (np.array[Nt]): time series of BLH as estimated by the KABL algorithm.
    '''

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    #---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, rcs_1, rcs_2 = utils.extract_data(inputFile,
                                                          params=params)

    blh = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("KABL estimation (" + loc +
                     dateofday.strftime(', %Y/%m/%d') + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        #---------------------
        coords = {
            'time': dt.datetime.utcfromtimestamp(t_values[t]),
            'lat': lat,
            'lon': lon
        }
        t_back = max(t - params['n_profiles'] + 1, 0)
        X, Z = prepare_data(coords, z_values, rcs_1[t_back:t + 1, :],
                            rcs_2[t_back:t + 1, :], params)

        # 3. Apply the machine learning algorithm
        #---------------------
        if isinstance(params['n_clusters'], int):
            labels = apply_algo(X, params['n_clusters'], params=params)

            # (3.1 OPTIONAL) Compute classification score
            classif_score = silhouette_score(X, labels)
            #ch_score=calinski_harabaz_score(X,labels)
            #db_score=davies_bouldin_score(X,labels)
        else:
            labels, n_clusters, classif_score = apply_algo_k_auto(
                X, params=params)

        # 4. Derive and store the BLH
        #---------------------
        blh.append(blh_from_labels(labels, Z))

    if outputFile is None:
        outputFile = inputFile[:-3] + ".out.nc"

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    # 5. Store the new BLH estimation into a copy of the original netCDF
    if storeInNetcdf:
        utils.add_blh_to_netcdf(inputFile, outputFile, blh)

    return np.array(blh)
Exemplo n.º 17
0
def kabl_qualitymetrics(inputFile,
                        outputFile=None,
                        reference='None',
                        rsFile='None',
                        storeResults=True,
                        params=None):
    '''Copy of blh_estimation including calculus and storage of scores
    
    [IN]
      - inputFile (str): path to the input file, as generated by raw2l1
      - outputFile (str): path to the output file. Default adds ".out" before ".nc"
      - reference (str): path to the reference file, if any.
      - rsFile (str): path to the radiosounding estimations, if any (give the possibility to store it in the same netcdf)
      - storeResults (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile
      - params (dict): dict of parameters. Depends on 'n_clusters'
    
    [OUT]
      - errl2_blh (float): root mean squared gap between BLH from KABL and the reference
      - errl1_blh (float): mean absolute gap between BLH from KABL and the reference
      - errl0_blh (float): maximum absolute gap between BLH from KABL and the reference
      - ch_score (float): mean over all day Calinski-Harabasz score (the higher, the better)
      - db_scores (float): mean over all day Davies-Bouldin score (the lower, the better)
      - s_scores (float): mean over all day silhouette score (the higher, the better)
      - chrono (float): computation time for the full day (seconds)
      - n_invalid (int): number of BLH estimation at NaN or Inf
    '''

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    #---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, rcs_1, rcs_2, blh_mnf, rr, vv, cbh = utils.extract_data(
        inputFile,
        to_extract=['rcs_1', 'rcs_2', 'pbl', 'rr', 'vv', 'b1'],
        params=params)

    blh = []
    K_values = []
    s_scores = []
    db_scores = []
    ch_scores = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("KABL estimation (" + loc +
                     dateofday.strftime(', %Y/%m/%d') + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        #---------------------
        coords = {
            'time': dt.datetime.utcfromtimestamp(t_values[t]),
            'lat': lat,
            'lon': lon
        }
        t_back = max(t - params['n_profiles'] + 1, 0)
        X, Z = prepare_data(coords,
                            z_values,
                            rcs_1[t_back:t + 1, :],
                            rcs_2[t_back:t + 1, :],
                            params=params)

        # 3. Apply the machine learning algorithm
        #---------------------

        if isinstance(params['n_clusters'], int):
            n_clusters = params['n_clusters']
            labels = apply_algo(X, params['n_clusters'], params=params)

            # Compute classification score
            if len(np.unique(labels)) > 1:
                with np.errstate(
                        divide='ignore', invalid='ignore'
                ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                    db_score = davies_bouldin_score(X, labels)
                s_score = silhouette_score(X, labels)
                ch_score = calinski_harabaz_score(X, labels)
            else:
                db_score = np.nan
                s_score = np.nan
                ch_score = np.nan
        else:
            labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores(
                X, params=params)

        # 4. Derive and store the BLH
        #---------------------
        blh.append(blh_from_labels(labels, Z))
        K_values.append(n_clusters)
        s_scores.append(s_score)
        db_scores.append(db_score)
        ch_scores.append(ch_score)

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        fname = inputFile.split('/')[-1]
        outputFile = "DAILY_BENCHMARK_" + fname[10:-3] + ".nc"

    mask_cloud = cbh[:] <= 3000

    if os.path.isfile(reference):
        blh_ref = np.loadtxt(reference)
    else:
        blh_ref = blh_mnf[:, 0]

    if storeResults:
        BLHS = [np.array(blh), np.array(blh_mnf[:, 0])]
        BLH_NAMES = ['BLH_KABL', 'BLH_INDUS']
        if os.path.isfile(reference):
            BLHS.append(blh_ref)
            BLH_NAMES.append('BLH_REF')

        # Cloud base height is added as if it were a BLH though it's not
        BLHS.append(cbh)
        BLH_NAMES.append("CLOUD_BASE_HEIGHT")

        msg = utils.save_qualitymetrics(outputFile, t_values, BLHS, BLH_NAMES,
                                        [s_scores, db_scores, ch_scores],
                                        ['SILH', 'DB', 'CH'], [rr, vv],
                                        ['MASK_RAIN', 'MASK_FOG'], K_values,
                                        chrono, params)

        if os.path.isfile(rsFile):
            blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1])
        else:
            blh_rs = None

        # graphics.blhs_over_data(t_values,z_values,rcs_1,BLHS,[s[4:] for s in BLH_NAMES],
        # blh_rs=blh_rs,storeImages=True,showFigure=False)
        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref)**2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return errl2_blh, errl1_blh, errl0_blh, corr_blh, np.mean(
        ch_scores), np.mean(db_scores), np.mean(s_scores), chrono, n_invalid
Exemplo n.º 18
0
def kabl_qualitymetrics(
    inputFile,
    outputFile=None,
    reference="None",
    rsFile="None",
    storeResults=True,
    params=None,
):
    """Estimate quality metrics of KABL for one day of measurement.
    
    This function perform the BLH estimation as in
    kabl.core.blh_estimation but its output are the quality metrics, not
    the BLH estimation. As the estimation of quality metrics is greedier
    this function is noticeably longer to execute.
    
    Parameters
    ----------
    inputFile : str
        Path to the input file, as generated by raw2l1
    
    outputFile : str, default=None
        Path to the output file
    
    reference : str, default=None
        Path to handmade BLH estimation, if any, which will serve
        as reference.
    
    rsFile : str
        Path to the radiosounding estimations, if any. Give the
        possibility to store it in the same netcdf
    
    storeResults : bool, default=True
        If True, quality metrics are stored in the `outputFile`
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'n_clusters'
    
    
    Returns
    -------
    errl2_blh : float
        Root mean squared gap between BLH from KABL and the reference
        .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2}
    
    errl1_blh : float
        Mean absolute gap between BLH from KABL and the reference
        .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert
      
    errl0_blh : float
        Maximum absolute gap between BLH from KABL and the reference
        .. math:: \max_i \vert Z(i)-Zref(i) \vert
    
    ch_score : float
        Average Calinski-Harabasz score (the higher, the better) over
        the full day
        
    db_scores : float
        Average Davies-Bouldin score (the lower, the better) over
        the full day
    
    s_scores : float
        Average silhouette score (the higher, the better) over
        the full day
    
    chrono : float
        Computation time for the full day (seconds)
    
    n_invalid : int
        Number of BLH estimation at NaN or Inf
    """

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, dat = utils.extract_data(
        inputFile, to_extract=["rcs_1", "rcs_2", "pbl", "rr", "vv", "b1"], params=params
    )
    rcs_1 = dat["rcs_1"]
    rcs_2 = dat["rcs_2"]
    blh_mnf = dat["pbl"]
    rr = dat["rr"]
    vv = dat["vv"]
    cbh = dat["b1"]

    blh = []
    K_values = []
    s_scores = []
    db_scores = []
    ch_scores = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write(
        "\nKABL estimation ("
        + loc
        + dateofday.strftime(", %Y/%m/%d")
        + "): [%s]" % ("." * toolbar_width)
    )
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        coords = {
            "time": dt.datetime.utcfromtimestamp(t_values[t]),
            "lat": lat,
            "lon": lon,
        }
        t_back = max(t - params["n_profiles"] + 1, 0)
        X, Z = prepare_data(
            coords,
            z_values,
            rcss={"rcs_1": rcs_1[t_back : t + 1, :], "rcs_2": rcs_2[t_back : t + 1, :]},
            params=params,
        )

        # 3. Apply the machine learning algorithm
        # ---------------------

        if isinstance(params["n_clusters"], int):
            n_clusters = params["n_clusters"]
            labels = apply_algo(X, params["n_clusters"], params=params)

            # Compute classification score
            if len(np.unique(labels)) > 1:
                with np.errstate(
                    divide="ignore", invalid="ignore"
                ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                    db_score = davies_bouldin_score(X, labels)
                s_score = silhouette_score(X, labels)
                ch_score = calinski_harabaz_score(X, labels)
            else:
                db_score = np.nan
                s_score = np.nan
                ch_score = np.nan
        else:
            labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores(
                X, params=params
            )

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(labels, Z))
        K_values.append(n_clusters)
        s_scores.append(s_score)
        db_scores.append(db_score)
        ch_scores.append(ch_score)

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        fname = os.path.split(inputFile)[-1]
        outputFile = os.path.join(
            paths.resultrootdir, "DAILY_BENCHMARK_" + fname[10:-3] + ".nc"
        )

    mask_cloud = cbh[:] <= 3000

    if os.path.isfile(reference):
        blh_ref = np.loadtxt(reference)
    else:
        blh_ref = blh_mnf[:, 0]

    if storeResults:
        BLHS = [np.array(blh), np.array(blh_mnf[:, 0])]
        BLH_NAMES = ["BLH_KABL", "BLH_INDUS"]
        if os.path.isfile(reference):
            BLHS.append(blh_ref)
            BLH_NAMES.append("BLH_REF")

        # Cloud base height is added as if it were a BLH though it's not
        BLHS.append(cbh)
        BLH_NAMES.append("CLOUD_BASE_HEIGHT")

        msg = utils.save_qualitymetrics(
            outputFile,
            t_values,
            BLHS,
            BLH_NAMES,
            [s_scores, db_scores, ch_scores],
            ["SILH", "DB", "CH"],
            [rr, vv],
            ["MASK_RAIN", "MASK_FOG"],
            K_values,
            chrono,
            params,
        )

        if os.path.isfile(rsFile):
            blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1])
        else:
            blh_rs = None

        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref) ** 2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return (
        errl2_blh,
        errl1_blh,
        errl0_blh,
        corr_blh,
        np.mean(ch_scores),
        np.mean(db_scores),
        np.mean(s_scores),
        chrono,
        n_invalid,
    )
Exemplo n.º 19
0
labels = core.apply_algo(X, 3)
blh = utils.blh_from_labels(labels, Z)

blhs_over_profile(z_values, rcs_1, blh, labels=labels)

plt.figure()
plt.hist(rcs_1, 35)
plt.title("Histogram of a single profile of RCS")
plt.show(block=False)

# Test of blhs_over_data
# ------------------------
print("\n --------------- Test of blhs_over_data")
testFile = paths.file_defaultlidardata()
blh = core.blh_estimation(testFile)
t_values, z_values, rcss = utils.extract_data(testFile)
rcs_1 = rcss["rcs_1"]
rcs_2 = rcss["rcs_2"]

blhs_over_data(t_values, z_values, rcs_1, blh)

# Test of scatterplot_blhs
# ------------------------
print("\n --------------- Test of scatterplot_blhs")
outputFile = paths.file_defaultoutput()
t_values, z_values, dat = utils.extract_data(
    outputFile, to_extract=["blh_kabl", "pbl"]
)
blh_new = dat["blh_kabl"]
blh_mnf  = dat["pbl"]
scatterplot_blhs(t_values, blh_mnf[:, 0], blh_new)
Exemplo n.º 20
0
def blh_estimation_returnlabels(
    inputFile, outputFile=None, storeInNetcdf=False, params=None
):
    """Perform BLH estimation on all profiles of the day and return the labels
    of the classification.
    
    
    Parameters
    ----------
    inputFile : str
        Path to the input file, as generated by raw2l1
    
    outputFile : str, default=None
        Path to the output file. Default adds ".out" before ".nc"
    
    storeInNetcdf : bool, default=True
        If True, the field 'blh_kabl', containg BLH estimation, is
        stored in the outputFile
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'n_clusters'
    
    
    Returns
    -------
    blh : ndarray of shape (Nt,)
        Time series of BLH as estimated by the KABL algorithm
    
    zoneID : ndarray of shape (Nt,Nz)
        Cluster labels of every profiles
    """

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    needed_data = np.unique(np.concatenate(list(params["predictors"].values())))
    t_values, z_values, rcss = utils.extract_data(
        inputFile, to_extract=needed_data, params=params
    )

    if "rcs_0" in needed_data:
        rcs_0 = rcss["rcs_0"]
    if "rcs_1" in needed_data:
        rcs_1 = rcss["rcs_1"]
    if "rcs_2" in needed_data:
        rcs_2 = rcss["rcs_2"]

    blh = []
    zoneID = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write(
        "\nKABL estimation ("
        + loc
        + dateofday.strftime(", %Y/%m/%d")
        + "): [%s]" % ("." * toolbar_width)
    )
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        coords = {
            "time": dt.datetime.utcfromtimestamp(t_values[t]),
            "lat": lat,
            "lon": lon,
        }
        t_back = max(t - params["n_profiles"] + 1, 0)

        rcss = {}
        if "rcs_0" in needed_data:
            rcss["rcs_0"] = rcs_0[t_back : t + 1, :]
        if "rcs_1" in needed_data:
            rcss["rcs_1"] = rcs_1[t_back : t + 1, :]
        if "rcs_2" in needed_data:
            rcss["rcs_2"] = rcs_2[t_back : t + 1, :]

        X, Z = prepare_data(coords, z_values, rcss=rcss, params=params)

        # 3. Apply the machine learning algorithm
        # ---------------------
        if isinstance(params["n_clusters"], int):
            labels = apply_algo(X, params["n_clusters"], params=params)
        else:
            labels, n_clusters, classif_score = apply_algo_k_auto(X, params=params)

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(labels, Z))
        zoneID.append(labels)

    if outputFile is None:
        outputFile = paths.file_defaultoutput()

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    # 5. Store the new BLH estimation into a copy of the original netCDF
    # ---------------------
    if storeInNetcdf:
        utils.add_blh_to_netcdf(inputFile, outputFile, blh)

    return np.array(blh), np.array(zoneID)