Пример #1
0
def test_prepare_data_multiprof():

    n_profiles = 3
    testFile = paths.file_defaultlidardata()
    t_values, z_values, rcss = utils.extract_data(
        testFile, to_extract=["rcs_1", "rcs_2"])
    rcs_1 = rcss["rcs_1"]
    rcs_2 = rcss["rcs_2"]

    params = utils.get_default_params()
    params["predictors"] = {
        "day": ["rcs_1", "rcs_2"],
        "night": ["rcs_1", "rcs_2"]
    }

    loc, dateofday, lat, lon = utils.where_and_when(testFile)
    t = 55
    coords = {
        "time": dt.datetime.utcfromtimestamp(t_values[t]),
        "lat": lat,
        "lon": lon
    }
    t_back = max(t - n_profiles + 1, 0)
    rcss = {"rcs_1": rcs_1[t_back:t + 1, :], "rcs_2": rcs_2[t_back:t + 1, :]}

    X, Z = prepare_data(coords, z_values, rcss=rcss, params=params)

    assert X.shape == (438, 2) and Z.shape == (438, )
Пример #2
0
def test_apply_algo_k_auto():

    X, y = make_blobs(n_samples=100, centers=3, random_state=418)

    params = utils.get_default_params()
    params["init"] = "advanced"

    labels, K, sc = apply_algo_k_auto(X, params=params)

    # cluster identification numbers are random. Only borders matter
    assert np.array_equal(np.diff(labels) == 0, np.diff(y) == 0) and K == 3
Пример #3
0
def test_apply_algo_k_3scores():

    X, y = make_blobs(n_samples=100, centers=3, random_state=418)

    params = utils.get_default_params()
    params["init"] = "advanced"

    labels, K, sil, db, ch = apply_algo_k_3scores(X, params=params)

    # cluster identification numbers are random. Only borders matter
    assert K == 3 and sil == silhouette_score(
        X, y) and db == davies_bouldin_score(
            X, y) and ch == calinski_harabaz_score(X, y)
Пример #4
0
def test_prepare_data_singleprof():

    testFile = paths.file_defaultlidardata()
    z_values, rcs_1, rcs_2, coords = utils.extract_testprofile(
        testFile, profile_id=2, return_coords=True)

    params = utils.get_default_params()
    params["predictors"] = {
        "day": ["rcs_1", "rcs_2"],
        "night": ["rcs_1", "rcs_2"]
    }

    X, Z = prepare_data(coords,
                        z_values,
                        rcss={
                            "rcs_1": rcs_1,
                            "rcs_2": rcs_2
                        },
                        params=params)

    assert X.shape == (146, 2) and Z.shape == (146, )
Пример #5
0
def test_prepare_data_cl31():

    n_profiles = 3
    testFile = paths.file_defaultcl31data()
    t_values, z_values, rcss = utils.extract_data(testFile,
                                                  to_extract=["rcs_0"])
    rcs_0 = rcss["rcs_0"]

    params = utils.get_default_params()
    params["predictors"] = {"day": ["rcs_0"], "night": ["rcs_0"]}

    loc, dateofday, lat, lon = utils.where_and_when(testFile)
    t = 55
    coords = {
        "time": dt.datetime.utcfromtimestamp(t_values[t]),
        "lat": lat,
        "lon": lon
    }
    t_back = max(t - n_profiles + 1, 0)
    rcs_0 = rcs_0[t_back:t + 1, :]

    X, Z = prepare_data(coords, z_values, rcss={"rcs_0": rcs_0}, params=params)

    assert X.shape == (1347, 1) and Z.shape == (1347, )
Пример #6
0
def kabl_qualitymetrics(inputFile,
                        outputFile=None,
                        reference='None',
                        rsFile='None',
                        storeResults=True,
                        params=None):
    '''Copy of blh_estimation including calculus and storage of scores
    
    [IN]
      - inputFile (str): path to the input file, as generated by raw2l1
      - outputFile (str): path to the output file. Default adds ".out" before ".nc"
      - reference (str): path to the reference file, if any.
      - rsFile (str): path to the radiosounding estimations, if any (give the possibility to store it in the same netcdf)
      - storeResults (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile
      - params (dict): dict of parameters. Depends on 'n_clusters'
    
    [OUT]
      - errl2_blh (float): root mean squared gap between BLH from KABL and the reference
      - errl1_blh (float): mean absolute gap between BLH from KABL and the reference
      - errl0_blh (float): maximum absolute gap between BLH from KABL and the reference
      - ch_score (float): mean over all day Calinski-Harabasz score (the higher, the better)
      - db_scores (float): mean over all day Davies-Bouldin score (the lower, the better)
      - s_scores (float): mean over all day silhouette score (the higher, the better)
      - chrono (float): computation time for the full day (seconds)
      - n_invalid (int): number of BLH estimation at NaN or Inf
    '''

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    #---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, rcs_1, rcs_2, blh_mnf, rr, vv, cbh = utils.extract_data(
        inputFile,
        to_extract=['rcs_1', 'rcs_2', 'pbl', 'rr', 'vv', 'b1'],
        params=params)

    blh = []
    K_values = []
    s_scores = []
    db_scores = []
    ch_scores = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("KABL estimation (" + loc +
                     dateofday.strftime(', %Y/%m/%d') + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        #---------------------
        coords = {
            'time': dt.datetime.utcfromtimestamp(t_values[t]),
            'lat': lat,
            'lon': lon
        }
        t_back = max(t - params['n_profiles'] + 1, 0)
        X, Z = prepare_data(coords,
                            z_values,
                            rcs_1[t_back:t + 1, :],
                            rcs_2[t_back:t + 1, :],
                            params=params)

        # 3. Apply the machine learning algorithm
        #---------------------

        if isinstance(params['n_clusters'], int):
            n_clusters = params['n_clusters']
            labels = apply_algo(X, params['n_clusters'], params=params)

            # Compute classification score
            if len(np.unique(labels)) > 1:
                with np.errstate(
                        divide='ignore', invalid='ignore'
                ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                    db_score = davies_bouldin_score(X, labels)
                s_score = silhouette_score(X, labels)
                ch_score = calinski_harabaz_score(X, labels)
            else:
                db_score = np.nan
                s_score = np.nan
                ch_score = np.nan
        else:
            labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores(
                X, params=params)

        # 4. Derive and store the BLH
        #---------------------
        blh.append(blh_from_labels(labels, Z))
        K_values.append(n_clusters)
        s_scores.append(s_score)
        db_scores.append(db_score)
        ch_scores.append(ch_score)

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        fname = inputFile.split('/')[-1]
        outputFile = "DAILY_BENCHMARK_" + fname[10:-3] + ".nc"

    mask_cloud = cbh[:] <= 3000

    if os.path.isfile(reference):
        blh_ref = np.loadtxt(reference)
    else:
        blh_ref = blh_mnf[:, 0]

    if storeResults:
        BLHS = [np.array(blh), np.array(blh_mnf[:, 0])]
        BLH_NAMES = ['BLH_KABL', 'BLH_INDUS']
        if os.path.isfile(reference):
            BLHS.append(blh_ref)
            BLH_NAMES.append('BLH_REF')

        # Cloud base height is added as if it were a BLH though it's not
        BLHS.append(cbh)
        BLH_NAMES.append("CLOUD_BASE_HEIGHT")

        msg = utils.save_qualitymetrics(outputFile, t_values, BLHS, BLH_NAMES,
                                        [s_scores, db_scores, ch_scores],
                                        ['SILH', 'DB', 'CH'], [rr, vv],
                                        ['MASK_RAIN', 'MASK_FOG'], K_values,
                                        chrono, params)

        if os.path.isfile(rsFile):
            blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1])
        else:
            blh_rs = None

        # graphics.blhs_over_data(t_values,z_values,rcs_1,BLHS,[s[4:] for s in BLH_NAMES],
        # blh_rs=blh_rs,storeImages=True,showFigure=False)
        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref)**2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return errl2_blh, errl1_blh, errl0_blh, corr_blh, np.mean(
        ch_scores), np.mean(db_scores), np.mean(s_scores), chrono, n_invalid
Пример #7
0
def apply_algo_k_3scores(X, params=None, quiet=True):
    '''Adapation of apply_algo_k_auto in benchmark context.
    
    [IN]
        - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor.
        - params (dict): dict with all settings. Depends on 'max_k', 'n_clusters'
        - quiet (bool): if True, all prints are skipped
        
    [OUT]
        - labels (np.array[N]): vector of cluster number attribution
            BEWARE: the cluster identification number are random. Only borders matter.
        - n_clusters_opt (int): optimal number of clusters to be found in the data
        - classif_scores (float): value of classification score (chosen in params['n_clusters']) for the returned classification.
    '''

    if params is None:
        params = utils.get_default_params()

    # Apply algorithm and compute scores for several number of clusters
    all_labels = []
    s_scores = []
    db_scores = []
    ch_scores = []
    for n_clusters in range(2, params['max_k'] + 1):

        labels = apply_algo(X, n_clusters, params=params)
        all_labels.append(labels)

        if len(np.unique(labels)) > 1:
            with np.errstate(
                    divide='ignore', invalid='ignore'
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                db_scores.append(davies_bouldin_score(X, labels))
            s_scores.append(silhouette_score(X, labels))
            ch_scores.append(calinski_harabaz_score(X, labels))
        else:
            db_scores.append(np.nan)
            s_scores.append(np.nan)
            ch_scores.append(np.nan)

    # Choose the best number of clusters
    valid = True
    if params['classif_score'] in ['silhouette', 'silh']:
        k_best = np.nanargmax(s_scores)
        if s_scores[k_best] < 0.6:
            if not quiet:
                print("Bad classification according to silhouette score (",
                      s_scores[k_best], "). BLH is thus NaN")
            valid = False
    elif params['classif_score'] in ['davies_bouldin', 'db']:
        k_best = np.nanargmin(db_scores)
        if db_scores[k_best] > 0.4:
            if not quiet:
                print("Bad classification according to Davies-Bouldin score (",
                      db_scores[k_best], "). BLH is thus NaN")
            valid = False
    else:
        k_best = np.nanargmax(ch_scores)
        if ch_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    ch_scores[k_best], "). BLH is thus NaN")
            valid = False

    if all(np.isnan(db_scores)):
        valid = False

    # Return the results
    if valid:
        result = all_labels[k_best], k_best + 2, s_scores[k_best], db_scores[
            k_best], ch_scores[k_best]
    else:
        result = None, np.nan, s_scores[k_best], db_scores[k_best], ch_scores[
            k_best]

    return result
Пример #8
0
def blh_estimation(inputFile,
                   outputFile=None,
                   storeInNetcdf=True,
                   params=None):
    '''Perform BLH estimation on all profiles of the day and write it into
    a copy of the netcdf file.
    
    [IN]
      - inputFile (str): path to the input file, as generated by raw2l1
      - outputFile (str): path to the output file. Default adds ".out" before ".nc"
      - storeInNetcdf (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile
      - params (dict): dict of parameters. Depends on 'n_clusters'
    
    [OUT]
      - blh (np.array[Nt]): time series of BLH as estimated by the KABL algorithm.
    '''

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    #---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, rcs_1, rcs_2 = utils.extract_data(inputFile,
                                                          params=params)

    blh = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("KABL estimation (" + loc +
                     dateofday.strftime(', %Y/%m/%d') + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        #---------------------
        coords = {
            'time': dt.datetime.utcfromtimestamp(t_values[t]),
            'lat': lat,
            'lon': lon
        }
        t_back = max(t - params['n_profiles'] + 1, 0)
        X, Z = prepare_data(coords, z_values, rcs_1[t_back:t + 1, :],
                            rcs_2[t_back:t + 1, :], params)

        # 3. Apply the machine learning algorithm
        #---------------------
        if isinstance(params['n_clusters'], int):
            labels = apply_algo(X, params['n_clusters'], params=params)

            # (3.1 OPTIONAL) Compute classification score
            classif_score = silhouette_score(X, labels)
            #ch_score=calinski_harabaz_score(X,labels)
            #db_score=davies_bouldin_score(X,labels)
        else:
            labels, n_clusters, classif_score = apply_algo_k_auto(
                X, params=params)

        # 4. Derive and store the BLH
        #---------------------
        blh.append(blh_from_labels(labels, Z))

    if outputFile is None:
        outputFile = inputFile[:-3] + ".out.nc"

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    # 5. Store the new BLH estimation into a copy of the original netCDF
    if storeInNetcdf:
        utils.add_blh_to_netcdf(inputFile, outputFile, blh)

    return np.array(blh)
Пример #9
0
def apply_algo(X, n_clusters, init_codification=None, params=None):
    '''Apply the machine learning algorithm on the prepared data.
    
    [IN]
        - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor.
        - n_clusters (int): number of clusters to be found in the data
        - init_codification (dict): dict to link initialisation strategy with actual algorithm inputs.
            Keys are the three strategy are available:
                'random': pick randomly an individual as starting  point (both Kmeans and GMM)
                'advanced': more sophisticated way to initialize
                'given': start at explicitly passed point coordinates.
                + special key 'token', where are given the explicit point coordinates to use when the strategy is 'given'
            Values are dictionnaries with, as key, the algorithm name and, as value, the corresponding input in Scikit-learn.
            For 'token', the value is a list of np.arrays (explicit point coordinates)
        - params (dict): dict of parameters. Depends on 'algo', 'n_inits', 'init', 'cov_type'
        
    [OUT]
        - labels (np.array[N]): vector of cluster number attribution
            BEWARE: the cluster identification number are random. Only borders matter.'''

    if params is None:
        params = utils.get_default_params()

    if init_codification is None:
        init_codification={ 'random':
                                {'kmeans':'random','gmm':'random'},
                            'advanced':
                                {'kmeans':'k-means++','gmm':'kmeans'},
                            'given':# When initialization is 'given', the values are given in the 'token' field
                                {'kmeans':'token','gmm':'kmeans'},
                            'token':    # trick to specify centroids in one object
                                [np.array([-2.7,-0.7]), # 2 clusters
                                    np.array([-2.7,-0.7,1]), # 3 clusters
                                    np.array([-3.9,-2.7,-0.7,1]), # 4 clusters
                                    np.array([-3.9,-2.7,-1.9,-0.7,1]), # 5 clusters
                                    np.array([-3.9,-2.7,-1.9,-0.7,0,1])] # 6 clusters
                            }
    initialization = init_codification[params['init']][params['algo']]

    # When initialization is 'given', the values are given in the 'token' field
    # The values are accessed afterward to keep the dict init_codification not too hard to read...
    if initialization == 'token':
        # Given values are repeated in all predictors
        n_predictors = X.shape[1]
        initialization = np.repeat(init_codification['token'][n_clusters - 2],
                                   n_predictors).reshape(
                                       (n_clusters, n_predictors))

    if params['algo'] == 'kmeans':
        kmeans = KMeans(n_clusters=n_clusters,
                        n_init=params['n_inits'],
                        init=initialization)
        kmeans.fit(X)
        labels = kmeans.predict(X)
    elif params['algo'] == 'gmm':
        gmm = GaussianMixture(n_components=n_clusters,
                              covariance_type=params['cov_type'],
                              n_init=params['n_inits'],
                              init_params=initialization)
        gmm.fit(X)
        labels = gmm.predict(X)

    return labels
Пример #10
0
def apply_algo_k_auto(X, init_codification=None, quiet=True, params=None):
    '''Apply the machine learning algorithm for various number of
    clusters and choose the best according a certain score
    
    [IN]
        - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor.
        - init_codification (dict): dict to link initialisation strategy with actual algorithm inputs. See kabl.core.apply_algo
        - quiet (boolean): if True, cut down all prints
        - params (dict): dict with all settings. Depends on 'max_k', 'n_clusters'
        
    [OUT]
        - labels (np.array[N]): vector of cluster number attribution
            BEWARE: the cluster identification number are random. Only borders matter.
        - n_clusters_opt (int): optimal number of clusters to be found in the data
        - classif_scores (float): value of classification score (chosen in params['n_clusters']) for the returned classification.
    '''

    if params is None:
        params = utils.get_default_params()

    # Apply algorithm and compute scores for several number of clusters
    all_labels = []
    classif_scores = []
    for n_clusters in range(2, params['max_k']):

        labels = apply_algo(X,
                            n_clusters,
                            init_codification=init_codification,
                            params=params)
        all_labels.append(labels)

        if params['classif_score'] in ['silhouette', 'silh']:
            classif_scores.append(silhouette_score(X, labels))
        elif params['classif_score'] in ['davies_bouldin', 'db']:
            with np.errstate(
                    divide='ignore', invalid='ignore'
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                classif_scores.append(davies_bouldin_score(X, labels))
        else:  # Default because fastest
            classif_scores.append(calinski_harabaz_score(X, labels))

    # Choose the best number of clusters
    if params['classif_score'] in ['silhouette', 'silh']:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 0.5:
            if not quiet:
                print("Bad classification according to silhouette score (",
                      classif_scores[k_best], "). BLH is thus NaN")
            k_best = None
    elif params['classif_score'] in ['davies_bouldin', 'db']:
        k_best = np.argmin(classif_scores)
        if classif_scores[k_best] > 0.36:
            if not quiet:
                print("Bad classification according to Davies-Bouldin score (",
                      classif_scores[k_best], "). BLH is thus NaN")
            k_best = None
    else:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    classif_scores[k_best], "). BLH is thus NaN")
            k_best = None

    # Return the results
    if k_best is not None:
        result = all_labels[k_best], k_best + 2, classif_scores[k_best]
    else:
        result = None, None, None

    return result
Пример #11
0
def blh_estimation_returnlabels(
    inputFile, outputFile=None, storeInNetcdf=False, params=None
):
    """Perform BLH estimation on all profiles of the day and return the labels
    of the classification.
    
    
    Parameters
    ----------
    inputFile : str
        Path to the input file, as generated by raw2l1
    
    outputFile : str, default=None
        Path to the output file. Default adds ".out" before ".nc"
    
    storeInNetcdf : bool, default=True
        If True, the field 'blh_kabl', containg BLH estimation, is
        stored in the outputFile
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'n_clusters'
    
    
    Returns
    -------
    blh : ndarray of shape (Nt,)
        Time series of BLH as estimated by the KABL algorithm
    
    zoneID : ndarray of shape (Nt,Nz)
        Cluster labels of every profiles
    """

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    needed_data = np.unique(np.concatenate(list(params["predictors"].values())))
    t_values, z_values, rcss = utils.extract_data(
        inputFile, to_extract=needed_data, params=params
    )

    if "rcs_0" in needed_data:
        rcs_0 = rcss["rcs_0"]
    if "rcs_1" in needed_data:
        rcs_1 = rcss["rcs_1"]
    if "rcs_2" in needed_data:
        rcs_2 = rcss["rcs_2"]

    blh = []
    zoneID = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write(
        "\nKABL estimation ("
        + loc
        + dateofday.strftime(", %Y/%m/%d")
        + "): [%s]" % ("." * toolbar_width)
    )
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        coords = {
            "time": dt.datetime.utcfromtimestamp(t_values[t]),
            "lat": lat,
            "lon": lon,
        }
        t_back = max(t - params["n_profiles"] + 1, 0)

        rcss = {}
        if "rcs_0" in needed_data:
            rcss["rcs_0"] = rcs_0[t_back : t + 1, :]
        if "rcs_1" in needed_data:
            rcss["rcs_1"] = rcs_1[t_back : t + 1, :]
        if "rcs_2" in needed_data:
            rcss["rcs_2"] = rcs_2[t_back : t + 1, :]

        X, Z = prepare_data(coords, z_values, rcss=rcss, params=params)

        # 3. Apply the machine learning algorithm
        # ---------------------
        if isinstance(params["n_clusters"], int):
            labels = apply_algo(X, params["n_clusters"], params=params)
        else:
            labels, n_clusters, classif_score = apply_algo_k_auto(X, params=params)

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(labels, Z))
        zoneID.append(labels)

    if outputFile is None:
        outputFile = paths.file_defaultoutput()

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    # 5. Store the new BLH estimation into a copy of the original netCDF
    # ---------------------
    if storeInNetcdf:
        utils.add_blh_to_netcdf(inputFile, outputFile, blh)

    return np.array(blh), np.array(zoneID)
Пример #12
0
def apply_algo_k_3scores(X, quiet=True, params=None):
    """Adapation of kabl.core.apply_algo_k_auto in benchmark context.
    
    Parameters
    ----------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm. Each line is an
        observation, each column is a predictor.
    
    quiet : bool, default=True
        If True, cut down all prints
        
    params : dict, default=None
        Dict with all settings. This function depends on 'max_k', 'n_clusters'
        
    Returns
    -------
    labels : ndarray of shape (N,)
        Vector of cluster number attribution
        BEWARE: the cluster identification number are random. Only
        borders matter.
    
    n_clusters_opt : int
        Optimal number of clusters to be found in the data
    
    classif_scores : float
        Value of classification score (chosen in
        params['n_clusters']) for the returned classification.
    """

    if params is None:
        params = utils.get_default_params()

    # Apply algorithm and compute scores for several number of clusters
    all_labels = []
    s_scores = []
    db_scores = []
    ch_scores = []
    for n_clusters in range(2, params["max_k"] + 1):

        labels = apply_algo(X, n_clusters, params=params)
        all_labels.append(labels)

        if len(np.unique(labels)) > 1:
            with np.errstate(
                divide="ignore", invalid="ignore"
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                db_scores.append(davies_bouldin_score(X, labels))
            s_scores.append(silhouette_score(X, labels))
            ch_scores.append(calinski_harabaz_score(X, labels))
        else:
            db_scores.append(np.nan)
            s_scores.append(np.nan)
            ch_scores.append(np.nan)

    # Choose the best number of clusters
    valid = True
    if params["classif_score"] in ["silhouette", "silh"]:
        k_best = np.nanargmax(s_scores)
        if s_scores[k_best] < 0.6:
            if not quiet:
                print(
                    "Bad classification according to silhouette score (",
                    s_scores[k_best],
                    "). BLH is thus NaN",
                )
            valid = False
    elif params["classif_score"] in ["davies_bouldin", "db"]:
        k_best = np.nanargmin(db_scores)
        if db_scores[k_best] > 0.4:
            if not quiet:
                print(
                    "Bad classification according to Davies-Bouldin score (",
                    db_scores[k_best],
                    "). BLH is thus NaN",
                )
            valid = False
    else:
        k_best = np.nanargmax(ch_scores)
        if ch_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    ch_scores[k_best],
                    "). BLH is thus NaN",
                )
            valid = False

    if all(np.isnan(db_scores)):
        valid = False

    # Return the results
    if valid:
        result = (
            all_labels[k_best],
            k_best + 2,
            s_scores[k_best],
            db_scores[k_best],
            ch_scores[k_best],
        )
    else:
        result = None, np.nan, s_scores[k_best], db_scores[k_best], ch_scores[k_best]

    return result
Пример #13
0
def prepare_data(coords, z_values, rcss, params=None):
    """Put the data in form to fulfil algorithm requirements.
    
    Five operations are carried out in this function:
      0. Check and reshape inputs
      1. Distinguish night and day for predictors
      2. Concatenate the profiles
      3. Take the logarithm of range-corrected signal
      4. Apply also a standard normalisation (remove mean and divide by
    standard deviation).
    
    
    Parameters
    ----------
    coords : dict
        Time and space coordinate. The dict must have 3 keys:
        'time' (datetime): time of the profile
        'lat' (float): latitude of the measurement site
        'lon' (float): longitude of the measurement site
    
    z_values : array-like of shape (nZ,)
        Vector of altitude values
    
    rcss : dict
        Input data, in the form of a dict of named matrices.
        Example: rcss={"rcs_0":rcs_0, "rcs_1":rcs_1} where rcs_0 and
        rcs_1 are ndarray of shape (nT,nZ)
    
    params : dict
        Dict with all settings. This function depends on 'n_profiles',
        'predictors', 'sunrise_shift', 'sunset_shift'.
      
    Returns
    -------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm.
        Each line is an observation, each column is a predictor.
    
    Z : ndarray of shape (N,)
        Vector of altitudes for each observation.
    """

    if params is None:
        params = utils.get_default_params()

    # 0. Check and reshape inputs
    # ---------------------------
    needed_data = np.unique(np.concatenate(list(params["predictors"].values())))

    if set(rcss.keys()) != set(needed_data):
        raise Exception("Wrong input data provided.")

    if "rcs_0" in needed_data:
        rcs_0 = rcss["rcs_0"]
        try:
            Nt, Nz = rcs_0.shape
        except ValueError:
            Nz = rcs_0.size
            Nt = 1
    if "rcs_1" in needed_data:
        rcs_1 = rcss["rcs_1"]
        try:
            Nt, Nz = rcs_1.shape
        except ValueError:
            Nz = rcs_1.size
            Nt = 1
    if "rcs_2" in needed_data:
        rcs_2 = rcss["rcs_2"]
        try:
            Nt, Nz = rcs_2.shape
        except ValueError:
            Nz = rcs_2.size
            Nt = 1

    # 1. Distinguish night and day for predictors
    # --------------------------------------------
    t = coords["time"]
    timeofday = t.strftime("%H:%M")
    dateofday = t.strftime("%Y%m%d")

    s = Sun(lat=coords["lat"], long=coords["lon"])
    sunrise = s.sunrise(t)
    sunset = s.sunset(t)

    sunrise = dt.datetime(
        t.year, t.month, t.day, sunrise.hour, sunrise.minute, sunrise.second
    ) + dt.timedelta(hours=params["sunrise_shift"])
    sunset = dt.datetime(
        t.year, t.month, t.day, sunset.hour, sunset.minute, sunset.second
    ) + dt.timedelta(hours=params["sunset_shift"])

    if t >= sunrise and t <= sunset:
        nightorday = "day"
    else:
        nightorday = "night"

    predictors = params["predictors"][nightorday]

    # 2. Concatenate the profiles
    # ----------------------------
    if Nt > 1:
        Z = np.tile(z_values, Nt)
    else:
        Z = z_values

    X = []
    if "rcs_0" in predictors:
        if rcs_0 is None:
            raise ValueError("Missing argument rcs_0 in kabl.core.prepare_data")
        X.append(rcs_0.ravel())
    if "rcs_1" in predictors:
        if rcs_1 is None:
            raise ValueError("Missing argument rcs_1 in kabl.core.prepare_data")
        X.append(rcs_1.ravel())
    if "rcs_2" in predictors:
        if rcs_2 is None:
            raise ValueError("Missing argument rcs_2 in kabl.core.prepare_data")
        X.append(rcs_2.ravel())

    # 3. Take the logarithm of range-corrected signal
    # ------------------------------------------------
    X = np.array(X).T
    X[X <= 0] = 1e-5
    X = np.log10(X)
    if X.ndim == 1:
        X.reshape(-1, 1)

    # 4. Normalisation: remove mean and divide by standard deviation
    # ---------------------------------------------------------------
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    return X, Z
Пример #14
0
def apply_algo(X, n_clusters, init_codification=None, params=None):
    """Apply the machine learning algorithm on the prepared data.
    
    Parameters
    ----------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm. Each line is an
        observation, each column is a predictor.
    
    n_clusters : int
        Number of clusters to be found in the data
    
    init_codification : dict, default=None
        Link initialisation strategy with actual algorithm inputs.
        Keys are the three strategy are available:
            'random': pick randomly an individual as starting  point (both Kmeans and GMM)
            'advanced': more sophisticated way to initialize
            'given': start at explicitly passed point coordinates.
            + special key 'token', where are given the explicit point coordinates to use when the strategy is 'given'
        Values are dictionnaries with, as key, the algorithm name and, as value, the corresponding input in Scikit-learn.
        For 'token', the value is a list of np.arrays (explicit point coordinates)
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'algo',
        'n_inits', 'init', 'cov_type'
        
    Returns
    -------
    labels : ndarray of shape (N,)
        Vector of cluster number attribution
        BEWARE: the cluster identification number are random.
        Only borders matter.
    """

    if params is None:
        params = utils.get_default_params()

    if init_codification is None:
        init_codification = {
            "random": {"kmeans": "random", "gmm": "random"},
            "advanced": {"kmeans": "k-means++", "gmm": "kmeans"},
            "given": {  # When initialization is 'given', the values are given in the 'token' field
                "kmeans": "token",
                "gmm": "kmeans",
            },
            "token": [  # trick to specify centroids in one object
                np.array([-2.7, -0.7]),  # 2 clusters
                np.array([-2.7, -0.7, 1]),  # 3 clusters
                np.array([-3.9, -2.7, -0.7, 1]),  # 4 clusters
                np.array([-3.9, -2.7, -1.9, -0.7, 1]),  # 5 clusters
                np.array([-3.9, -2.7, -1.9, -0.7, 0, 1]),  # 6 clusters
            ],
        }
    initialization = init_codification[params["init"]][params["algo"]]

    # When initialization is 'given', the values are given in the 'token' field
    # The values are accessed afterward to keep the dict init_codification not too hard to read...
    if initialization == "token":
        # Given values are repeated in all predictors
        n_predictors = X.shape[1]
        initialization = np.repeat(
            init_codification["token"][n_clusters - 2], n_predictors
        ).reshape((n_clusters, n_predictors))

    if params["algo"] == "kmeans":
        kmeans = KMeans(
            n_clusters=n_clusters, n_init=params["n_inits"], init=initialization
        )
        kmeans.fit(X)
        labels = kmeans.predict(X)
    elif params["algo"] == "gmm":
        gmm = GaussianMixture(
            n_components=n_clusters,
            covariance_type=params["cov_type"],
            n_init=params["n_inits"],
            init_params=initialization,
        )
        gmm.fit(X)
        labels = gmm.predict(X)

    return labels
Пример #15
0
def apply_algo_k_auto(X, init_codification=None, quiet=True, params=None):
    """Apply the machine learning algorithm for various number of
    clusters and choose the best according the specified score.
    
    Parameters
    ----------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm. Each line is an
        observation, each column is a predictor.
    
    init_codification : dict, default=None
        Link initialisation strategy with actual algorithm inputs. 
        See kabl.core.apply_algo
    
    quiet : bool, default=True
        If True, cut down all prints
        
    params : dict, default=None
        Dict with all settings. This function depends on 'max_k', 'n_clusters'
    
    
    Returns
    -------
    labels : ndarray of shape (N,)
        Vector of cluster number attribution
        BEWARE: the cluster identification number are random. Only
        borders matter.
    
    n_clusters_opt : int
        Optimal number of clusters to be found in the data
    
    classif_scores : float
        Value of classification score (chosen in
        params['n_clusters']) for the returned classification.
    """

    if params is None:
        params = utils.get_default_params()

    # 1. Apply algorithm and compute scores for several number of clusters
    # --------------------------------------------------------------------
    all_labels = []
    classif_scores = []
    for n_clusters in range(2, params["max_k"]):

        labels = apply_algo(
            X, n_clusters, init_codification=init_codification, params=params
        )
        all_labels.append(labels)

        if params["classif_score"] in ["silhouette", "silh"]:
            classif_scores.append(silhouette_score(X, labels))
        elif params["classif_score"] in ["davies_bouldin", "db"]:
            with np.errstate(
                divide="ignore", invalid="ignore"
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                classif_scores.append(davies_bouldin_score(X, labels))
        else:  # Default because fastest
            classif_scores.append(calinski_harabaz_score(X, labels))

    # 2. Choose the best number of clusters
    # -------------------------------------
    if params["classif_score"] in ["silhouette", "silh"]:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 0.5:
            if not quiet:
                print(
                    "Bad classification according to silhouette score (",
                    classif_scores[k_best],
                    "). BLH is thus NaN",
                )
            k_best = None
    elif params["classif_score"] in ["davies_bouldin", "db"]:
        k_best = np.argmin(classif_scores)
        if classif_scores[k_best] > 0.36:
            if not quiet:
                print(
                    "Bad classification according to Davies-Bouldin score (",
                    classif_scores[k_best],
                    "). BLH is thus NaN",
                )
            k_best = None
    else:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    classif_scores[k_best],
                    "). BLH is thus NaN",
                )
            k_best = None

    # 3. Return the results
    # ---------------------
    if k_best is not None:
        result = all_labels[k_best], k_best + 2, classif_scores[k_best]
    else:
        result = None, None, None

    return result
Пример #16
0
def prepare_data(coords, z_values, rcs_1=None, rcs_2=None, params=None):
    '''Put the data in form to fulfil algorithm requirements.
    
    Four operations are carried out in this function:
     1. Distinguish night and day for predictors
     2. Concatenate the profiles
     3. Take the logarithm of range-corrected signal
     4. Apply also a standard normalisation (remove mean and divide by
    standard deviation).
    
    [IN]
        - coords (dict): dict with time and space coordinate
          'time' (datetime): time of the profile
          'lat' (float): latitude of the measurement site
          'lon' (float): longitude of the measurement site
        - z_values (np.array[nZ]): vector of altitude values
        - rcs_1 (np.array[nT,nZ]): vector of co-polarized backscatter values
        - rcs_2 (np.array[nT,nZ]): vector of cross-polarized backscatter values
        - params (dict): dict with all settings. Depends on 'n_profiles', 'predictors', 'sunrise_shift', 'sunset_shift'.
      
    [OUT]
        - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor.
        - Z (np.array[N]): vector of altitudes for each observation.
    '''

    if params is None:
        params = utils.get_default_params()

    # 1. Distinguish night and day for predictors
    #--------------------------------------------
    t = coords['time']
    timeofday = t.strftime('%H:%M')
    dateofday = t.strftime('%Y%m%d')

    s = Sun(lat=coords['lat'], long=coords['lon'])
    sunrise = s.sunrise(t)
    sunset = s.sunset(t)

    sunrise = dt.datetime(
        t.year, t.month, t.day, sunrise.hour, sunrise.minute,
        sunrise.second) + dt.timedelta(hours=params['sunrise_shift'])
    sunset = dt.datetime(
        t.year, t.month, t.day, sunset.hour, sunset.minute,
        sunset.second) + dt.timedelta(hours=params['sunset_shift'])

    if t >= sunrise and t <= sunset:
        nightorday = 'day'
    else:
        nightorday = 'night'

    predictors = params['predictors'][nightorday]

    # 2. Concatenate the profiles
    #----------------------------
    try:
        Nt, Nz = rcs_1.shape
        Z = np.tile(z_values, Nt)
    except ValueError:
        Z = z_values

    X = []
    if 'rcs_1' in predictors:
        if rcs_1 is None:
            raise ValueError(
                "Missing argument rcs_1 in kabl.core.prepare_data")
        X.append(rcs_1.ravel())
    if 'rcs_2' in predictors:
        if rcs_2 is None:
            raise ValueError(
                "Missing argument rcs_2 in kabl.core.prepare_data")
        X.append(rcs_2.ravel())

    # 3. Take the logarithm of range-corrected signal
    #------------------------------------------------
    X = np.array(X).T
    X[X <= 0] = 1e-5
    X = np.log10(X)

    # 4. Normalisation: remove mean and divide by standard deviation
    #---------------------------------------------------------------
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    return X, Z
Пример #17
0
import datetime as dt
import pytz
import sys
import time
import netCDF4 as nc

lidarFile = paths.file_defaultcl31data()

t_values, z_values, rcss = utils.extract_data(lidarFile,
                                              max_height=4620,
                                              to_extract=["rcs_0"])
rcs_0 = rcss["rcs_0"]

# Estimation with KABL
# ----------------------
params = utils.get_default_params()
params["n_clusters"] = 3
params["predictors"] = {"day": ["rcs_0"], "night": ["rcs_0"]}
params["n_profiles"] = 1
params["init"] = "advanced"

blh_kabl = core.blh_estimation(lidarFile, storeInNetcdf=False, params=params)

# Plot
# ------
graphics.storeImages = False

graphics.blhs_over_data(t_values, z_values, rcs_0, [blh_kabl], ['KABL'])

input("\n Press Enter to exit (close down all figures)\n")
Пример #18
0
def kabl_qualitymetrics(
    inputFile,
    outputFile=None,
    reference="None",
    rsFile="None",
    storeResults=True,
    params=None,
):
    """Estimate quality metrics of KABL for one day of measurement.
    
    This function perform the BLH estimation as in
    kabl.core.blh_estimation but its output are the quality metrics, not
    the BLH estimation. As the estimation of quality metrics is greedier
    this function is noticeably longer to execute.
    
    Parameters
    ----------
    inputFile : str
        Path to the input file, as generated by raw2l1
    
    outputFile : str, default=None
        Path to the output file
    
    reference : str, default=None
        Path to handmade BLH estimation, if any, which will serve
        as reference.
    
    rsFile : str
        Path to the radiosounding estimations, if any. Give the
        possibility to store it in the same netcdf
    
    storeResults : bool, default=True
        If True, quality metrics are stored in the `outputFile`
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'n_clusters'
    
    
    Returns
    -------
    errl2_blh : float
        Root mean squared gap between BLH from KABL and the reference
        .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2}
    
    errl1_blh : float
        Mean absolute gap between BLH from KABL and the reference
        .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert
      
    errl0_blh : float
        Maximum absolute gap between BLH from KABL and the reference
        .. math:: \max_i \vert Z(i)-Zref(i) \vert
    
    ch_score : float
        Average Calinski-Harabasz score (the higher, the better) over
        the full day
        
    db_scores : float
        Average Davies-Bouldin score (the lower, the better) over
        the full day
    
    s_scores : float
        Average silhouette score (the higher, the better) over
        the full day
    
    chrono : float
        Computation time for the full day (seconds)
    
    n_invalid : int
        Number of BLH estimation at NaN or Inf
    """

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, dat = utils.extract_data(
        inputFile, to_extract=["rcs_1", "rcs_2", "pbl", "rr", "vv", "b1"], params=params
    )
    rcs_1 = dat["rcs_1"]
    rcs_2 = dat["rcs_2"]
    blh_mnf = dat["pbl"]
    rr = dat["rr"]
    vv = dat["vv"]
    cbh = dat["b1"]

    blh = []
    K_values = []
    s_scores = []
    db_scores = []
    ch_scores = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write(
        "\nKABL estimation ("
        + loc
        + dateofday.strftime(", %Y/%m/%d")
        + "): [%s]" % ("." * toolbar_width)
    )
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        coords = {
            "time": dt.datetime.utcfromtimestamp(t_values[t]),
            "lat": lat,
            "lon": lon,
        }
        t_back = max(t - params["n_profiles"] + 1, 0)
        X, Z = prepare_data(
            coords,
            z_values,
            rcss={"rcs_1": rcs_1[t_back : t + 1, :], "rcs_2": rcs_2[t_back : t + 1, :]},
            params=params,
        )

        # 3. Apply the machine learning algorithm
        # ---------------------

        if isinstance(params["n_clusters"], int):
            n_clusters = params["n_clusters"]
            labels = apply_algo(X, params["n_clusters"], params=params)

            # Compute classification score
            if len(np.unique(labels)) > 1:
                with np.errstate(
                    divide="ignore", invalid="ignore"
                ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                    db_score = davies_bouldin_score(X, labels)
                s_score = silhouette_score(X, labels)
                ch_score = calinski_harabaz_score(X, labels)
            else:
                db_score = np.nan
                s_score = np.nan
                ch_score = np.nan
        else:
            labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores(
                X, params=params
            )

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(labels, Z))
        K_values.append(n_clusters)
        s_scores.append(s_score)
        db_scores.append(db_score)
        ch_scores.append(ch_score)

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        fname = os.path.split(inputFile)[-1]
        outputFile = os.path.join(
            paths.resultrootdir, "DAILY_BENCHMARK_" + fname[10:-3] + ".nc"
        )

    mask_cloud = cbh[:] <= 3000

    if os.path.isfile(reference):
        blh_ref = np.loadtxt(reference)
    else:
        blh_ref = blh_mnf[:, 0]

    if storeResults:
        BLHS = [np.array(blh), np.array(blh_mnf[:, 0])]
        BLH_NAMES = ["BLH_KABL", "BLH_INDUS"]
        if os.path.isfile(reference):
            BLHS.append(blh_ref)
            BLH_NAMES.append("BLH_REF")

        # Cloud base height is added as if it were a BLH though it's not
        BLHS.append(cbh)
        BLH_NAMES.append("CLOUD_BASE_HEIGHT")

        msg = utils.save_qualitymetrics(
            outputFile,
            t_values,
            BLHS,
            BLH_NAMES,
            [s_scores, db_scores, ch_scores],
            ["SILH", "DB", "CH"],
            [rr, vv],
            ["MASK_RAIN", "MASK_FOG"],
            K_values,
            chrono,
            params,
        )

        if os.path.isfile(rsFile):
            blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1])
        else:
            blh_rs = None

        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref) ** 2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return (
        errl2_blh,
        errl1_blh,
        errl0_blh,
        corr_blh,
        np.mean(ch_scores),
        np.mean(db_scores),
        np.mean(s_scores),
        chrono,
        n_invalid,
    )