Пример #1
0
def test_train_adabl_save():
    datasetfile = paths.file_labelleddataset()
    train_adabl(datasetfile, saveResults = True)
    
    modelFile = os.path.join(paths.dir_trainedadabl(),"Classifier_AdaBoost_M200_D5_tzRCS0.pkl")
    scalerFile = os.path.join(paths.dir_trainedadabl(),"Scaler_tzRCS0.pkl")
    assert os.path.isfile(modelFile) and os.path.isfile(scalerFile)
Пример #2
0
def test_block_crossval():
    datasetfile = paths.file_labelleddataset()

    accuracies, chronos, classifiers_keys = block_crossval(datasetfile,
                                                           n_folds=3,
                                                           plot_on=False)

    assert accuracies.shape == (5, 3) and chronos.shape == (
        5, 3) and set(classifiers_keys) == set([
            'RandomForestClassifier', 'KNeighborsClassifier',
            'DecisionTreeClassifier', 'AdaBoostClassifier', 'LabelSpreading'
        ])
Пример #3
0

# Test of bar_scores
# ---------------------

scores = [0.22,0.34,0.76]
bar_scores(scores,"corr")

# Test of plot_cv_indices
# ---------------------

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

datasetfile = paths.file_labelleddataset()
df = pd.read_csv(datasetfile)
predictors = ["sec0", "alti", "rcs0"]
X = df.loc[:, predictors].values
y = df.loc[:, "isBL"].values
kf = KFold(n_splits=5)
gkf = GroupKFold(n_splits=3)

group=np.zeros_like(y)
for itx in df.groupby(np.floor(df.sec0/(8*3600))).indices.items():
    grval,grdex = itx
    group[grdex]=int(grval)

subsize=100
plot_cv_indices(
    gkf,
Пример #4
0
def test_train_adabl():
    datasetfile = paths.file_labelleddataset()
    model, scaler = train_adabl(datasetfile)

    assert set(model.classes_) == set([0, 1
                                       ]) and scaler.n_samples_seen_ == 86400
Пример #5
0
def prepare_supervised_dataset(
    dataFiles: list,
    refFiles: list,
    saveInCSV: bool = False,
    outputFile: str = None,
    plot_on: bool = False,
):
    """Create a dataframe with appropriate fields from original data format.
    
    Lidar data is expected to be provided in raw2l1 files and handmade BLH 
    estimation is expected in .csv file with 2 columns: time, BLH values.
    Paths are given in a list in order to easily had multiple days.
    
    
    Parameters
    ----------
    dataFile : list of str
        Paths to the data input file, as generated by raw2l1
    
    refFile : list of str
        Paths to the reference file (handmade BLH estimation) in CSV format
    
    saveInCSV : bool, default=False
        If True, the dataset is saved in a .csv file at the specified location
        
    outputFile : str, default=None
        Path to the file where the dataset is stored, if saveInCSV=True
    
    plot_on : bool, default=False
        If True, display the handmade BLH over the data.
    
    
    Returns
    -------
    df : `pandas.DataFrame`
        Ready-to-use dataframe for ADABL training. Contains 5 columns of input
        data and one column of output binary data
    """

    RCS0 = []
    RCS1 = []
    RCS2 = []
    SEC0 = []
    ALTI = []
    y = []
    for i in range(len(dataFiles)):
        dataFile = dataFiles[i]
        refFile = refFiles[i]
        print("Reading file ", dataFile, "with reference", refFile)
        t_values, z_values, dat = utils.extract_data(
            dataFile, max_height=4620, to_extract=["rcs_0", "rcs_1", "rcs_2", "pbl"]
        )
        rcs_0 = dat["rcs_0"]
        rcs_1 = dat["rcs_1"]
        rcs_2 = dat["rcs_2"]
        blh_mnf = dat["pbl"]

        blh_ref = pd.read_csv(refFile, delimiter=",", header=0)
        blh_ref = blh_ref["blh_ref"].values

        if plot_on:
            graphics.blhs_over_data(t_values, z_values, rcs_0, blh_ref)

        # Input data
        # ----------
        sec_intheday = np.mod(t_values, 24 * 3600)
        Nt, Nz = rcs_1.shape

        rcs0loc = rcs_0.ravel()
        rcs0loc[rcs0loc <= 0] = 1e-5
        RCS0.append(np.log10(rcs0loc))

        rcs1loc = rcs_1.ravel()
        rcs1loc[rcs1loc <= 0] = 1e-5
        RCS1.append(np.log10(rcs1loc))

        rcs2loc = rcs_2.ravel()
        rcs2loc[rcs2loc <= 0] = 2e-5
        RCS2.append(np.log10(rcs2loc))

        SEC0.append(np.repeat(sec_intheday, Nz))
        ALTI.append(np.tile(z_values, Nt))

        # Output data
        # -----------
        yday = []
        for t in range(Nt):
            yloc = np.zeros(Nz)
            yloc[z_values > blh_ref[t]] = 1
            yday.append(yloc)

        y.append(np.array(yday, dtype=int).ravel())

    # Create dataframe
    # ------------------
    df = pd.DataFrame(
        {
            "sec0": np.concatenate(SEC0),
            "alti": np.concatenate(ALTI),
            "rcs0": np.concatenate(RCS0),
            "rcs1": np.concatenate(RCS1),
            "rcs2": np.concatenate(RCS2),
            "isBL": np.concatenate(y),
        }
    )

    if saveInCSV:
        if outputFile is None:
            outputFile = paths.file_labelleddataset()
        df.to_csv(outputFile, index=False)
        print("Dataset for ADABL is saved in", outputFile)

    return df