def test_train_adabl_save(): datasetfile = paths.file_labelleddataset() train_adabl(datasetfile, saveResults = True) modelFile = os.path.join(paths.dir_trainedadabl(),"Classifier_AdaBoost_M200_D5_tzRCS0.pkl") scalerFile = os.path.join(paths.dir_trainedadabl(),"Scaler_tzRCS0.pkl") assert os.path.isfile(modelFile) and os.path.isfile(scalerFile)
def test_block_crossval(): datasetfile = paths.file_labelleddataset() accuracies, chronos, classifiers_keys = block_crossval(datasetfile, n_folds=3, plot_on=False) assert accuracies.shape == (5, 3) and chronos.shape == ( 5, 3) and set(classifiers_keys) == set([ 'RandomForestClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier', 'LabelSpreading' ])
# Test of bar_scores # --------------------- scores = [0.22,0.34,0.76] bar_scores(scores,"corr") # Test of plot_cv_indices # --------------------- import pandas as pd from sklearn.model_selection import KFold from sklearn.model_selection import GroupKFold datasetfile = paths.file_labelleddataset() df = pd.read_csv(datasetfile) predictors = ["sec0", "alti", "rcs0"] X = df.loc[:, predictors].values y = df.loc[:, "isBL"].values kf = KFold(n_splits=5) gkf = GroupKFold(n_splits=3) group=np.zeros_like(y) for itx in df.groupby(np.floor(df.sec0/(8*3600))).indices.items(): grval,grdex = itx group[grdex]=int(grval) subsize=100 plot_cv_indices( gkf,
def test_train_adabl(): datasetfile = paths.file_labelleddataset() model, scaler = train_adabl(datasetfile) assert set(model.classes_) == set([0, 1 ]) and scaler.n_samples_seen_ == 86400
def prepare_supervised_dataset( dataFiles: list, refFiles: list, saveInCSV: bool = False, outputFile: str = None, plot_on: bool = False, ): """Create a dataframe with appropriate fields from original data format. Lidar data is expected to be provided in raw2l1 files and handmade BLH estimation is expected in .csv file with 2 columns: time, BLH values. Paths are given in a list in order to easily had multiple days. Parameters ---------- dataFile : list of str Paths to the data input file, as generated by raw2l1 refFile : list of str Paths to the reference file (handmade BLH estimation) in CSV format saveInCSV : bool, default=False If True, the dataset is saved in a .csv file at the specified location outputFile : str, default=None Path to the file where the dataset is stored, if saveInCSV=True plot_on : bool, default=False If True, display the handmade BLH over the data. Returns ------- df : `pandas.DataFrame` Ready-to-use dataframe for ADABL training. Contains 5 columns of input data and one column of output binary data """ RCS0 = [] RCS1 = [] RCS2 = [] SEC0 = [] ALTI = [] y = [] for i in range(len(dataFiles)): dataFile = dataFiles[i] refFile = refFiles[i] print("Reading file ", dataFile, "with reference", refFile) t_values, z_values, dat = utils.extract_data( dataFile, max_height=4620, to_extract=["rcs_0", "rcs_1", "rcs_2", "pbl"] ) rcs_0 = dat["rcs_0"] rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] blh_ref = pd.read_csv(refFile, delimiter=",", header=0) blh_ref = blh_ref["blh_ref"].values if plot_on: graphics.blhs_over_data(t_values, z_values, rcs_0, blh_ref) # Input data # ---------- sec_intheday = np.mod(t_values, 24 * 3600) Nt, Nz = rcs_1.shape rcs0loc = rcs_0.ravel() rcs0loc[rcs0loc <= 0] = 1e-5 RCS0.append(np.log10(rcs0loc)) rcs1loc = rcs_1.ravel() rcs1loc[rcs1loc <= 0] = 1e-5 RCS1.append(np.log10(rcs1loc)) rcs2loc = rcs_2.ravel() rcs2loc[rcs2loc <= 0] = 2e-5 RCS2.append(np.log10(rcs2loc)) SEC0.append(np.repeat(sec_intheday, Nz)) ALTI.append(np.tile(z_values, Nt)) # Output data # ----------- yday = [] for t in range(Nt): yloc = np.zeros(Nz) yloc[z_values > blh_ref[t]] = 1 yday.append(yloc) y.append(np.array(yday, dtype=int).ravel()) # Create dataframe # ------------------ df = pd.DataFrame( { "sec0": np.concatenate(SEC0), "alti": np.concatenate(ALTI), "rcs0": np.concatenate(RCS0), "rcs1": np.concatenate(RCS1), "rcs2": np.concatenate(RCS2), "isBL": np.concatenate(y), } ) if saveInCSV: if outputFile is None: outputFile = paths.file_labelleddataset() df.to_csv(outputFile, index=False) print("Dataset for ADABL is saved in", outputFile) return df