예제 #1
0
def computing_precise_vs_imprecise(in_path=None,
                                   ell_optimal=0.1,
                                   cv_n_fold=10,
                                   seeds=None,
                                   lib_path_server=None,
                                   model_type_precise='lda',
                                   model_type_imprecise='ilda',
                                   scaling=True):
    data = export_data_set('iris.data') if in_path is None else pd.read_csv(
        in_path)
    logger = create_logger("computing_precise_vs_imprecise", True)
    logger.info('Training dataset and models (%s, %s, %s, %s)', in_path,
                model_type_precise, model_type_imprecise, ell_optimal)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    model_impr = __factory_model(model_type_imprecise,
                                 init_matlab=True,
                                 add_path_matlab=lib_path_server,
                                 DEBUG=False)
    model_prec = __factory_model_precise(model_type_precise,
                                         store_covariance=True)
    avg_imprecise, avg_precise, n_real_times = 0, 0, 0
    for time in range(cv_n_fold):
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        imprecise_mean, precise_mean, n_real_fold = 0, 0, 0
        for idx_train, idx_test in kf.split(y):
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            model_impr.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal)
            model_prec.fit(X_cv_train, y_cv_train)
            n_real_tests, time_precise, time_imprecise = 0, 0, 0
            n_test, _ = X_cv_test.shape
            for i, test in enumerate(X_cv_test):
                evaluate_imp, _ = model_impr.evaluate(test)
                evaluate = model_prec.predict([test])
                if len(evaluate_imp) > 1:
                    n_real_tests += 1
                    if y_cv_test[i] in evaluate_imp: time_imprecise += 1
                    if y_cv_test[i] in evaluate: time_precise += 1
                logger.debug(
                    "(time, iTest, ellOptimal, cautious, prediction, ground-truth)(%s, %s, %s, %s, %s, %s)",
                    time, i, ell_optimal, evaluate_imp, evaluate, y_cv_test[i])
            logger.debug(
                "(time, ellOptimal, nRealTests, timeImprecise, timePrecise) (%s, %s, %s, %s, %s)",
                time, ell_optimal, n_real_tests, time_imprecise, time_precise)
            if n_real_tests > 0:
                n_real_fold += 1
                imprecise_mean += time_imprecise / n_real_tests
                precise_mean += time_precise / n_real_tests
        logger.debug("(time, nRealFold, imprecise, precise) (%s, %s, %s, %s)",
                     time, n_real_fold, imprecise_mean, precise_mean)
        if n_real_fold > 0:
            n_real_times += 1
            avg_imprecise += imprecise_mean / n_real_fold
            avg_precise += precise_mean / n_real_fold
    logger.debug("(dataset, models, imprec, prec) (%s, %s, %s, %s, %s)",
                 in_path, model_type_imprecise, model_type_precise,
                 avg_imprecise / n_real_times, avg_precise / n_real_times)
예제 #2
0
def performance_hold_out(in_path=None,
                         out_path=None,
                         model_type='lda',
                         test_pct=0.4,
                         n_times=10,
                         seeds=None,
                         scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "Without output saving performance"

    logger = create_logger("performance_hold_out", True)
    logger.info('Training data set %s, test percentage %s, model_type %s',
                in_path, test_pct, model_type)

    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = data.iloc[:, -1].tolist()

    seeds = generate_seeds(n_times) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)

    file_csv = open(out_path, 'w')
    writer = csv.writer(file_csv)

    model = __factory_model_precise(model_type, store_covariance=True)
    mean_u65, mean_u80 = np.array([]), np.array([])
    for i in range(0, n_times):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_pct, random_state=seeds[i])
        sum_u65, sum_u80 = 0, 0
        model.fit(X_train, y_train)
        n, _ = X_test.shape
        for j, test in enumerate(X_test):
            evaluate = model.predict([test])
            if y_test[j] in evaluate:
                sum_u65 += u65(evaluate)
                sum_u80 += u80(evaluate)
        logger.info("time, u65, u80 (%s, %s, %s)", i, sum_u65 / n, sum_u80 / n)
        mean_u65 = np.append(mean_u65, sum_u65 / n)
        mean_u80 = np.append(mean_u80, sum_u80 / n)
        writer.writerow([-999, i, mean_u65[i], mean_u80[i]])
        file_csv.flush()
    file_csv.close()
    logger.info("[total:data-set:avgResults] (%s, %s)", np.mean(mean_u65),
                np.mean(mean_u80))
예제 #3
0
def performance_accuracy_hold_out(in_path=None,
                                  model_type="ilda",
                                  ell_optimal=0.1,
                                  lib_path_server=None,
                                  seeds=None,
                                  DEBUG=False,
                                  scaling=False):
    assert os.path.exists(
        in_path
    ), "Without training data, cannot performing cross hold-out accuracy"
    logger = create_logger("performance_accuracy_hold_out", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                ell_optimal)
    X, y = dataset_to_Xy(in_path, scaling=scaling)

    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)
    n_time = len(seeds)
    mean_u65, mean_u80 = 0, 0
    model = __factory_model(model_type,
                            solver_matlab=True,
                            add_path_matlab=lib_path_server,
                            DEBUG=DEBUG)
    for k in range(0, n_time):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.4, random_state=seeds[k])
        model.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal)
        sum_u65, sum_u80 = 0, 0
        n_test, _ = X_test.shape
        for i, test in enumerate(X_test):
            evaluate = lqa.evaluate(test)
            logger.debug(
                "(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)",
                i, ell_optimal, evaluate, y_test[i])
            if y_test[i] in evaluate:
                sum_u65 += u65(evaluate)
                sum_u80 += u80(evaluate)
        logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_current, k,
                     sum_u65 / n_test, sum_u80 / n_test)
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    mean_u65 = mean_u65 / n_time
    mean_u80 = mean_u80 / n_time
    logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, mean_u65,
                 mean_u80)
예제 #4
0
def performance_cv_accuracy(in_path=None,
                            model_type='lda',
                            cv_n_fold=10,
                            seeds=None,
                            scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("performance_cv_accuracy", True)
    logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path,
                cv_n_fold, model_type)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())
    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)
    for time in range(cv_n_fold):
        # Generation a random k-fold validation.
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        model = __factory_model_precise(model_type, store_covariance=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            model.fit(X_cv_train, y_cv_train)
            n_test = len(idx_test)
            sum_u65, sum_u80 = 0, 0
            for i, test in enumerate(X_cv_test):
                evaluate = model.predict([test])
                logger.debug(
                    "(testing, prediction, ground-truth) (%s, %s, %s)", i,
                    evaluate, y_cv_test[i])
                if y_cv_test[i] in evaluate:
                    sum_u65 += u65(evaluate)
                    sum_u80 += u80(evaluate)
            mean_u65 += sum_u65 / n_test
            mean_u80 += sum_u80 / n_test
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    logger.info("[Total:data-set:avgResults] (%s, %s,  %s)", in_path,
                avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
def performance_cv_accuracy_imprecise(in_path=None,
                                      model_type="ilda",
                                      ell_optimal=0.1,
                                      nb_process=2,
                                      lib_path_server=None,
                                      cv_n_fold=10,
                                      seeds=None,
                                      criterion="maximality"):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path)
    logger = create_logger("performance_cv_accuracy_imprecise", True)
    logger.info('Training dataset (%s, %s, %s, %s)', in_path, model_type,
                ell_optimal, criterion)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)
    manager = ManagerWorkers(nb_process=nb_process, criterion=criterion)
    manager.executeAsync(model_type, lib_path_server)
    for time in range(cv_n_fold):
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            logger.info("Splits train %s", idx_train)
            logger.info("Splits test %s", idx_test)
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            mean_u65, mean_u80 = computing_training_testing_step(
                X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_optimal,
                manager, mean_u65, mean_u80)
            logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time,
                         mean_u65, mean_u80)
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    manager.poisonPillTraining()
    logger.debug("total-ell (%s, %s, %s, %s)", in_path, ell_optimal,
                 avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
예제 #6
0
def performance_cv_accuracy_imprecise(in_path=None,
                                      model_type="ilda",
                                      ell_optimal=0.1,
                                      scaling=False,
                                      lib_path_server=None,
                                      cv_n_fold=10,
                                      seeds=None,
                                      nb_process=10):
    assert os.path.exists(
        in_path
    ), "Without training data, cannot performing cross validation accuracy"
    logger = create_logger("performance_cv_accuracy_imprecise", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                ell_optimal)
    X, y = dataset_to_Xy(in_path, scaling=scaling)

    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)

    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(model_type, lib_path_server)
    for time in range(cv_n_fold):
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            mean_u65, mean_u80, _ = computing_training_testing_step(
                X[idx_train], y[idx_train], X[idx_test], y[idx_test],
                ell_optimal, manager, mean_u65, mean_u80)
            logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_optimal, time,
                         mean_u65, mean_u80)
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    manager.poisonPillTraining()
    logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal,
                 avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
예제 #7
0
def computing_time_prediction(in_path=None,
                              ell_optimal=0.1,
                              lib_path_server=None,
                              model_type="ilda",
                              criterion="maximality",
                              k_repetition=10,
                              seeds=None):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("computing_time_prediction", True)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].tolist()
    seeds = generate_seeds(k_repetition) if seeds is None else seeds
    logger.info(
        'Training dataset %s with maximality version (%s) and model (%s), ell_optimal (%s) and seeds %s',
        in_path, criterion, model_type, ell_optimal, seeds)
    model = __factory_model(model_type,
                            solver_matlab=True,
                            add_path_matlab=lib_path_server,
                            DEBUG=False)
    avg = np.array([])
    for k in range(k_repetition):
        logger.info("%s-fold repetition randomly, seed %s", k, seeds[k])
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=seeds[k])
        model.learn(X=X_train, y=y_train, ell=ell_optimal)
        n, _ = X_test.shape
        sum_time = 0
        for i, test in enumerate(X_test):
            start = time.time()
            evaluate = model.evaluate(test, criterion=criterion)
            end = time.time()
            logger.info("Evaluate %s, Ground-truth %s, Time %s ", evaluate,
                        y_test[i], (end - start))
            sum_time += (end - start)
        avg = np.append(avg, sum_time / n)
    logger.info("Total time (%s, %s) and average %s and sd %s of %s testing",
                in_path, avg, np.mean(avg), np.std(avg), n)
예제 #8
0
def performance_qda_regularized(in_path=None,
                                out_path=None,
                                cv_n_fold=10,
                                seeds=None,
                                from_alpha=0,
                                to_alpha=2.0,
                                by_alpha=0.01,
                                scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "Without output saving performance"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("performance_qda_regularized", True)
    logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path,
                cv_n_fold, "qda")
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)

    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    alphas = np.arange(from_alpha, to_alpha, by_alpha)
    writer.writerow(alphas)

    qda_regularized = [None] * len(alphas)
    for idx, alpha in enumerate(alphas):
        qda_regularized[idx] = __factory_model_precise("qda",
                                                       store_covariance=True,
                                                       reg_param=alpha)
    # Generation a random k-fold validation.
    kf_second = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    ikfold, accuracy, best_alphas = 0, [0] * cv_n_fold, [0] * cv_n_fold
    for idx_learning, idx_testing in kf_second.split(y):
        X_training, y_training = X[idx_learning], y[idx_learning]
        X_testing, y_testing = X[idx_testing], y[idx_testing]
        kf = KFold(n_splits=cv_n_fold,
                   random_state=seeds[ikfold],
                   shuffle=True)
        acc_u80 = [0] * len(qda_regularized)
        for idx_train, idx_test in kf.split(y_training):
            X_cv_train, y_cv_train = X_training[idx_train], y_training[
                idx_train]
            X_cv_test, y_cv_test = X_training[idx_test], y_training[idx_test]
            for model in qda_regularized:
                model.fit(X_cv_train, y_cv_train)
            n_test = len(idx_test)
            for i, test in enumerate(X_cv_test):
                for im, model in enumerate(qda_regularized):
                    evaluate = model.predict([test])
                    if y_cv_test[i] in evaluate:
                        acc_u80[im] += (u80(evaluate) / n_test) / cv_n_fold
        idx_best = np.argmax(acc_u80)
        logger.info("[1kfold:best_model:seed:u80] (%s, %s, %s, %s)", ikfold,
                    alphas[idx_best], seeds[ikfold], acc_u80)
        writer.writerow(acc_u80)
        file_csv.flush()

        best_model = __factory_model_precise("qda",
                                             store_covariance=True,
                                             reg_param=alphas[idx_best])
        best_model.fit(X_training, y_training)
        accuracy[ikfold], bn_test, best_alphas[ikfold] = 0, len(
            idx_testing), alphas[idx_best]
        for i, test in enumerate(X_testing):
            evaluate = best_model.predict([test])
            if y_testing[i] in evaluate:
                accuracy[ikfold] += u80(evaluate) / bn_test
        logger.info("[2kfold:best_model:seed:accuracy] (%s, %s, %s)", ikfold,
                    alphas[idx_best], accuracy[ikfold])
        ikfold += 1
    file_csv.close()
    logger.info("[total:data-set:avgResults] (%s, %s, %s, %s)", in_path,
                np.mean(accuracy), best_alphas, accuracy)
예제 #9
0
def computing_best_imprecise_mean(in_path=None, out_path=None, lib_path_server=None, model_type="ilda",
                                  from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, cv_kfold_first=10,
                                  nb_process=2, skip_nfold=0, cv_kfold_second=10, seed_second=None, scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean_cv", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, out_path, model_type)
    logger.info('Parameters (ells, nbProcess, skip_nfold, cv_kfold_second) (%s, %s, %s, %s, %s, %s)', from_ell,
                to_ell, by_ell, nb_process, skip_nfold, cv_kfold_second)

    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    # Seeding a random value for k-fold top learning-testing data
    seed = random.randrange(pow(2, 30)) if seed is None else seed
    logger.debug("[FIRST-STEP-SEED] MODEL: %s, SEED: %s", model_type, seed)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process)
    manager.executeAsync(model_type, lib_path_server)

    kfFirst = KFold(n_splits=cv_kfold_first, random_state=seed, shuffle=True)
    acc_u80, acc_u65, idx_kfold = dict(), dict(), 0
    seed_2step = generate_seeds(cv_kfold_second) if seed_second is None else seed_second
    logger.debug("[SECOND-STEP-SEEDS] MODEL: %s, SEED: %s, SECOND-SEED: %s", model_type, seed, seed_2step)
    for idx_learning, idx_testing in kfFirst.split(y):
        ell_u65, ell_u80 = dict(), dict()
        # Generate sampling k-fold (learning, testing) for optimal ell parameters
        X_learning, y_learning = X[idx_learning], y[idx_learning]
        X_testing, y_testing = X[idx_testing], y[idx_testing]
        logger.info("Splits %s learning %s", idx_kfold, idx_learning)
        logger.info("Splits %s testing %s", idx_kfold, idx_testing)

        # # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling
        if skip_nfold != 0 and idx_kfold > skip_nfold:
            from_ell = 0.01

        # n-Skipping fold cross-validation (purpose for parallel computing)
        if idx_kfold >= skip_nfold:
            # Generate same k-fold-second (train, test) for impartially computing accuracy all ell parameters
            splits_ell = list([])
            logger.debug("[2-STEP-SEED] MODEL: %s, SEED: %s OF FIRST STEP %s", model_type, seed_2step[idx_kfold], seed)
            kfSecond = KFold(n_splits=cv_kfold_second, random_state=seed_2step[idx_kfold], shuffle=True)
            for idx_learn_train, idx_learn_test in kfSecond.split(y_learning):
                splits_ell.append((idx_learn_train, idx_learn_test))
                logger.info("Splits %s train %s", len(splits_ell), idx_learn_train)
                logger.info("Splits %s test %s", len(splits_ell), idx_learn_test)

            for ell_current in np.arange(from_ell, to_ell, by_ell):
                ell_u65[ell_current], ell_u80[ell_current] = 0, 0
                logger.info("ELL_CURRENT %s", ell_current)
                for idx_learn_train, idx_learn_test in splits_ell:
                    logger.info("Splits step train %s", idx_learn_train)
                    logger.info("Splits step test %s", idx_learn_test)
                    X_cv_train, y_cv_train = X_learning[idx_learn_train], y_learning[idx_learn_train]
                    X_cv_test, y_cv_test = X_learning[idx_learn_test], y_learning[idx_learn_test]

                    ell_u65[ell_current], ell_u80[ell_current], _ = \
                        computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current,
                                                        manager, ell_u65[ell_current], ell_u80[ell_current])

                    logger.info("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current])
                ell_u65[ell_current] = ell_u65[ell_current] / cv_kfold_first
                ell_u80[ell_current] = ell_u80[ell_current] / cv_kfold_first
                writer.writerow([ell_current, idx_kfold, ell_u65[ell_current], ell_u80[ell_current]])
                file_csv.flush()
                logger.debug("Partial-ell-k-step (%s, %s, %s)", idx_kfold, ell_u65[ell_current], ell_u80[ell_current])
            logger.debug("Total-ell-k-step (%s, %s, %s, %s)", in_path, idx_kfold, ell_u65, ell_u80)

            # Computing optimal ells for using in testing step
            acc_ell_u80 = max(ell_u80.values())
            acc_ell_u65 = max(ell_u65.values())
            ell_u80_opts = [k for k, v in ell_u80.items() if v == acc_ell_u80]
            ell_u65_opts = [k for k, v in ell_u65.items() if v == acc_ell_u65]
            acc_u65[idx_kfold], acc_u80[idx_kfold] = 0, 0
            n_ell80_opts, n_ell65_opts = len(ell_u80_opts), len(ell_u65_opts)
            for ell_u80_opt in ell_u80_opts:
                logger.info("ELL_OPTIMAL_CV_U80 %s", ell_u80_opt)
                _, _acc_u80, _ = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u80_opt,
                                                    manager, 0, 0)
                acc_u80[idx_kfold] += _acc_u80
                writer.writerow([-999, -8, ell_u80_opt, _acc_u80])

            for ell_u65_opt in ell_u65_opts:
                logger.info("ELL_OPTIMAL_CV_U65 %s", ell_u65_opt)
                _acc_u65, _, _ = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ell_u65_opt,
                                                    manager, 0, 0)
                acc_u65[idx_kfold] += _acc_u65
                writer.writerow([-999, -7, ell_u65_opt, _acc_u65])

            acc_u65[idx_kfold] = acc_u65[idx_kfold] / n_ell65_opts
            acc_u80[idx_kfold] = acc_u80[idx_kfold] / n_ell80_opts
            writer.writerow([-999, idx_kfold, acc_u65[idx_kfold], acc_u80[idx_kfold]])
            file_csv.flush()
            logger.debug("Partial-ell-2step (u80, u65, accs) (%s, %s, %s, %s, %s)", -999, ell_u80_opts, ell_u65_opts,
                         acc_u65[idx_kfold], acc_u80[idx_kfold])
        idx_kfold += 1
    writer.writerow([-9999, -9, np.mean(list(acc_u65.values())), np.mean(list(acc_u80.values()))])
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80)
    logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path, np.mean(list(acc_u65.values())),
                 np.mean(list(acc_u80.values())))
def computing_best_imprecise_mean(in_path=None,
                                  out_path=None,
                                  cv_nfold=10,
                                  model_type="ilda",
                                  test_size=0.4,
                                  from_ell=0.1,
                                  to_ell=1.0,
                                  by_ell=0.1,
                                  seeds=None,
                                  lib_path_server=None,
                                  nb_process=2,
                                  n_sampling=10,
                                  skip_n_sample=0,
                                  criterion="maximality",
                                  scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean_sampling", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                criterion)
    logger.info(
        'Parameters (size, ells, nbProcess, sampling, nSkip) (%s, %s, %s, %s, %s, %s, %s)',
        test_size, from_ell, to_ell, by_ell, nb_process, n_sampling,
        skip_n_sample)
    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    # Seed for get back up if process is killed
    seeds = generate_seeds(n_sampling) if seeds is None else seeds
    logger.debug("MODEL: %s, SEED: %s", model_type, seeds)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)
    manager = ManagerWorkers(nb_process=nb_process, criterion=criterion)
    manager.executeAsync(model_type, lib_path_server)
    acc_u80, acc_u65 = dict(), dict()
    for sampling in range(min(n_sampling, len(seeds))):
        X_learning, X_testing, y_learning, y_testing = \
            train_test_split(X, y, test_size=test_size, random_state=seeds[sampling])
        logger.info("Splits %s learning %s", sampling, y_learning)
        logger.info("Splits %s testing %s", sampling, y_testing)

        # n-Skipping sampling and reboot parameter from_ell to 0.01 next sampling
        if skip_n_sample != 0 and sampling > skip_n_sample: from_ell = 0.01
        # n-Skipping sampling testing (purpose for parallel computing)
        if sampling >= skip_n_sample:
            kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True)
            ell_u65, ell_u80, splits = dict(), dict(), list([])
            for idx_train, idx_test in kf.split(y_learning):
                splits.append((idx_train, idx_test))
                logger.info("Sampling %s Splits %s train %s", sampling,
                            len(splits), idx_train)
                logger.info("Sampling %s Splits %s test %s", sampling,
                            len(splits), idx_test)

            for ell_current in np.arange(from_ell, to_ell, by_ell):
                ell_u65[ell_current], ell_u80[ell_current] = 0, 0
                logger.info("ELL_CURRENT %s", ell_current)
                for idx_train, idx_test in splits:
                    logger.info("Splits train %s", idx_train)
                    logger.info("Splits test %s", idx_test)
                    X_cv_train, y_cv_train = X_learning[idx_train], y_learning[
                        idx_train]
                    X_cv_test, y_cv_test = X_learning[idx_test], y_learning[
                        idx_test]
                    # Computing accuracy testing for cross-validation step
                    ell_u65[ell_current], ell_u80[ell_current] = \
                        computing_training_testing_step(X_cv_train, y_cv_train, X_cv_test, y_cv_test, ell_current,
                                                        manager, ell_u65[ell_current], ell_u80[ell_current])
                    logger.info("Partial-kfold (%s, %s, %s)", ell_current,
                                ell_u65[ell_current], ell_u80[ell_current])

                ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold
                ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold
                writer.writerow([
                    ell_current, sampling, ell_u65[ell_current],
                    ell_u80[ell_current]
                ])
                file_csv.flush()
                logger.debug("Partial-ell-sampling (%s, %s, %s, %s)",
                             ell_current, sampling, ell_u65, ell_u80)
            logger.debug("Total-ell-sampling (%s, %s, %s, %s)", in_path,
                         sampling, ell_u65, ell_u80)

            # Computing optimal ells for using in testing step
            acc_ellu80 = max(ell_u80.values())
            acc_ellu65 = max(ell_u65.values())
            ellu80_opts = [k for k, v in ell_u80.items() if v == acc_ellu80]
            ellu65_opts = [k for k, v in ell_u65.items() if v == acc_ellu65]
            acc_u65[sampling], acc_u80[sampling] = 0, 0
            n_ell80_opts, n_ell65_opts = len(ellu80_opts), len(ellu65_opts)

            for ellu80_opt in ellu80_opts:
                logger.info("ELL_OPTIMAL_SAMPLING_U80 %s", ellu80_opt)
                _, acc_u80[sampling] = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu80_opt,
                                                    manager, 0, acc_u80[sampling])

            for ellu65_opt in ellu65_opts:
                logger.info("ELL_OPTIMAL_SAMPLING_U65 %s", ellu65_opt)
                acc_u65[sampling], _ = \
                    computing_training_testing_step(X_learning, y_learning, X_testing, y_testing, ellu65_opt,
                                                    manager, acc_u65[sampling], 0)

            acc_u65[sampling] = acc_u65[sampling] / n_ell65_opts
            acc_u80[sampling] = acc_u80[sampling] / n_ell80_opts
            writer.writerow(
                [-999, sampling, acc_u65[sampling], acc_u80[sampling]])
            file_csv.flush()
            logger.debug("Partial-ell-2step (%s, %s, %s, %s)", -999,
                         ellu80_opts, acc_u65[sampling], acc_u80[sampling])

    writer.writerow([
        -9999, -9,
        np.mean(list(acc_u65.values())),
        np.mean(list(acc_u80.values()))
    ])
    manager.poisonPillTraining()
    file_csv.close()
    logger.debug("Total-accuracy (%s, %s, %s)", in_path, acc_u65, acc_u80)
    logger.debug("Total-avg-accuracy (%s, %s, %s)", in_path,
                 np.mean(list(acc_u65.values())),
                 np.mean(list(acc_u80.values())))