示例#1
0
def prediction(pid, tasks, queue, results, model_type, lib_path_server,
               criterion):
    model = __factory_model(model_type,
                            init_matlab=True,
                            add_path_matlab=lib_path_server,
                            DEBUG=False)
    while True:
        training = queue.get()
        if training is None: break
        model.learn(**training)
        sum80, sum65 = 0, 0
        while True:
            task = tasks.get()
            if task is None: break
            evaluate, _ = model.evaluate(task['X_test'], criterion=criterion)
            print("(pid, prediction, ground-truth) (",
                  pid,
                  evaluate,
                  task["y_test"],
                  ")",
                  flush=True)
            if task['y_test'] in evaluate:
                sum65 += u65(evaluate)
                sum80 += u80(evaluate)
        results.put(dict({'u65': sum65, 'u80': sum80}))
        queue.task_done()
    print("Worker PID finished", pid, flush=True)
示例#2
0
def computing_best_imprecise_mean(in_path=None, out_path=None, cv_nfold=10, model_type="ieda", test_size=0.4,
                                  from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, lib_path_server=None, scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "File for putting results does not exist"

    logger = create_logger("computing_best_imprecise_mean", True)
    logger.info('Training dataset %s', in_path)
    data = pd.read_csv(in_path)  # , header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    ell_u65, ell_u80 = dict(), dict()
    seed = random.randrange(pow(2, 30)) if seed is None else seed
    logger.debug("MODEL: %s, SEED: %s", model_type, seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True)
    splits = list([])
    for idx_train, idx_test in kf.split(y_train):
        splits.append((idx_train, idx_test))
        logger.info("Splits %s train %s", len(splits), idx_train)
        logger.info("Splits %s test %s", len(splits), idx_test)

    # Create a CSV file for saving results
    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=True)
    for ell_current in np.arange(from_ell, to_ell, by_ell):
        ell_u65[ell_current], ell_u80[ell_current] = 0, 0
        logger.info("ELL_CURRENT %s", ell_current)
        for idx_train, idx_test in splits:
            logger.info("Splits train %s", idx_train)
            logger.info("Splits test %s", idx_test)
            X_cv_train, y_cv_train = X_train[idx_train], y_train[idx_train]
            X_cv_test, y_cv_test = X_train[idx_test], y_train[idx_test]
            model.learn(X=X_cv_train, y=y_cv_train, ell=ell_current)
            sum_u65, sum_u80 = 0, 0
            n_test = len(idx_test)
            for i, test in enumerate(X_cv_test):
                evaluate = model.evaluate(test)
                logger.debug("(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)",
                             i, ell_current, evaluate, y_cv_test[i])
                if y_cv_test[i] in evaluate:
                    sum_u65 += u65(evaluate)
                    sum_u80 += u80(evaluate)
            ell_u65[ell_current] += sum_u65 / n_test
            ell_u80[ell_current] += sum_u80 / n_test
            logger.debug("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current])
        ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold
        ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold
        writer.writerow([ell_current, ell_u65[ell_current], ell_u80[ell_current]])
        file_csv.flush()
        logger.debug("Partial-ell (%s, %s, %s)", ell_current, ell_u65, ell_u80)
    file_csv.close()
    logger.debug("Total-ell %s %s %s", in_path, ell_u65, ell_u80)
示例#3
0
def performance_hold_out(in_path=None,
                         out_path=None,
                         model_type='lda',
                         test_pct=0.4,
                         n_times=10,
                         seeds=None,
                         scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "Without output saving performance"

    logger = create_logger("performance_hold_out", True)
    logger.info('Training data set %s, test percentage %s, model_type %s',
                in_path, test_pct, model_type)

    data = pd.read_csv(in_path, header=None)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = data.iloc[:, -1].tolist()

    seeds = generate_seeds(n_times) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)

    file_csv = open(out_path, 'w')
    writer = csv.writer(file_csv)

    model = __factory_model_precise(model_type, store_covariance=True)
    mean_u65, mean_u80 = np.array([]), np.array([])
    for i in range(0, n_times):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_pct, random_state=seeds[i])
        sum_u65, sum_u80 = 0, 0
        model.fit(X_train, y_train)
        n, _ = X_test.shape
        for j, test in enumerate(X_test):
            evaluate = model.predict([test])
            if y_test[j] in evaluate:
                sum_u65 += u65(evaluate)
                sum_u80 += u80(evaluate)
        logger.info("time, u65, u80 (%s, %s, %s)", i, sum_u65 / n, sum_u80 / n)
        mean_u65 = np.append(mean_u65, sum_u65 / n)
        mean_u80 = np.append(mean_u80, sum_u80 / n)
        writer.writerow([-999, i, mean_u65[i], mean_u80[i]])
        file_csv.flush()
    file_csv.close()
    logger.info("[total:data-set:avgResults] (%s, %s)", np.mean(mean_u65),
                np.mean(mean_u80))
示例#4
0
def performance_accuracy_hold_out(in_path=None,
                                  model_type="ilda",
                                  ell_optimal=0.1,
                                  lib_path_server=None,
                                  seeds=None,
                                  DEBUG=False,
                                  scaling=False):
    assert os.path.exists(
        in_path
    ), "Without training data, cannot performing cross hold-out accuracy"
    logger = create_logger("performance_accuracy_hold_out", True)
    logger.info('Training dataset (%s, %s, %s)', in_path, model_type,
                ell_optimal)
    X, y = dataset_to_Xy(in_path, scaling=scaling)

    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds used for accuracy %s', seeds)
    n_time = len(seeds)
    mean_u65, mean_u80 = 0, 0
    model = __factory_model(model_type,
                            solver_matlab=True,
                            add_path_matlab=lib_path_server,
                            DEBUG=DEBUG)
    for k in range(0, n_time):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.4, random_state=seeds[k])
        model.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal)
        sum_u65, sum_u80 = 0, 0
        n_test, _ = X_test.shape
        for i, test in enumerate(X_test):
            evaluate = lqa.evaluate(test)
            logger.debug(
                "(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)",
                i, ell_optimal, evaluate, y_test[i])
            if y_test[i] in evaluate:
                sum_u65 += u65(evaluate)
                sum_u80 += u80(evaluate)
        logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_current, k,
                     sum_u65 / n_test, sum_u80 / n_test)
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    mean_u65 = mean_u65 / n_time
    mean_u80 = mean_u80 / n_time
    logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, mean_u65,
                 mean_u80)
示例#5
0
def performance_cv_accuracy(in_path=None,
                            model_type='lda',
                            cv_n_fold=10,
                            seeds=None,
                            scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("performance_cv_accuracy", True)
    logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path,
                cv_n_fold, model_type)
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())
    avg_u65, avg_u80 = 0, 0
    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)
    for time in range(cv_n_fold):
        # Generation a random k-fold validation.
        kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True)
        model = __factory_model_precise(model_type, store_covariance=True)
        mean_u65, mean_u80 = 0, 0
        for idx_train, idx_test in kf.split(y):
            X_cv_train, y_cv_train = X[idx_train], y[idx_train]
            X_cv_test, y_cv_test = X[idx_test], y[idx_test]
            model.fit(X_cv_train, y_cv_train)
            n_test = len(idx_test)
            sum_u65, sum_u80 = 0, 0
            for i, test in enumerate(X_cv_test):
                evaluate = model.predict([test])
                logger.debug(
                    "(testing, prediction, ground-truth) (%s, %s, %s)", i,
                    evaluate, y_cv_test[i])
                if y_cv_test[i] in evaluate:
                    sum_u65 += u65(evaluate)
                    sum_u80 += u80(evaluate)
            mean_u65 += sum_u65 / n_test
            mean_u80 += sum_u80 / n_test
        logger.info("Time, seed, u65, u80 (%s, %s, %s, %s)", time, seeds[time],
                    mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
        avg_u65 += mean_u65 / cv_n_fold
        avg_u80 += mean_u80 / cv_n_fold
    logger.info("[Total:data-set:avgResults] (%s, %s,  %s)", in_path,
                avg_u65 / cv_n_fold, avg_u80 / cv_n_fold)
示例#6
0
def performance_qda_regularized(in_path=None,
                                out_path=None,
                                cv_n_fold=10,
                                seeds=None,
                                from_alpha=0,
                                to_alpha=2.0,
                                by_alpha=0.01,
                                scaling=False):
    assert os.path.exists(in_path), "Without training data, not testing"
    assert os.path.exists(out_path), "Without output saving performance"
    data = pd.read_csv(in_path, header=None)
    logger = create_logger("performance_qda_regularized", True)
    logger.info('Training data set %s, cv_n_fold %s, model_type %s', in_path,
                cv_n_fold, "qda")
    X = data.iloc[:, :-1].values
    if scaling: X = normalize_minmax(X)
    y = np.array(data.iloc[:, -1].tolist())

    seeds = generate_seeds(cv_n_fold) if seeds is None else seeds
    logger.info('Seeds generated %s', seeds)

    file_csv = open(out_path, 'a')
    writer = csv.writer(file_csv)

    alphas = np.arange(from_alpha, to_alpha, by_alpha)
    writer.writerow(alphas)

    qda_regularized = [None] * len(alphas)
    for idx, alpha in enumerate(alphas):
        qda_regularized[idx] = __factory_model_precise("qda",
                                                       store_covariance=True,
                                                       reg_param=alpha)
    # Generation a random k-fold validation.
    kf_second = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    ikfold, accuracy, best_alphas = 0, [0] * cv_n_fold, [0] * cv_n_fold
    for idx_learning, idx_testing in kf_second.split(y):
        X_training, y_training = X[idx_learning], y[idx_learning]
        X_testing, y_testing = X[idx_testing], y[idx_testing]
        kf = KFold(n_splits=cv_n_fold,
                   random_state=seeds[ikfold],
                   shuffle=True)
        acc_u80 = [0] * len(qda_regularized)
        for idx_train, idx_test in kf.split(y_training):
            X_cv_train, y_cv_train = X_training[idx_train], y_training[
                idx_train]
            X_cv_test, y_cv_test = X_training[idx_test], y_training[idx_test]
            for model in qda_regularized:
                model.fit(X_cv_train, y_cv_train)
            n_test = len(idx_test)
            for i, test in enumerate(X_cv_test):
                for im, model in enumerate(qda_regularized):
                    evaluate = model.predict([test])
                    if y_cv_test[i] in evaluate:
                        acc_u80[im] += (u80(evaluate) / n_test) / cv_n_fold
        idx_best = np.argmax(acc_u80)
        logger.info("[1kfold:best_model:seed:u80] (%s, %s, %s, %s)", ikfold,
                    alphas[idx_best], seeds[ikfold], acc_u80)
        writer.writerow(acc_u80)
        file_csv.flush()

        best_model = __factory_model_precise("qda",
                                             store_covariance=True,
                                             reg_param=alphas[idx_best])
        best_model.fit(X_training, y_training)
        accuracy[ikfold], bn_test, best_alphas[ikfold] = 0, len(
            idx_testing), alphas[idx_best]
        for i, test in enumerate(X_testing):
            evaluate = best_model.predict([test])
            if y_testing[i] in evaluate:
                accuracy[ikfold] += u80(evaluate) / bn_test
        logger.info("[2kfold:best_model:seed:accuracy] (%s, %s, %s)", ikfold,
                    alphas[idx_best], accuracy[ikfold])
        ikfold += 1
    file_csv.close()
    logger.info("[total:data-set:avgResults] (%s, %s, %s, %s)", in_path,
                np.mean(accuracy), best_alphas, accuracy)