Пример #1
0
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True)
    performance_tr_knn = eval.accuracy(train_y, class_train_y)
    performance_te_knn = eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True)
    performance_tr_dt = eval.accuracy(train_y, class_train_y)
    performance_te_dt = eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X)
    performance_tr_nb = eval.accuracy(train_y, class_train_y)
    performance_te_nb = eval.accuracy(test_y, class_test_y)

    scores_with_sd = util.print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [
                                                                                                (overall_performance_tr_nn, overall_performance_te_nn),
                                                                                                (overall_performance_tr_rf, overall_performance_te_rf),
                                                                                                (overall_performance_tr_svm, overall_performance_te_svm),
                                                                                                (performance_tr_knn, performance_te_knn),
                                                                                                (performance_tr_dt, performance_te_dt),
                                                                                                (performance_tr_nb, performance_te_nb)])
    scores_over_all_algs.append(scores_with_sd)

DataViz.plot_performances_classification(['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs)

# And we study two promising ones in more detail. First let us consider the decision tree which works best with the selected
# features.
#
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features],
                                                                                           gridsearch=True,
                                                                                           print_model_details=True, export_tree_path=export_tree_path)

class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(train_X[selected_features], train_y, test_X[selected_features],
Пример #2
0
    """
    overall_performance_tr_svm = performance_tr_svm/repeats
    overall_performance_te_svm = performance_te_svm/repeats
    """

    # And we run our deterministic classifiers:
    """
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True)
    performance_tr_knn = eval.accuracy(train_y, class_train_y)
    performance_te_knn = eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True)
    performance_tr_dt = eval.accuracy(train_y, class_train_y)
    performance_te_dt = eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X)
    performance_tr_nb = eval.accuracy(train_y, class_train_y)
    performance_te_nb = eval.accuracy(test_y, class_test_y)
    """

    scores_with_sd = util.print_table_row_performances(
        feature_names[i], len(selected_train_X.index),
        len(selected_test_X.index),
        [(overall_performance_tr_rf, overall_performance_te_rf)])
    scores_over_all_algs.append(scores_with_sd)

print scores_over_all_algs
DataViz.plot_performances_classification(['RF'], feature_names,
                                         scores_over_all_algs)

exit(0)
Пример #3
0
    overall_performance_tr_rf = performance_tr_rf/repeats
    overall_performance_te_rf = performance_te_rf/repeats
    """
    overall_performance_tr_svm = performance_tr_svm / repeats
    overall_performance_te_svm = performance_te_svm / repeats

    # And we run our deterministic classifiers:
    """
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True)
    performance_tr_knn = eval.accuracy(train_y, class_train_y)
    performance_te_knn = eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True)
    performance_tr_dt = eval.accuracy(train_y, class_train_y)
    performance_te_dt = eval.accuracy(test_y, class_test_y)

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X)
    performance_tr_nb = eval.accuracy(train_y, class_train_y)
    performance_te_nb = eval.accuracy(test_y, class_test_y)
    """

    scores_with_sd = util.print_table_row_performances(
        feature_names[i], len(selected_train_X.index),
        len(selected_test_X.index), [(performance_tr_svm, performance_te_svm)])
    scores_over_all_algs.append(scores_with_sd)

print scores_over_all_algs
DataViz.plot_performances_classification(['SVM'], feature_names,
                                         scores_over_all_algs)

exit(0)
Пример #4
0
def experiment(file):
    dataset = pd.read_csv(file, index_col=time_col)
    DataViz = VisualizeDataset(__file__.split('.')[0] +
                               file.split('.')[0].split('/')[1] + '.py',
                               show=True)
    print(DataViz.figures_dir)
    dataset.index = pd.to_datetime(dataset.index)
    prepare = PrepareDatasetForLearning()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'],
        'like',
        0.7,
        filter=False,
        temporal=False,
        drop_na=False,
        fill_na=True)

    time_features = [name for name in dataset.columns if '_temp' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    features_2 = list(set().union(basic_features, time_features))
    features_3 = list(set().union(basic_features, time_features,
                                  freq_features))
    features_4 = list(set().union(basic_features, time_features, freq_features,
                                  cluster_features))

    # print('feature selection')
    # fs = FeatureSelectionClassification()
    # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
    #                                                                   train_X[features_4], train_y)
    # log([str(ordered_scores), str(selected_features)])
    selected_features = [
        'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120',
        'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq',
        'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40',
        'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40',
        'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40',
        'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200',
        'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120',
        'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40',
        'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40',
        'acc_z_freq_0.6_Hz_ws_40'
    ]
    DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                    y=[selected_features],
                    xlabel='number of features',
                    ylabel='accuracy')

    print('feature selection finished for %s' % file)
    learner = ClassificationAlgorithms()
    eval = ClassificationEvaluation()

    possible_feature_sets = [
        basic_features, features_2, features_3, features_4, selected_features
    ]
    feature_names = [
        'Basic features', 'Features with time', 'Features with frequency',
        'Features with cluster', 'Selected features'
    ]

    # with shelve.open('temp/shelve.out', 'n') as f:
    #     for key in dir():
    #         try:
    #             f[key] = globals()[key]
    #         except:
    #             print('ERROR shelving: {0}'.format(key))

    N_KCV_REPEATS = 1

    scores_over_all_algs = []

    for i in range(0, len(possible_feature_sets)):
        print(datetime.now())
        print('possible feature sets', i)
        log(['Features %d' % i])
        selected_train_X = train_X[possible_feature_sets[i]]
        selected_test_X = test_X[possible_feature_sets[i]]

        # First we run our non deterministic classifiers a number of times to average their score.

        performance_tr_rf = 0
        performance_te_rf = 0

        for repeat in range(0, N_KCV_REPEATS):
            print(datetime.now())
            print('\nRepeat', repeat)
            print('Random Forest')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                selected_train_X,
                train_y,
                selected_test_X,
                gridsearch=True,
                print_model_details=True)
            test_cm = eval.confusion_matrix(test_y, class_test_y,
                                            class_train_prob_y.columns)

            DataViz.plot_confusion_matrix(test_cm,
                                          class_train_prob_y.columns,
                                          normalize=False)

            performance_tr_rf += eval.accuracy(train_y, class_train_y)
            performance_te_rf += eval.accuracy(test_y, class_test_y)

            print(datetime.now())

        overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
        overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
        log([
            'RF' + ' train acc: %f' % performance_te_rf +
            ' test acc: %f' % performance_te_rf
        ])

        # And we run our deterministic classifiers:

        print('decision tree')
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
            selected_train_X,
            train_y,
            selected_test_X,
            gridsearch=True,
            print_model_details=True)
        performance_tr_dt = eval.accuracy(train_y, class_train_y)
        performance_te_dt = eval.accuracy(test_y, class_test_y)
        test_cm = eval.confusion_matrix(test_y, class_test_y,
                                        class_train_prob_y.columns)

        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)

        log([
            'DT' + ' train acc: %f' % performance_tr_dt +
            ' test acc: %f' % performance_te_dt
        ])
        scores_with_sd = util.print_table_row_performances(
            feature_names[i], len(selected_train_X.index),
            len(selected_test_X.index), [
                (overall_performance_tr_rf, overall_performance_te_rf),
                (performance_tr_dt, performance_te_dt),
            ])
        scores_over_all_algs.append(scores_with_sd)

    DataViz.plot_performances_classification(['RF', 'DT'], feature_names,
                                             scores_over_all_algs)
    print(datetime.now())