def backward_selection(self, max_features, X_train, y_train): # First select all features. selected_features = X_train.columns.tolist() ca = ClassificationAlgorithms() ce = ClassificationEvaluation() for i in range(0, (len(X_train.columns) - max_features)): best_perf = 0 worst_feature = '' # Select from the features that are still in the selection. for f in selected_features: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.remove(f) # Determine the score without the feature. pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = ce.accuracy(y_train, pred_y_train) # If we score better without the feature than what we have seen so far # this is the worst feature. if perf > best_perf: best_perf = perf worst_feature = f # Remove the worst feature. selected_features.remove(worst_feature) return selected_features
def forward_selection( max_features: int, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[List[str], List[str], List[float]]: """ Select the given number of features for classification, that show the best accuracy, using forward selection. The method uses the given features and labels to train a decision tree and determine the accuracy of the prediction. The method returns the selected features as well as the the scores. :param max_features: Number of features to select. :param X_train: Features as DataFrame. :param y_train: Labels corresponding to given features. :return: Selected features and scores. """ # Start with no features ordered_features = [] ordered_scores = [] selected_features = [] ca = ClassificationAlgorithms() ce = ClassificationEvaluation() # Select the appropriate number of features for i in range(0, max_features): # Determine the features left to select features_left = list(set(X_train.columns) - set(selected_features)) best_perf = 0 best_feature = '' print(f'Selecting feature {i+1}/{max_features}') # Iterate over all features left for f in features_left: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.append(f) # Determine the accuracy of a decision tree learner adding the feature pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features], gridsearch=False) perf = ce.accuracy(y_train, pred_y_train) # If the performance is better than the best so far (aiming for high accuracy), set the current feature # to the best feature and the same for the best performance if perf > best_perf: best_perf = perf best_feature = f # Select the feature with the best performance selected_features.append(best_feature) ordered_features.append(best_feature) ordered_scores.append(best_perf) return selected_features, ordered_features, ordered_scores
def gridsearch_reservoir_computing(self, train_X, train_y, test_X, test_y, per_time_step=False, error='mse', gridsearch_training_frac=0.7): tuned_parameters = { 'a': [0.6, 0.8], 'reservoir_size': [400, 700, 1000] } # tuned_parameters = {'a': [0.4], 'reservoir_size':[250]} params = tuned_parameters.keys() combinations = self.generate_parameter_combinations( tuned_parameters, params) split_point = int(gridsearch_training_frac * len(train_X.index)) train_params_X = train_X.ix[0:split_point, ] test_params_X = train_X.ix[split_point:len(train_X.index), ] train_params_y = train_y.ix[0:split_point, ] test_params_y = train_y.ix[split_point:len(train_X.index), ] if error == 'mse': best_error = sys.float_info.max elif error == 'accuracy': best_error = 0 best_combination = [] for comb in combinations: print comb # Order of the keys might have changed. keys = tuned_parameters.keys() pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.reservoir_computing( train_params_X, train_params_y, test_params_X, test_params_y, reservoir_size=comb[keys.index('reservoir_size')], a=comb[keys.index('a')], per_time_step=per_time_step, gridsearch=False) if error == 'mse': eval = RegressionEvaluation() mse = eval.mean_squared_error(test_params_y, pred_test_y_prob) if mse < best_error: best_error = mse best_combination = comb elif error == 'accuracy': eval = ClassificationEvaluation() acc = eval.accuracy(test_params_y, pred_test_y) if acc > best_error: best_error = acc best_combination = comb print '-------' print best_combination print '-------' return best_combination[keys.index( 'reservoir_size')], best_combination[keys.index('a')]
def gridsearch_recurrent_neural_network(self, train_X, train_y, test_X, test_y, error='accuracy', gridsearch_training_frac=0.7): tuned_parameters = { 'n_hidden_neurons': [50, 100], 'iterations': [250, 500], 'outputbias': [True] } params = list(tuned_parameters.keys()) combinations = self.generate_parameter_combinations( tuned_parameters, params) split_point = int(gridsearch_training_frac * len(train_X.index)) train_params_X = train_X.iloc[0:split_point, ] test_params_X = train_X.iloc[split_point:len(train_X.index), ] train_params_y = train_y.iloc[0:split_point, ] test_params_y = train_y.iloc[split_point:len(train_X.index), ] if error == 'mse': best_error = sys.float_info.max elif error == 'accuracy': best_error = 0 best_combination = [] for comb in combinations: print(comb) # Order of the keys might have changed. keys = list(tuned_parameters.keys()) # print(keys) pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.recurrent_neural_network( train_params_X, train_params_y, test_params_X, test_params_y, n_hidden_neurons=comb[keys.index('n_hidden_neurons')], iterations=comb[keys.index('iterations')], outputbias=comb[keys.index('outputbias')], gridsearch=False) if error == 'mse': eval = RegressionEvaluation() mse = eval.mean_squared_error(test_params_y, pred_test_y_prob) if mse < best_error: best_error = mse best_combination = comb elif error == 'accuracy': eval = ClassificationEvaluation() acc = eval.accuracy(test_params_y, pred_test_y) if acc > best_error: best_error = acc best_combination = comb print('-------') print(best_combination) print('-------') return best_combination[params.index( 'n_hidden_neurons')], best_combination[params.index( 'iterations')], best_combination[params.index('outputbias')]
def forward_selection(self, max_features, X_train, y_train): # Start with no features. ordered_features = [] ordered_scores = [] selected_features = [] ca = ClassificationAlgorithms() ce = ClassificationEvaluation() prev_best_perf = 0 # Select the appropriate number of features. for i in range(0, max_features): print i #Determine the features left to select. features_left = list(set(X_train.columns) - set(selected_features)) best_perf = 0 best_attribute = '' # For all features we can still select... for f in features_left: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.append(f) # Determine the accuracy of a decision tree learner if we were to add # the feature. pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = ce.accuracy(y_train, pred_y_train) # If the performance is better than what we have seen so far (we aim for high accuracy) # we set the current feature to the best feature and the same for the best performance. if perf > best_perf: best_perf = perf best_feature = f # We select the feature with the best performance. selected_features.append(best_feature) prev_best_perf = best_perf ordered_features.append(best_feature) ordered_scores.append(best_perf) return selected_features, ordered_features, ordered_scores
def backward_selection(max_features: int, X_train: pd.DataFrame, y_train: pd.Series) -> List[str]: """ Select the given number of features for classification, that show the best accuracy, using backward selection. The method uses the given features and labels to train a decision tree and determine the accuracy of the prediction. :param max_features: Number of features to select. :param X_train: Features as DataFrame. :param y_train: Labels corresponding to given features. :return: Selected features. """ # First select all features selected_features = X_train.columns.tolist() ca = ClassificationAlgorithms() ce = ClassificationEvaluation() for i in range(0, (len(X_train.columns) - max_features)): best_perf = 0 worst_feature = '' # Select from the features that are still in the selection for f in selected_features: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.remove(f) # Determine the score without the feature pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = ce.accuracy(y_train, pred_y_train) # If scoring better without the feature than seen so far, this is the worst feature if perf > best_perf: best_perf = perf worst_feature = f # Remove the worst feature selected_features.remove(worst_feature) return selected_features
plot.plot(range(1, 26), ordered_scores) plot.xlabel('number of features') plot.ylabel('accuracy') plot.show() # Based on the plot we select the top 10 features. selected_features = features_after_chapter_5 # ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120', # 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40', # 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40'] # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting? learner = ClassificationAlgorithms() eval = ClassificationEvaluation() reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 20 for reg_param in reg_parameters: performance_tr = 0 performance_te = 0 for i in range(0, repeats): class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X, train_y, test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500,
'gyr_z_freq_2.0_Hz_ws_40', 'gyr_y_freq_0.0_Hz_ws_40', 'mag_z_freq_1.5_Hz_ws_40', 'acc_z_temp_MAD_ws_120', 'acc_y_temp_kurtosis_ws_120', 'mag_x_freq_1.2_Hz_ws_40', 'lin_acc_y_freq_1.8_Hz_ws_40' ] selected_features_with_DT = [ 'acc_z_freq_0.0_Hz_ws_40', 'loc_height_temp_mean_ws_120', 'pca_4_temp_kurtosis_ws_120', 'lin_acc_y_temp_kurtosis_ws_120', 'pca_1_temp_kurtosis_ws_120', 'acc_z_temp_MAD_ws_120', 'mag_x_freq_1.2_Hz_ws_40', 'gyr_z_freq_2.0_Hz_ws_40', 'acc_y_temp_kurtosis_ws_120', 'lin_acc_y_freq_0.6_Hz_ws_40' ] learner = ClassificationAlgorithms() eval = ClassificationEvaluation() possible_feature_sets = [ basic_features, features_after_outliers_and_imputation, features_after_domain_features, features_after_cluster_features, selected_features_with_DT, selected_features_with_NB ] feature_names = [ 'initial set', 'After imputation', 'With Domain features', 'With cluster features', 'Selected features DT', 'Selected features NB' ] repeats = 3 scores_over_all_algs = []
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the first task, namely the prediction of the label. Therefore create a single column with the categorical # attribute representing the class. Furthermore, use 70% of the data for training and the remaining 30% as an # independent test set. Select the sets based on stratified sampling and remove cases where the label is unknown. print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() learner = ClassificationAlgorithms() evaluation = ClassificationEvaluation() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter_data=True, temporal=False) print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('- - - Selecting subsets - - -') basic_features = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure' ] pca_features = [ 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7' ] time_features = [name for name in dataset.columns if '_temp_' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) if FLAGS.mode == 'selection' or FLAGS.mode == 'all': # First, consider the performance over a selection of features N_FORWARD_SELECTION = FLAGS.nfeatures fs = FeatureSelectionClassification() print('\n- - - Running feature selection - - -') features, ordered_features, ordered_scores = fs.forward_selection( max_features=N_FORWARD_SELECTION, X_train=train_X[features_after_chapter_5], y_train=train_y) DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[ordered_scores], xlabel='number of features', ylabel='accuracy') # Select the most important features (based on python2 features) selected_features = [ 'acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120', 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40' ] if FLAGS.mode == 'regularization' or FLAGS.mode == 'all': print('\n- - - Running regularization and model complexity test - - -') # Study the impact of regularization and model complexity: does regularization prevent overfitting? # Due to runtime constraints run the experiment 3 times, for even more robust data increase the repetitions N_REPEATS_NN = FLAGS.nnrepeat reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] for reg_param in reg_parameters: performance_tr = 0 performance_te = 0 for i in range(0, N_REPEATS_NN): class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( train_X, train_y, test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500, gridsearch=False) performance_tr += evaluation.accuracy(train_y, class_train_y) performance_te += evaluation.accuracy(test_y, class_test_y) performance_training.append(performance_tr / N_REPEATS_NN) performance_test.append(performance_te / N_REPEATS_NN) DataViz.plot_xy(x=[reg_parameters, reg_parameters], y=[performance_training, performance_test], method='semilogx', xlabel='regularization parameter value', ylabel='accuracy', ylim=[0.95, 1.01], names=['training', 'test'], line_styles=['r-', 'b:']) if FLAGS.mode == 'tree' or FLAGS.mode == 'all': print('\n- - - Running leaf size test of decision tree - - -') # Consider the influence of certain parameter settings for the tree model. (very related to the # regularization) and study the impact on performance. leaf_settings = [1, 2, 5, 10] performance_training = [] performance_test = [] for no_points_leaf in leaf_settings: class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( train_X[selected_features], train_y, test_X[selected_features], min_samples_leaf=no_points_leaf, gridsearch=False, print_model_details=False) performance_training.append( evaluation.accuracy(train_y, class_train_y)) performance_test.append(evaluation.accuracy(test_y, class_test_y)) DataViz.plot_xy(x=[leaf_settings, leaf_settings], y=[performance_training, performance_test], xlabel='Minimum number of points per leaf', ylabel='Accuracy', names=['training', 'test'], line_styles=['r-', 'b:']) if FLAGS.mode == 'overall' or FLAGS.mode == 'all': print( '\n- - - Running test of all different classification algorithms - - -' ) # Perform grid searches over the most important parameters and do so by means of cross validation upon the # training set possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] N_KCV_REPEATS = FLAGS.kcvrepeat scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First run non deterministic classifiers a number of times to average their score performance_tr_nn, performance_te_nn = 0, 0 performance_tr_rf, performance_te_rf = 0, 0 performance_tr_svm, performance_te_svm = 0, 0 for repeat in range(0, N_KCV_REPEATS): print( f'Training NeuralNetwork run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... ' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( selected_train_X, train_y, selected_test_X, gridsearch=True) print( f'Training RandomForest run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... ' ) performance_tr_nn += evaluation.accuracy( train_y, class_train_y) performance_te_nn += evaluation.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_rf += evaluation.accuracy( train_y, class_train_y) performance_te_rf += evaluation.accuracy(test_y, class_test_y) print( f'Training SVM run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ...' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner. \ support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_svm += evaluation.accuracy( train_y, class_train_y) performance_te_svm += evaluation.accuracy(test_y, class_test_y) overall_performance_tr_nn = performance_tr_nn / N_KCV_REPEATS overall_performance_te_nn = performance_te_nn / N_KCV_REPEATS overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS overall_performance_tr_svm = performance_tr_svm / N_KCV_REPEATS overall_performance_te_svm = performance_te_svm / N_KCV_REPEATS # Run deterministic classifiers: print("Deterministic Classifiers:") print( f'Training Nearest Neighbor run 1 / 1, featureset {feature_names[i]}' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = evaluation.accuracy(train_y, class_train_y) performance_te_knn = evaluation.accuracy(test_y, class_test_y) print( f'Training Decision Tree run 1 / 1 featureset {feature_names[i]}' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = evaluation.accuracy(train_y, class_train_y) performance_te_dt = evaluation.accuracy(test_y, class_test_y) print( f'Training Naive Bayes run 1/1 featureset {feature_names[i]}') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes( selected_train_X, train_y, selected_test_X) performance_tr_nb = evaluation.accuracy(train_y, class_train_y) performance_te_nb = evaluation.accuracy(test_y, class_test_y) scores_with_sd = util. \ print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_nn, overall_performance_te_nn), (overall_performance_tr_rf, overall_performance_te_rf), (overall_performance_tr_svm, overall_performance_te_svm), (performance_tr_knn, performance_te_knn), (performance_tr_knn, performance_te_knn), (performance_tr_dt, performance_te_dt), (performance_tr_nb, performance_te_nb)]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification( ['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': print( '\n- - - Running detail test of promising classification algorithms - - -' ) # Study two promising ones in more detail, namely decision tree and random forest algorithm learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True, export_tree_path=EXPORT_TREE_PATH) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True) test_cm = evaluation.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)
plot.show() # Based on the plot we select the top 10 features. ''' selected_features = [ 'acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120', 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40' ] # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting? learner = ClassificationAlgorithms() eval = ClassificationEvaluation() ''' reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 20 for reg_param in reg_parameters: performance_tr = 0 performance_te = 0 for i in range(0, repeats): class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X, train_y, test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500,
print '#basic features: ', len(basic_features) print '#PCA features: ', len(pca_features) print '#time features: ', len(time_features) print '#frequency features: ', len(freq_features) cluster_features = ['cluster'] print '#cluster features: ', len(cluster_features) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features)) features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features)) # First, let us consider the performance over a selection of features: learner = ClassificationAlgorithms() eval = ClassificationEvaluation() # And we study two promising ones in more detail. First let us consider the decision tree which works best with the selected # features. # class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], gridsearch=True, print_model_details=True, export_tree_path=export_tree_path) #class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], # gridsearch=True, print_model_details=True) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) #DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)
def experiment(file): dataset = pd.read_csv(file, index_col=time_col) DataViz = VisualizeDataset(__file__.split('.')[0] + file.split('.')[0].split('/')[1] + '.py', show=True) print(DataViz.figures_dir) dataset.index = pd.to_datetime(dataset.index) prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter=False, temporal=False, drop_na=False, fill_na=True) time_features = [name for name in dataset.columns if '_temp' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] features_2 = list(set().union(basic_features, time_features)) features_3 = list(set().union(basic_features, time_features, freq_features)) features_4 = list(set().union(basic_features, time_features, freq_features, cluster_features)) # print('feature selection') # fs = FeatureSelectionClassification() # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION, # train_X[features_4], train_y) # log([str(ordered_scores), str(selected_features)]) selected_features = [ 'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120', 'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq', 'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40', 'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40', 'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40', 'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200', 'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120', 'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40', 'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40', 'acc_z_freq_0.6_Hz_ws_40' ] DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[selected_features], xlabel='number of features', ylabel='accuracy') print('feature selection finished for %s' % file) learner = ClassificationAlgorithms() eval = ClassificationEvaluation() possible_feature_sets = [ basic_features, features_2, features_3, features_4, selected_features ] feature_names = [ 'Basic features', 'Features with time', 'Features with frequency', 'Features with cluster', 'Selected features' ] # with shelve.open('temp/shelve.out', 'n') as f: # for key in dir(): # try: # f[key] = globals()[key] # except: # print('ERROR shelving: {0}'.format(key)) N_KCV_REPEATS = 1 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): print(datetime.now()) print('possible feature sets', i) log(['Features %d' % i]) selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score. performance_tr_rf = 0 performance_te_rf = 0 for repeat in range(0, N_KCV_REPEATS): print(datetime.now()) print('\nRepeat', repeat) print('Random Forest') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) performance_tr_rf += eval.accuracy(train_y, class_train_y) performance_te_rf += eval.accuracy(test_y, class_test_y) print(datetime.now()) overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS log([ 'RF' + ' train acc: %f' % performance_te_rf + ' test acc: %f' % performance_te_rf ]) # And we run our deterministic classifiers: print('decision tree') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) log([ 'DT' + ' train acc: %f' % performance_tr_dt + ' test acc: %f' % performance_te_dt ]) scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_rf, overall_performance_te_rf), (performance_tr_dt, performance_te_dt), ]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification(['RF', 'DT'], feature_names, scores_over_all_algs) print(datetime.now())
# Based on the plot we select the top 10 features (note: slightly different compared to Python 2, we use # those feartures here). selected_features = [ 'rotationRate.z_temp_std_ws_180', 'userAcceleration.z_temp_std_ws_180', 'gravity.x_freq_0.0_Hz_ws_100', 'userAcceleration.y_freq_0.0_Hz_ws_100', 'gravity.x_freq_2.7_Hz_ws_100', ] # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting? learner = ClassificationAlgorithms() eval = ClassificationEvaluation() reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. # N_REPEATS_NN = 20 # # for reg_param in reg_parameters: # performance_tr = 0 # performance_te = 0 # for i in range(0, N_REPEATS_NN): # class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( # train_X, train_y, # test_X, hidden_layer_sizes=(250,), alpha=reg_param, max_iter=500,
# Based on the plot we select the top 10 features (note: slightly different compared to Python 2, we use # those feartures here). selected_features = ['gyr_phone_Z_freq_2.0_Hz_ws_16', 'gyr_phone_X_freq_1.0_Hz_ws_16', 'mag_phone_Z_freq_0.75_Hz_ws_16', 'mag_phone_Y_freq_0.75_Hz_ws_16', 'pca_1_temp_std_ws_16', 'acc_phone_Z_temp_mean_ws_16', 'acc_phone_X_freq_0.0_Hz_ws_16', 'pca_3_temp_std_ws_16', 'mag_phone_X_freq_1.25_Hz_ws_16', 'mag_phone_Z_freq_0.25_Hz_ws_16'] # selected_features = ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', # 'gyr_phone_x_temp_std_ws_120', # 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', # 'gyr_phone_y_freq_1.0_Hz_ws_40', # 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40'] # # # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting? # learner = ClassificationAlgorithms() eval = ClassificationEvaluation() reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. N_REPEATS_NN = 20 for reg_param in reg_parameters: performance_tr = 0 performance_te = 0 for i in range(0, N_REPEATS_NN): class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( train_X, train_y, test_X, hidden_layer_sizes=(250,), alpha=reg_param, max_iter=500,