def test_get_combinations(self): x_train, _, _, _ = get_training_data() x_train = get_full_combinations(x_train) self.assertIsNotNone(x_train) self.assertEqual(5500, len(x_train)) self.assertEqual(22, len(x_train.columns))
def get_data(job_name): # the data, shuffled and split between train and test sets x_train, y_train, x_val, y_val = get_training_data(validation=True) x_train = get_full_combinations(x_train) x_val = get_full_combinations(x_val) y_val = y_val.reset_index(drop=True) input_shape = (len(x_train.columns), ) print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train samples') print(x_val.shape[0], 'test samples') return SupervisedData(job_name, x_train, y_train, x_val, y_val, input_shape)
color=cmaps(index)) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') # An usage example. if __name__ == '__main__': from data.data_reader import get_training_data from data.data_combinator import get_full_combinations from stats.regression_calculator import get_ridge_regression, get_logistic_regression, get_lasso_regression, \ get_linear_discriminant_analysis, get_quadratic_discriminant_analysis, get_naive_bayes, get_random_forest, \ get_select_more_follower_count alpha = 0.062 x_train, y_train, x_val, y_val = get_training_data(validation=True) original_x_val = x_val.copy() x_train = get_full_combinations(x_train) x_val = get_full_combinations(x_val) title = 'Select more follower count' print(title) y_prediction = get_select_more_follower_count(x_train, y_train, original_x_val) accuracy, f1_score, AUC = evaluate_predictions( y_val, y_prediction, title=title, confusion_matrix_plotting=True, roc_curve_plotting=True) print('accuracy:{}'.format(accuracy))
x_copy.iloc[:, i] / (x_copy.iloc[:, i + variable_number] + 1e-20)) # Use only combined columns. combined_x = x_copy.iloc[:, column_number:] combined_columns = combined_x.columns # Transform features by scaling each feature to a given range. scaler = MinMaxScaler(feature_range=(0, 1)) combined_x = scaler.fit_transform(combined_x) combined_x = pd.DataFrame(data=combined_x, columns=combined_columns) return combined_x # An usage example if __name__ == '__main__': from data.data_reader import get_training_data x_train, y_train, _, _ = get_training_data() full_combined_x_train = get_full_combinations(x_train) print(full_combined_x_train.head()) print('-' * 70) sub_combined_x_train = get_sub_combinations(x_train) print(sub_combined_x_train.head()) print('-' * 70) div_combined_x_train = get_div_combinations(x_train) print(div_combined_x_train.head()) print('-' * 70)
def draw_regression_comparison_graph(from_alpha, to_alpha, step, combination_function=get_full_combinations): """ This method shows you AUC(Area Under the Curve) of all regression methods. :param from_alpha: (float) from_alpha must be bigger than 0. :param to_alpha: (float) to_alpha must be bigger than from_alpha :param step: (float) The size of one step. :param combination_function: (function) The function getting combined data. """ assert 0 < from_alpha assert from_alpha < to_alpha x_train, y_train, x_val, y_val = get_training_data(validation=True) x_train = combination_function(x_train) original_x_val = x_val.copy() x_val = combination_function(x_val) # Dictionary for saving results. evaluation_results_dict = {ALPHA: []} for auc in REGRESSION_COMPARISON_AUCS: evaluation_results_dict[auc] = [] # Logistic regression logistic_y_prediction = get_logistic_regression(x_train, y_train, x_val) _, _, logistic_auc = evaluate_predictions(y_val, logistic_y_prediction) # LDA lda_y_prediction = get_linear_discriminant_analysis(x_train, y_train, x_val) _, _, lda_auc = evaluate_predictions(y_val, lda_y_prediction) # QDA qda_y_prediction = get_quadratic_discriminant_analysis(x_train, y_train, x_val) _, _, qda_auc = evaluate_predictions(y_val, qda_y_prediction) # GNB gnb_y_prediction = get_naive_bayes(x_train, y_train, x_val) _, _, gnb_auc = evaluate_predictions(y_val, gnb_y_prediction) # RF rf_y_prediction = get_random_forest(x_train, y_train, x_val) _, _, rf_auc = evaluate_predictions(y_val, rf_y_prediction) # SMFC smfc_y_prediction = get_select_more_follower_count(x_train, y_train, original_x_val) _, _, smfc_auc = evaluate_predictions(y_val, smfc_y_prediction) for alpha in np.arange(from_alpha, to_alpha, step): # Ridge regression ridge_y_prediction = get_ridge_regression(x_train, y_train, x_val, alpha) _, _, ridge_auc = evaluate_predictions(y_val, ridge_y_prediction) # Lasso regression lasso_y_prediction = get_lasso_regression(x_train, y_train, x_val, alpha) _, _, lasso_auc = evaluate_predictions(y_val, lasso_y_prediction) # Save index values. evaluation_results_dict[ALPHA].append(alpha) # Save results of logistic regression. evaluation_results_dict[LOGISTIC_AUC].append(logistic_auc) # Save results of ridge regression. evaluation_results_dict[RIDGE_AUC].append(ridge_auc) # Save results of lasso regression. evaluation_results_dict[LASSO_AUC].append(lasso_auc) # Save results of lda. evaluation_results_dict[LDA_AUC].append(lda_auc) # Save results of qda. evaluation_results_dict[QDA_AUC].append(qda_auc) # Save results of gnb. evaluation_results_dict[GNB_AUC].append(gnb_auc) # Save results of rf. evaluation_results_dict[RF_AUC].append(rf_auc) # Save results of smfc. evaluation_results_dict[SMFC_AUC].append(smfc_auc) evaluation_results_df = pd.DataFrame(data=evaluation_results_dict) # Print peek points for auc in REGRESSION_COMPARISON_AUCS: highest_auc_row = evaluation_results_df.loc[evaluation_results_df[auc].idxmax(), [auc, ALPHA]] print('The highest {}={} when alpha={}'.format(auc, highest_auc_row[auc], highest_auc_row[ALPHA])) evaluation_results_df = evaluation_results_df.set_index([ALPHA]) evaluation_results_df.plot(title='Logistic vs. Ridge vs. Lasso vs. GNB vs. LDA vs. QDA vs. SMFC', grid=True, ylim=(0.0, 1)) plt.show()
def draw_combination_comparison_graph(regression_function, function_name, from_alpha, to_alpha, step): """ :param regression_function: The regression function you want to see the graph. :param function_name: The name of the function. :param from_alpha: (float) from_alpha must be bigger than 0. :param to_alpha: (float) to_alpha must be bigger than from_alpha :param step: (float) The size of one step. """ assert 0 < from_alpha assert from_alpha < to_alpha x_train, y_train, x_val, y_val = get_training_data(validation=True) full_combined_x_train = get_full_combinations(x_train) full_combined_x_val = get_full_combinations(x_val) self_full_combined_x_train = get_self_combinations(x_train, get_full_combinations) self_full_combined_x_val = get_self_combinations(x_val, get_full_combinations) sub_combined_x_train = get_sub_combinations(x_train) sub_combined_x_val = get_sub_combinations(x_val) div_combined_x_train = get_div_combinations(x_train) div_combined_x_val = get_div_combinations(x_val) log_div_combined_x_train = get_log_div_combinations(x_train) log_div_combined_x_val = get_log_div_combinations(x_val) root_div_combined_x_train = get_log_div_combinations(x_train) root_div_combined_x_val = get_log_div_combinations(x_val) # Dictionary for saving results. evaluation_results_dict = {ALPHA: []} for auc in COMBINATION_COMPARISON_AUCS: evaluation_results_dict[auc] = [] for alpha in np.arange(from_alpha, to_alpha, step): # Full combined ridge regression full_combined_ridge_y_prediction = \ regression_function(full_combined_x_train, y_train, full_combined_x_val, alpha) _, _, full_combined_auc = evaluate_predictions(y_val, full_combined_ridge_y_prediction) # Self full combined ridge regression self_full_combined_ridge_y_prediction = \ regression_function(self_full_combined_x_train, y_train, self_full_combined_x_val, alpha) _, _, self_full_combined_auc = evaluate_predictions(y_val, self_full_combined_ridge_y_prediction) # Sub combined ridge regression sub_combined_ridge_y_prediction = \ regression_function(sub_combined_x_train, y_train, sub_combined_x_val, alpha) _, _, sub_combined_auc = evaluate_predictions(y_val, sub_combined_ridge_y_prediction) # Div combined ridge regression div_combined_ridge_y_prediction = \ regression_function(div_combined_x_train, y_train, div_combined_x_val, alpha) _, _, div_combined_auc = evaluate_predictions(y_val, div_combined_ridge_y_prediction) # Log div combined ridge regression log_div_combined_ridge_y_prediction = \ regression_function(log_div_combined_x_train, y_train, log_div_combined_x_val, alpha) _, _, log_div_combined_auc = evaluate_predictions(y_val, log_div_combined_ridge_y_prediction) # Log div combined ridge regression root_div_combined_ridge_y_prediction = \ regression_function(root_div_combined_x_train, y_train, root_div_combined_x_val, alpha) _, _, root_div_combined_auc = evaluate_predictions(y_val, root_div_combined_ridge_y_prediction) # Save index values. evaluation_results_dict[ALPHA].append(alpha) # Save results of full combined ridge regression. evaluation_results_dict[FULL_COMBINED_AUC].append(full_combined_auc) # Save results of self full combined ridge regression. evaluation_results_dict[SELF_FULL_COMBINED_AUC].append(self_full_combined_auc) # Save results of sub combined ridge regression. evaluation_results_dict[SUB_COMBINED_AUC].append(sub_combined_auc) # Save results of div combined ridge regression. evaluation_results_dict[DIV_COMBINED_AUC].append(div_combined_auc) # Save results of div combined ridge regression. evaluation_results_dict[LOG_DIV_COMBINED_AUC].append(log_div_combined_auc) # Save results of div combined ridge regression. evaluation_results_dict[ROOT_DIV_COMBINED_AUC].append(root_div_combined_auc) evaluation_results_df = pd.DataFrame(data=evaluation_results_dict) evaluation_results_df = evaluation_results_df.set_index([ALPHA]) evaluation_results_df.plot( title='{}: Full vs. SelfFull vs. Sub vs. Div vs. LogDiv vs. RootDiv'.format(function_name), grid=True, ylim=(0.0, 1) ) plt.show()