예제 #1
0
    def test_get_combinations(self):
        x_train, _, _, _ = get_training_data()
        x_train = get_full_combinations(x_train)

        self.assertIsNotNone(x_train)
        self.assertEqual(5500, len(x_train))
        self.assertEqual(22, len(x_train.columns))
예제 #2
0
def get_data(job_name):
    # the data, shuffled and split between train and test sets
    x_train, y_train, x_val, y_val = get_training_data(validation=True)
    x_train = get_full_combinations(x_train)
    x_val = get_full_combinations(x_val)
    y_val = y_val.reset_index(drop=True)

    input_shape = (len(x_train.columns), )

    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_val.shape[0], 'test samples')

    return SupervisedData(job_name, x_train, y_train, x_val, y_val,
                          input_shape)
예제 #3
0
                       color=cmaps(index))

    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')


# An usage example.
if __name__ == '__main__':
    from data.data_reader import get_training_data
    from data.data_combinator import get_full_combinations
    from stats.regression_calculator import get_ridge_regression, get_logistic_regression, get_lasso_regression, \
        get_linear_discriminant_analysis, get_quadratic_discriminant_analysis, get_naive_bayes, get_random_forest, \
        get_select_more_follower_count

    alpha = 0.062

    x_train, y_train, x_val, y_val = get_training_data(validation=True)
    original_x_val = x_val.copy()
    x_train = get_full_combinations(x_train)
    x_val = get_full_combinations(x_val)

    title = 'Select more follower count'
    print(title)
    y_prediction = get_select_more_follower_count(x_train, y_train,
                                                  original_x_val)
    accuracy, f1_score, AUC = evaluate_predictions(
        y_val,
        y_prediction,
        title=title,
        confusion_matrix_plotting=True,
        roc_curve_plotting=True)
    print('accuracy:{}'.format(accuracy))
예제 #4
0
            x_copy.iloc[:, i] / (x_copy.iloc[:, i + variable_number] + 1e-20))

    # Use only combined columns.
    combined_x = x_copy.iloc[:, column_number:]
    combined_columns = combined_x.columns

    # Transform features by scaling each feature to a given range.
    scaler = MinMaxScaler(feature_range=(0, 1))
    combined_x = scaler.fit_transform(combined_x)
    combined_x = pd.DataFrame(data=combined_x, columns=combined_columns)

    return combined_x


# An usage example
if __name__ == '__main__':
    from data.data_reader import get_training_data

    x_train, y_train, _, _ = get_training_data()
    full_combined_x_train = get_full_combinations(x_train)
    print(full_combined_x_train.head())
    print('-' * 70)

    sub_combined_x_train = get_sub_combinations(x_train)
    print(sub_combined_x_train.head())
    print('-' * 70)

    div_combined_x_train = get_div_combinations(x_train)
    print(div_combined_x_train.head())
    print('-' * 70)
예제 #5
0
def draw_regression_comparison_graph(from_alpha, to_alpha, step, combination_function=get_full_combinations):
    """
    This method shows you AUC(Area Under the Curve) of all regression methods.

    :param from_alpha: (float) from_alpha must be bigger than 0.
    :param to_alpha: (float) to_alpha must be bigger than from_alpha
    :param step: (float) The size of one step.
    :param combination_function: (function) The function getting combined data.
    """
    assert 0 < from_alpha
    assert from_alpha < to_alpha

    x_train, y_train, x_val, y_val = get_training_data(validation=True)
    x_train = combination_function(x_train)
    original_x_val = x_val.copy()
    x_val = combination_function(x_val)

    # Dictionary for saving results.
    evaluation_results_dict = {ALPHA: []}
    for auc in REGRESSION_COMPARISON_AUCS:
        evaluation_results_dict[auc] = []

    # Logistic regression
    logistic_y_prediction = get_logistic_regression(x_train, y_train, x_val)
    _, _, logistic_auc = evaluate_predictions(y_val, logistic_y_prediction)

    # LDA
    lda_y_prediction = get_linear_discriminant_analysis(x_train, y_train, x_val)
    _, _, lda_auc = evaluate_predictions(y_val, lda_y_prediction)

    # QDA
    qda_y_prediction = get_quadratic_discriminant_analysis(x_train, y_train, x_val)
    _, _, qda_auc = evaluate_predictions(y_val, qda_y_prediction)

    # GNB
    gnb_y_prediction = get_naive_bayes(x_train, y_train, x_val)
    _, _, gnb_auc = evaluate_predictions(y_val, gnb_y_prediction)

    # RF
    rf_y_prediction = get_random_forest(x_train, y_train, x_val)
    _, _, rf_auc = evaluate_predictions(y_val, rf_y_prediction)

    # SMFC
    smfc_y_prediction = get_select_more_follower_count(x_train, y_train, original_x_val)
    _, _, smfc_auc = evaluate_predictions(y_val, smfc_y_prediction)

    for alpha in np.arange(from_alpha, to_alpha, step):
        # Ridge regression
        ridge_y_prediction = get_ridge_regression(x_train, y_train, x_val, alpha)
        _, _, ridge_auc = evaluate_predictions(y_val, ridge_y_prediction)

        # Lasso regression
        lasso_y_prediction = get_lasso_regression(x_train, y_train, x_val, alpha)
        _, _, lasso_auc = evaluate_predictions(y_val, lasso_y_prediction)

        # Save index values.
        evaluation_results_dict[ALPHA].append(alpha)

        # Save results of logistic regression.
        evaluation_results_dict[LOGISTIC_AUC].append(logistic_auc)

        # Save results of ridge regression.
        evaluation_results_dict[RIDGE_AUC].append(ridge_auc)

        # Save results of lasso regression.
        evaluation_results_dict[LASSO_AUC].append(lasso_auc)

        # Save results of lda.
        evaluation_results_dict[LDA_AUC].append(lda_auc)

        # Save results of qda.
        evaluation_results_dict[QDA_AUC].append(qda_auc)

        # Save results of gnb.
        evaluation_results_dict[GNB_AUC].append(gnb_auc)

        # Save results of rf.
        evaluation_results_dict[RF_AUC].append(rf_auc)

        # Save results of smfc.
        evaluation_results_dict[SMFC_AUC].append(smfc_auc)

    evaluation_results_df = pd.DataFrame(data=evaluation_results_dict)

    # Print peek points
    for auc in REGRESSION_COMPARISON_AUCS:
        highest_auc_row = evaluation_results_df.loc[evaluation_results_df[auc].idxmax(), [auc, ALPHA]]
        print('The highest {}={} when alpha={}'.format(auc, highest_auc_row[auc], highest_auc_row[ALPHA]))

    evaluation_results_df = evaluation_results_df.set_index([ALPHA])
    evaluation_results_df.plot(title='Logistic vs. Ridge vs. Lasso vs. GNB vs. LDA vs. QDA vs. SMFC', grid=True,
                               ylim=(0.0, 1))
    plt.show()
예제 #6
0
def draw_combination_comparison_graph(regression_function, function_name, from_alpha, to_alpha, step):
    """

    :param regression_function: The regression function you want to see the graph.
    :param function_name: The name of the function.
    :param from_alpha: (float) from_alpha must be bigger than 0.
    :param to_alpha: (float) to_alpha must be bigger than from_alpha
    :param step: (float) The size of one step.
    """
    assert 0 < from_alpha
    assert from_alpha < to_alpha

    x_train, y_train, x_val, y_val = get_training_data(validation=True)
    full_combined_x_train = get_full_combinations(x_train)
    full_combined_x_val = get_full_combinations(x_val)
    self_full_combined_x_train = get_self_combinations(x_train, get_full_combinations)
    self_full_combined_x_val = get_self_combinations(x_val, get_full_combinations)
    sub_combined_x_train = get_sub_combinations(x_train)
    sub_combined_x_val = get_sub_combinations(x_val)
    div_combined_x_train = get_div_combinations(x_train)
    div_combined_x_val = get_div_combinations(x_val)
    log_div_combined_x_train = get_log_div_combinations(x_train)
    log_div_combined_x_val = get_log_div_combinations(x_val)
    root_div_combined_x_train = get_log_div_combinations(x_train)
    root_div_combined_x_val = get_log_div_combinations(x_val)

    # Dictionary for saving results.
    evaluation_results_dict = {ALPHA: []}
    for auc in COMBINATION_COMPARISON_AUCS:
        evaluation_results_dict[auc] = []

    for alpha in np.arange(from_alpha, to_alpha, step):
        # Full combined ridge regression
        full_combined_ridge_y_prediction = \
            regression_function(full_combined_x_train, y_train, full_combined_x_val, alpha)
        _, _, full_combined_auc = evaluate_predictions(y_val, full_combined_ridge_y_prediction)

        # Self full combined ridge regression
        self_full_combined_ridge_y_prediction = \
            regression_function(self_full_combined_x_train, y_train, self_full_combined_x_val, alpha)
        _, _, self_full_combined_auc = evaluate_predictions(y_val, self_full_combined_ridge_y_prediction)

        # Sub combined ridge regression
        sub_combined_ridge_y_prediction = \
            regression_function(sub_combined_x_train, y_train, sub_combined_x_val, alpha)
        _, _, sub_combined_auc = evaluate_predictions(y_val, sub_combined_ridge_y_prediction)

        # Div combined ridge regression
        div_combined_ridge_y_prediction = \
            regression_function(div_combined_x_train, y_train, div_combined_x_val, alpha)
        _, _, div_combined_auc = evaluate_predictions(y_val, div_combined_ridge_y_prediction)

        # Log div combined ridge regression
        log_div_combined_ridge_y_prediction = \
            regression_function(log_div_combined_x_train, y_train, log_div_combined_x_val, alpha)
        _, _, log_div_combined_auc = evaluate_predictions(y_val, log_div_combined_ridge_y_prediction)

        # Log div combined ridge regression
        root_div_combined_ridge_y_prediction = \
            regression_function(root_div_combined_x_train, y_train, root_div_combined_x_val, alpha)
        _, _, root_div_combined_auc = evaluate_predictions(y_val, root_div_combined_ridge_y_prediction)

        # Save index values.
        evaluation_results_dict[ALPHA].append(alpha)

        # Save results of full combined ridge regression.
        evaluation_results_dict[FULL_COMBINED_AUC].append(full_combined_auc)

        # Save results of self full combined ridge regression.
        evaluation_results_dict[SELF_FULL_COMBINED_AUC].append(self_full_combined_auc)

        # Save results of sub combined ridge regression.
        evaluation_results_dict[SUB_COMBINED_AUC].append(sub_combined_auc)

        # Save results of div combined ridge regression.
        evaluation_results_dict[DIV_COMBINED_AUC].append(div_combined_auc)

        # Save results of div combined ridge regression.
        evaluation_results_dict[LOG_DIV_COMBINED_AUC].append(log_div_combined_auc)

        # Save results of div combined ridge regression.
        evaluation_results_dict[ROOT_DIV_COMBINED_AUC].append(root_div_combined_auc)

    evaluation_results_df = pd.DataFrame(data=evaluation_results_dict)

    evaluation_results_df = evaluation_results_df.set_index([ALPHA])
    evaluation_results_df.plot(
        title='{}: Full vs. SelfFull vs. Sub vs. Div vs. LogDiv vs. RootDiv'.format(function_name),
        grid=True,
        ylim=(0.0, 1)
    )
    plt.show()