Exemplo n.º 1
0
def modelPrompt(x_train, y_train, x_standard, y, classifier, name):
    while True:
        value = input(
            "Please choose the maximum number of features to select: ")
        try:
            k_value = int(value)
        except ValueError:
            print("The input you entered is not an integer. Please try again.")
        else:
            if k_value > 0:
                result = feature_selection.modelFeatSelect(
                    x_train, y_train, classifier, k_value)
                print(result)
                if not result.empty:
                    features = list(result['Feature'])
                    title = "Learning Curves (" + name + ", log regression model)"
                    x_selected = x_standard[features]
                    x_train_selected = x_train[features]

                    logreg = LogisticRegression(C=1)
                    logreg.fit(x_train_selected, y_train)
                    evaluate_model.plot_learning_curve(logreg,
                                                       title,
                                                       x_selected,
                                                       y,
                                                       cv=5)

                break
            else:
                print(
                    "The integer you entered is not positive. Please try again."
                )
Exemplo n.º 2
0
def uniPrompt(x_train, y_train, x_standard, y):
    print("Starting Univariance Feature Selection: ")
    while True:
        value = input("Please choose the number of features to select: ")
        try:
            k_value = int(value)
        except ValueError:
            print("The input you entered is not an integer. Please try again.")
        else:
            if k_value > 0:
                result = feature_selection.univariance(x_train, y_train,
                                                       k_value)
                print(result)

                features = list(result['Feature'])
                title = "Learning Curves (univariance, log regression model)"
                x_selected = x_standard[features]
                x_train_selected = x_train[features]

                logreg = LogisticRegression(C=1)
                logreg.fit(x_train_selected, y_train)
                evaluate_model.plot_learning_curve(logreg,
                                                   title,
                                                   x_selected,
                                                   y,
                                                   cv=5)

                break
            else:
                print(
                    "The integer you entered is not positive. Please try again."
                )
Exemplo n.º 3
0
def evalModelPrompt(x_train, x_test, y_train, y_test, x_standard, y,
                    classifier, name):
    print("Find the best number of features using " + name + "... ")
    selected_features = feature_selection.getBestK(x_train, x_test, y_train,
                                                   y_test, classifier, name)

    title = "Learning Curves with " + name + " (use selected features)"
    x_selected = x_standard[list(selected_features)]
    x_train_selected = x_train[list(selected_features)]

    classifier.fit(x_train_selected, y_train)
    evaluate_model.plot_learning_curve(classifier, title, x_selected, y, cv=5)
Exemplo n.º 4
0
def RFEPrompt(x_train, y_train, x_standard, y, classifier, name):
    print("Starting RFE: ")

    result = feature_selection.RFEFeatSelect(x_train, y_train, classifier)
    print(result)
    if not result.empty:
        features = list(result['Feature'])
        title = "Learning Curves with " + name + " (use current feature subset)"
        x_selected = x_standard[features]
        x_train_selected = x_train[features]

        classifier.fit(x_train_selected, y_train)
        evaluate_model.plot_learning_curve(classifier,
                                           title,
                                           x_selected,
                                           y,
                                           cv=5)
Exemplo n.º 5
0
def main():
    print("Begin Operation... ")
    print(
        "Notice: Non-numeric data besides the target class (can be categorical) will be ignored... "
    )

    # get path to csv file
    csv_path = None
    while True:
        csv_path = input("Please provide a path to a csv file: ")
        if os.path.exists(csv_path) == False:
            print("The path that you provided is incorrect. Please try again.")
        elif os.path.isfile(csv_path) == False:
            print(
                "The path that you provided is not a file. Please try again.")
        elif csv_path.endswith('.csv') == False:
            print(
                "The path that you provided is not a csv file. Please try again."
            )
        else:
            break
    df = pd.read_csv(csv_path, index_col=0)

    # get target class
    while True:
        target_name = input("Please provide the name of the target class: ")
        if target_name in df.head():
            x, y = preprocessing.separateVars(df, target_name)
            break
        else:
            print(
                "The name that you provided doesn't exist in this csv file. Please try again."
            )

    # get target class encoding
    y = encodePrompt(y, target_name)

    # remove non numeric data from feature columns
    x = preprocessing.cleanData(x)
    print("Non-numeric columns and columns with missing values removed.")
    # normalize data
    x_standard = preprocessing.standardization(x)
    print("Data Normalized (z-score).")

    # visualize data
    visualPrompt(x_standard, y)

    # prompt for feature transformation
    x_standard, y = transformationPrompt(x_standard, y)
    x_standard = preprocessing.standardization(x_standard)

    answer = input("Would you like to perform feature selection? (y/n) ")
    if answer == 'y':
        # split dataset to train and test
        x_train, x_test, y_train, y_test = train_test_split(x_standard,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=0)

        method = input(
            "Would you like to perform classification or regression? (c/r) ")
        if method == 'c':
            # find optimal classifier
            print("Begin to test classifiers... ")
            classifier1, name1, classifier2, name2 = feature_selection.optimalClassifier(
                x_train, x_test, y_train, y_test)

            # plot learning curve of current classifier
            title = "Learning Curves (" + name1 + ")"
            evaluate_model.plot_learning_curve(classifier1,
                                               title,
                                               x_standard,
                                               y,
                                               cv=5)

            title = "Learning Curves (" + name2 + ")"
            evaluate_model.plot_learning_curve(classifier2,
                                               title,
                                               x_standard,
                                               y,
                                               cv=5)

            answer = input(
                "Do you want to find the best number of feature using " +
                name1 + "? (y/n) ")
            if answer == 'y':
                evalModelPrompt(x_train, x_test, y_train, y_test, x_standard,
                                y, classifier1, name1)
            else:
                print("Skip for " + name1 + ".")

            answer = input(
                "Do you want to find the best number of feature using " +
                name2 + "? (y/n) ")
            if answer == 'y':
                evalModelPrompt(x_train, x_test, y_train, y_test, x_standard,
                                y, classifier2, name2)
            else:
                print("Skip for " + name2 + ".")
        elif method == 'r':
            print("Begin to test regression models... ")
            model1, name1, model2, name2 = feature_selection.optimalRegression(
                x_train, x_test, y_train, y_test)

            # plot learning curve of current classifier
            title = "Learning Curves (" + name1 + ")"
            evaluate_model.plot_learning_curve(model1,
                                               title,
                                               x_standard,
                                               y,
                                               cv=5)

            title = "Learning Curves (" + name2 + ")"
            evaluate_model.plot_learning_curve(model2,
                                               title,
                                               x_standard,
                                               y,
                                               cv=5)
        else:
            print("Your input doesn't match classification or regression.")

        # feature selection
        while True:
            method = input(
                "Please choose a feature selection method(RFE/univariance/lasso/optimal_classifier/MRMR): "
            )
            if method == 'RFE':
                print("Using " + name1)
                RFEPrompt(x_train, y_train, x_standard, y, classifier1, name1)
                print("Using " + name2)
                RFEPrompt(x_train, y_train, x_standard, y, classifier2, name2)
                break
            elif method == 'univariance':
                uniPrompt(x_train, y_train, x_standard, y)
                break
            elif method == 'lasso':
                print("Using lasso")
                modelPrompt(x_train, y_train, x_standard, y, LassoCV(cv=5),
                            'Lasso')
                break
            elif method == 'optimal_classifier':
                print("Using " + name1)
                modelPrompt(x_train, y_train, x_standard, y, classifier1,
                            name1)
                print("Using " + name2)
                modelPrompt(x_train, y_train, x_standard, y, classifier2,
                            name2)
                break
            elif method == 'MRMR':
                result = feature_selection.mrmr(x_standard, y)
                print(result)
                break
            else:
                print(
                    "The method you provide is not an option. Please try again. "
                )
    else:
        print("Feature Selection is skipped. ")

    print("End of Operation.")