示例#1
0
def run_on_soybean(file):
    print("_______________________________")
    print("Reading in Soybean data set...")
    # Read in soybean data
    df_soybean = pd.read_csv(file, header=None)

    # Generate boolean classifiers (0 = not-Iris-virginica, 1 = Iris-virginica)
    df_soybean = df_soybean.rename(columns={35: 'Class'})
    df_soybean.loc[df_soybean["Class"] == "D1", "Class"] = 0
    df_soybean.loc[df_soybean["Class"] == "D2", "Class"] = 0
    df_soybean.loc[df_soybean["Class"] == "D3", "Class"] = 1
    df_soybean.loc[df_soybean["Class"] == "D4", "Class"] = 1

    # One hot encode breast data set for naive bayes
    columns_to_encode = df_soybean.columns.values.tolist()
    del columns_to_encode[35]
    df_soybean_encoded = one_hot_encoder(df_soybean, columns_to_encode)

    lg_averages = run_logistic_regression(df_soybean, 35)
    nb_averages = run_naive_bayes(df_soybean_encoded, 35)

    print("----------------------------")
    print(f"Averages over 5 experiments")
    print("----------------------------")
    print(f"Logistic Regression Averages = {lg_averages}%")
    print(f"Naive Bayes Averages = {nb_averages}%")
示例#2
0
def run_on_glass(file, num_hidden):
    """
    This function runs logistic regression and naive bayes classifier on the glass data set
    :param file: input file
    """
    print("_______________________________")
    print("Reading in Glass data set...")
    # Read in glass data
    df_glass = pd.read_csv(file, header=None)
    df_glass.columns = [
        "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class"
    ]

    # This data set has no missing values, so we will skip that step

    # Drop Id
    df_glass = df_glass.drop('Id', axis=1)

    # Encode the class
    df_glass = one_hot_encoder(df_glass, ['Class'])
    df_glass = df_glass.rename(columns={"Class_1": "Class"})
    df_glass = df_glass.drop(
        columns=['Class_2', 'Class_3', 'Class_5', 'Class_6', 'Class_7'])

    averages = run_backpropagation(df_glass, 9, int(num_hidden))

    print("----------------------------")
    print(f"Averages over 5 experiments")
    print("----------------------------")
    print(f"Averages over all 5 experiments = {averages}%")
示例#3
0
def run_on_glass(file):
    """
    This function runs logistic regression and naive bayes classifier on the glass data set
    :param file: input file
    """
    print("_______________________________")
    print("Reading in Glass data set...")
    # Read in glass data
    df_glass = pd.read_csv(file, header=None)
    df_glass.columns = [
        "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class"
    ]

    # This data set has no missing values, so we will skip that step

    # Drop Id
    df_glass = df_glass.drop('Id', axis=1)

    # Encode the class
    df_glass = one_hot_encoder(df_glass, ['Class'])
    df_glass = df_glass.rename(columns={"Class_1": "Class"})
    df_glass = df_glass.drop(
        columns=['Class_2', 'Class_3', 'Class_5', 'Class_6', 'Class_7'])
    print(df_glass.head())

    # One hot code the data set for naive bayes
    columns_to_encode = df_glass.columns.values.tolist()
    del columns_to_encode[9]
    df_glass_encoded = one_hot_encoder(df_glass, columns_to_encode)
    df_glass_encoded = df_glass_encoded[
        [c for c in df_glass_encoded if c not in ['Class']] + ['Class']]

    lg_averages = run_logistic_regression(df_glass, 9)
    nb_averages = run_naive_bayes(df_glass_encoded,
                                  len(df_glass_encoded.columns) - 1)

    print("----------------------------")
    print(f"Averages over 5 experiments")
    print("----------------------------")
    print(f"Logistic Regression Averages = {lg_averages}%")
    print(f"Naive Bayes Averages = {nb_averages}%")
示例#4
0
def run_on_breast(file):
    """
    This function runs logistic regression and naive bayes classifier on the breast data set, it encodes
    the classes to Benign = 0, Malignant = 1, and removes missing values
    :param file: input file
    """
    print("_______________________________")
    print("Reading in Breast data set...")
    df_breast = pd.read_csv(file, header=None)
    df_breast.columns = [
        "Sample Id", "Clump Thickness", "Uniformity of Cell Size",
        "Uniformity of Cell Shape", "Marginal Adhesion",
        "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
        "Normal Nucleoli", "Mitoses", "Class"
    ]

    # Find missing values and remove them, since there are so few
    # The documentation notes that there are16 missing values in group 1 and 6 denoted by '?'
    # I found 16 values in Group 6
    # Since there are so few missing values I dropped those rows
    df_breast = df_breast[df_breast["Bare Nuclei"] != '?']

    # Drop Sample Id
    df_breast = df_breast.drop('Sample Id', axis=1)

    # Generate boolean classifiers (0 = Benign, 1 = Malignant)
    df_breast.loc[df_breast["Class"] == 2, "Class"] = 0
    df_breast.loc[df_breast["Class"] == 4, "Class"] = 1

    # One hot encode breast data set for naive bayes
    columns_to_encode = df_breast.columns.values.tolist()
    del columns_to_encode[9]
    df_breast_encoded = one_hot_encoder(df_breast, columns_to_encode)

    lg_averages = run_logistic_regression(df_breast, 9)
    nb_averages = run_naive_bayes(df_breast_encoded, 9)

    print("----------------------------")
    print(f"Averages over 5 experiments")
    print("----------------------------")
    print(f"Logistic Regression Averages = {lg_averages}%")
    print(f"Naive Bayes Averages = {nb_averages}%")
示例#5
0
def run_on_iris(file):
    """
    This function runs the SFS Algorithm wrapping Naive Bayes to reduce the feature set
    and then runs a K-means clustering algorithm to cluster the data. To test k-means as
    a classifier Naive Bayes is run again with the cluster labels generated by K-means.
    :param file: The file name that includes the data
    :return: Nothing
    """
    print("_______________________________")
    print("Reading in Iris data set...")
    # Read in iris data
    df_iris = pd.read_csv(file, header=None)
    df_iris.columns = [
        "sepal length in cm", "sepal width in cm", "petal length in cm",
        "petal width in cm", "Class"
    ]

    # This data set has no missing values, so we will skip that step

    # One hot code the classes in order to use my previous naive bayes algorithm
    df_iris_encoded = one_hot_encoder(df_iris, ["Class"])

    # Split into test and training sets
    x_test, x_train = split_test_train(df_iris_encoded)
    x_test = x_test.reset_index(drop=True)
    x_train = x_train.reset_index(drop=True)

    # Run stepwise forward selection to reduce the feature set
    print(
        "Run Stepwise forward selection to reduce the feature set on Iris...")
    print("All features...")
    features = df_iris_encoded.columns.values.tolist()[0:4]
    print(features)
    sfs = StepwiseForwardSelection(features, x_train.iloc[:, 0:4],
                                   x_test.iloc[:, 0:4],
                                   x_train["Class_Iris-virginica"],
                                   x_test["Class_Iris-virginica"], nb.learn,
                                   nb.test)
    optimized_feature_set = sfs.run()

    cluster_and_classify(optimized_feature_set, x_test, x_train)
示例#6
0
def run_on_glass(file):
    """
    This function runs the SFS Algorithm wrapping Naive Bayes to reduce the feature set
    and then runs a K-means clustering algorithm to cluster the data. To test k-means as
    a classifier Naive Bayes is run again with the cluster labels generated by K-means.
    :param file: The file name that includes the data
    :return: Nothing
    """
    print("_______________________________")
    print("Reading in Glass data set...")
    # Read in glass data
    df_glass = pd.read_csv(file, header=None)
    df_glass.columns = [
        "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class"
    ]

    # This data set has no missing values, so we will skip that step

    # Drop Id
    df_glass_all = df_glass.drop('Id', axis=1)

    # One hot code the classes in order to use my previous naive bayes algorithm
    df_glass_encoded = one_hot_encoder(df_glass_all, ["Class"])

    # Split into test and training sets
    x_test, x_train = split_test_train(df_glass_encoded)
    x_test = x_test.reset_index(drop=True)
    x_train = x_train.reset_index(drop=True)

    # Run stepwise forward selection to reduce the feature set
    print("Running SFS on Glass data set...")
    features = df_glass_all.columns.values.tolist()[0:9]
    print("All features...")
    print(features)
    sfs = StepwiseForwardSelection(features, x_train.iloc[:, 0:9],
                                   x_test.iloc[:, 0:9], x_train["Class_1"],
                                   x_test["Class_1"], nb.learn, nb.test)
    optimized_feature_set = sfs.run()

    cluster_and_classify(optimized_feature_set, x_test, x_train)
def run_on_computer(file, k):
    """
    This function runs k-nearest neighbors on the Computer Hardware data set
    :param file: input file
    """
    print("_______________________________")
    print("Reading in Computer Hardware data set...")
    df_computer = pd.read_csv(file, header=None)
    print(df_computer.head())

    df_computer.columns = [
        "0", "1", "2", "3", "4", "5", "6", "7", "Class", "9"
    ]

    # One hot encode categorical values
    df_computer = one_hot_encoder(df_computer, ["0", "1"])
    print(df_computer.head())

    run_k_nearest_neighbor_experiments(df_computer,
                                       k,
                                       False,
                                       classification=False)
示例#8
0
def run_on_soybean(file, num_hidden):
    print("_______________________________")
    print("Reading in Soybean data set...")
    # Read in soybean data
    df_soybean = pd.read_csv(file, header=None)

    # Generate boolean classifiers
    df_soybean = df_soybean.rename(columns={35: 'Class'})
    df_soybean.loc[df_soybean["Class"] == "D1", "Class"] = 0
    df_soybean.loc[df_soybean["Class"] == "D2", "Class"] = 0
    df_soybean.loc[df_soybean["Class"] == "D3", "Class"] = 1
    df_soybean.loc[df_soybean["Class"] == "D4", "Class"] = 1

    # One hot encode breast data set for naive bayes
    columns_to_encode = df_soybean.columns.values.tolist()
    del columns_to_encode[35]
    df_soybean_encoded = one_hot_encoder(df_soybean, columns_to_encode)

    averages = run_backpropagation(df_soybean, 35, int(num_hidden))

    print("----------------------------")
    print(f"Averages over 5 experiments")
    print("----------------------------")
    print(f"Averages over all 5 experiments = {averages}%")