def run_on_soybean(file): print("_______________________________") print("Reading in Soybean data set...") # Read in soybean data df_soybean = pd.read_csv(file, header=None) # Generate boolean classifiers (0 = not-Iris-virginica, 1 = Iris-virginica) df_soybean = df_soybean.rename(columns={35: 'Class'}) df_soybean.loc[df_soybean["Class"] == "D1", "Class"] = 0 df_soybean.loc[df_soybean["Class"] == "D2", "Class"] = 0 df_soybean.loc[df_soybean["Class"] == "D3", "Class"] = 1 df_soybean.loc[df_soybean["Class"] == "D4", "Class"] = 1 # One hot encode breast data set for naive bayes columns_to_encode = df_soybean.columns.values.tolist() del columns_to_encode[35] df_soybean_encoded = one_hot_encoder(df_soybean, columns_to_encode) lg_averages = run_logistic_regression(df_soybean, 35) nb_averages = run_naive_bayes(df_soybean_encoded, 35) print("----------------------------") print(f"Averages over 5 experiments") print("----------------------------") print(f"Logistic Regression Averages = {lg_averages}%") print(f"Naive Bayes Averages = {nb_averages}%")
def run_on_glass(file, num_hidden): """ This function runs logistic regression and naive bayes classifier on the glass data set :param file: input file """ print("_______________________________") print("Reading in Glass data set...") # Read in glass data df_glass = pd.read_csv(file, header=None) df_glass.columns = [ "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class" ] # This data set has no missing values, so we will skip that step # Drop Id df_glass = df_glass.drop('Id', axis=1) # Encode the class df_glass = one_hot_encoder(df_glass, ['Class']) df_glass = df_glass.rename(columns={"Class_1": "Class"}) df_glass = df_glass.drop( columns=['Class_2', 'Class_3', 'Class_5', 'Class_6', 'Class_7']) averages = run_backpropagation(df_glass, 9, int(num_hidden)) print("----------------------------") print(f"Averages over 5 experiments") print("----------------------------") print(f"Averages over all 5 experiments = {averages}%")
def run_on_glass(file): """ This function runs logistic regression and naive bayes classifier on the glass data set :param file: input file """ print("_______________________________") print("Reading in Glass data set...") # Read in glass data df_glass = pd.read_csv(file, header=None) df_glass.columns = [ "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class" ] # This data set has no missing values, so we will skip that step # Drop Id df_glass = df_glass.drop('Id', axis=1) # Encode the class df_glass = one_hot_encoder(df_glass, ['Class']) df_glass = df_glass.rename(columns={"Class_1": "Class"}) df_glass = df_glass.drop( columns=['Class_2', 'Class_3', 'Class_5', 'Class_6', 'Class_7']) print(df_glass.head()) # One hot code the data set for naive bayes columns_to_encode = df_glass.columns.values.tolist() del columns_to_encode[9] df_glass_encoded = one_hot_encoder(df_glass, columns_to_encode) df_glass_encoded = df_glass_encoded[ [c for c in df_glass_encoded if c not in ['Class']] + ['Class']] lg_averages = run_logistic_regression(df_glass, 9) nb_averages = run_naive_bayes(df_glass_encoded, len(df_glass_encoded.columns) - 1) print("----------------------------") print(f"Averages over 5 experiments") print("----------------------------") print(f"Logistic Regression Averages = {lg_averages}%") print(f"Naive Bayes Averages = {nb_averages}%")
def run_on_breast(file): """ This function runs logistic regression and naive bayes classifier on the breast data set, it encodes the classes to Benign = 0, Malignant = 1, and removes missing values :param file: input file """ print("_______________________________") print("Reading in Breast data set...") df_breast = pd.read_csv(file, header=None) df_breast.columns = [ "Sample Id", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class" ] # Find missing values and remove them, since there are so few # The documentation notes that there are16 missing values in group 1 and 6 denoted by '?' # I found 16 values in Group 6 # Since there are so few missing values I dropped those rows df_breast = df_breast[df_breast["Bare Nuclei"] != '?'] # Drop Sample Id df_breast = df_breast.drop('Sample Id', axis=1) # Generate boolean classifiers (0 = Benign, 1 = Malignant) df_breast.loc[df_breast["Class"] == 2, "Class"] = 0 df_breast.loc[df_breast["Class"] == 4, "Class"] = 1 # One hot encode breast data set for naive bayes columns_to_encode = df_breast.columns.values.tolist() del columns_to_encode[9] df_breast_encoded = one_hot_encoder(df_breast, columns_to_encode) lg_averages = run_logistic_regression(df_breast, 9) nb_averages = run_naive_bayes(df_breast_encoded, 9) print("----------------------------") print(f"Averages over 5 experiments") print("----------------------------") print(f"Logistic Regression Averages = {lg_averages}%") print(f"Naive Bayes Averages = {nb_averages}%")
def run_on_iris(file): """ This function runs the SFS Algorithm wrapping Naive Bayes to reduce the feature set and then runs a K-means clustering algorithm to cluster the data. To test k-means as a classifier Naive Bayes is run again with the cluster labels generated by K-means. :param file: The file name that includes the data :return: Nothing """ print("_______________________________") print("Reading in Iris data set...") # Read in iris data df_iris = pd.read_csv(file, header=None) df_iris.columns = [ "sepal length in cm", "sepal width in cm", "petal length in cm", "petal width in cm", "Class" ] # This data set has no missing values, so we will skip that step # One hot code the classes in order to use my previous naive bayes algorithm df_iris_encoded = one_hot_encoder(df_iris, ["Class"]) # Split into test and training sets x_test, x_train = split_test_train(df_iris_encoded) x_test = x_test.reset_index(drop=True) x_train = x_train.reset_index(drop=True) # Run stepwise forward selection to reduce the feature set print( "Run Stepwise forward selection to reduce the feature set on Iris...") print("All features...") features = df_iris_encoded.columns.values.tolist()[0:4] print(features) sfs = StepwiseForwardSelection(features, x_train.iloc[:, 0:4], x_test.iloc[:, 0:4], x_train["Class_Iris-virginica"], x_test["Class_Iris-virginica"], nb.learn, nb.test) optimized_feature_set = sfs.run() cluster_and_classify(optimized_feature_set, x_test, x_train)
def run_on_glass(file): """ This function runs the SFS Algorithm wrapping Naive Bayes to reduce the feature set and then runs a K-means clustering algorithm to cluster the data. To test k-means as a classifier Naive Bayes is run again with the cluster labels generated by K-means. :param file: The file name that includes the data :return: Nothing """ print("_______________________________") print("Reading in Glass data set...") # Read in glass data df_glass = pd.read_csv(file, header=None) df_glass.columns = [ "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class" ] # This data set has no missing values, so we will skip that step # Drop Id df_glass_all = df_glass.drop('Id', axis=1) # One hot code the classes in order to use my previous naive bayes algorithm df_glass_encoded = one_hot_encoder(df_glass_all, ["Class"]) # Split into test and training sets x_test, x_train = split_test_train(df_glass_encoded) x_test = x_test.reset_index(drop=True) x_train = x_train.reset_index(drop=True) # Run stepwise forward selection to reduce the feature set print("Running SFS on Glass data set...") features = df_glass_all.columns.values.tolist()[0:9] print("All features...") print(features) sfs = StepwiseForwardSelection(features, x_train.iloc[:, 0:9], x_test.iloc[:, 0:9], x_train["Class_1"], x_test["Class_1"], nb.learn, nb.test) optimized_feature_set = sfs.run() cluster_and_classify(optimized_feature_set, x_test, x_train)
def run_on_computer(file, k): """ This function runs k-nearest neighbors on the Computer Hardware data set :param file: input file """ print("_______________________________") print("Reading in Computer Hardware data set...") df_computer = pd.read_csv(file, header=None) print(df_computer.head()) df_computer.columns = [ "0", "1", "2", "3", "4", "5", "6", "7", "Class", "9" ] # One hot encode categorical values df_computer = one_hot_encoder(df_computer, ["0", "1"]) print(df_computer.head()) run_k_nearest_neighbor_experiments(df_computer, k, False, classification=False)
def run_on_soybean(file, num_hidden): print("_______________________________") print("Reading in Soybean data set...") # Read in soybean data df_soybean = pd.read_csv(file, header=None) # Generate boolean classifiers df_soybean = df_soybean.rename(columns={35: 'Class'}) df_soybean.loc[df_soybean["Class"] == "D1", "Class"] = 0 df_soybean.loc[df_soybean["Class"] == "D2", "Class"] = 0 df_soybean.loc[df_soybean["Class"] == "D3", "Class"] = 1 df_soybean.loc[df_soybean["Class"] == "D4", "Class"] = 1 # One hot encode breast data set for naive bayes columns_to_encode = df_soybean.columns.values.tolist() del columns_to_encode[35] df_soybean_encoded = one_hot_encoder(df_soybean, columns_to_encode) averages = run_backpropagation(df_soybean, 35, int(num_hidden)) print("----------------------------") print(f"Averages over 5 experiments") print("----------------------------") print(f"Averages over all 5 experiments = {averages}%")