def test_train_test_sizes(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] for test, train in KFolds(breastCancer, 10): self.assertEqual(len(test) + len(train), len(breastCancer))
def test_train_test_independence(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] for test, train in KFolds(breastCancer, 10): #https://stackoverflow.com/questions/3170055/test-if-lists-share-any-items-in-python self.assertFalse(bool(set(test.index) & set(train.index)))
def test_q_calculation(self): dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") naiveBayes = NaiveBayes(dataRetriever.getDataSet(), dataRetriever.getDataClass()) #seperatedByClass = naiveBayes.calculateQ() #print(seperatedByClass) self.assertEqual( dataRetriever.getDataMenu(), ["breastCancer", "glass", "iris", "soybeanSmall", "vote"], "should return list of data sets")
def test_proper_number_of_folds(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] iterations = 0 for test, train in KFolds(breastCancer, 10): iterations += 1 self.assertEqual(iterations, 10)
def test_calc_bins(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") breastCancer = dataRetriever.retrieveData("breastCancer") continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] bd = BinDiscretizer(breastCancer["clumpThickness"], bins=8) fitted_data = bd.train_fit() numpy_bins = np.histogram_bin_edges(breastCancer["clumpThickness"], bins=8) self.assertEqual(np.allclose(bd.bin_edges, numpy_bins[1:]), True, "Should produce the same bins as np.histogram") #Numpy digitize has this weird shenanigan where values of array.max() are considered the next highest bin: #https://stackoverflow.com/questions/4355132/numpy-digitize-returns-values-out-of-range #These lines convert the array returned to consider the max values as part of the right most bin numpy_digitize = np.digitize(breastCancer["clumpThickness"], numpy_bins) max_vals = np.asarray( np.where(numpy_digitize == numpy_digitize.max())).flatten() numpy_digitize[max_vals] = numpy_digitize.max() - 1 print("Shape", breastCancer.shape) self.assertEqual(np.allclose(numpy_digitize, fitted_data), True, "Should produce the same results as np.digitize")
def test_data_retrieval(self): dataRetriever = DataRetriever("../Datasets/metadata.json") # This test is failing because the test itself isn't working # self.assertEqual(dataRetriever.retrieveData("breastCancer"), pd.DataFrame() , "Should return a dataframe") self.assertEqual(dataRetriever.retrieveData("dogDiseases"), None, "Should return null since no data exist")
def testKMeans(self): data = DataRetriever("../Datasets/metadata.json") data.retrieveData("computerHardware") kValue = 15 t = Timer() t.start() mediods = KMediods(data.getDataSet(), data.getDataClass(), data.getDescreteAttributes(), data.getContinuousAttributes(), data.getPredictionType(), kValue, 100) t.stop() print(f"Time: {t}") print(mediods) mediods.to_csv('kmedoids.csv', index=False)
def test_stratisfied(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] iterations = 0 for test, train in KFolds(breastCancer, 10, stratisfied=True): print("TestLen 2", len(test[test['class'] == 2]), "TestLen 4", len(test[test['class'] == 4])) print("TrainLen 2", len(train[train['class'] == 2]), "TrainLen 4", len(train[train['class'] == 4])) iterations += 1
def test_test_set_coverage(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] tested_vals = [] #Add a dummy index to the dataset so we can see which rows are selected each test breastCancer["dummyIndex"] = np.arange(len(breastCancer)) + 1 for test, train in KFolds(breastCancer, 10): tested_vals.extend(test["dummyIndex"]) self.assertTrue(set(tested_vals) == set(breastCancer["dummyIndex"]))
def test_creation(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") breastCancer = dataRetriever.retrieveData("breastCancer") continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] with self.assertRaises(Exception) as context: BinDiscretizer() self.assertTrue('positional' in str(context.exception), "Should Raise Error if no data is passed in") with self.assertRaises(TypeError): BinDiscretizer(breastCancer["clumpThickness"], bins=None)
def test_range_normalizer_bounds(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") breastCancer = dataRetriever.retrieveData("breastCancer") continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] rn = RangeNormalizer(breastCancer[continousAttributes]) fitted = rn.train_fit() #Check if the mins/maxes of all the fitted columns are 0/1, respectively self.assertEqual(np.allclose(np.ones(fitted.shape[1]), fitted.max()), True) self.assertEqual(np.allclose(np.zeros(fitted.shape[1]), fitted.min()), True)
def test_untrained(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") breastCancer = dataRetriever.retrieveData("breastCancer") continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] rn = RangeNormalizer(breastCancer[continousAttributes]) sn = StandardNormalizer(breastCancer[continousAttributes]) #Test range normalizer with self.assertRaises(UntrainedUtilityError): rn.fit(breastCancer[continousAttributes]) #Test standard normalizer with self.assertRaises(UntrainedUtilityError): sn.fit(breastCancer[continousAttributes])
def testOneHotEncoder(self): dataRetriver = DataRetriever("../Datasets/metadata.json") glassData = dataRetriver.retrieveData("breastCancer") data = glassData.getDataSet() unknown = glassData.getDataClass() train = data.sample(n=6, random_state=69) test = data.sample(n=6, random_state=420) ohe = OneHotEncoder() encodedDataFrame = ohe.train_fit(train, glassData.getDescreteAttributes()) encodedDict = ohe.encodedDict encodedTest = ohe.fit(test) # print(encodedDataFrame) # print(encodedDict) print("=============Train============") print(encodedDataFrame[unknown]) print(train[unknown]) print("=============Test=============") print(encodedTest[unknown]) print(test[unknown])
dname = os.path.dirname(abspath) os.chdir(dname) happiness = r""" ____ __ _ __ _____ ____ __ _ _ __ _ / __ \____ ______/ /_(_)____/ /__ / ___/ ______ __________ ___ / __ \____ / /_(_)___ ___ (_)___ ____ _/ /_(_)___ ____ / /_/ / __ `/ ___/ __/ / ___/ / _ \ \__ \ | /| / / __ `/ ___/ __ `__ \ / / / / __ \/ __/ / __ `__ \/ /_ / / __ `/ __/ / __ \/ __ \ / ____/ /_/ / / / /_/ / /__/ / __/ ___/ / |/ |/ / /_/ / / / / / / / / / /_/ / /_/ / /_/ / / / / / / / / /_/ /_/ / /_/ / /_/ / / / / /_/ \__,_/_/ \__/_/\___/_/\___/ /____/|__/|__/\__,_/_/ /_/ /_/ /_/ \____/ .___/\__/_/_/ /_/ /_/_/ /___/\__,_/\__/_/\____/_/ /_/ /_/ """ print(happiness) dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("abalone") dataset = dataRetriever.getDataSet().dropna() dataset = dataset.reset_index(drop=True) test_set = dataset.sample(frac=0.1, random_state=69) train_set = dataset.drop(test_set.index) test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) # This line is used to normalize the data for Forest Fires # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1) maxIter = 1 learning_rate = 1e-3 batch_size = 0.01 metrics = []
from MLAlgorithms.Utils.DataRetriever import DataRetriever from MLAlgorithms.KNN.KMediods import KMediods from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer """ This is the driver we used to create the medoids CSVs. We went with the archaic route and manually changed the data set for each generation. This was the method chosen since some data sets took longer to calculate There are a couple of lines that are data set specific """ data = DataRetriever("../Datasets/metadata.json") dataSetName = "computerHardware" print(f"Creating CSV for {dataSetName}") data.retrieveData(dataSetName) maxItter = 100 kValue = 78 # These are only used for image segmentation and abalone # frac = .25 # random_state = 69 # kValue = m.floor(frac * kValue) dataSetUnNormalized = data.getDataSet() # dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001) // This is for Forest Fires sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()]) dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit()
for i, x_row in enumerate(x_points.to_numpy()): for j, y_row in enumerate(y_points.to_numpy()): for k, col in enumerate(test_cols): x_val = "unseen" y_val = "unseen" if x_row[k] in self.probMatrix[col]: x_val = x_row[k] if y_row[k] in self.probMatrix[col]: y_val = y_row[k] self.distances[i, j] += self.probMatrix[col][x_val][y_val] if __name__ == "__main__": dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("computerHardware") data = dataRetriever.getDataSet() data = data.dropna() data = data.reset_index(drop=True)[[ "venderName", "modelName", "myct", "mmin", "mmax", "cach", "chmin", "chmax", "prp", "erp" ]] test = data.sample(frac=0.2) train = data.drop(test.index) test = test.reset_index(drop=True) train = train.reset_index(drop=True) VDM = ValueDifferenceMetric( data,
/ /_/ // __// / / // __// /_ / // /__ / ___ / / // /_/ // /_/ // / / // /_ / / / // / / / / /(__ ) \____/ \___//_/ /_/ \___/ \__//_/ \___/ /_/ |_//_/ \__, / \____//_/ /_/ \__//_/ /_//_/ /_/ /_//____/ /____/ """ # ====================== Adjustable Variables ============================== current_data_set = "soybeanSmall" mutation_rate = .5 maxItter = 1000 batch_size = .6 population_size = 110 # =========================================================================== # ================ Data pre-processing ================================================= dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData(current_data_set) dataset = dataRetriever.getDataSet().dropna() discrete_attr = dataRetriever.getDescreteAttributes() cont_attributes = dataRetriever.getContinuousAttributes() # This line is used to normalize the data for Forest Fires if current_data_set == "forestFires": zeros = dataset[dataset[dataRetriever.getDataClass()] < 1].index print(len(zeros) / len(dataset)) dataset = dataset.drop(zeros) discrete_attr.remove('month') discrete_attr.remove('day') dataset['month'] = (pd.to_datetime(dataset.month, format='%b').dt.month) - 1 dataset["day"] = dataset['day'].apply(
from MLAlgorithms.KNN.KNearest import KNearestNeighbor, EditedKNN, CondensedKNN from MLAlgorithms.Utils.KFolds import KFolds from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer from MLAlgorithms.Utils.DataRetriever import DataRetriever from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer import numpy as np import json dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("vote") data = dataRetriever.getDataSet() data = data.dropna() data = data.sample(frac=1.0, random_state=93) data = data.reset_index(drop=True) # data = data.drop('idNumber', axis=1) class_col = dataRetriever.getDataClass() # data[class_col] = np.log(data[class_col] + 0.001) contAttr = dataRetriever.getContinuousAttributes() discAttr = dataRetriever.getDescreteAttributes() predictionType = dataRetriever.getPredictionType() output_json = {} iter_num = 0 for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col): #KFolds doesn't have the capability of returning a validate set #K is set to desired k/2 and the validate set is half of the test set
def run_driver(current_data_set, mutation_rate=0.5, maxIter=1000, batch_size=0.6, population_size=110, network_architecture=[15], pb_actor=None): cost_func = { "breastCancer": "bin_cross", "glass": "log_cosh", "soybeanSmall": "log_cosh", "abalone": "log_cosh", "forestFires": "log_cosh", "computerHardware": "log_cosh" } title_text = r""" ______ __ _ ___ __ _ __ __ / ____/___ ____ ___ / /_ (_)_____ / / / /____ _ ____ _____ (_)/ /_ / /_ ____ ___ _____ / / __ / _ \ / __ \ / _ \ / __// // ___/ / /| / / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/ / /_/ // __// / / // __// /_ / // /__ / ___ / / // /_/ // /_/ // / / // /_ / / / // / / / / /(__ ) \____/ \___//_/ /_/ \___/ \__//_/ \___/ /_/ |_//_/ \__, / \____//_/ /_/ \__//_/ /_//_/ /_/ /_//____/ /____/ """ output_json = {} # ====================== Adjustable Variables ============================== # current_data_set = "abalone" # mutation_rate = .5 # maxIter = 10 # batch_size = .6 # population_size = 110 # network_architecture = [] # =========================================================================== output_json["parameters"] = { "mutation_rate": mutation_rate, "population_size": population_size, "network_architecture": network_architecture, "cost_func": cost_func[current_data_set], "maxIter": maxIter, "batch_size": batch_size } # ================ Data pre-processing ================================================= dataRetriever = DataRetriever("../../Datasets/metadata.json") dataRetriever.retrieveData(current_data_set) dataset = dataRetriever.getDataSet().dropna() discrete_attr = dataRetriever.getDescreteAttributes() cont_attributes = dataRetriever.getContinuousAttributes() # This line is used to normalize the data for Forest Fires if current_data_set == "forestFires": discrete_attr.remove('month') discrete_attr.remove('day') dataset['month'] = (pd.to_datetime(dataset.month, format='%b').dt.month) - 1 dataset["day"] = dataset['day'].apply( lambda x: list(calendar.day_abbr).index(x.capitalize())) dataset["month_sin"] = np.sin(dataset['month']) dataset["month_cos"] = np.sin(dataset['month']) dataset["day_sin"] = np.sin(dataset['day']) dataset["day_cos"] = np.sin(dataset['day']) dataset = dataset.drop('day', axis=1) dataset = dataset.drop('month', axis=1) cont_attributes.append('month_sin') cont_attributes.append('month_cos') cont_attributes.append('day_sin') cont_attributes.append('day_cos') dataset[dataRetriever.getDataClass()] = np.log( dataset[dataRetriever.getDataClass()] + 0.000001) elif current_data_set == "computerHardware": discrete_attr.remove('venderName') discrete_attr.remove('modelName') dataset = dataset.drop('venderName', axis=1) dataset = dataset.drop('modelName', axis=1) dataset = dataset.reset_index(drop=True) if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) # ======================= Train Neural Network ================ print(title_text) fold = 0 metrics = [] for test_set, train_set in KFolds(dataset, 10): fold += 1 fitness_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_fitness.csv" output_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_output.csv" metrics.append( multiprocess_func.remote(test_set, train_set, fold, fitness_file, output_file, dataRetriever, cost_func[current_data_set], current_data_set, mutation_rate, maxIter, batch_size, population_size, network_architecture, pb_actor=None)) metrics = ray.get(metrics) print(metrics) print("Average Performance: ", np.asarray(metrics).mean()) output_json["Metrics"] = metrics output_json["Average"] = np.asarray(metrics, dtype=np.float64).mean() output_json["Std"] = np.asarray(metrics, dtype=np.float64).std() with open( f"../DataDump/GA_{current_data_set}_layer{len(network_architecture)}.json", 'w') as f: json.dump(output_json, f, indent=4)
temp_train_with_unknown["unknown_col"] = self.unknown_col.values self.distance_matrix = DistanceMatrix(self.test_data, temp_train_with_unknown, self.contAttr, self.discAttr, len(self.contAttr), len(self.discAttr), self.predictionType, "unknown_col") self.neighbors = self.distance_matrix.distanceMatrix print(curr_iter) if __name__ == "__main__": dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("imageSegmentation") data = dataRetriever.getDataSet() data = data.dropna() data = data.reset_index(drop=True) # data = data.drop('idNumber', axis=1) class_col = dataRetriever.getDataClass() # data[class_col] = np.log(data[class_col] + 0.001) contAttr = dataRetriever.getContinuousAttributes() discAttr = dataRetriever.getDescreteAttributes() test = data.sample(frac=0.2, random_state=17) train = data.drop(test.index) # sn = RangeNormalizer(train[contAttr]) # train[contAttr] = sn.train_fit()
def network_tuner(*nodes_per_hidden_layer): """ This function is used to calcuate the optimal network architecture The user should input the dataset they would like to operate with and change the performance metric in accordance to the data set type IE regression or classification """ MSEs = [] bestNetwork = {} learning_rate = 0.0001 maxItter = 500 batch_size = .5 dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("glass") dataset = dataRetriever.getDataSet().dropna() dataset = dataset.reset_index(drop=True) # This line is used to normalize the data for Forest Fires # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1) dataset[dataRetriever.getContinuousAttributes()] = (dataset[dataRetriever.getContinuousAttributes()]-dataset[dataRetriever.getContinuousAttributes()].mean())/dataset[dataRetriever.getContinuousAttributes()].std() test_set = dataset.sample(frac=0.1, random_state=69) train_set = dataset.drop(test_set.index) test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) ohe = OneHotEncoder() discrete_attr = dataRetriever.getDescreteAttributes() if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) datasetEncoded = ohe.train_fit(train_set, dataRetriever.getDescreteAttributes()) testEncoded = ohe.fit(test_set) output = None nn = NeuralNetwork(datasetEncoded, 0, [], dataRetriever.getPredictionType(), dataRetriever.getDataClass()) for i in range(maxItter): # We don't call an inital feedforward because backpropagate starts with a feedforward call # batch_size represents the number of data points per batch output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size) final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1)) output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True) actual = testEncoded[dataRetriever.getDataClass()] ## ===================== Classification ================= correct = 0 acc = 0 for i, row in enumerate(final): if row == actual.iloc[i]: correct += 1 # final = final.reshape(final.shape[0]) # MSE = ((actual-final)**2).mean() # MSEs.append(MSE) bestNetwork['network'] = nn bestNetwork['acc'] = acc bestNetwork['arc'] = [0] # # ============================================ # # ============ Compare Acc to Most Common Class values = test_set[dataRetriever.getDataClass()].value_counts() # USED FOR CLASSIFICATION # print(f'Accuracy: {acc}') # print(f'Max Class Prior: {values.max()/values.sum()}') # print(f"Class Distribution:\n{values}") # print("Final: ", final) # print("Actual: ", list(actual)) # print() numOfLayer = len(nodes_per_hidden_layer) print("Number of Hidden Layers: ", numOfLayer) for layer in range(numOfLayer): print(f"Layer Number: {layer + 1}") combinations = list(itertools.product(*nodes_per_hidden_layer[:layer+1])) for combo in combinations: output = None print("Node Combination: ",list(combo)) print(combo) nn = NeuralNetwork(datasetEncoded, layer, list(combo), dataRetriever.getPredictionType(), dataRetriever.getDataClass()) for i in range(maxItter): # We don't call an inital feedforward because backpropagate starts with a feedforward call # batch_size represents the number of data points per batch output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size) final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1)) output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True) actual = testEncoded[dataRetriever.getDataClass()] ## ===================== Classification ================= correct = 0 acc = 0 for i, row in enumerate(final): if row == actual.iloc[i]: correct += 1 acc = correct/len(test_set) # # # ============================================ # # # ============ Compare Acc to Most Common Class values = test_set[dataRetriever.getDataClass()].value_counts() # USED FOR CLASSIFICATION # print(f'Accuracy: {acc}') # print(f'Max Class Prior: {values.max()/values.sum()}') # # print(f"Class Distribution:\n{values}") # print("Final: ", final) # print("Actual: ", list(actual)) # print() if acc > bestNetwork['acc']: bestNetwork['network'] = nn bestNetwork['acc'] = acc bestNetwork['arc'] = combo # final = final.reshape(final.shape[0]) # MSE = ((actual-final)**2).mean() # MSEs.append(MSE) # if MSE < bestNetwork['acc']: # bestNetwork['network'] = nn # bestNetwork['acc'] = MSE # bestNetwork['arc'] = combo return bestNetwork#, MSEs
import numpy as np from scipy.stats import norm # Used for P score import matplotlib.pyplot as plt from tqdm import tqdm from MLAlgorithms.NeuralNetwork.NeuralNetwork import NeuralNetwork from MLAlgorithms.Utils.KFolds import KFolds from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer from MLAlgorithms.Utils.DataRetriever import DataRetriever from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer from MLAlgorithms.Utils.OneHotEncoder import OneHotEncoder dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") dataset = dataRetriever.getDataSet().dropna() dataset = dataset.reset_index(drop=True) # This line is used to normalize the data for Forest Fires # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1) maxIter = 1 learning_rate = 1e-3 batch_size = 0.01 metrics = [] fold = 0 # Ten-Fold Cross Validation for test_set, train_set in KFolds(dataset, 10): fold += 1
from MLAlgorithms.KNN.KNearest import KNearestNeighbor, EditedKNN, CondensedKNN from MLAlgorithms.Utils.KFolds import KFolds from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer from MLAlgorithms.Utils.DataRetriever import DataRetriever from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer import numpy as np import pandas as pd import json import glob import json dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("glass") data = dataRetriever.getDataSet() data = data.dropna() data = data.sample(frac=1.0, random_state=93) data = data.reset_index(drop=True) # data = data.drop('region-pixel-count', axis=1) class_col = dataRetriever.getDataClass() # data[class_col] = np.log(data[class_col] + 0.001) contAttr = dataRetriever.getContinuousAttributes() # contAttr.remove('region-pixel-count') discAttr = dataRetriever.getDescreteAttributes() predictionType = dataRetriever.getPredictionType() f = open("glassPerf.json",'r') output_json = json.load(f)
f += 1 print("The Percent of Correct Predictions is {t}%".format( t=round((t * 100 / len(answers)), 1))) print("The Percent of Incorrect Predictions is {f}%\n".format( f=round((f * 100 / len(answers)), 1))) dataRetriever = DataRetriever("../Datasets/metadata.json") ################################################ Un-Shuffled Data ################################################ # This first for loop performs the NaiveBayes algorithm for un-shuffled data jsonResults1 = {} for dataSet in dataRetriever.getDataMenu(): dataRetriever.retrieveData(dataSet) dataClass = dataRetriever.getDataClass() retrievedData = dataRetriever.getDataSet() numOfClassValues = len( retrievedData[dataRetriever.getDataClass()].unique()) method = "macro" foldNum = 1 jsonResults1[dataSet] = {} print(f"PRINTING RESULTS FOR THE CONTROL DATASET {dataSet}") for train, test in KFolds(retrievedData, 10): trainBin = BinDiscretizer( train[dataRetriever.getContinuousAttributes()], multi=True)