def test_random_forest_fit(): interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"] interview_table = [["Senior", "Java", "no", "no", "False"], ["Senior", "Java", "no", "yes", "False"], ["Mid", "Python", "no", "no", "True"], ["Junior", "Python", "no", "no", "True"], ["Junior", "R", "yes", "no", "True"], ["Junior", "R", "yes", "yes", "False"], ["Mid", "R", "yes", "yes", "True"], ["Senior", "Python", "no", "no", "False"], ["Senior", "R", "yes", "no", "True"], ["Junior", "Python", "yes", "no", "True"], ["Senior", "Python", "yes", "yes", "True"], ["Mid", "Python", "no", "yes", "True"], ["Mid", "Java", "yes", "no", "True"], ["Junior", "Python", "no", "yes", "False"]] myutils.prepend_attribute_label(interview_table, interview_header) interview_pytable = MyPyTable(column_names=interview_header, data=interview_table) y_col = interview_pytable.get_column("interviewed_well", False) x_cols = interview_pytable.drop_col("interviewed_well") many_trees = MyRandomForestClassifier() X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col) X_train, X_test, y_train, y_test = myutils.train_test_split( X_sample, y_sample, .33) many_trees.fit(X_train, y_train, X_test, y_test) y_predicted = many_trees.predict(X_test) numCorrectPredictions = 0 numWrongPredictions = 0 for i in range(len(y_test)): values = [y_predicted[i], y_test[i]] #predicted/actual if (values[0] == values[1]): numCorrectPredictions = numCorrectPredictions + 1 else: numWrongPredictions = numWrongPredictions + 1 accuracy = np.round((numCorrectPredictions) / (numCorrectPredictions + numWrongPredictions), 3) error_rate = np.round( (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions), 3) print("-----------------------------------------------------------") print("Accuracy and Error Rate") print("-----------------------------------------------------------") print() print("Random Forest: accuracy = {}, error rate = {}".format( accuracy, error_rate)) print() print( "Because of the random aspect of this classifier, this will not always pass the tests" ) print() print("Predicted table: " + str(y_predicted)) print("Testing set: " + str(y_test)) for i in range(len(y_test)): assert y_predicted[i] == y_test[i]
from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyNaiveBayesClassifier import mysklearn.myevaluation as myevaluation import mysklearn.myutils as myutils import os # "pickle" an object (AKA object serialization) # save a Python object to a binary file # "unpickle" an object (AKA object de-serialization) # load a Python object from a binary file (back into memory) # Get data from csv file table = MyPyTable().load_from_file( os.path.join("input_files", "winequality-red.csv")) y_col = table.get_column("quality", False) x_cols = table.drop_col("quality") # Use Naive Bayes to classify testcase = MyNaiveBayesClassifier() #Returns x INDEXES X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols, y_col, n_splits=10) X_train, X_test, y_train, y_test = myutils.getInstances( X_train, X_test, x_cols, y_col) for i, fold in enumerate(X_train): train, test = myutils.normalize_values(X_train[i], X_test[i]) testcase.fit(train, y_train[i]) break