from sklearn.ensemble import RandomForestClassifier import metrics import matplotlib.pyplot as plt data, target = metrics.preprocess(k=13, fsiter=1000) C_range = range(64, 258, 8) accuracy_scores = [] for c in C_range: rf = RandomForestClassifier(n_estimators=c) scores = metrics.repeatedCrossValidatedScores(data, target, rf, cv=10, iterations=50) temp = scores['test_accuracy'].mean() if temp > 0.9: accuracy_scores.append(temp) metrics.printAverages(c, scores) plt.plot(C_range, accuracy_scores) plt.title('Random Forest Optimization', size=11, fontweight='bold') plt.xlabel('Number of Estimators', size=8) plt.ylabel('Accuracy', size=8) plt.show()
from sklearn.neighbors import KNeighborsClassifier from sklearn.externals import joblib import metrics data, target = metrics.preprocess(k=8, fsiter=1000) clf = KNeighborsClassifier(n_neighbors=1) scores = metrics.repeatedCrossValidatedScores(data, target, clf, iterations=1000, cv=10) metrics.printAverages('clf', scores) clf.fit(data, target) joblib.dump(clf, 'classifier.pkl', compress=9)
import arff import numpy as np from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score import metrics data, target = metrics.preprocess(k=10, fsiter=1000) print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy") temp = metrics.repeatedCrossValidatedScores( data, target, LogisticRegression(C=1000), cv=10, iterations=50) # Gives avaerage accuracy metrics.printAverages(1000, temp) model = LogisticRegression( C=1000 ) #Creates a copy of te function LogisticRegression and names it as model results = metrics.repeatedCrossValidatedScores( data, target, model, cv=10, iterations=50) #Gives avaerage accuracy print("Accuracy: %0.2f (+/- %0.2f)" % (results['test_accuracy'].mean() * 100, results['test_accuracy'].std() * 200)) #prints results
import metrics import warnings warnings.filterwarnings("ignore") data, target = metrics.preprocess(k=8, fsiter=1000, scaling=False) # default values ideal = [0] maxi = 0 # check a lot of hidden layer configurations for sets with high accuracy print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy") for x in range(1, 100): temp = metrics.repeatedCrossValidatedScores(data, target, MLPClassifier( solver='lbfgs', alpha=1e-5, hidden_layer_sizes=x, random_state=1, ), iterations=20, cv=10) metrics.printAverages(x, temp) if np.average(temp['test_f1']) > maxi: maxi = np.average(temp['test_f1']) ideal = x # print the highest accuracy one print(str(ideal) + " gives " + str(maxi) + "% accuracy")
data, target = SMOTE().fit_sample(data, target) # default values to be overwritten ideal = [0, 0, 0] maxi = 0 best = [] # check a lot of hidden layer configurations for sets with high accuracy for x in range(3, 50): for y in range(2, x): for z in range(1, y): temp = metrics.aveaccuracy(data, target, MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(x, y, z), random_state=1), iterations=10) metrics.printAverages((x, y, z), temp) if np.average(temp['test_f1']) > maxi: maxi = np.average(temp['test_f1']) ideal = (x, y, z) if np.average(temp['test_f1']) > 0.8: best = np.append(best, [(x, y, z)]) # print the highest accuracy one print(str(ideal) + " gives " + str(maxi) + "% accuracy") for i in best: print(i)
import numpy as np from sklearn import tree import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=1000) temp = metrics.repeatedCrossValidatedScores(data, target, tree.DecisionTreeClassifier(), iterations=100, cv=10) metrics.printAverages(features, temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for Decision Trees", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"]) plt.xticks([0, 4, 6, 8, 12, 16, 20, 24]) plt.show()
# fixes missing data by taking values from other rows and taking the average imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # this function takes the average of every column excluding the unknown values imp.fit(data) # inserts the average into the missing spots data = imp.fit_transform(data) data, target = SMOTE().fit_sample(data, target) # alpha_range = [float(i) / 100000 for i in range(1, 1001)] alpha_range = [0.0005, 0.001, 0.005, 0.01] alpha_accuracy = [] alpha_sensitivity = [] alpha_specificity = [] for x in alpha_range: temp = metrics.repeatedCrossValidatedScores(data, target, MLPClassifier(solver='lbfgs', alpha=x, hidden_layer_sizes=43, random_state=1), iterations=1000, cv=10) metrics.printAverages('%.5f' % x, temp) alpha_accuracy.append(np.average(temp['test_accuracy'])) # alpha_sensitivity.append(np.average(temp['test_sensitivity'])) # alpha_specificity.append(np.average(temp['test_specificity'])) plt.plot(range(1, 5), alpha_accuracy) plt.xlabel('Value of Alpha') plt.ylabel('Cross-Validation Accuracy') plt.grid = True plt.show()
maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=1000) for neighbours in [1]: temp = metrics.repeatedCrossValidatedScores( data, target, KNeighborsClassifier(n_neighbors=neighbours), iterations=100, cv=10) metrics.printAverages((features, neighbours), temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for KNN", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"]) plt.xticks([0, 4, 8, 12, 16, 20, 24]) plt.show()
'Neural Network': metrics.preprocess(k=8, fsiter=1000), 'Nearest Neighbours': metrics.preprocess(k=8, fsiter=1000) } accuracies = [] sensitivities = [] specificities = [] for key in classifiers.keys(): print(key) data, target = datasets[key] temp = metrics.repeatedCrossValidatedScores(data, target, classifiers[key], iterations=100, cv=10) metrics.printAverages(key, temp) accuracies.append(np.average(temp['test_accuracy']) - 0.9) sensitivities.append(np.average(temp['test_sensitivity']) - 0.9) specificities.append(np.average(temp['test_specificity']) - 0.9) plt.figure() ax = plt.subplot() plt.xticks([2, 6, 10, 14, 18, 22], classifiers.keys(), size=5.5) plt.yticks([0.9, 0.925, 0.95, 0.975, 1.0], ["90%", "92.5%", "95%", "97.5%", "100%"]) plt.title('Relative Success with optimal features') plt.xlabel('Algorithm', size=8) sens = ax.bar([1, 5, 9, 13, 17, 21], sensitivities, width=0.8, color='red',
sens_range = [] neuron_accuracy = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=1000) maxacc = 0.0 for neuron in range(40, 60): temp = metrics.repeatedCrossValidatedScores( data, target, MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=neuron, random_state=1), iterations=100, cv=10) metrics.printAverages((features, neuron), temp) if np.average(temp['test_accuracy']) > maxacc: maxacc = np.average(temp['test_accuracy']) neuron_accuracy.append(maxacc) sens_range.append(features) for x in range(np.size(sens_range)): print((sens_range[x], neuron_accuracy[x])) acc, = plt.plot(sens_range, neuron_accuracy, label='Accuracy') plt.title("Feature Selection for Neural Networks", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"]) plt.xticks([0, 4, 8, 12, 16, 20, 24]) plt.show()