def run_4d_model(): """ 4D example """ print('\nLinear Discriminant Analysis - 4 dimensions\n') # get features of the data and the target dt = DataFeeder() X, y = dt.get_data() # reduce our features only to 2 dimensions X = run_pca(X, n_components=4, columns=['pc_1', 'pc_2', 'pc_3', 'pc_4']) # split data into 70% training & 30% testing X_train_std, X_test_std, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # create linear dicriminant analysis model model = LinearDiscriminantAnalysis() # train model.fit(X_train_std, y_train) # test y_pred = model.predict(X_test_std) # calculate model accuracy score score = accuracy_score(y_test, y_pred) * 100 print('# Accuracy score: %.2f' % score) calculate_f1_score(y_test, y_pred) # plot confusion matrix plot_confusion_matrix(y_test, y_pred, normalize=True, title='Confusion Matrix') plt.show()
def main(): # create data feeder and get features and target dt = DataFeeder() features, target = dt.get_data() # perform PCA with variety of components #features = dt.pca(2) features = dt.pca(10) # get best hyperparameters scorer = make_scorer(f1_score, pos_label=0) params = find_parameters(features, target, scorer=scorer) # run train test split without penalty print('#################################################') print('Train test split without penaty') run_train_test_split(features, target, C=params['C'], penalty='none', solver='saga') # run train test split with L2 penalty print('#################################################') print('Train test split with L2 penaty') run_train_test_split(features, target, C=params['C']) # run cross validation with L2 penalty print('#################################################') print('Cross Validation with L2 penalty') run_cross_validation(features, target, C=params['C'], penalty='none', solver='saga', title='Cross validation with no penalty') # run cross validation without penalty print('#################################################') print('Cross Validation without penalty') run_cross_validation(features, target, C=params['C'], title='Cross validation with l2 penalty') # plot decission boundaries plt.show()
def main(): """ Initialise DataFrame and pull the features and targets """ df = DataFeeder() features, target = df.get_data() """ Use only 1 component """ features = df.pca(n_components=1) """ Split features and target into 70% train and 30% test """ features_train, features_test, target_train, target_test = train_test_split( features, target, test_size=0.3, stratify=target, random_state=100) """ Initialise Gaussian Naive Bayes into variable clf """ clf = GaussianNB() """ Fit the training data into the classifier and predict using test data """ y_pred = clf.fit(features_train, target_train).predict(features_test) """ Calculate and print accuracy score """ acc = accuracy_score(target_test, y_pred) * 100 print("Accuracy Score: %.2f" % acc) print("F1 score: %.2f" % (f1_score(target_test, y_pred) * 100)) print("Recall score: %.2f" % (recall_score(target_test, y_pred) * 100)) print("Precision score: %.2f" % (precision_score(target_test, y_pred) * 100))
def run_2d_model(): """ 2D example """ print( '\nLinear Discriminant Analysis - 2 dimensions with decision regions\n' ) # get features of the data and the target dt = DataFeeder() X, y = dt.get_data() # reduce our features only to 2 dimensions X = run_pca(X) # split data into 70% training & 30% testing X_train_std, X_test_std, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # create linear dicriminant analysis model model = LinearDiscriminantAnalysis() # train model.fit(X_train_std, y_train) # test y_pred = model.predict(X_test_std) # calculate model accuracy score score = accuracy_score(y_test, y_pred) * 100 print('# Accuracy score: %.2f' % score) calculate_f1_score(y_test, y_pred) # prepare data for visualization X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined_std = np.hstack((y_train, y_test)) # plot decision boundaries plt.figure() plot_decision_regions(X_combined_std, y_combined_std, model) # plot confusion matrix plot_confusion_matrix(y_test, y_pred, normalize=True, title='Confusion Matrix') plt.show()
def main(): # init data feeder df = DataFeeder() # get pre-processed features and target features, target = df.get_data() plot_hist(target, xlabel='Diagnosis', ylabel='Patient Records', title='Patient Diagnosis Distribution', xlim=['M', 'B']) # run PCA to reduce data dimensionality # features = df.pca(n_components=2) # features = df.pca(n_components=4) features = df.pca(n_components=10) # find best hyperparameter n_neighbors = find_best_params(features, target)['n_neighbors'] print("Best number of neighbors: %d" % n_neighbors) # run train_test_split std_test_train_split(features, target, n_neighbors=n_neighbors) # run cross validation cross_validation(features, target, n_neighbors=n_neighbors) # show all graphs plt.show()
def main(): # initialize dataframe as data attained from the DataFeeder df = DataFeeder() # get feature and target data sets from cancer data features, target = df.get_data() # perform PCA with the option of 4 or 2 components #features = df.pca(n_components=4) features = df.pca(n_components=2) # find best hyperparameters (max depth for decision tree) scorer = make_scorer(f1_score, pos_label=0) params = find_best_params(features, target, scorer=scorer) features_train, features_test, target_train, target_test = train_test_split( features, target, stratify=target, random_state=1) # run training and testing data split std_train_test_split(features_train, features_test, target_train, target_test, max_depth=int(params['max_depth'])) # run cross validation cross_validation(features, target, max_depth=int(params['max_depth'])) plt.show()
def main(): """ Main function containing object initialization and method triggering order """ # data feeding object df = DataFeeder() # evaluation object ev = Evaluator() # get features and target data sets features, target = df.get_data(normalize=False) Plotter.plot_distribution(target, ["M", "B"], bins=2, title="Diagnosis Distribution", xlabel="Diagnosis", ylabel="Records") Plotter.plot_distribution(features.iloc[:, 1], bins=50, title="Texture Mean Distribution", xlabel="Texture Mean", ylabel="Records") Plotter.plot_distribution(features.iloc[:, 2], bins=50, title="Perimeter Mean Distribution", xlabel="Perimeter Mean", ylabel="Records") # get features and target data sets features, target = df.get_data() # run PCA # features = df.pca(n_components=2) # features = df.pca(n_components=4) features = df.pca(n_components=10) # split data features_train, features_test, target_train, target_test = Evaluator.split( features, target, stratify=target) # find best parameters based on F1-score scorer = make_scorer(f1_score, pos_label=0) linear_params, rbf_params = Evaluator.find_best_params(features_train, target_train, n_folds=10, scoring=scorer) # train and test model trained on K-fold cross validation ev.k_fold_cv(features, target, n_splits=10, linear_params=linear_params, rbf_params=rbf_params) # train and test linear SVM model with best parameter ev.run_linear_svm(features_train, features_test, target_train, target_test, params=linear_params) # train and test rbf SVM model with best parameter ev.run_rbf_svm(features_train, features_test, target_train, target_test, params=rbf_params) # show all plot figures plt.show()