def vizualizeData(X_svm, y_svm, y_kmeans, y_dbscan, y_agglomerative): interactive = isInteractive() # X_PCA and true y X_PCA, y, classes = processData(encodeLabels=True, testSet=0, reduceDim=3) # X_TSNE X, y, classes = processData(encodeLabels=True, testSet=0) X_TSNE = TSNE(n_components=3).fit_transform(X) # X_PCA and X_TSNE for svm X_svm_PCA = PCA(n_components=3).fit_transform(X_svm) X_svm_TSNE = TSNE(n_components=3).fit_transform(X_svm) if interactive: plotInteractive(X_PCA, y, 'PCA') plotInteractive(X_TSNE, y, 'TSNE') plotInteractive(X_svm_PCA, y_svm, 'Support Vector Machines PCA') plotInteractive(X_svm_TSNE, y_svm, 'Support Vector Machines TSNE') plotInteractive(X_PCA, y_kmeans, 'K-means Clustering PCA') plotInteractive(X_TSNE, y_kmeans, 'K-means Clustering TSNE') plotInteractive(X_PCA, y_dbscan, 'DBSCAN Clustering PCA') plotInteractive(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE') plotInteractive(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA') plotInteractive(X_TSNE, y_agglomerative, 'Agglomerative Clustering TSNE') else: plotStationary(X_PCA, y, 'PCA') plotStationary(X_TSNE, y, 'TSNE') plotStationary(X_svm_PCA, y_svm, 'Support Vector Machines PCA') plotStationary(X_svm_TSNE, y_svm, 'Support Vector Machines TSNE') plotStationary(X_PCA, y_kmeans, 'K-means Clustering PCA') plotStationary(X_TSNE, y_kmeans, 'K-means Clustering TSNE') plotStationary(X_PCA, y_dbscan, 'DBSCAN Clustering PCA') plotStationary(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE') plotStationary(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA') plotStationary(X_TSNE, y_agglomerative, 'Agglomerative Clustering TSNE')
def analyseData(): """Plots most frequent words in the dataset for each author""" X, y, classes = processData(encodeLabels=False, vectorizerName='None', testSet=0) # assert X.shape[0] == 400 and len(y) == 400 and len(classes) == 20 for i, author in enumerate(classes): authorBooks = [] for j in range(20): idx = (i + 1) * (j + 1) - 1 authorBooks.append(X[idx]) freq = get_top_n_words(authorBooks, n=10) plotDistribution(freq, author)
from linearRegression import linearRegression from kNNRegression import kNNRegression from evaluateClassification import evaluateClassification from sklearn.metrics import accuracy_score from sklearn.metrics import r2_score from analyseData import analyseData from plotDecisionBoundry import plotDecisionBoundry from evaluateRegression import evaluateRegression from kmeans import kmeans from dbscan import dbscan from agglomerative import agglomerative from evaluateClustering import evaluateClustering from vizualizeData import vizualizeData #loading pre-processed data X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData( ) yClass_lr = logisticRegression(X_train, X_test, yClass_train) lrAc = accuracy_score(yClass_test, yClass_lr) X_lr = X_test #reloading data pre-processed with different parameters X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData( normalization='mms') yClass_rf = randomForest(X_train, X_test, yClass_train) rfAc = accuracy_score(yClass_test, yClass_rf) X_rf = X_test yClass_knn = kNN(X_train, X_test, yClass_train) knnAc = accuracy_score(yClass_test, yClass_knn)
def vizualizeData(X_lr, y_lr, X_rf, y_rf, X_knn, y_knn, X_linReg, y_linReg, X_knnReg, y_knnReg, y_kmeans, y_dbscan, y_agglomerative): interactive = isInteractive() # X_PCA and X_TSNE X, yClass, yReg = processData(trainTestSplit=2) yClass = yClass.values.ravel() yReg = yReg.values.ravel() X_PCA = PCA(n_components=3).fit_transform(X) X_TSNE = TSNE(n_components=3).fit_transform(X) X_lr_PCA = PCA(n_components=3).fit_transform(X_lr) X_lr_TSNE = TSNE(n_components=3).fit_transform(X_lr) X_rf_PCA = PCA(n_components=3).fit_transform(X_rf) X_rf_TSNE = TSNE(n_components=3).fit_transform(X_rf) X_knn_PCA = PCA(n_components=3).fit_transform(X_knn) X_knn_TSNE = TSNE(n_components=3).fit_transform(X_knn) X_linReg_PCA = PCA(n_components=3).fit_transform(X_linReg) X_linReg_TSNE = TSNE(n_components=3).fit_transform(X_linReg) X_knnReg_PCA = PCA(n_components=3).fit_transform(X_knnReg) X_knnReg_TSNE = TSNE(n_components=3).fit_transform(X_knnReg) if interactive: plotInteractive(X_PCA, yClass, 'PCA Classification') plotInteractive(X_TSNE, yClass, 'TSNE Classification') plotInteractive(X_PCA, yReg, 'PCA Regression') plotInteractive(X_TSNE, yReg, 'TSNE Regression') plotInteractive(X_lr_PCA, y_lr, 'Logistic Regression PCA') plotInteractive(X_lr_TSNE, y_lr, 'Logistic Regression TSNE') plotInteractive(X_rf_PCA, y_rf, 'Random Forests PCA') plotInteractive(X_rf_TSNE, y_rf, 'Random Forests TSNE') plotInteractive(X_knn_PCA, y_knn, 'K-nearest neighbors PCA') plotInteractive(X_knn_TSNE, y_knn, 'K-nearest neighbors TSNE') plotInteractive(X_linReg_PCA, y_linReg, 'Linear Regression PCA') plotInteractive(X_linReg_TSNE, y_linReg, 'Linear Regression TSNE') plotInteractive(X_knnReg_PCA, y_knnReg, 'KNN Regression PCA') plotInteractive(X_knnReg_TSNE, y_knnReg, 'KNN Regression TSNE') plotInteractive(X_PCA, y_kmeans, 'K-means Clustering PCA') plotInteractive(X_TSNE, y_kmeans, 'K-means Clustering TSNE') plotInteractive(X_PCA, y_dbscan, 'DBSCAN Clustering PCA') plotInteractive(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE') plotInteractive(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA') plotInteractive(X_TSNE, y_agglomerative, 'Agglomerative Clustering TSNE') else: plotStationary(X_PCA, yClass, 'PCA Classification') plotStationary(X_TSNE, yClass, 'TSNE Classification') plotStationary(X_PCA, yReg, 'PCA Regression') plotStationary(X_TSNE, yReg, 'TSNE Regression') plotStationary(X_lr_PCA, y_lr, 'Logistic Regression PCA') plotStationary(X_lr_TSNE, y_lr, 'Logistic Regression TSNE') plotStationary(X_rf_PCA, y_rf, 'Random Forests PCA') plotStationary(X_rf_TSNE, y_rf, 'Random Forests TSNE') plotStationary(X_knn_PCA, y_knn, 'K-nearest neighbors PCA') plotStationary(X_knn_TSNE, y_knn, 'K-nearest neighbors TSNE') plotStationary(X_linReg_PCA, y_linReg, 'Linear Regression PCA') plotStationary(X_linReg_TSNE, y_linReg, 'Linear Regression TSNE') plotStationary(X_knnReg_PCA, y_knnReg, 'KNN Regression PCA') plotStationary(X_knnReg_TSNE, y_knnReg, 'KNN Regression TSNE') plotStationary(X_PCA, y_kmeans, 'K-means Clustering PCA') plotStationary(X_TSNE, y_kmeans, 'K-means Clustering TSNE') plotStationary(X_PCA, y_dbscan, 'DBSCAN Clustering PCA') plotStationary(X_TSNE, y_dbscan, 'DBSCAN Clustering TSNE') plotStationary(X_PCA, y_agglomerative, 'Agglomerative Clustering PCA') plotStationary(X_TSNE, y_agglomerative, 'Agglomerative Clustering TSNE')
from dataPreprocessing import processData from svm import svm from kmeans import kmeans from dbscan import dbscan from agglomerative import agglomerative from evaluateClassification import evaluateClassification from evaluateClustering import evaluateClustering from vizualizeData import vizualizeData from analyseData import analyseData """ # loading pre-processed data X_train, X_test, y_train, y_test, classes = processData() y_svm = svm(X_train, X_test, y_train) evaluateClassification(y_test, y_svm, 'Support Vector Machines', classes) """ X, y, classes = processData(testSet=0, reduceDim=2000) y_kmeans = kmeans(X, init='random') evaluateClustering(X, y, y_kmeans, 'K-means Clustering', classes) y_dbscan = dbscan(X) evaluateClustering(X, y, y_dbscan, 'DBSCAN Clustering', classes) y_agglomerative = agglomerative(X) evaluateClustering(X, y, y_agglomerative, 'Agglomerative Clustering', classes) #vizualizeData(X_test, y_svm, y_kmeans, y_dbscan, y_agglomerative) analyseData()