def run_rp_2(X, dataset): model = SparseRandomProjection() result_df = pd.DataFrame() iterations = 10 k_max = X.shape[1] if k_max > 120: k_max = 120 for i in range(2, k_max + 1): LOGGER.info('rp: k={}'.format(i)) vals = [] for c in range(iterations): model.set_params(n_components=i) rp_X = model.fit_transform(X) r_e = reconstruction_error(model, X) # drp = pairwise_distances(rp_X) # dx = pairwise_distances(X) # vals.append(np.corrcoef(drp.ravel(),dx.ravel())[0,1]) vals.append(r_e) result_df.loc[i, 'mean'] = np.mean(vals) result_df.loc[i, 'std'] = np.std(vals) plt.clf() plt.title('RP_Reconstruction_Error_' + dataset) plt.xlabel('K') plt.ylabel('error') plt.grid() plt.plot(result_df.reset_index()['index'], result_df['mean'], label='mean', color='blue') plt.fill_between(result_df.reset_index()['index'], result_df['mean'], result_df['mean'] + result_df['std'], color='green', alpha=0.3) plt.fill_between(result_df.reset_index()['index'], result_df['mean'], result_df['mean'] - result_df['std'], color='green', alpha=0.3) plt.savefig('plots/' + 'rp_distances_' + dataset + '.png')
class assignment4: def __init__(self): # data processing self.dataSetPath = './data_set/' self.dataSetName = "" self.csv_delimiter = ',' self.data = None self.allFeatures = [] self.allTarget = [] # not used self.XTrain = None self.XTest = None self.YTrain = None self.YTest = None # k-mean clustering self.kNum = range(1, 21) self.kmean = None self.kmeanRD = None # expectation maximization self.em = None self.emRD = None # PCA self.pca = None self.pcaDims = range(1, 21) # ICA self.icaDims = range(1, 21) self.ica = None # RP self.rp = None self.rpDims = range(1, 21) # TSVD self.tsvd = None self.tsvdDims = range(1, 10) def read_data_voice(self, dataName): with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=self.csv_delimiter) self.data = list(reader) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.data))) print('Number of attributes: {}'.format(len(self.data[0]) - 1)) def read_data_haptX(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) print(len(self.data)) for elim in self.data: feature = [] for i in elim: feature.append(i) self.allFeatures.append(feature) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allFeatures))) print('Number of attributes: {}'.format(len(self.allFeatures[0]))) def read_data_haptY(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) for elim in self.data: self.allTarget.append(elim) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allTarget))) print('Number of attributes: {}'.format(len(self.allTarget[0]))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.allTarget = self.allTarget.ravel() def split_data_to_train_test(self, testSize=0.3): # in case the data set are very different in format sample_len = len(self.data[0]) for elem in self.data: feature = elem[0:sample_len - 1] feature_vector = [] for f in feature: feature_vector.append(float(f)) self.allFeatures.append(feature_vector) if elem[-1] == '0': val = 0 else: val = 1 self.allTarget.append((float(val))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split( self.allFeatures, self.allTarget, test_size=testSize, random_state=42) print( 'Total X train data -> {}%'.format( int((len(self.XTrain) / len(self.data)) * 100)), 'Size:', len(self.XTrain)) print( 'Total X test data -> {}%'.format( int((len(self.XTest) / len(self.data)) * 100)), 'Size:', len(self.XTest)) print( 'Total Y train data -> {}%'.format( int((len(self.YTrain) / len(self.data)) * 100)), 'Size:', len(self.YTrain)) print( 'Total Y test data -> {}%'.format( int((len(self.YTest) / len(self.data)) * 100)), 'Size', len(self.YTest)) def get_max_idx(self, input): maxVal = input[0] maxIdx = 0 for i in range(1, len(input)): if input[i] > maxVal: maxIdx = i maxVal = input[i] return maxIdx def pairwiseDistCorr(self, X1, X2): assert X1.shape[0] == X2.shape[0] d1 = pairwise_distances(X1) d2 = pairwise_distances(X2) return np.corrcoef(d1.ravel(), d2.ravel())[0, 1] def k_mean_cluster(self): print("-" * 50) print('{}: K-mean clustering'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.kmean = KMeans(random_state=5, max_iter=1000) for i in self.kNum: self.kmean.set_params(n_clusters=i) self.kmean.fit(dataX) scores.append(sm.accuracy_score(self.allTarget, self.kmean.labels_)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, self.kmean.labels_)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:", confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('K-mean Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_KMEAN.png'.format(self.dataSetName)) print("-" * 50) def k_mean_cluster_reduced(self, n_clusters, reduced_data, name): print("-" * 50) print('{}: K-mean clustering {}'.format(self.dataSetName, name)) dataX = StandardScaler().fit_transform(self.allFeatures) self.kmeanRD = KMeans(n_clusters=n_clusters, random_state=5, max_iter=1000) self.kmeanRD.fit(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, self.kmeanRD.labels_))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_)) print("-" * 50) def expectation_maximization_reduced(self, n_components, reduced_data, name): print("-" * 50) print('{}: Expectation maximization {}'.format(self.dataSetName, name)) self.emRD = GaussianMixture(n_components=n_components, random_state=5) self.emRD.fit(reduced_data) y_predict = self.emRD.predict(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, y_predict))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, y_predict)) print("-" * 50) def expectation_maximization(self): print("-" * 50) print('{}: Expectation maximization'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.em = GaussianMixture(random_state=5) for i in self.kNum: self.em.set_params(n_components=i) self.em.fit(dataX) y_predict = self.em.predict(dataX) scores.append(sm.accuracy_score(self.allTarget, y_predict)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, y_predict)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:") print(confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_EM.png'.format(self.dataSetName)) print("-" * 50) def PCA(self): print("-" * 50) print('{}: Principal component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.pca = PCA(random_state=5) grid = {'pca__n_components': self.pcaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('pca', self.pca), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number PCA components:", search.best_params_) self.pca.fit(dataX) var = np.cumsum( np.round(self.pca.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('PCA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.ylim([0, 1]) plt.style.context('seaborn-whitegrid') plt.plot(self.pcaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName)) print("-" * 50) def ICA(self): print("-" * 50) print('{}: Independent component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.ica = FastICA(random_state=5, max_iter=6000) # kurtosis kurt = [] for dim in self.icaDims: self.ica.set_params(n_components=dim) tmp = self.ica.fit_transform(dataX) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) # grid search grid = {'ica__n_components': self.icaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('ica', self.ica), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number ICA components:", search.best_params_) plt.figure() plt.ylabel('Kurtosis') plt.xlabel('# of Features') plt.title('ICA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(kurt) plt.grid() plt.draw() plt.savefig('./{}_kurtosis.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(self.icaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName)) print("-" * 50) def RP(self): print("-" * 50) print('{}: Random Projection'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) disCorr = [] self.rp = SparseRandomProjection(random_state=5) for dim in self.rpDims: self.rp.set_params(n_components=dim) disCorr.append( self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX)) print(disCorr) # grid search grid = {'rp__n_components': self.rpDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('rp', self.rp), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number RP components:", search.best_params_) plt.figure() plt.ylabel('Distance') plt.xlabel('# of Features') plt.title('RP Analysis ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(disCorr) plt.grid() plt.draw() plt.savefig('./{}_distance.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_RP_GS.png'.format(self.dataSetName)) print("-" * 50) def TSVD(self): print("-" * 50) print('{}: TruncatedSVD'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.tsvd = TruncatedSVD(random_state=5) # grid search grid = {'tsvd__n_components': self.tsvdDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number TSVD components:", search.best_params_) self.tsvd.fit(dataX) var = np.cumsum( np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('TSVD Analysis ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName)) print("-" * 50)