def RCA_Experiment(X, title, folder=""): n_components_range = list(np.arange(2, X.shape[1], 1)) correlation_coefficient = defaultdict(dict) for i, n in product(range(5), n_components_range): rp = RCA(random_state=i, n_components=n) rp.fit(X) projections = rp.components_ if sparse.issparse(projections): projections = projections.todense() p = pinv(projections) reconstructed = ((p @ projections) @ (X.T)).T correlation_coefficient[n][i] = np.nanmean(np.square(X - reconstructed)) correlation_coefficient = pd.DataFrame(correlation_coefficient).T mean_recon = correlation_coefficient.mean(axis=1).tolist() std_recon = correlation_coefficient.std(axis=1).tolist() plt.plot(n_components_range, mean_recon) plt.xlabel('Random Components') plt.ylabel('Mean Reconstruction Correlation') plt.title( 'Sparse Random Projection for Mean Reconstruction Correlation: ' + title) plt.savefig(folder + '/RcaMeanRE.png') plt.close() plt.plot(n_components_range, std_recon) plt.xlabel('Random Components') plt.ylabel('STD Reconstruction Correlation') plt.title("Sparse Random Projection for STD Reconstruction Correlation: " + title) plt.savefig(folder + '/RcaStdRE.png') plt.close()
def run_RCA(X, title): dims = list(np.arange(2, (X.shape[1] - 1), 3)) dims.append(X.shape[1]) tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = RCA(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T mean_recon = tmp.mean(axis=1).tolist() std_recon = tmp.std(axis=1).tolist() fig, ax1 = plt.subplots() ax1.plot(dims, mean_recon, 'b-') ax1.set_xlabel('Random Components') ax1.set_ylabel('Mean Reconstruction Correlation', color='b') ax1.tick_params('y', colors='b') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims, std_recon, 'm-') ax2.set_ylabel('STD Reconstruction Correlation', color='m') ax2.tick_params('y', colors='m') plt.grid(False) plt.title("Random Components for 5 Restarts: " + title) fig.tight_layout() plt.show()
def run_RCA(X,y,title): dims = list(np.arange(2,(X.shape[1]-1),3)) dims.append(X.shape[1]) tmp = defaultdict(dict) for i,dim in product(range(5),dims): rp = RCA(random_state=i, n_components=dim) tmp[dim][i] = 0 tmp = pd.DataFrame(tmp).T mean_recon = tmp.mean(axis=1).tolist() std_recon = tmp.std(axis=1).tolist() fig, ax1 = plt.subplots() ax1.plot(dims,mean_recon, 'b-') ax1.set_xlabel('RCA') ax1.set_ylabel('Mean RC', color='g') ax1.tick_params('y', colors='g') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims,std_recon, 'm-') ax2.set_ylabel('STD RC', color='r') ax2.tick_params('y', colors='r') plt.grid(False) plt.title(title + "RCA") fig.tight_layout() plt.show()
def credit_risk_data(): data_X = credit_data.drop([ 'credit_amount', 'other_parties', 'purpose', 'own_telephone', 'foreign_worker' ], axis=1) data_y = credit_data[['class']] features_to_encode = [ 'personal_status', 'checking_status', 'credit_history', 'savings_status', 'employment', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'class' ] enc = my_encoder() enc.fit(data_X, features_to_encode) X_train = enc.transform(data_X) # X_test = enc.transform(X_test) run_PCA(X_train, "Credit Data") run_ICA(X_train, "Credit Data") run_RCA(X_train, "Credit Data") pca_credit = PCA(n_components=3, random_state=5).fit_transform(X_train) ica_credit = ICA(n_components=2, random_state=5).fit_transform(X_train) rca_credit = RCA(n_components=29, random_state=5).fit_transform(X_train) run_kmeans(pca_credit, X_train, "KMEANS") run_kmeans(ica_credit, X_train, "KMEANS") run_kmeans(rca_credit, X_train, "KMEANS") run_EM(pca_credit, X_train, 'PCA Credit Risk Data') run_EM(ica_credit, X_train, 'ICA Credit Risk Data') run_EM(rca_credit, X_train, 'RCA Credit Risk Data') km = KMeans(n_clusters=3, random_state=0) y_km = km.fit_predict(X_train) score = silhouette_score(X_train, km.labels_, metric='euclidean') print('Silhouetter Score: %.3f' % score) # kmeans_silhoutte_analysis(X_train) elbow_function(X_train) run_kmeans(X_train, y_km, "KMEANS") em = EM(n_components=2, covariance_type='spherical', random_state=100) y_em = em.fit_predict(X_train) plot_EM(em, X_train) run_EM(X_train, y_em, "EM") # evaluate_EM(em, X_train, y_em) X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=0)
def chess_game_data(): data_X = game_data.drop([ 'id', 'created_at', 'increment_code', 'black_id', 'white_id', 'moves' ], axis=1) data_y = game_data[['winner']] gd = data_X[:1000] features_to_encode = [ 'rated', 'victory_status', 'winner', 'opening_eco', 'opening_name' ] enc = my_encoder() enc.fit(gd, features_to_encode) X_train = enc.transform(gd) # X_test = enc.transform(X_test) run_PCA(X_train, "Chess Data") run_ICA(X_train, "Chess Data") run_RCA(X_train, "Chess Data") pca_chess = PCA(random_state=5).fit_transform(X_train) # ica_chess = ICA(random_state=5).fit_transform(X_train) rca_chess = RCA(n_components=60, random_state=5).fit_transform(X_train) run_kmeans(pca_chess, X_train, "KMEANS") # run_kmeans(ica_chess, X_train, "KMEANS") run_kmeans(rca_chess, X_train, "KMEANS") run_EM(pca_chess, X_train, 'PCA Chess Game Data') # run_EM(ica_chess, X_train, 'ICA Chess Game Data') run_EM(rca_chess, X_train, 'RCA Chess Game Data') km = KMeans(n_clusters=3, random_state=0) y_km = km.fit_predict(X_train) score = silhouette_score(X_train, km.labels_, metric='euclidean') print('Silhouetter Score: %.3f' % score) # kmeans_silhoutte_analysis(X_train) run_kmeans(X_train, y_km, "KMEANS") elbow_function(X_train) em = EM(n_components=4, covariance_type='spherical', random_state=100) y_em = em.fit_predict(X_train) plot_EM(em, X_train) run_EM(X_train, y_em, "EM") X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=0)
def main(): df = pd.read_csv("../Dataset/winequality-white.csv", delimiter=";") seed = 200 np.random.seed(seed) lowquality = df.loc[df['quality'] <= 6].index highquality = df.loc[df['quality'] > 6].index df.iloc[lowquality, df.columns.get_loc('quality')] = 0 df.iloc[highquality, df.columns.get_loc('quality')] = 1 X = np.array(df.iloc[:, 0:-1]) wine_Y = np.array(df.iloc[:, -1]) standardScalerX = StandardScaler() wine_x = standardScalerX.fit_transform(X) pca_wine = PCA(n_components=7, random_state=seed).fit_transform(wine_x) ica_wine = ICA(n_components=9, random_state=seed).fit_transform(wine_x) rca_wine = RCA(n_components=8, random_state=seed).fit_transform(wine_x) imp_wine, top_columns_wine = run_RFC(wine_x, wine_Y, df) rfc_wine = df[top_columns_wine] rfc_wine = np.array(rfc_wine.values, dtype='int64') X_train, X_test, y_train, y_test = train_test_split(np.array(wine_x), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="FullDataset") X_train, X_test, y_train, y_test = train_test_split(np.array(pca_wine), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="PCA") X_train, X_test, y_train, y_test = train_test_split(np.array(ica_wine), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="ICA") X_train, X_test, y_train, y_test = train_test_split(np.array(rca_wine), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="RP") X_train, X_test, y_train, y_test = train_test_split(np.array(rfc_wine), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="RFC")
from sklearn.datasets import load_digits data_digits = load_digits() #X1, Y1 = pd.DataFrame(data_digits["data"]), pd.Series(data_digits["target"]) #Dataset = "digits" from sklearn.datasets import load_wine data_wine = load_wine() X1, Y1 = pd.DataFrame(data_wine["data"],columns=data_wine.feature_names), pd.Series(data_wine["target"]) Dataset = "wine" Xraw = X1 Xpca = PCA(n_components=5,random_state=5).fit_transform(X1) Xica = ICA(n_components=5,random_state=5).fit_transform(X1) Xica /= Xica.std(axis=0) Xrca = RCA(n_components=5,random_state=5).fit_transform(X1) # Run RFC #rfc = RFC(n_estimators=500,min_samples_leaf=round(len(X1)*.01),random_state=5,n_jobs=-1) #imp = rfc.fit(X1,Y1).feature_importances_ #imp = pd.DataFrame(imp,columns=['Feature Importance']) #imp.sort_values(by=['Feature Importance'],inplace=True,ascending=False) #imp['Cum Sum'] = imp['Feature Importance'].cumsum() #imp = imp[imp['Cum Sum']<=0.35] #top_cols = imp.index.tolist() #Xrfc = X1[top_cols] def MLP_classifier(X, Y, datasource): param_range = range(1,201,20)
def dimensionality_reduction_analysis(): X_p, Y_p, df_phish = get_phishing_data() run_PCA(X_p, Y_p, "Phishing Data") run_ICA(X_p, Y_p, "Phishing Data") run_RCA(X_p, Y_p, "Phishing Data") imp_phish, topcols_phish = run_RFC(X_p, Y_p, df_original=df_phish) pca_phish = PCA(n_components=32, random_state=5).fit_transform(X_p) ica_phish = ICA(n_components=32, random_state=5).fit_transform(X_p) rca_phish = RCA(n_components=32, random_state=5).fit_transform(X_p) rfc_phish = df_phish[topcols_phish] rfc_phish = np.array(rfc_phish.values, dtype='int64')[:, :32] # run_kmeans(pca_phish, Y_p, 'PCA Phishing Data') run_kmeans(ica_phish, Y_p, 'ICA Phishing Data') run_kmeans(rca_phish, Y_p, 'RCA Phishing Data') run_kmeans(rfc_phish, Y_p, 'RFC Phishing Data') evaluate_kmeans(KMeans(n_clusters=14, n_init=10, random_state=100, n_jobs=-1), pca_phish, Y_p, title="PCA") evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), ica_phish, Y_p, title="ICA") evaluate_kmeans(KMeans(n_clusters=10, n_init=10, random_state=100, n_jobs=-1), rca_phish, Y_p, title="RCA") evaluate_kmeans(KMeans(n_clusters=2, n_init=10, random_state=100, n_jobs=-1), rfc_phish, Y_p, title="RFC") run_EM(pca_phish, Y_p, 'PCA Phishing Data') run_EM(ica_phish, Y_p, 'ICA Phishing Data') run_EM(rca_phish, Y_p, 'RCA Phishing Data') run_EM(rfc_phish, Y_p, 'RFC Phishing Data') evaluate_EM(EM(n_components=67, covariance_type='diag', n_init=1, warm_start=True, random_state=100), pca_phish, Y_p, title="PCA") evaluate_EM(EM(n_components=64, covariance_type='diag', n_init=1, warm_start=True, random_state=100), ica_phish, Y_p, title="ICA") evaluate_EM(EM(n_components=64, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rca_phish, Y_p, title="RCA") evaluate_EM(EM(n_components=32, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rfc_phish, Y_p, title="RFC") X_v, Y_v, df_vocal = get_vocal_data() run_PCA(X_v, Y_v, "Phone Me Data") run_ICA(X_v, Y_v, "Phone Me Data") run_RCA(X_v, Y_v, "Phone Me Data") imp_vocal, topcols_vocal = run_RFC(X_v, Y_v, df_original=df_vocal) pca_vocal = PCA(n_components=4, random_state=5).fit_transform(X_v) ica_vocal = ICA(n_components=4, random_state=5).fit_transform(X_v) rca_vocal = RCA(n_components=4, random_state=5).fit_transform(X_v) rfc_vocal = df_vocal[topcols_vocal] rfc_vocal = np.array(rfc_vocal.values, dtype='int64')[:, :4] run_kmeans(pca_vocal, Y_v, 'PCA Phone Me Data') run_kmeans(ica_vocal, Y_v, 'ICA Phone Me Data') run_kmeans(rca_vocal, Y_v, 'RCA Phone Me Data') run_kmeans(rfc_vocal, Y_v, 'RFC Phone Me Data') evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), pca_vocal, Y_v, title="PCA") evaluate_kmeans(KMeans(n_clusters=10, n_init=10, random_state=100, n_jobs=-1), ica_vocal, Y_v, title="ICA") evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), rca_vocal, Y_v, title="RCA") evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), rfc_vocal, Y_v, title="RFC") run_EM(pca_vocal, Y_v, 'PCA Phone Me Data') run_EM(ica_vocal, Y_v, 'ICA Phone Me Data') run_EM(rca_vocal, Y_v, 'RCA Phone Me Data') run_EM(rfc_vocal, Y_v, 'RFC Phone Me Data') evaluate_EM(EM(n_components=58, covariance_type='diag', n_init=1, warm_start=True, random_state=100), pca_vocal, Y_v, title="PCA") evaluate_EM(EM(n_components=52, covariance_type='diag', n_init=1, warm_start=True, random_state=100), ica_vocal, Y_v, title="ICA") evaluate_EM(EM(n_components=56, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rca_vocal, Y_v, title="RCA") evaluate_EM(EM(n_components=48, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rfc_vocal, Y_v, title="RFC") # Comparing With NN # Original print("Original") X_train, X_test, y_train, y_test = train_test_split(np.array(X_p), np.array(Y_p), test_size=0.20) full_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve( full_est, X_train, y_train, title="Neural Net Phishing: Full") final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test) # PCA print("PCA") X_train, X_test, y_train, y_test = train_test_split(np.array(pca_phish), np.array(Y_p), test_size=0.20) pca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve( pca_est, X_train, y_train, title="Neural Net Phishing: PCA") final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test) # ICA print("ICA") X_train, X_test, y_train, y_test = train_test_split(np.array(ica_phish), np.array(Y_p), test_size=0.20) ica_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve( ica_est, X_train, y_train, title="Neural Net Phishing: ICA") final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test) # Randomised Projection print("RCA") X_train, X_test, y_train, y_test = train_test_split(np.array(rca_phish), np.array(Y_p), test_size=0.20) rca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve( rca_est, X_train, y_train, title="Neural Net Phishing: RCA") final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test) # RFC print("RFC") X_train, X_test, y_train, y_test = train_test_split(np.array(rfc_phish), np.array(Y_p), test_size=0.20) rfc_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve( rfc_est, X_train, y_train, title="Neural Net Phishing: RFC") final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test) compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca, NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc, 'Phishing Dataset') compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca, NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc, 'Phishing Dataset') compare_learn_time(train_samp_full, NN_train_score_full, NN_train_score_pca, NN_train_score_ica, NN_train_score_rca, NN_train_score_rfc, 'Phishing Dataset') print("Training Clustered Label") # Training NN on Projected data with cluster labels km = KMeans(n_clusters=2, n_init=10, random_state=100, n_jobs=-1).fit(X_p) km_labels = km.labels_ em = EM(n_components=30, covariance_type='diag', n_init=1, warm_start=True, random_state=100).fit(X_p) em_labels = em.predict(X_p) clust_full = addclusters(X_p, km_labels, em_labels) clust_pca = addclusters(pca_phish, km_labels, em_labels) clust_ica = addclusters(ica_phish, km_labels, em_labels) clust_rca = addclusters(rca_phish, km_labels, em_labels) clust_rfc = addclusters(rfc_phish, km_labels, em_labels) print("Training Clustered - Original") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_full), np.array(Y_p), test_size=0.20) full_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve( full_est, X_train, y_train, title="Neural Net Phishing with Clusters: Full") final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test) print("Training Clustered - PCA") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_pca), np.array(Y_p), test_size=0.20) pca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve( pca_est, X_train, y_train, title="Neural Net Phishing with Clusters: PCA") final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test) print("Training Clustered - ICA") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_ica), np.array(Y_p), test_size=0.20) ica_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve( ica_est, X_train, y_train, title="Neural Net Phishing with Clusters: ICA") final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test) print("Training Clustered - RCA") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rca), np.array(Y_p), test_size=0.20) rca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve( rca_est, X_train, y_train, title="Neural Net Phishing with Clusters: RCA") final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test) print("Training Clustered - RFC") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rfc), np.array(Y_p), test_size=0.20) rfc_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve( rfc_est, X_train, y_train, title="Neural Net Phishing with Clusters: RFC") final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test) compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca, NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc, 'Phishing Dataset') compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca, NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc, 'Phishing Dataset') compare_learn_time(train_samp_full, NN_train_score_full, NN_train_score_pca, NN_train_score_ica, NN_train_score_rca, NN_train_score_rfc, 'Phishing Dataset')
def pairwiseDistCorr(X1, X2): assert X1.shape[0] == X2.shape[0] d1 = pairwise_distances(X1) d2 = pairwise_distances(X2) return np.corrcoef(d1.ravel(), d2.ravel())[0, 1] # Run RCA dims = list(np.arange(2, (X1.shape[1] - 1), 3)) dims.append(X1.shape[1]) tmp = defaultdict(dict) for i, dim in product(range(5), dims): rp = RCA(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X1), X1) tmp = pd.DataFrame(tmp).T mean_recon = tmp.mean(axis=1).tolist() std_recon = tmp.std(axis=1).tolist() # Plot RCA fig, ax1 = plt.subplots() ax1.plot(dims, mean_recon, 'b-') ax1.set_xlabel('Random Components') ax1.set_ylabel('Mean Reconstruction Correlation', color='b') ax1.tick_params('y', colors='b') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims, std_recon, 'm-')