def ica_experiment(X, name, dims, max_iter=5000, tol=1e-04): """Run ICA on specified dataset and saves mean kurtosis results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ ica = FastICA(random_state=0, max_iter=max_iter, tol=tol) kurt = [] loss = [] X = StandardScaler().fit_transform(X) for dim in dims: print(dim) ica.set_params(n_components=dim) tmp = ica.fit_transform(X) df = pd.DataFrame(tmp) df = df.kurt(axis=0) kurt.append(kurtosistest(tmp).statistic.mean()) proj = ica.inverse_transform(tmp) loss.append(((X - proj)**2).mean()) res = pd.DataFrame({"kurtosis": kurt, "loss": loss}) # save results as CSV resdir = 'results/ICA' resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir) res.to_csv(resfile, index_label='n')
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/ICA.py self.log("Performing {}".format(self.experiment_name())) # %% Data for 1 ica = FastICA(random_state=self._details.seed) kurt = {} for dim in self._dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(self._details.ds.training_x) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(self._out.format('{}_scree.csv'.format(self._details.ds_name))) # %% Data for 2 grid = {'ica__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} ica = FastICA(random_state=self._details.seed) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('ica', ica), ('NN', mlp)], memory=experiments.pipeline_memory) gs, final_estimator = self.gs_with_best_estimator(pipe, grid) self.log("Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_dim_red.csv'.format(self._details.ds_name))) self.log("Done")
def run_ica_2(X, dataset): model = FastICA(random_state=0) result_df = pd.DataFrame() k_max = X.shape[1] if k_max > 120: k_max = 120 for i in range(2, k_max + 1): model.set_params(n_components=i) ica_data = pd.DataFrame(model.fit_transform(X)).kurt( axis=0) #kurtosis of ica results result_df.loc[i, 'mean_kurtosis'] = ica_data.abs().mean() plt.clf() plt.title('ICA_Mean_Kurtosis_Per_K') plt.xlabel('K') plt.ylabel('Mean') plt.grid() plt.bar(range(2, result_df.shape[0] + 2), result_df['mean_kurtosis'], align='center', label='mean kurtosis') LOGGER.info('ica max kurtosis on {}: k={}'.format( dataset, result_df.idxmax(axis=0)['mean_kurtosis'])) plt.savefig('plots/' + 'ica_kurt_' + dataset + '.png')
def ica_experiment(X, name, dims): """Run ICA on specified dataset and saves mean kurtosis results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ ica = FastICA(random_state=0, max_iter=5000) kurt = {} for dim in dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(X) df = pd.DataFrame(tmp) df = df.kurt(axis=0) kurt[dim] = df.abs().mean() res = pd.DataFrame.from_dict(kurt, orient='index') res.rename(columns={0: 'kurtosis'}, inplace=True) # save results as CSV resdir = 'results/ICA' resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir) res.to_csv(resfile, index_label='n')
def run_credit_ICA(creditX, creditY): dims_digits = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ] print('Part 2B & 4B - Starting ICA for dataset...credit') ica = FastICA(random_state=5) kurt = {} for dim in dims_digits: ica.set_params(n_components=dim) tmp = ica.fit_transform(creditX) tmp = pd.DataFrame(tmp) tmp2 = tmp.kurt(axis=0) kurt[dim] = tmp2.abs().mean() kurt = pd.Series(kurt) kurt.to_csv('./P2_Dimensionality_Reduction/Credit_ICA_kurtosis.csv') # Run Neural Networks # Transform X data sc = StandardScaler() creditX_tr = sc.fit_transform(creditX) nn_results = run_NN(dims_digits, ica, creditX_tr, creditY) nn_results.to_csv('./P4_Neural_Networks_Reduced/Credit_ICA_nn_results.csv')
def main(): decomp1 = FastICA(random_state=10) decomp2 = FastICA(random_state=10) for i, val in enumerate(r_dims): # [0.6, 0.7, 0.8, 0.9]: decomp1.set_params(n_components=r_dims[i]) decomp2.set_params(n_components=c_dims[i]) run_dim_alg(r_X, r_y, 'reviews', decomp1, r_dims[i], OUT) run_dim_alg(c_X, c_y, 'cancer', decomp2, c_dims[i], OUT)
def run_ica(X, dname, dims): ica = FastICA(random_state=5, max_iter=5000) kurt = {} for dim in dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + '{}_ica.csv'.format(dname))
def get_gnnl_ica(): best_ica_components = X_train_gnnl.shape[1] ica = FastICA(random_state=42, max_iter=500) print("Running ICA for {} components".format(best_ica_components)) ica.set_params(n_components=best_ica_components) ica.fit(X_train_gnnl) X_train_gnnl_ica = ica.transform(X_train_gnnl) X_test_gnnl_ica = ica.transform(X_test_gnnl) X_train_gnnl_ica_df = pd.DataFrame(X_train_gnnl_ica) ica_kurt = X_train_gnnl_ica_df.kurt(axis=0) return X_train_gnnl_ica[:, ica_kurt > 200], X_test_gnnl_ica[:, ica_kurt > 200]
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/ICA.py self.log("Performing {}".format(self.experiment_name())) # %% Data for 1 ica = FastICA( random_state=self._details.seed) # google how this is done kurt = {} for dim in self._dims: ica.set_params(n_components=dim) tmp = ica.fit_transform( self._details.ds.training_x ) # performing ICA on training data --> data with dim components tmp = pd.DataFrame(tmp) tmp = tmp.kurt( axis=0 ) # calculates fourth moment for each component (# components == dim) kurt[dim] = tmp.abs().mean( ) # average kurtosis among the components, why do this??? kurt = pd.Series(kurt) kurt.to_csv( self._out.format('{}_scree.csv'.format(self._details.ds_name)) ) # {dim: avg kurtosis of dim components} --> './output/ICA/bank_scree.csv' # i assume plotting will go through this .csv file and find the dim that results in highest kurtosis? (what transformation is considered best?) # %% Data for 2 # learn NN on transformed input features, pick the best NN and best feature transformation grid = { 'ica__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } ica = FastICA(random_state=self._details.seed) mlp = MLPClassifier( activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) # change to 'identity'??? pipe = Pipeline([('ica', ica), ('NN', mlp)], memory=experiments.pipeline_memory) gs, final_estimator = self.gs_with_best_estimator( pipe, grid) # WAIT, best_estimator != final_estimator self.log("Grid search complete") tmp = pd.DataFrame(gs.cv_results_) # what is this? tmp.to_csv( self._out.format('{}_dim_red.csv'.format( self._details.ds_name))) # --> './output/ICA/bank_ICA.csv self.log("Done")
def performICA(X, title): ica = FastICA(random_state=11, whiten=True) kurt = {} dims = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] for d in dims: ica.set_params(n_components=d) tmp = ica.fit_transform(X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[d] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.plot() plt.xlabel("K") plt.title("ICA on " + title) plt.show()
def ica(X, problem): dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] ica = FastICA(random_state=5) if 'Blood' in problem: dims = range(2, len(X[0])) kurt = {} for dim in dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + problem + 'ICA.csv')
def ICA_experiment(X, y, title, folder=""): n_components_range = list(np.arange(2, X.shape[1], 1)) ica = ICA(random_state=200) kurtosis_scores = [] for n in n_components_range: ica.set_params(n_components=n) ice_score = ica.fit_transform(X) ice_score = pd.DataFrame(ice_score) ice_score = ice_score.kurt(axis=0) kurtosis_scores.append(ice_score.abs().mean()) plt.figure() plt.title("ICA Kurtosis: " + title) plt.xlabel("Independent Components") plt.ylabel("Avg Kurtosis Across IC") plt.plot(n_components_range, kurtosis_scores) plt.savefig(folder + '/ICA.png') plt.close()
def run_ICA(X, title): dims = list(np.arange(2, (X.shape[1] - 1), 3)) dims.append(X.shape[1]) ica = ICA(random_state=5) kurt = [] for dim in dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) plt.figure() plt.title("ICA Kurtosis: " + title) plt.xlabel("Independent Components") plt.ylabel("Avg Kurtosis Across IC") plt.plot(dims, kurt, 'b-') plt.grid(False) plt.show()
def run_adult_ICA(adultX, adultY): dims_digits = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] print('Part 2B & 4B - Starting ICA for dataset...adult') ica = FastICA(random_state=5) kurt = {} for dim in dims_digits: ica.set_params(n_components=dim) tmp = ica.fit_transform(adultX) tmp = pd.DataFrame(tmp) tmp2 = tmp.kurt(axis=0) kurt[dim] = tmp2.abs().mean( ) #taking the mean kurtosis of all components kurt = pd.Series(kurt) kurt.to_csv('./P2_Dimensionality_Reduction/Adult_ICA_kurtosis.csv') # Transform X data sc = StandardScaler() adultX_tr = sc.fit_transform(adultX) nn_results = run_NN(dims_digits, ica, adultX_tr, adultY) nn_results.to_csv('./P4_Neural_Networks_Reduced/Adult_ICA_nn_results.csv')
def run_ICA(X, y, plot_path): dims = list(np.arange(2, (X.shape[1] - 1), 3)) #dims = list(np.arange(2,80,3)) dims.append(X.shape[1]) ica = ICA(random_state=1, max_iter=10) kurt = [] for dim in dims: print(dim) ica.set_params(n_components=dim) tmp = ica.fit_transform(X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) plt.figure() plt.title("ICA Kurtosis") plt.xlabel("Independent Components") plt.ylabel("Avg Kurtosis Across IC") plt.plot(dims, kurt, 'b-') plt.grid(False) plt.savefig(plot_path + '/ICA_DR')
def part2(): ica = FastICA(random_state=5, max_iter=1000, tol=0.75) kurt = {} for dim in range(1, 31): ica.set_params(n_components=dim) tmp = ica.fit_transform(cancer_x) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'cancer part 2.csv') ica = FastICA(random_state=5) kurt = {} for dim in dims_big: ica.set_params(n_components=dim) tmp = ica.fit_transform(housing_x) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'housing part 2.csv')
def run_ICA(X, y, title): dims = list(np.arange(2, (X.shape[1] - 1), 3)) dims.append(X.shape[1]) ica = ICA(random_state=randomSeed, whiten=True) kurt = [] for dim in dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) plt.figure() plt.title("ICA Kurtosis: " + title) plt.xlabel("Independent Components") plt.ylabel("Avg Kurtosis Across IC") plt.plot(dims, kurt, 'b-') plt.grid(False) d = plotsdir + "/" + title if not os.path.exists(d): os.makedirs(d) plt.savefig(d + "/ICA Kurtosis.png")
from helpers.dim_reduction import run_dim_alg, get_data from helpers.constants import ICA_DIMS r_dims = c_dims = ICA_DIMS OUT = '{}/../../OUTPUT/ICA'.format(dir_path) BASE = '{}/../../OUTPUT/BASE'.format(dir_path) r, c = get_data(BASE) r_X, r_y = r c_X, c_y = c ica = FastICA(random_state=5) kurt = {} for dim in r_dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(r_X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv('{}/reviews kurtosis.csv'.format(OUT)) ica = FastICA(random_state=5) kurt = {} for dim in c_dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(c_X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0)
#%% Baseline scores datasets={} datasets['Titanic']={'X_train':XT_train.copy(), 'y_train':yT_train.copy(), 'X_test':XT_test.copy(), 'y_test':yT_test.copy()} datasets['Wilt']={'X_train':XW_train.copy(), 'y_train':yW_train.copy(), 'X_test':XW_test.copy(), 'y_test':yW_test.copy()} clusters = [2,3,4,5,6,8,10,12,15,20,25,30,35,40,50] scores = hlp.explore_clustering(datasets, clusters) #%% Part 2 ICA # ICA for Titanic icaT = FastICA(random_state=54) dims = [2,3,4,5,6,7,8,9,10] kurtT = {} for dim in dims: icaT.set_params(n_components=dim) trans = icaT.fit_transform(XT) proj = icaT.inverse_transform(trans) tmp = pd.DataFrame(trans) tmp = tmp.kurt(axis=0) rec_err = ((XT - proj)**2).mean() kurtT[dim] = (round(tmp.abs().mean(),3), round(tmp.abs().min(),3), round(rec_err,3), tmp) kurtT = pd.Series(kurtT) kurtT # examine average and minimum kurtosis kurtT[7] kurtT[4] # check what kurt returns on a normal distribution: pd.DataFrame(np.random.normal(155, 72, 100000)).kurt(axis=0) icaW = FastICA(random_state=54)
def ulICA(X, y, random_seed, filename, verbose=False): n_cols = len(X.columns) n_com = range(1, n_cols + 1) ica = FastICA(random_state=random_seed) kurt_scores = [] for n in n_com: ica.set_params(n_components=n) icaX = ica.fit_transform(X) icaX = pd.DataFrame(icaX) icaX = icaX.kurt(axis=0) kurt_scores.append(icaX.abs().mean()) if verbose: print(kurt_scores) plt.figure(0) plt.xlabel("# of Components", fontsize=16) plt.ylabel("Average Kurtosis", fontsize=16) plt.title(filename + ' ICA', fontsize=16) plt.plot(n_com, kurt_scores, 'b-') plt.xticks(range(1, n_cols + 1), fontsize=16) plt.yticks(fontsize=16) plt.grid(linestyle='-', linewidth=1, axis="x") plt.savefig("Images\\" + filename + " ICA Kurtosis") plt.show() plt.close() n_cols = len(X.columns) n_com = range(1, n_cols + 1) re = defaultdict(dict) for i, n in product(range(50), n_com): random_projection = PCA(random_state=random_seed, n_components=n) X_Reduced = random_projection.fit_transform(X) p_inverse = np.linalg.pinv(random_projection.components_.T) Recon_X = X_Reduced.dot(p_inverse) MSE_RE = metrics.mean_squared_error(X, Recon_X) re[n][i] = MSE_RE rec = pd.DataFrame(re).T re_mean = rec.mean(axis=1).tolist() re_std = rec.std(axis=1).tolist() lower_axis = [] upper_axis = [] zip_object = zip(re_mean, re_std) for list1_i, list2_i in zip_object: lower_axis.append(list1_i - list2_i) upper_axis.append(list1_i + list2_i) if verbose: print('ICA RE') print(re_mean) print(re_std) fig, ax1 = plt.subplots() ax1.plot(n_com, re_mean, 'b-') ax1.fill_between(n_com, lower_axis, upper_axis, alpha=0.2) ax1.set_xlabel('# of Components', fontsize=16) # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Mean Reconstruction Error', color='b', fontsize=16) ax1.tick_params('y', colors='b', labelsize=16) ax1.tick_params('x', labelsize=16) plt.grid(False) plt.title(filename + " ICA Mean Reconstruction Error", fontsize=16) fig.tight_layout() plt.show()
def main(): out = './BASES/' np.random.seed(0) character = pd.read_hdf('./BASES/datasets.hdf', 'character') character_X = character.drop('Class', 1).copy().values character_Y = character['Class'].copy().values madelon = pd.read_hdf('./BASES/datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values madelon_X = StandardScaler().fit_transform(madelon_X) character_X = StandardScaler().fit_transform(character_X) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dim_red = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims_red_s = [2, 4, 6, 8, 10, 12, 14, 16] # raise data for 1 ################################ ica = FastICA(random_state=5) kurt = {} for dim in dims_red_s: ica.set_params(n_components=dim) tmp = ica.fit_transform(character_X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'character_scree.csv') ################################ ica = FastICA(random_state=5) kurt = {} for dim in dim_red: ica.set_params(n_components=dim) tmp = ica.fit_transform(madelon_X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'madelon_scree.csv') raise # Data for 2 ############################## grid = {'ica__n_components': dims_red_s, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch} ica = FastICA(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('ica', ica), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_dim_red.csv') ############################## grid = {'ica__n_components': dim_red, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch} ica = FastICA(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('ica', ica), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon_dim_red.csv') # raise data for 3 ############################### # Set this from chart 2 and dump, use clustering script to finish up dim = 16 ica = FastICA(n_components=dim, random_state=10) character_X2 = ica.fit_transform(character_X) character_2 = pd.DataFrame(np.hstack((character_X2, np.atleast_2d(character_Y).T))) cols = list(range(character_2.shape[1])) cols[-1] = 'Class' character_2.columns = cols character_2.to_hdf(out + 'datasets.hdf', 'character', complib='blosc', complevel=9) ################################# dim = 45 ica = FastICA(n_components=dim, random_state=10) madelon_X2 = ica.fit_transform(madelon_X) madelon_2 = pd.DataFrame(np.hstack((madelon_X2, np.atleast_2d(madelon_Y).T))) cols = list(range(madelon_2.shape[1])) cols[-1] = 'Class' madelon_2.columns = cols madelon_2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9)
def clustering_ica(cluster_range, ICA_component_, dataset, dir): df = dataset.data x = (df.iloc[:, 0:-1]) y = (df.iloc[:, -1]) y = y.astype('int') x = StandardScaler().fit_transform(x) global _ica, x_ica, _dataset_ica, _dataset_ica NN_ICA_accuracy = defaultdict(dict) kmeans_accuracy_ICA = defaultdict(dict) kmeans_time_ICA = defaultdict(dict) em_accuracy_ICA = defaultdict(dict) em_time_ICA = defaultdict(dict) _data_ICA = FastICA(random_state=0) kurt = {} for dim in ICA_component_: _data_ICA.set_params(n_components=dim) tmp = _data_ICA.fit_transform(dataset.x) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(dir + '{}_ica_scree.csv'.format(dataset.dataset_name)) common_utils.plot_dim_red_scores( dir + '{}_ica_scree.csv'.format(dataset.dataset_name), dir, dataset.dataset_name, "ICA", multiple_runs=False, xlabel='Number of Clusters', ylabel=None) _data_ICA_data = _data_ICA.fit_transform(x) _data_ICA_df = pd.DataFrame(data=_data_ICA_data) _data_ICA_kurtosis = _data_ICA_df.kurt() print(_data_ICA_kurtosis) for ICA_comp in ICA_component_: _data_ICA = FastICA(n_components=ICA_comp, random_state=0) _data_ICA_data = _data_ICA.fit_transform(x) _data_ICA_df = pd.DataFrame(data=_data_ICA_data) _ica = FastICA(n_components=ICA_comp, random_state=0) x_ica = _ica.fit_transform(x) _dataset_ica = dataset _dataset_ica.x = x_ica _dataset_ica.y = y for cluster in cluster_range: # Kmeans start = datetime.now() myk_mean_ICA_prediction = KMeans( n_clusters=cluster, random_state=0).fit_predict(_data_ICA_df) kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( y, myk_mean_ICA_prediction) end = datetime.now() kmeans_accuracy_ICA[ICA_comp][cluster] = kmeans_accuracy_for_k kmeans_time_ICA[ICA_comp][cluster] = (end - start).total_seconds() # EM start = datetime.now() em_pca_prediction_y = GaussianMixture( n_components=cluster).fit(_data_ICA_df).predict(_data_ICA_df) em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( y, em_pca_prediction_y) end = datetime.now() em_accuracy_ICA[ICA_comp][cluster] = em_pca_accuracy_for_k em_time_ICA[ICA_comp][cluster] = (end - start).total_seconds() NN_ICA_accuracy[ICA_comp] = nn_experiment(_dataset_ica) common_utils.plot_feature_transformation_time( kmeans_time_ICA, "k-means ICA clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( kmeans_accuracy_ICA, "k-means ICA clusters vs accuracy", dir) common_utils.plot_feature_transformation_time(em_time_ICA, "EM ICA clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( em_accuracy_ICA, "EM ICA clusters vs accuracy", dir)
file_2.write(";") file_2.write("%1.9f" % pca_var_2[i]) file_2.write("\n") file_2.write("PCA_singular_2") for i in range(0, len(pca_sing_2)): file_2.write(";") file_2.write("%1.9f" % pca_sing_2[i]) file_2.write("\n") ############################## ICA ############################## ica = FastICA(random_state=5) error_rate_1 = np.zeros(np.shape(data1_X)[1]) for i in range(0, np.shape(data1_X)[1]): ica.set_params(n_components=i + 1) DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=0.005) error_rate_1[i] = sum( DT1.fit(ica.fit_transform(data1_X), data1_Y).predict( ica.fit_transform(data1_X)) == data1_Y) * 1.0 / n1 print i + 1 i1 = np.argmax(error_rate_1) + 1 ica.set_params(n_components=i1) temp1 = ica.fit_transform(data1_X) temp1 = pd.DataFrame(temp1) kurt1 = temp1.kurt(axis=0) error_rate_2 = np.zeros(np.shape(data2_X)[1]) for i in range(0, np.shape(data2_X)[1]): ica.set_params(n_components=i + 1)
class assignment4: def __init__(self): # data processing self.dataSetPath = './data_set/' self.dataSetName = "" self.csv_delimiter = ',' self.data = None self.allFeatures = [] self.allTarget = [] # not used self.XTrain = None self.XTest = None self.YTrain = None self.YTest = None # k-mean clustering self.kNum = range(1, 21) self.kmean = None self.kmeanRD = None # expectation maximization self.em = None self.emRD = None # PCA self.pca = None self.pcaDims = range(1, 21) # ICA self.icaDims = range(1, 21) self.ica = None # RP self.rp = None self.rpDims = range(1, 21) # TSVD self.tsvd = None self.tsvdDims = range(1, 10) def read_data_voice(self, dataName): with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=self.csv_delimiter) self.data = list(reader) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.data))) print('Number of attributes: {}'.format(len(self.data[0]) - 1)) def read_data_haptX(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) print(len(self.data)) for elim in self.data: feature = [] for i in elim: feature.append(i) self.allFeatures.append(feature) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allFeatures))) print('Number of attributes: {}'.format(len(self.allFeatures[0]))) def read_data_haptY(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) for elim in self.data: self.allTarget.append(elim) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allTarget))) print('Number of attributes: {}'.format(len(self.allTarget[0]))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.allTarget = self.allTarget.ravel() def split_data_to_train_test(self, testSize=0.3): # in case the data set are very different in format sample_len = len(self.data[0]) for elem in self.data: feature = elem[0:sample_len - 1] feature_vector = [] for f in feature: feature_vector.append(float(f)) self.allFeatures.append(feature_vector) if elem[-1] == '0': val = 0 else: val = 1 self.allTarget.append((float(val))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split( self.allFeatures, self.allTarget, test_size=testSize, random_state=42) print( 'Total X train data -> {}%'.format( int((len(self.XTrain) / len(self.data)) * 100)), 'Size:', len(self.XTrain)) print( 'Total X test data -> {}%'.format( int((len(self.XTest) / len(self.data)) * 100)), 'Size:', len(self.XTest)) print( 'Total Y train data -> {}%'.format( int((len(self.YTrain) / len(self.data)) * 100)), 'Size:', len(self.YTrain)) print( 'Total Y test data -> {}%'.format( int((len(self.YTest) / len(self.data)) * 100)), 'Size', len(self.YTest)) def get_max_idx(self, input): maxVal = input[0] maxIdx = 0 for i in range(1, len(input)): if input[i] > maxVal: maxIdx = i maxVal = input[i] return maxIdx def pairwiseDistCorr(self, X1, X2): assert X1.shape[0] == X2.shape[0] d1 = pairwise_distances(X1) d2 = pairwise_distances(X2) return np.corrcoef(d1.ravel(), d2.ravel())[0, 1] def k_mean_cluster(self): print("-" * 50) print('{}: K-mean clustering'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.kmean = KMeans(random_state=5, max_iter=1000) for i in self.kNum: self.kmean.set_params(n_clusters=i) self.kmean.fit(dataX) scores.append(sm.accuracy_score(self.allTarget, self.kmean.labels_)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, self.kmean.labels_)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:", confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('K-mean Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_KMEAN.png'.format(self.dataSetName)) print("-" * 50) def k_mean_cluster_reduced(self, n_clusters, reduced_data, name): print("-" * 50) print('{}: K-mean clustering {}'.format(self.dataSetName, name)) dataX = StandardScaler().fit_transform(self.allFeatures) self.kmeanRD = KMeans(n_clusters=n_clusters, random_state=5, max_iter=1000) self.kmeanRD.fit(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, self.kmeanRD.labels_))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_)) print("-" * 50) def expectation_maximization_reduced(self, n_components, reduced_data, name): print("-" * 50) print('{}: Expectation maximization {}'.format(self.dataSetName, name)) self.emRD = GaussianMixture(n_components=n_components, random_state=5) self.emRD.fit(reduced_data) y_predict = self.emRD.predict(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, y_predict))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, y_predict)) print("-" * 50) def expectation_maximization(self): print("-" * 50) print('{}: Expectation maximization'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.em = GaussianMixture(random_state=5) for i in self.kNum: self.em.set_params(n_components=i) self.em.fit(dataX) y_predict = self.em.predict(dataX) scores.append(sm.accuracy_score(self.allTarget, y_predict)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, y_predict)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:") print(confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_EM.png'.format(self.dataSetName)) print("-" * 50) def PCA(self): print("-" * 50) print('{}: Principal component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.pca = PCA(random_state=5) grid = {'pca__n_components': self.pcaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('pca', self.pca), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number PCA components:", search.best_params_) self.pca.fit(dataX) var = np.cumsum( np.round(self.pca.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('PCA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.ylim([0, 1]) plt.style.context('seaborn-whitegrid') plt.plot(self.pcaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName)) print("-" * 50) def ICA(self): print("-" * 50) print('{}: Independent component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.ica = FastICA(random_state=5, max_iter=6000) # kurtosis kurt = [] for dim in self.icaDims: self.ica.set_params(n_components=dim) tmp = self.ica.fit_transform(dataX) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) # grid search grid = {'ica__n_components': self.icaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('ica', self.ica), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number ICA components:", search.best_params_) plt.figure() plt.ylabel('Kurtosis') plt.xlabel('# of Features') plt.title('ICA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(kurt) plt.grid() plt.draw() plt.savefig('./{}_kurtosis.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(self.icaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName)) print("-" * 50) def RP(self): print("-" * 50) print('{}: Random Projection'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) disCorr = [] self.rp = SparseRandomProjection(random_state=5) for dim in self.rpDims: self.rp.set_params(n_components=dim) disCorr.append( self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX)) print(disCorr) # grid search grid = {'rp__n_components': self.rpDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('rp', self.rp), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number RP components:", search.best_params_) plt.figure() plt.ylabel('Distance') plt.xlabel('# of Features') plt.title('RP Analysis ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(disCorr) plt.grid() plt.draw() plt.savefig('./{}_distance.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_RP_GS.png'.format(self.dataSetName)) print("-" * 50) def TSVD(self): print("-" * 50) print('{}: TruncatedSVD'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.tsvd = TruncatedSVD(random_state=5) # grid search grid = {'tsvd__n_components': self.tsvdDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number TSVD components:", search.best_params_) self.tsvd.fit(dataX) var = np.cumsum( np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('TSVD Analysis ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName)) print("-" * 50)
kurt1_test = np.zeros(np.shape(data1_X_test)[1]) DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth = None ) # DT1 = neighbors.KNeighborsClassifier(n_neighbors=5, algorithm='auto') # DT1 = svm.SVC(C=0.418, kernel='rbf', max_iter=-1) error_rate_train_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0] print "error_rate_train_DT_1", error_rate_train_DT_1 error_rate_test_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0] print "error_rate_test_DT_2", error_rate_test_DT_1 for i in range(0, np.shape(data1_X_train)[1]): print i start_time = time.time() ica.set_params(n_components=i + 1) data1_X_train_ica = ica.fit_transform(data1_X_train) # data2_X_train is observation, data2_X_train_ica is ICAed # A_1 = ica.mixing_ # Get estimated mixing matrix # # print "A_2", A_2 # data1_X_test_ica = np.dot(data1_X_test, A_1) data1_X_test_ica = ica.transform(data1_X_test) error_rate_train_1[i] = sum( DT1.fit(data1_X_train_ica, data1_y_train).predict(data1_X_train_ica) == data1_y_train) * 1.0 /data1_y_train.shape[0] print("error_rate_train_1[%f]" %i), error_rate_train_1[i] error_rate_test_1[i] = sum( DT1.fit(data1_X_train_ica, data1_y_train).predict(data1_X_test_ica) == data1_y_test) * 1.0 / data1_y_test.shape[0] print("error_rate_test_1[%f]" % i), error_rate_test_1[i] print "time consumed:", time.time()-start_time file_2.write("ICA_error_rate_train_1")
adultY = getAdultY() adultX = getAdultX() adultX, adultTestX = adultX.iloc[:6000, :], adultX.iloc[6000:, :] adultY = getAdultY() adultY, adultTestY = adultY[:6000, ], adultY[6000:, ] dims1 = range(1, 8) dims2 = range(1, 16) #raise #%% data for 1 svm = SVC(kernel="linear", random_state=0, C=6) ica = FastICA(random_state=5) kurt = {} acc = {} for dim in dims1: ica.set_params(n_components=dim, max_iter=500, tol=0.1) tmp = ica.fit_transform(ecoliX) svm.fit(tmp, ecoliY) testX = ica.transform(ecoliTestX) acc[dim] = accuracy_score(ecoliTestY, svm.predict(testX)) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'ecoli scree.csv') acc = pd.Series(acc) acc.to_csv(out + 'ecoli svm validate.csv') tmp.to_csv(out + 'ecoli kurtosis.csv') dt = DecisionTreeClassifier(random_state=0)