def part2(): tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancer_x) tmp[dim][i] = reconstructionError(rp, cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(housing_x) tmp[dim][i] = reconstructionError(rp, housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv')
nn_results = run_NN(dims_spam, ica, spamX, spamY) nn_results.to_csv('./P4_Neural_Networks_Reduced/spam_ICA_nn_results.csv') ica = FastICA(random_state=5) nn_results = run_NN(dims_letter, ica, letterX, letterY) nn_results.to_csv('./P4_Neural_Networks_Reduced/letter_ICA_nn_results.csv') #%% Part 2C & 4C - Run Dimensionality Reduction Algorithm RP, Run NN with reduced dims print( 'Part 2C - Starting RP, pairwise distance correlation, for spam dataset...' ) tmp = defaultdict(dict) for i, dim in product(range(10), dims_spam): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(spamX), spamX) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2_Dimensionality_Reduction/spam_RP_pairwise_distance_corr.csv') print( 'Part 2C - Starting RP, pairwise distance correlation, for letter dataset...' ) tmp = defaultdict(dict) for i, dim in product(range(1), dims_letter): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv( './P2_Dimensionality_Reduction/letter_RP_pairwise_distance_corr.csv')
madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values madelonX = StandardScaler().fit_transform(madelonX) digitsX = StandardScaler().fit_transform(digitsX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX)
def main(): out = './BASE/' cmap = cm.get_cmap('Spectral') np.random.seed(0) letter = pd.read_hdf('./BASE/datasets.hdf', 'letter') letterX = letter.drop('Class', 1).copy().values letterY = letter['Class'].copy().values madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values madelonX = StandardScaler().fit_transform(madelonX) letterX = StandardScaler().fit_transform(letterX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims2 = [2, 4, 6, 8, 10, 12, 14, 16] #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(letterX) tmp[dim][i] = reconstructionError(rp, letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelonX, madelonY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv') grid = { 'rp__n_components': dims2, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(letterX, letterY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'letter dim red.csv') #raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 60 rp = SparseRandomProjection(n_components=dim, random_state=5) madelonX2 = rp.fit_transform(madelonX) madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T))) cols = list(range(madelon2.shape[1])) cols[-1] = 'Class' madelon2.columns = cols madelon2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) # dim = 16 rp = SparseRandomProjection(n_components=dim, random_state=5) letterX2 = rp.fit_transform(letterX) letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T))) cols = list(range(letter2.shape[1])) cols[-1] = 'Class' letter2.columns = cols letter2.to_hdf(out + 'datasets.hdf', 'letter', complib='blosc', complevel=9)
wineX = StandardScaler().fit_transform(wineX) cancerX = StandardScaler().fit_transform(cancerX) clusters = range(2, 10) dims_wine = range(1, 12) dims_cancer = range(1, 10) #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX)
ica.set_params(n_components=dim) tmp = ica.fit_transform(creditX) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'ICA/credit_scree.csv', header=False) #raise # Randomized projections ======================== tmp = defaultdict(dict) for i, dim in product(range(10), wine_dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'RP/wine_scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), credit_dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(creditX), creditX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'RP/credit_scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), wine_dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX)
sep=',', header=None) wineX = StandardScaler().fit_transform(wineX) digitX = StandardScaler().fit_transform(digitX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims_wine = [i for i in range(2, 12)] # data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitX), digitX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digit scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX)
biodegY = biodeg['Class'].copy().values biodegX = StandardScaler().fit_transform(biodegX) digitsX= StandardScaler().fit_transform(digitsX) clusters = [2,5,10,15,20,25,30,35,40] dims = [2,5,10,15,20,25,30,35,40,45,50,55,60] dimsb = [2,5,7,10,15,20,25,30,35] #raise #%% data for 1 tmp = defaultdict(dict) for i,dim in product(range(10),dimsb): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(biodegX), biodegX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'biodeg scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'digits scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dimsb): rp = SparseRandomProjection(random_state=i, n_components=dim)
from matplotlib import cm from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection from itertools import product out = './results/randomized_projections/' perm_x, perm_y, housing_x, housing_y = load_data() # perm, housing raise Exception('Remove this line to run code') #2 print(1) tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(perm_x), perm_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'perm scree1.csv') print(2) tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing scree1.csv') print(3) tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim)
contraX = StandardScaler().fit_transform(contraX) cancerX = StandardScaler().fit_transform(cancerX) clusters = range(2, 10) dims_contra = range(1, 12) dims_cancer = range(1, 10) #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims_contra): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(contraX), contraX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'contra scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_contra): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(contraX) tmp[dim][i] = reconstructionError(rp, contraX)
bcY = bc['diagnosis'].copy().values faultsX = StandardScaler().fit_transform(faultsX) bcX= StandardScaler().fit_transform(bcX) clusters = [2,3,4,5,6,7,8,9,10] dims = [2,3,4,5,6,7,8,9,10,12,15] #raise #%% data for 1 tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(faultsX), faultsX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out1+'faults scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(bcX), bcX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out1+'bc scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim)
return ratio # Breast Cancer Dataset br = pd.read_csv('./BASE/breast.csv') brX = br.drop('Class', 1).copy().values brY = br['Class'].copy().values brX = StandardScaler().fit_transform(brX) cluster_range = range(1, 11) dims = range(1, 30) tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(brX), brX) tmp = pd.DataFrame(tmp).T tmp.to_csv('./RP/breast_scree.csv') ratio = calculate_accuracy(brX, brY, range(1, 30)) barplot_breast(ratio, range(1, 30)) dim = 10 rp = SparseRandomProjection(n_components=dim, random_state=5) brX2 = rp.fit_transform(brX) br2 = pd.DataFrame(np.hstack((brX2, np.atleast_2d(brY).T))) cols = list(range(br2.shape[1])) cols[-1] = 'Class' br2.columns = cols br2.to_csv('./RP/breast.csv')
abaloneX = abalone.drop('Class', 1).copy().values abaloneY = abalone['Class'].copy().values abaloneX = StandardScaler().fit_transform(abaloneX) digitsX = StandardScaler().fit_transform(digitsX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] abalone_dims = range(1, 9) #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(abaloneX), abaloneX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'abalone scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(abaloneX) tmp[dim][i] = reconstructionError(rp, abaloneX)
blocks_Y = blocks_balanced['Class'].copy().values blocks_X= StandardScaler().fit_transform(blocks_X) print blocks_X.shape # Run RP for Loans 10 times #clusters = [2,5,10,15,20,25,30,35,40] dims = [2,4,6,9,12,15,18,21,26] tmp_distcorr = defaultdict(dict) tmp_recnstErr = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(loans_X) tmp_recnstErr[dim][i] = reconstructionError(rp, loans_X) tmp_distcorr[dim][i] = pairwiseDistCorr(rp.transform(loans_X), loans_X) tmp_distcorr =pd.DataFrame(tmp_distcorr).T tmp_distcorr['mean'] = np.mean(tmp_distcorr.iloc[:,0:10], axis = 1) tmp_distcorr['std'] = np.std(tmp_distcorr.iloc[:,0:10], axis = 1) tmp_recnstErr =pd.DataFrame(tmp_recnstErr).T tmp_recnstErr['mean'] = np.mean(tmp_recnstErr.iloc[:,0:10], axis = 1) tmp_recnstErr['std'] = np.std(tmp_recnstErr.iloc[:,0:10], axis =1) tmp_distcorr.to_csv(out+'loans_RP_distCorr.csv') tmp_recnstErr.to_csv(out+'loans_RP_reconstrErr.csv') #%% # Run RP for Pageblock 10 times dims = [2,3,4,5,6,7,8,9,10]