def part2(): tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancer_x) tmp[dim][i] = reconstructionError(rp, cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(housing_x) tmp[dim][i] = reconstructionError(rp, housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv')
def rp(X, problem): dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] tmp = defaultdict(dict) if 'Blood' in problem: dims = range(2, len(X[0])) for i, dim in product(range(10), dims): rp = SparseRandomProjection(n_components=dim) print(i, dim) #rp.fit(X) #tmp[dim][i] = euclidean_distances(rp.fit_transform(X)) tmp[dim][i] = reconstructionError(rp, X) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + problem + '_RP.csv')
tmp = defaultdict(dict) for i, dim in product(range(1), dims_letter): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv( './P2_Dimensionality_Reduction/letter_RP_pairwise_distance_corr.csv') print('Part 2C - Starting RP, reconstruction error, for spam dataset...') tmp = defaultdict(dict) for i, dim in product(range(1), dims_spam): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(spamX) tmp[dim][i] = reconstructionError(rp, spamX) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2_Dimensionality_Reduction/spam_RP_reconstruction_error.csv') print('Part 2C - Starting RP, reconstruction error, for letter dataset...') tmp = defaultdict(dict) for i, dim in product(range(1), dims_letter): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(letterX) tmp[dim][i] = reconstructionError(rp, letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2_Dimensionality_Reduction/letter_RP_reconstruction_error.csv') # Run Neural Networks rp = SparseRandomProjection(random_state=5)
tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitsX) tmp[dim][i] = reconstructionError(rp, digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims,
def main(): out = './BASE/' cmap = cm.get_cmap('Spectral') np.random.seed(0) letter = pd.read_hdf('./BASE/datasets.hdf', 'letter') letterX = letter.drop('Class', 1).copy().values letterY = letter['Class'].copy().values madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values madelonX = StandardScaler().fit_transform(madelonX) letterX = StandardScaler().fit_transform(letterX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims2 = [2, 4, 6, 8, 10, 12, 14, 16] #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(letterX) tmp[dim][i] = reconstructionError(rp, letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelonX, madelonY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv') grid = { 'rp__n_components': dims2, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(letterX, letterY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'letter dim red.csv') #raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 60 rp = SparseRandomProjection(n_components=dim, random_state=5) madelonX2 = rp.fit_transform(madelonX) madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T))) cols = list(range(madelon2.shape[1])) cols[-1] = 'Class' madelon2.columns = cols madelon2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) # dim = 16 rp = SparseRandomProjection(n_components=dim, random_state=5) letterX2 = rp.fit_transform(letterX) letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T))) cols = list(range(letter2.shape[1])) cols[-1] = 'Class' letter2.columns = cols letter2.to_hdf(out + 'datasets.hdf', 'letter', complib='blosc', complevel=9)
tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancerX) tmp[dim][i] = reconstructionError(rp, cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims_wine,
diamondsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'diamonds scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims1): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(diamondsX) tmp[dim][i] = reconstructionError(rp, diamondsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'diamonds scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitsX) tmp[dim][i] = reconstructionError(rp, digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree2.csv') #%% task 4 grid = { 'rp__n_components': dims1,
tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitX), digitX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digit scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitX) tmp[dim][i] = reconstructionError(rp, digitX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digit scree2.csv') # Data for 2 grid = { 'rp__n_components': dims_wine,
tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'RP/wine_scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), credit_dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(creditX), creditX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'RP/credit_scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), wine_dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'RP/wine_scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), credit_dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(creditX) tmp[dim][i] = reconstructionError(rp, creditX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'RP/credit_scree2.csv') # RF ==================================== rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced',
tmp.to_csv(out + 'perm scree1.csv') print(2) tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing scree1.csv') print(3) tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(perm_x) tmp[dim][i] = reconstructionError(rp, perm_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'perm scree2.csv') print(4) tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(housing_x) tmp[dim][i] = reconstructionError(rp, housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing scree2.csv') #4 grid = { 'rp__n_components': dims,
tmp.to_csv(out+'biodeg scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'digits scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dimsb): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(biodegX) tmp[dim][i] = reconstructionError(rp, biodegX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'biodeg scree2.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitsX) tmp[dim][i] = reconstructionError(rp, digitsX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'digits scree2.csv') #%% Data for 2 grid ={'rp__n_components':dimsb,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(contraX), contraX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'contra scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_contra): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(contraX) tmp[dim][i] = reconstructionError(rp, contraX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'contra scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancerX) tmp[dim][i] = reconstructionError(rp, cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims_contra,
tmp.to_csv(out1+'faults scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(bcX), bcX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out1+'bc scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(faultsX) tmp[dim][i] = reconstructionError(rp, faultsX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out1+'faults scree2.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(bcX) tmp[dim][i] = reconstructionError(rp, bcX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out1+'bc scree2.csv') #%% Data for 2 grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(abaloneX), abaloneX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'abalone scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(abaloneX) tmp[dim][i] = reconstructionError(rp, abaloneX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'abalone scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitsX) tmp[dim][i] = reconstructionError(rp, digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree2.csv') #%% Data for 2 grid = { 'rp__n_components': abalone_dims,
blocks_X = blocks_balanced.drop('Class',1).copy().values blocks_Y = blocks_balanced['Class'].copy().values blocks_X= StandardScaler().fit_transform(blocks_X) print blocks_X.shape # Run RP for Loans 10 times #clusters = [2,5,10,15,20,25,30,35,40] dims = [2,4,6,9,12,15,18,21,26] tmp_distcorr = defaultdict(dict) tmp_recnstErr = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(loans_X) tmp_recnstErr[dim][i] = reconstructionError(rp, loans_X) tmp_distcorr[dim][i] = pairwiseDistCorr(rp.transform(loans_X), loans_X) tmp_distcorr =pd.DataFrame(tmp_distcorr).T tmp_distcorr['mean'] = np.mean(tmp_distcorr.iloc[:,0:10], axis = 1) tmp_distcorr['std'] = np.std(tmp_distcorr.iloc[:,0:10], axis = 1) tmp_recnstErr =pd.DataFrame(tmp_recnstErr).T tmp_recnstErr['mean'] = np.mean(tmp_recnstErr.iloc[:,0:10], axis = 1) tmp_recnstErr['std'] = np.std(tmp_recnstErr.iloc[:,0:10], axis =1) tmp_distcorr.to_csv(out+'loans_RP_distCorr.csv') tmp_recnstErr.to_csv(out+'loans_RP_reconstrErr.csv') #%% # Run RP for Pageblock 10 times