def rp_experiment(X, y, name, dims): """Run Randomized Projections on specified dataset and saves reconstruction error and pairwise distance correlation results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ re = defaultdict(dict) pdc = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(X) re[dim][i] = reconstruction_error(rp, X) pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X) re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1)) re.rename(columns={0: 'recon_error'}, inplace=True) pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1)) pdc.rename(columns={0: 'pairwise_dc'}, inplace=True) metrics = pd.concat((re, pdc), axis=1) # save results as CSV resdir = 'results/RP' resfile = get_abspath('{}_metrics.csv'.format(name), resdir) metrics.to_csv(resfile, index_label='n')
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/RP.py self.log("Performing {}".format(self.experiment_name())) # TODO: Use a diff random state? Might be ok as-is # %% Data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), self._dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwise_dist_corr(rp.fit_transform(self._details.ds.training_x), self._details.ds.training_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(self._out.format('{}_scree1.csv'.format(self._details.ds_name))) tmp = defaultdict(dict) for i, dim in product(range(10), self._dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(self._details.ds.training_x) tmp[dim][i] = reconstruction_error(rp, self._details.ds.training_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(self._out.format('{}_scree2.csv'.format(self._details.ds_name))) # %% Data for 2 grid = {'rp__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} rp = SparseRandomProjection(random_state=self._details.seed) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('rp', rp), ('NN', mlp)], memory=experiments.pipeline_memory) gs, final_estimator = self.gs_with_best_estimator(pipe, grid) self.log("Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_dim_red.csv'.format(self._details.ds_name))) self.log("Done")
def part2(): tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancer_x) tmp[dim][i] = reconstructionError(rp, cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(housing_x) tmp[dim][i] = reconstructionError(rp, housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv')
def get_best_dimensionality_reductions(x1, x2, best_features): dim_reds = {} for d, x in {'wine': x1, 'pima': x2}.items(): pca = PCA(n_components=0.95, whiten=True, random_state=42) pca.fit(x) k = dim_reds.setdefault('pca', {}) k[d] = pca k = dim_reds.setdefault('rfc', {}) k[d] = best_features[d] k = dim_reds.setdefault('ica', {}) ica = FastICA(n_components=8, whiten=True, random_state=42) ica.fit(x1) k['wine'] = ica ica = FastICA(n_components=6, whiten=True, random_state=42) ica.fit(x2) k['pima'] = ica k = dim_reds.setdefault('rp', {}) rp = SparseRandomProjection(random_state=42, n_components=8) rp.fit(x1) k['wine'] = rp rp = SparseRandomProjection(random_state=42, n_components=6) rp.fit(x2) k['pima'] = rp return dim_reds
def run_rp(dataset): x_train = data.DATA[dataset]['base']['x_train'] k_values = [] if dataset == 'fashion': k_values = [2, 5, 10, 20, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 784] if dataset == 'wine': k_values = range(2, 11) stats = [] for k in k_values: print(f'Analyzing {dataset} with RP (k={k})') reconstruction_error = float('inf') for seed in range(10): rp = SparseRandomProjection(n_components=k, random_state=seed) rp.fit(x_train) new_reconstruction_error = compute_rp_reconstruction_error(rp, x_train) reconstruction_error = new_reconstruction_error if new_reconstruction_error < reconstruction_error else reconstruction_error if dataset == 'fashion' and k in (300, 500, 600, 650, 700) and seed == 0: plot_rp_reconstructed_data_fashion(rp, x_train, k) stats.append({ 'k': k, 'reconstruction_error': reconstruction_error }) stats_df = pd.DataFrame(stats).set_index('k') plot_rp_reconstruction_error(dataset, stats_df)
def RCA_Experiment(X, title, folder=""): n_components_range = list(np.arange(2, X.shape[1], 1)) correlation_coefficient = defaultdict(dict) for i, n in product(range(5), n_components_range): rp = RCA(random_state=i, n_components=n) rp.fit(X) projections = rp.components_ if sparse.issparse(projections): projections = projections.todense() p = pinv(projections) reconstructed = ((p @ projections) @ (X.T)).T correlation_coefficient[n][i] = np.nanmean(np.square(X - reconstructed)) correlation_coefficient = pd.DataFrame(correlation_coefficient).T mean_recon = correlation_coefficient.mean(axis=1).tolist() std_recon = correlation_coefficient.std(axis=1).tolist() plt.plot(n_components_range, mean_recon) plt.xlabel('Random Components') plt.ylabel('Mean Reconstruction Correlation') plt.title( 'Sparse Random Projection for Mean Reconstruction Correlation: ' + title) plt.savefig(folder + '/RcaMeanRE.png') plt.close() plt.plot(n_components_range, std_recon) plt.xlabel('Random Components') plt.ylabel('STD Reconstruction Correlation') plt.title("Sparse Random Projection for STD Reconstruction Correlation: " + title) plt.savefig(folder + '/RcaStdRE.png') plt.close()
class SparseRandomProjectionImpl(): def __init__(self, n_components='auto', density='auto', eps=0.1, dense_output=False, random_state=None): self._hyperparams = { 'n_components': n_components, 'density': density, 'eps': eps, 'dense_output': dense_output, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def RP_V(X, k): #random projection #overall O(M * k) transformer = SparseRandomProjection(n_components=k, random_state=0) #O(M * k) transformer.fit(X) V = transformer.components_.T return V
def random_project(weight, channel_num): A = weight.cpu().clone() A = A.view(A.size(0), -1) rp = SparseRandomProjection(n_components=channel_num * weight.size(2) * weight.size(3)) rp.fit(A) return rp.transform(A)
def RCA_Reconstruction(X, ncomponent): start = time.time() rca = SparseRandomProjection(random_state=0, n_components=ncomponent) rca.fit(X) end = time.time() print("RCA took {} s".format(end - start)) w = rca.components_ if sps.issparse(w): w = w.todense() p = pinv(w) reconstructed = ((p @ w) @ (X.T)).T # Unproject projected data errors = np.square(X - reconstructed) return reconstructed, np.nanmean(errors)
class SparseRandomProjectionImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class VCoder(object): def __init__(self, n_sketches, sketch_dim, input_dim): self.n_sketches = n_sketches self.sketch_dim = sketch_dim self.input_dim = input_dim self.standard_scaler = StandardScaler() if self.input_dim < 10000: self.random_projection = GaussianRandomProjection(n_components = 16*n_sketches) else: self.random_projection = SparseRandomProjection(n_components = 16*n_sketches, density = 1/3.0) def fit(self, v): self.standard_scaler = self.standard_scaler.fit(v) v = self.standard_scaler.transform(v) self.random_projection = self.random_projection.fit(v) v = self.random_projection.transform(v) self.init_biases(v) def transform(self, v): v = self.standard_scaler.transform(v) v = self.random_projection.transform(v) v = self.discretize(v) v = np.packbits(v, axis=-1) v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape(v.shape[0], -1) % self.sketch_dim return v
def reducer_rand_proj_sparse(data, params): if params is None: params = {'n_components': 5} X = data['X_train'] y = data['y_train'] reducer = SparseRandomProjection(n_components=params['n_components']) reducer.fit(X) do = deepcopy(data) do['X_train'] = reducer.transform(data['X_train']) do['X_valid'] = reducer.transform(data['X_valid']) return do
class SparseRandomProjectionSLFN(SLFN): def __init__(self, X, n_neurons, density=0.1, ufunc=np.tanh, random_state=None): self.n_neurons = n_neurons self.ufunc = ufunc self.projection = SparseRandomProjection(n_components=n_neurons, density=density, dense_output=True, random_state=random_state) self.projection.fit(X) def transform(self, X): return self.ufunc(self.projection.transform(X))
class DReduction: _N_COMP = 0 ### Number of decomposition components ### _pca = 0 _tsvd = 0 _ica = 0 _grp = 0 _srp = 0 def __init__(self, nComp): self._N_COMP = nComp self._pca = PCA(n_components=self._N_COMP, random_state=17) self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17) self._ica = FastICA(n_components=self._N_COMP, random_state=17) self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17) self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17) def fit(self, X): self._pca.fit(X) self._tsvd.fit(X) self._ica.fit(X) self._grp.fit(X) self._srp.fit(X) def transform(self, X): res_pca = self._pca.transform(X) res_tsvd = self._tsvd.transform(X) res_ica = self._ica.transform(X) res_grp = self._grp.transform(X) res_srp = self._srp.transform(X) df = pd.DataFrame() for i in range(1, self._N_COMP + 1): df['pca_' + str(i)] = res_pca[:, i - 1] df['tsvd_' + str(i)] = res_tsvd[:, i - 1] df['ica_' + str(i)] = res_ica[:, i - 1] df['grp_' + str(i)] = res_grp[:, i - 1] df['srp_' + str(i)] = res_srp[:, i - 1] return df
def process_file(file, model='distilbert-base-uncased', dim_reduction='auto', output_path=None): # establish conventional file names for output save_dir = pathlib.Path(output_path) if output_path else _default_output_dir vec_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.npy' dim_reducer_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.reducer.pkl' metadata_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.metadata.json' # keep track of config metadata = { 'model': model, 'source_file': file, 'embeddings_file': str(vec_outpath), # filled in later 'dim_reduction': dim_reduction, 'dim_reduction_transformer_file': str(dim_reducer_outpath) if dim_reduction else None } language_model = pipeline(task='feature-extraction', model=model) embedded_entries = [] with open(file, 'r') as f: current_line = f.readline() while len(current_line): entry = process_entry(json.loads(current_line), language_model) embedded_entries.append(entry) current_line = f.readline() entries_vec = np.stack(embedded_entries, axis=0) print(f'Processed {len(embedded_entries)} from file {file}') dim_reducer = None if dim_reduction is not None: dim_reducer = SparseRandomProjection(n_components=dim_reduction) dim_reducer.fit(entries_vec) entries_vec = dim_reducer.transform(entries_vec) # save trained dim reducer with open(str(dim_reducer_outpath), 'wb') as f_out: pickle.dump(dim_reducer, f_out) # save embeddings np.save(vec_outpath, entries_vec) # save metadata with open(str(metadata_outpath), 'w') as f_out: json.dump(metadata, f_out)
def test_SparseRandomProjection_output_representation(): for SparseRandomProjection in all_SparseRandomProjection: # when using sparse input, the projected data can be forced to be a # dense numpy array rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0) rp.fit(data) assert isinstance(rp.transform(data), np.ndarray) sparse_data = sp.csr_matrix(data) assert isinstance(rp.transform(sparse_data), np.ndarray) # the output can be left to a sparse matrix instead rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0) rp = rp.fit(data) # output for dense input will stay dense: assert isinstance(rp.transform(data), np.ndarray) # output for sparse output will be sparse: assert sp.issparse(rp.transform(sparse_data))
def create_projection(F, D, projection_name, seed): if projection_name == "none" or projection_name == "sample": return elif projection_name == "gaussian": G = GaussianRandomProjection(n_components=F, random_state=seed) G.fit(np.zeros([1, D])) return G elif "sparse" in projection_name: if projection_name == "very sparse": s = np.sqrt(D) elif projection_name == "very very sparse": s = D / np.log(D) elif projection_name == "sparse": s = 3 proj = SparseRandomProjection(n_components=F, random_state=seed, density=1 / s) elif "DCT" in projection_name: proj = DCT(F=F, arrangement=projection_name[4:]) else: raise Exception(f"projection name wrong; {projection_name}") proj.fit(np.zeros([1, D])) return proj
def run_SRP(X,y,title): dims = list(np.arange(1,X.shape[1]+1)) tmp1 = defaultdict(dict) for i,dim in product(range(3),dims): rp = SRP(random_state=5, n_components=dim) rp = rp.fit(X) tmp1[dim][i] = reconstruction_error(rp, X) tmp1 = pd.DataFrame(tmp1).T plt.plot(dims,tmp1, 'm-') plt.ylabel('error') plt.xlabel('number of dimension') plt.legend(loc="best") plt.title("Random Components for 3 Restarts: "+title) plt.show()
def test_random_sparse_encoder_load(): train_data = np.random.rand(2000, input_dim) from sklearn.random_projection import SparseRandomProjection model = SparseRandomProjection(n_components=target_output_dim) filename = 'random_sparse_model.model' pickle.dump(model.fit(train_data), open(filename, 'wb')) encoder = TransformEncoder(model_path=filename) test_data = np.random.rand(10, input_dim) encoded_data = encoder.encode(test_data) transformed_data = model.transform(test_data) assert encoded_data.shape == (test_data.shape[0], target_output_dim) assert type(encoded_data) == np.ndarray np.testing.assert_almost_equal(transformed_data, encoded_data) save_and_load(encoder, False) save_and_load_config(encoder, False, train_data) rm_files([encoder.save_abspath, encoder.config_abspath, filename])
def visualize_rp(X, y, problem): pl.figure() colors = ['navy', 'darkorange'] if 'Freddie' in problem: target_names = ['default', 'no default'] else: target_names = ['donated', 'not donated'] lw = 2 rp = SparseRandomProjection(n_components=2) X_rp = rp.fit(X).transform(X) for color, i, target_name in zip(colors, [0, 1], target_names): pl.scatter(X_rp[y == i, 0], X_rp[y == i, 1], color=color, alpha=.8, lw=lw, label=target_name) pl.legend(loc='best', shadow=False, scatterpoints=1) pl.title('RP of ' + problem) pl.show()
def applyRP(label, method, X, n_components, usen, reconstructimages=False): print("doing %s..." % (method)) pdiffms = [] pdiffstds = [] mse = [] firstimages = [] for n in n_components: model = SparseRandomProjection(n_components=n) Xt = model.fit_transform(X) Xr = reconstructit(model.components_, Xt) mse.append(mean_squared_error(X, Xr)) firstimages.append(Xr[0, :]) Xtd = pairwise_distances(Xt) Xd = pairwise_distances(X) nonzero = Xd != 0 Xd = Xd[nonzero] pdiff = np.abs(Xtd[nonzero] - Xd) / Xd pdiffm = pdiff.mean() pdiffstd = pdiff.std() pdiffms.append(pdiffm) pdiffstds.append(pdiffstd) print("done. plotting...") plot_pdiff(label, method, np.array(pdiffms), np.array(pdiffstds), n_components) plot_re(label, method, mse, n_components) if reconstructimages: firstimages.insert(0, np.array(X.iloc[0, :])) plot_first_images(firstimages, n_components, method, label) model = SparseRandomProjection(n_components=usen) model = model.fit(X) return model
tmp[dim][i] = pairwiseDistCorr_chunked(rp.fit_transform(diamondsX), diamondsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'diamonds scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims1): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(diamondsX) tmp[dim][i] = reconstructionError(rp, diamondsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'diamonds scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitsX) tmp[dim][i] = reconstructionError(rp, digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree2.csv') #%% task 4 grid = {
def _get_projection(n_samples, n_features, density='auto', eps=0.1): p = SparseRandomProjection(density=density, eps=eps) mat = csr_matrix((n_samples, n_features)) return p.fit(mat)
def DecomposedFeatures(train, test, val, total, addtrain, addtest, use_pca = 0.0, use_tsvd = 0.0, use_ica = 0.0, use_fa = 0.0, use_grp = 0.0, use_srp = 0.0, use_KPCA = 0.0, kernal="rbf"): print("\nStart decomposition process...") train_decomposed = [] test_decomposed = [] val_decomposed = [] if addtrain is not None: train_decomposed = [addtrain] val_decomposed= [val] if addtest is not None: test_decomposed = [addtest] if use_pca>0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) +1 pca = PCA(n_components = N_COMP, whiten=True, svd_solver="full", random_state = 42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) pca_results_val = pca.transform(val) train_decomposed.append(pca_results_train) test_decomposed.append(pca_results_test) val_decomposed.append(pca_results_val) if use_tsvd>0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) +1 tsvd = TruncatedSVD(n_components = N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) tsvd_results_val = tsvd.transform(val) train_decomposed.append(tsvd_results_train) test_decomposed.append(tsvd_results_test) val_decomposed.append(tsvd_results_val) if use_ica>0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) +1 ica = FastICA(n_components = N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) ica_results_val = ica.transform(val) train_decomposed.append(ica_results_train) test_decomposed.append(ica_results_test) val_decomposed.append(ica_results_val) if use_fa>0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) +1 fa = FactorAnalysis(n_components = N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) fa_results_val = fa.transform(val) train_decomposed.append(fa_results_train) test_decomposed.append(fa_results_test) val_decomposed.append(fa_results_val) if use_grp>0.0 or use_grp<0.0: print("GRP") if use_grp>0.0: N_COMP = int(use_grp * train.shape[1]) +1 eps=10 if use_grp<0.0: N_COMP = "auto" eps=abs(use_grp) grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) grp_results_val = grp.transform(val) train_decomposed.append(grp_results_train) test_decomposed.append(grp_results_test) val_decomposed.append(grp_results_val) if use_srp>0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) +1 srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) srp_results_val = pca.transform(val) train_decomposed.append(srp_results_train) test_decomposed.append(srp_results_test) val_decomposed.append(srp_results_val) if use_KPCA >0.0: print("KPCA") N_COMP = int(use_KPCA * train.shape[1]) +1 #N_COMP = None pls = KernelPCA(n_components = N_COMP,kernel=kernal) pls_results = pls.fit(total) pls_results_train = pls.transform(train) pls_results_test = pls.transform(test) pls_results_val = pls.transform(val) train_decomposed.append(pls_results_train) test_decomposed.append(pls_results_test) val_decomposed.append(pls_results_val) gc.collect() print("Append decomposition components together...") train_decomposed = np.concatenate(train_decomposed, axis=1) test_decomposed = np.concatenate( test_decomposed, axis=1) val_decomposed = np.concatenate( val_decomposed, axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) val_with_only_decomposed_features = pd.DataFrame(val_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(0) val_with_only_decomposed_features = val_with_only_decomposed_features.fillna(0) return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
def _get_projection(n_samples, n_features, density="auto", eps=0.1): p = SparseRandomProjection() mat = lil_matrix((n_samples, n_features)) return p.fit(mat)
rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX) tmp[dim][i] = reconstructionError(rp, wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_cancer): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancerX) tmp[dim][i] = reconstructionError(rp, cancerX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer scree2.csv') #%% Data for 2 grid = {
rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(digitsX) tmp[dim][i] = reconstructionError(rp, digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree2.csv') #%% Data for 2 grid = {
def select_features_SparseRandomProjections(train_X, train_y, test_X, k): selector = SparseRandomProjection(n_components=k, random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
def main(): out = './BASE/' cmap = cm.get_cmap('Spectral') np.random.seed(0) letter = pd.read_hdf('./BASE/datasets.hdf', 'letter') letterX = letter.drop('Class', 1).copy().values letterY = letter['Class'].copy().values madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values madelonX = StandardScaler().fit_transform(madelonX) letterX = StandardScaler().fit_transform(letterX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims2 = [2, 4, 6, 8, 10, 12, 14, 16] #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(letterX) tmp[dim][i] = reconstructionError(rp, letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelonX, madelonY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv') grid = { 'rp__n_components': dims2, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(letterX, letterY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'letter dim red.csv') #raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 60 rp = SparseRandomProjection(n_components=dim, random_state=5) madelonX2 = rp.fit_transform(madelonX) madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T))) cols = list(range(madelon2.shape[1])) cols[-1] = 'Class' madelon2.columns = cols madelon2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) # dim = 16 rp = SparseRandomProjection(n_components=dim, random_state=5) letterX2 = rp.fit_transform(letterX) letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T))) cols = list(range(letter2.shape[1])) cols[-1] = 'Class' letter2.columns = cols letter2.to_hdf(out + 'datasets.hdf', 'letter', complib='blosc', complevel=9)
class STPM(pl.LightningModule): def __init__(self, model: torchvision.models, embedding_dir_path: str, sample_path: str, input_image_size: int, coreset_sampling_ratio: int, n_neighbors: int, anomal_threshold: float, normalization_mean: [], normalization_std: []): super(STPM, self).__init__() self.save_hyperparameters() self.init_features() # MODEL HYPERPARAMETERS self.input_image_size = input_image_size self.coreset_sampling_ratio = coreset_sampling_ratio self.n_neighbors = n_neighbors self.anomal_threshold = anomal_threshold self.embedding_dir_path = embedding_dir_path self.sample_path = sample_path #self.source_code_save_path = source_code_save_path def hook_t(module, input, output): self.features.append(output) self.model = model #self.model = wide_resnet50_2(pretrained=True, progress=True) for param in self.model.parameters(): param.requires_grad = False self.model.layer2[-1].register_forward_hook(hook_t) self.model.layer3[-1].register_forward_hook(hook_t) #self.data_inv_transform= transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255], std=[1/0.229, 1/0.224, 1/0.255]) self.data_inv_transform = transforms.Normalize( mean=[ -normalization_mean[0] / normalization_std[0], -normalization_mean[1] / normalization_std[1], -normalization_mean[2] / normalization_std[2] ], std=[ 1 / normalization_std[0], 1 / normalization_std[1], 1 / normalization_std[2] ]) # dummy loss. No Update parameters is performed self.criterion = torch.nn.MSELoss(reduction='sum') self.init_results_list() def init_results_list(self): self.img_path_list = [] self.mean_score_norm = [] self.all_scores = [] self.all_scores_mean_norm = [] self.image_batch_list = [] self.x_type_list = [] self.y_true = [] def init_features(self): self.features = [] def forward(self, x_t): self.init_features() _ = self.model(x_t) return self.features def save_anomaly_map(self, anomaly_map, input_img, gt_img, file_name, x_type): if anomaly_map.shape != input_img.shape: anomaly_map = cv2.resize(anomaly_map, (input_img.shape[0], input_img.shape[1])) anomaly_map_norm = min_max_norm(anomaly_map) anomaly_map_norm_hm = cvt2heatmap(anomaly_map_norm * 255) # anomaly map on image heatmap = cvt2heatmap(anomaly_map_norm * 255) hm_on_img = heatmap_on_image(heatmap, input_img) # save images cv2.imwrite( os.path.join(self.sample_path, f'{x_type}_{file_name}.jpg'), input_img) cv2.imwrite( os.path.join(self.sample_path, f'{x_type}_{file_name}_amap.jpg'), anomaly_map_norm_hm) cv2.imwrite( os.path.join(self.sample_path, f'{x_type}_{file_name}_amap_on_img.jpg'), hm_on_img) def configure_optimizers(self): return None def on_train_start(self): self.model.eval() # to stop running_var move (maybe not critical) self.embedding_list = [] def on_test_start(self): self.init_results_list() self.embedding_coreset = pickle.load( open(os.path.join(self.embedding_dir_path, 'embedding.pickle'), 'rb')) embeded = torch.tensor(self.embedding_coreset) train_jit = TrainFeature(embeded) traced_model = torch.jit.script(train_jit) torch.jit.save(traced_model, "patchcore_features.pt") def training_step(self, batch, batch_idx): # save locally aware patch features x, _, file_name, _ = batch features = self(x) embeddings = [] for feature in features: m = torch.nn.AvgPool2d(3, 1, 1) embeddings.append(m(feature)) embedding = embedding_concat(embeddings[0], embeddings[1]) self.embedding_list.extend(reshape_embedding(np.array(embedding))) gc.collect() def training_epoch_end(self, outputs): total_embeddings = np.array(self.embedding_list) # Random projection self.randomprojector = SparseRandomProjection( n_components='auto', eps=0.9) # 'auto' => Johnson-Lindenstrauss lemma self.randomprojector.fit(total_embeddings) # Coreset Subsampling selector = kCenterGreedy(total_embeddings, 0, 0) selected_idx = selector.select_batch( model=self.randomprojector, already_selected=[], N=int(total_embeddings.shape[0] * float(self.coreset_sampling_ratio))) self.embedding_coreset = total_embeddings[selected_idx] print('initial embedding size : ', total_embeddings.shape) print('final embedding size : ', self.embedding_coreset.shape) with open(os.path.join(self.embedding_dir_path, 'embedding.pickle'), 'wb') as f: pickle.dump(self.embedding_coreset, f) gc.collect() def test_step(self, batch, batch_idx): # Nearest Neighbour Search x, label, file_name, x_type = batch features = self(x) embeddings = [] for feature in features: m = torch.nn.AvgPool2d(3, 1, 1) embeddings.append(m(feature)) embedding_ = embedding_concat(embeddings[0], embeddings[1]) embedding_test = np.array(reshape_embedding(np.array(embedding_))) # NN knn = KNN(torch.from_numpy(self.embedding_coreset).cuda(), k=self.n_neighbors) score_patches = knn( torch.from_numpy(embedding_test).cuda())[0].cpu().detach().numpy() self.img_path_list.extend(file_name) # support multi input size block_size = int(np.sqrt(len(score_patches))) anomaly_map = score_patches[:, 0].reshape((block_size, block_size)) self.all_scores.append(anomaly_map) self.image_batch_list.append(x) self.x_type_list.append(x_type) self.y_true.append(label.cpu().numpy()[0]) def Find_Optimal_Cutoff(self, target, predicted): fpr, tpr, threshold = roc_curve(target, predicted, pos_label=1) i = np.arange(len(tpr)) roc = pd.DataFrame({ 'tf': pd.Series(tpr - (1 - fpr), index=i), 'threshold': pd.Series(threshold, index=i) }) roc_t = roc.iloc[(roc.tf - 0).abs().argsort()[:1]] return list(roc_t['threshold']), threshold ''' plt.plot(fpr, tpr) plt.plot([0, 1], [0, 1], '--', color='black') plt.title('ROC Curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show() ''' def analyze_data(self): score_pathces = np.array(self.all_scores) for i, val in enumerate(score_pathces): self.all_scores_mean_norm.append(np.mean(val)) min_score = np.min(score_pathces) max_score = np.max(score_pathces) print("MIN SCORE {}".format(min_score)) print("MAX SCORE {}".format(max_score)) scores = (score_pathces - min_score) / (max_score - min_score) for i, heatmap in enumerate(scores): anomaly_map_resized = cv2.resize( heatmap, (self.input_image_size, self.input_image_size)) max_ = np.max(heatmap) min_ = np.min(heatmap) anomaly_map_resized_blur = gaussian_filter(anomaly_map_resized, sigma=4) anomaly_map_resized_blur[0][0] = 1. # save images x = self.image_batch_list[i] x = self.data_inv_transform(x) input_x = cv2.cvtColor( x.permute(0, 2, 3, 1).cpu().numpy()[0] * 255, cv2.COLOR_BGR2RGB) if anomaly_map_resized_blur.shape != input_x.shape: anomaly_map_resized_blur = cv2.resize( anomaly_map_resized_blur, (input_x.shape[0], input_x.shape[1])) if self.anomal_threshold != 0: anomaly_threshold_index = anomaly_map_resized_blur[ anomaly_map_resized_blur > self.anomal_threshold] anomaly_map_resized_blur[ anomaly_map_resized_blur < self.anomal_threshold] = 0 anomaly_threshold_area = anomaly_threshold_index.size anomaly_threshold_area = anomaly_threshold_area / \ float(anomaly_map_resized_blur.size) * 100. self.all_scores_mean_norm[i] = anomaly_threshold_area # anomaly map on image heatmap = cvt2heatmap(anomaly_map_resized_blur * 255) hm_on_img = heatmap_on_image(heatmap, input_x) # save images cv2.imwrite( os.path.join( self.sample_path, f'{self.x_type_list[i]}_{self.img_path_list[i]}.jpg'), input_x) cv2.imwrite( os.path.join( self.sample_path, f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap.jpg'), heatmap) cv2.imwrite( os.path.join( self.sample_path, f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap_on_img.jpg' ), hm_on_img) def test_epoch_end(self, outputs): self.analyze_data() best_th, threshold = self.Find_Optimal_Cutoff( self.y_true, self.all_scores_mean_norm) print(f'\nbest threshold={best_th}') ng_index = np.where(np.array(self.y_true) == 1) if len(ng_index[0]) == 0: ng_index = len(self.y_true) else: ng_index = ng_index[0][0] fig = plt.figure() sns.histplot(self.all_scores_mean_norm[:ng_index], kde=True, color="blue", label="normal") sns.histplot(self.all_scores_mean_norm[ng_index:], kde=True, color="red", label="abnormal") fig.legend(labels=['normal', 'abnormal']) plt.xlabel("Anomaly score") plt.ylabel("Count") plt.savefig('Anomaly_score_histplot.jpg')
) tmp = defaultdict(dict) for i, dim in product(range(1), dims_letter): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv( './P2_Dimensionality_Reduction/letter_RP_pairwise_distance_corr.csv') print('Part 2C - Starting RP, reconstruction error, for spam dataset...') tmp = defaultdict(dict) for i, dim in product(range(1), dims_spam): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(spamX) tmp[dim][i] = reconstructionError(rp, spamX) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2_Dimensionality_Reduction/spam_RP_reconstruction_error.csv') print('Part 2C - Starting RP, reconstruction error, for letter dataset...') tmp = defaultdict(dict) for i, dim in product(range(1), dims_letter): print(dim) rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(letterX) tmp[dim][i] = reconstructionError(rp, letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2_Dimensionality_Reduction/letter_RP_reconstruction_error.csv') # Run Neural Networks
#randomized projection tmp = defaultdict(dict) dims = range(1, 22) for i, dim in product(range(20), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T tmp tmp.to_csv('rp_mushroom_iterations.csv') tmp_fit = defaultdict(dict) for i,dim in product(range(20),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(X) tmp_fit[dim][i] = reconstructionError(rp, X) tmp_fit =pd.DataFrame(tmp_fit).T tmp_fit tmp_fit.to_csv('rp_mushroom_new_data.csv') grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} rp = SparseRandomProjection(random_state=10) mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) pipe = Pipeline([('rp',rp),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(X,y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv('rp_mushroom_ann.csv') #ndim= 3 dim = 7