def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/benchmark.py self.log("Performing {}".format(self.experiment_name())) # %% benchmarking for chart type 2 grid = {'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid) self.log("Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_nn_bmk.csv'.format(self._details.ds_name))) self.log("Done") # benchmark based on best params from assignment 1 mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, self._details.best_nn_params, type='ass1') tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_ass1_nn_bmk.csv'.format(self._details.ds_name))) # Run clustering as a subexperiment self.log("Running clustering sub-experiment") experiments.run_subexperiment(self, self._out.format('clustering/'))
def perform_cluster(self, dim_param): self.log('Running clustering for {} with dim param {}'.format( self.experiment_name(), dim_param)) # TODO: USE UNSUPERVISED METHOD TO GET THIS BEST VALUE # %% Data for 3 # Set this from chart 2 and dump, use clustering script to finish up pca = PCA(n_components=dim_param, random_state=self._details.seed) # ANN based on best params from assignment 1 mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('pca', pca), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, self._details.best_nn_params, type='ass1') tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_ass1_dim_red.csv'.format( self._details.ds_name))) hdf_path = self.dump_for_clustering( lambda x: pca.fit_transform(x.get_details().ds.training_x)) # Run clustering as a subexperiment self.log("Running clustering sub-experiment") updated_ds = self._details.ds.reload_from_hdf( hdf_path=hdf_path, hdf_ds_name=self._details.ds_name, preprocess=False) experiments.run_subexperiment(self, self._out.format('clustering/'), updated_ds)
def perform_cluster(self, dim_param): self.log('Running clustering for {} with dim param {}'.format(self.experiment_name(), dim_param)) # TODO: USE UNSUPERVISED METHOD TO GET THIS BEST VALUE # %% Data for 3 # Set this from chart 2 and dump, use clustering script to finish up rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=self._details.seed, n_jobs=self._details.threads) rfc.fit(self._details.ds.training_x, self._details.ds.training_y) filtr = ImportanceSelect(rfc, dim_param) # ANN based on best params from assignment 1 mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('filter', filtr), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, experiments.BEST_NN_PARAMS, type='ass1') tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_ass1_dim_red.csv'.format(self._details.ds_name))) hdf_path = self.dump_for_clustering(lambda x: filtr.fit_transform(x.get_details().ds.training_x, x.get_details().ds.training_y)) # Run clustering as a subexperiment self.log("Running clustering sub-experiment") updated_ds = self._details.ds.reload_from_hdf(hdf_path=hdf_path, hdf_ds_name=self._details.ds_name, preprocess=False) experiments.run_subexperiment(self, self._out.format('clustering/'), updated_ds)
def perform_cluster(self, dim_param): self.log('Running clustering for {} with dim param {}'.format( self.experiment_name(), dim_param)) # TODO: USE UNSUPERVISED METHOD TO GET THIS BEST VALUE # %% Data for 3 # Set this from chart 2 and dump, use clustering script to finish up ica = FastICA(n_components=dim_param, random_state=self._details.seed) # user requested dim # ANN based on best params from assignment 1 mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('ica', ica), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator( pipe, experiments.BEST_NN_PARAMS, type='ass1' ) # perform experiments and write best estimator params to file tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_ass1_dim_red.csv'.format( self._details.ds_name)) ) # write assignment 1 best NN results --> ./output/ICA/bank_ass1_ICA.csv (misleading name. should be ICA) hdf_path = self.dump_for_clustering( lambda x: ica.fit_transform(x.get_details().ds.training_x) ) # write ICA transformed features + class to file, returns path to that file # Run clustering as a subexperiment self.log("Running clustering sub-experiment") updated_ds = self._details.ds.reload_from_hdf( hdf_path=hdf_path, hdf_ds_name=self._details.ds_name, preprocess=False) # new BankData object experiments.run_subexperiment( self, self._out.format('clustering/'), updated_ds) # --> './output/ICA/clustering/'
def perform_cluster(self, dim_param): experiments.run_subexperiment(self, self._out.format('clustering/'))