def test_cv(): df = pd.read_csv(get_path('test_data.csv'), header=[0, 1]) df = stratified_folds(df, nfolds=3, sortby=('comp', 'SiO2')) params = {'n_components': [1, 2, 3], 'scale': [False]} paramgrid = list(ParameterGrid(params)) cv_obj = cv.cv(paramgrid) df_out, output, models, modelkeys, predictkeys = cv_obj.do_cv( df, xcols='wvl', ycol=[('comp', 'SiO2')], method='PLS', yrange=[0, 100], calc_path=False, alphas=None) expected_predicts = [ 56.55707481, 57.93716105, 59.34785052, 60.59708391, 55.83934129, 56.7456989 ] expected_output_rmsec = [18.6509206, 14.64015186, 13.80182457] np.testing.assert_array_almost_equal( expected_predicts, np.array(df_out['predict'].iloc[0, :])) np.testing.assert_array_almost_equal(expected_output_rmsec, np.array(output[('cv', 'RMSEC')])) assert output.shape == (3, 8) assert len(models) == 3 assert len(modelkeys) == 3 assert modelkeys[ 0] == 'PLS - SiO2 - (0, 100) {\'n_components\': 1, \'scale\': False}' assert len(predictkeys) == 6 assert predictkeys[ 0] == '"PLS- CV -{\'n_components\': 1, \'scale\': False}"'
def test_cv_calc_path(): df = pd.read_csv(get_path('test_data.csv'), header=[0, 1]) df = stratified_folds(df, nfolds=3, sortby=('comp', 'SiO2')) params = { 'fit_intercept': [True, False], 'max_iter': [1000], 'tol': [1e-3], 'precompute': [True], 'copy_X': [True], 'positive': [True, False], 'selection': ['random'], 'random_state': [1] } alphas = np.logspace(np.log10(0.0000001), np.log10(0.01), num=10) paramgrid = list(ParameterGrid(params)) cv_obj = cv.cv(paramgrid) df_out, output, models, modelkeys, predictkeys = cv_obj.do_cv( df, xcols='wvl', ycol=[('comp', 'SiO2')], method='LASSO', yrange=[0, 100], calc_path=True, alphas=alphas) expected_predicts = [ 57.87064, 57.868983, 57.868983, 57.868983, 57.868983, 59.315111, 59.315113, 59.315114, 59.315114, 59.315114 ] expected_output_rmsec = [ 18.490365, 18.490365, 18.490365, 18.490365, 18.490365, 7.042796, 6.986007, 6.967643, 6.959045, 6.953588 ] np.testing.assert_array_almost_equal( expected_predicts, np.array(df_out['predict'].iloc[0, 5:15])) np.testing.assert_array_almost_equal( expected_output_rmsec, np.array(output[('cv', 'RMSEC')].iloc[5:15])) assert output.shape == (40, 15) assert len(models) == 40 assert len(modelkeys) == 40 assert modelkeys[ 0] == 'LASSO - SiO2 - (0, 100) Alpha: 0.01, {\'copy_X\': True, \'fit_intercept\': True, \'max_iter\': 1000, \'positive\': True, \'precompute\': True, \'random_state\': 1, \'selection\': \'random\', \'tol\': 0.001}' assert len(predictkeys) == 80 assert predictkeys[ 0] == '"LASSO - SiO2 - CV - Alpha:0.01 - {\'copy_X\': True, \'fit_intercept\': True, \'max_iter\': 1000, \'positive\': True, \'precompute\': True, \'random_state\': 1, \'selection\': \'random\', \'tol\': 0.001}"'
def test_cv_nofolds(): df = pd.read_csv(get_path('test_data.csv'), header=[0, 1]) params = {'n_components': [1, 2, 3], 'scale': [False]} paramgrid = list(ParameterGrid(params)) cv_obj = cv.cv(paramgrid) results = cv_obj.do_cv(df, xcols='wvl', ycol=[('comp', 'SiO2')], method='PLS', yrange=[0, 100], calc_path=False, alphas=None) print(results) assert results == 0
def test_cv_local_regression(): df = pd.read_csv(get_path('test_data.csv'), header=[0, 1]) df = df.iloc[0:20, :] #make data set smaller so this test runs faster df = stratified_folds(df, nfolds=3, sortby=('comp', 'SiO2')) params = { 'n_neighbors': [5, 6], 'fit_intercept': [True], 'positive': [False], 'random_state': [1], 'tol': [1e-2] } paramgrid = list(ParameterGrid(params)) cv_obj = cv.cv(paramgrid) df_out, output, models, modelkeys, predictkeys = cv_obj.do_cv( df, xcols='wvl', ycol=[('comp', 'SiO2')], method='Local Regression', yrange=[0, 100], calc_path=False, alphas=None) expected_predicts = [51.30212, 54.25293063, 48.54834655, 54.18676067] expected_output_rmsec = [10.32151211, 10.89018268] np.testing.assert_array_almost_equal( expected_predicts, np.array(df_out['predict'].iloc[5, :])) np.testing.assert_array_almost_equal(expected_output_rmsec, np.array(output[('cv', 'RMSEC')])) assert output.shape == (2, 11) assert len(models) == 2 assert len(modelkeys) == 2 assert modelkeys[ 0] == 'Local Regression - SiO2 - (0, 100) {\'fit_intercept\': True, \'positive\': False, \'random_state\': 1, \'tol\': 0.01} n_neighbors: 5' assert len(predictkeys) == 4 assert predictkeys[ 0] == '"Local Regression- CV -{\'fit_intercept\': True, \'positive\': False, \'random_state\': 1, \'tol\': 0.01} n_neighbors: 5"'
def run(self): method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] # Warning: Params passing through cv.cv(params) needs to be in lists # Example: {'n_components': [4], 'scale': [False]} params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() #if the method supports it, separate out alpha from the other parameters and prepare for calculating path path_methods = ['Elastic Net', 'LASSO'] #, 'Ridge'] if method in path_methods: calc_path = True alphas = params.pop('alpha') else: alphas = None calc_path = False y = np.array(self.data[datakey].df[yvars]) match = np.squeeze((y > yrange[0]) & (y < yrange[1])) data_for_cv = spectral_data(self.data[datakey].df.ix[match]) paramgrid = list( ParameterGrid(params)) # create a grid of parameter permutations cv_obj = cv.cv(paramgrid) try: cv_iterator = LeaveOneGroupOut().split( data_for_cv.df[xvars], data_for_cv.df[yvars], data_for_cv.df[('meta', 'Folds')] ) # create an iterator for cross validation based on the predefined folds n_folds = LeaveOneGroupOut().get_n_splits( groups=data_for_cv.df[('meta', 'Folds')]) except: print( '***No folds found! Did you remember to define folds before running cross validation?***' ) self.data[ datakey].df, self.cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv( data_for_cv.df, cv_iterator, xcols=xvars, ycol=yvars, yrange=yrange, method=method, alphas=alphas, calc_path=calc_path, n_folds=n_folds) for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) for n, key in enumerate(cvmodelkeys): self.list_amend(self.modelkeys, len(self.modelkeys), key) self.modelkeys.append(key) self.models[key] = cvmodels[n] self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': coef = np.squeeze(cvmodels[n].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples( self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) self.datakeys.append('Model Coefficients') number = 1 cvid = str('CV Results ' + modelkey + ' - ' + yvars[0][1]) while cvid in self.datakeys: number += 1 cvid = str('CV Results ' + modelkey + ' - ' + yvars[0][1]) + ' - ' + str(number) self.datakeys.append(cvid) self.data[cvid] = self.cv_results
def setup(self): try: method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] # Warning: Params passing through cv.cv(params) needs to be in lists # Example: {'n_components': [4], 'scale': [False]} params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() #if the method supports it, separate out alpha from the other parameters and prepare for calculating path path_methods = ['Elastic Net', 'LASSO'] #, 'Ridge'] if method in path_methods: alphas = params.pop('alpha') else: alphas = None paramgrid = list(ParameterGrid( params)) # create a grid of parameter permutations cv_obj = cv.cv(paramgrid) cvpredictkeys = [] cvmodelkeys = [] for i in range(len(paramgrid)): if alphas is not None: for j in range(len(alphas)): keytemp = '"' + method + ' - ' + yvars[0][ -1] + ' - CV - Alpha:' + str( alphas[j]) + ' - ' + str(paramgrid[i]) + '"' cvpredictkeys.append(keytemp) keytemp = '"' + method + ' - ' + yvars[0][ -1] + ' - Cal - Alpha:' + str( alphas[j]) + ' - ' + str(paramgrid[i]) + '"' cvpredictkeys.append(keytemp) modelkeytemp = "{} - {} - ({}, {}) Alpha: {}, {}".format( method, yvars[0][-1], yrange[0], yrange[1], alphas[j], paramgrid[i]) cvmodelkeys.append(modelkeytemp) else: keytemp = '"' + method + '- Cal -' + str( paramgrid[i]) + '"' cvpredictkeys.append(keytemp) keytemp = '"' + method + '- Cal -' + str( paramgrid[i]) + '"' cvpredictkeys.append(keytemp) modelkeytemp = "{} - {} - ({}, {}) {}".format( method, yvars[0][-1], yrange[0], yrange[1], paramgrid[i]) cvmodelkeys.append(modelkeytemp) for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) self.data[datakey].df[( 'predict', key )] = 9999 #Need to fill the data frame with dummy values until CV is actually run for n, key in enumerate(cvmodelkeys): self.list_amend(self.modelkeys, len(self.modelkeys), key) self.modelkeys.append(key) self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': coef = self.data[datakey].df[xvars[ 0]].columns.values * 0.0 + 9999 #Fill with dummy coeffs before model is run coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples( self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[( 'meta', 'Intercept' )] = 0 #Fill intercept with zeros prior to model run except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat( [self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) self.datakeys.append('Model Coefficients') self.list_amend(self.datakeys, len(self.datakeys), 'CV Results ' + modelkey) except: pass
def run(self): self.cv_results_combined = None #clear previous results in case of re-run if 'Model Coefficients' in self.datakeys: pass else: Modules.data_count += 1 self.coef_index = Modules.data_count self.list_amend(self.datakeys, self.coef_index, 'Model Coefficients') Modules.data_count += 1 self.results_index = Modules.data_count paramgrids = {} if self.ARDcheckbox.isChecked(): paramgrids['ARD']=list(ParameterGrid(self.alg['ARD'][0].run())) if self.BRRcheckbox.isChecked(): paramgrids['BRR']=list(ParameterGrid(self.alg['BRR'][0].run())) if self.ENetcheckbox.isChecked(): enet_params=self.alg['Elastic Net'][0].run() params = enet_params[0] params['alpha'] = enet_params[1] paramgrids['Elastic Net']=list(ParameterGrid(params)) # if self.GPcheckBox.isChecked(): # paramgrids.append(list(ParameterGrid(self.alg['GP - Gaussian Processes'][0].run()))) if self.LARScheckbox.isChecked(): paramgrids['LARS']=list(ParameterGrid(self.alg['LARS'][0].run())) if self.LASSOcheckBox.isChecked(): lasso_params=self.alg['LASSO'][0].run() params = lasso_params[0] params['alpha'] = lasso_params[1] paramgrids['LASSO'] = list(ParameterGrid(params)) #paramgrids['LASSO']={'alphas':lasso_params[1],'params':list(ParameterGrid(lasso_params[0]))} if self.OLScheckBox.isChecked(): paramgrids['OLS']=list(ParameterGrid(self.alg['OLS'][0].run())) if self.OMPcheckBox.isChecked(): paramgrids['OMP']=list(ParameterGrid(self.alg['OMP'][0].run())) if self.PLScheckBox.isChecked(): paramgrids['PLS']=list(ParameterGrid(self.alg['PLS'][0].run())) if self.RidgecheckBox.isChecked(): paramgrids['Ridge']=list(ParameterGrid(self.alg['Ridge'][0].run())) if self.SVRcheckBox.isChecked(): paramgrids['SVR']=list(ParameterGrid(self.alg['SVR'][0].run())) if self.LocalcheckBox.isChecked(): paramgrids['Local Regression']=list(ParameterGrid(self.alg['Local Regression'][0].run())) if self.GBRcheckBox.isChecked(): paramgrids['GBR'] = list(ParameterGrid(self.alg['GBR'][0].run())) if self.RFcheckBox.isChecked(): paramgrids['RF'] = list(ParameterGrid(self.alg['RF'][0].run())) datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value()] y = np.array(self.data[datakey].df[yvars]) match = np.squeeze((y > yrange[0]) & (y < yrange[1])) data_for_cv = spectral_data(self.data[datakey].df.loc[match]) for key in paramgrids.keys(): print('===== Cross validating '+key+' =====') method=key paramgrid = paramgrids[key] cv_obj = cv.cv(paramgrid) data_for_cv_out, cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(data_for_cv.df, xcols=xvars, ycol=yvars, yrange=yrange, method=method) try: cv_results[('cv','Data_file')] = self.datafiles[datakey] except: pass cv_results[('cv','ymin')] = yrange[0] cv_results[('cv','ymax')] = yrange[1] cv_results[('cv','ycol')] = yvars[0][1] data_for_cv = spectral_data(data_for_cv_out) self.cv_results_combined = pd.concat((self.cv_results_combined,cv_results)) for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) for n, key in enumerate(cvmodelkeys): Modules.model_count += 1 self.list_amend(self.modelkeys, Modules.model_count, key) self.models[key] = cvmodels[n] self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': try: coef = np.squeeze(cvmodels[n].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples(self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) except: pass number = 1 cvid = str('CV Results - ' + yvars[0][1]) while cvid in self.datakeys: number += 1 cvid = str('CV Results - ' + yvars[0][1]) + ' - ' + str(number) self.list_amend(self.datakeys,self.results_index,cvid) self.data[cvid] = spectral_data(self.cv_results_combined) Modules.data_count += 1 new_datakey = datakey + '-' +str(yvars)+' '+ str(yrange)+'-CV Predictions' self.list_amend(self.datakeys, Modules.data_count, new_datakey) self.data[new_datakey] = spectral_data(data_for_cv_out)
def run(self): paramgrids = {} if self.ARDcheckbox.isChecked(): paramgrids['ARD']=list(ParameterGrid(self.alg['ARD'][0].run())) if self.BRRcheckbox.isChecked(): paramgrids['BRR']=list(ParameterGrid(self.alg['BRR'][0].run())) if self.ENetcheckbox.isChecked(): enet_params=self.alg['Elastic Net'][0].run() paramgrids['Elastic Net']={'alphas':enet_params[1],'params':list(ParameterGrid(enet_params[0]))} # if self.GPcheckBox.isChecked(): # paramgrids.append(list(ParameterGrid(self.alg['GP - Gaussian Processes'][0].run()))) if self.LARScheckbox.isChecked(): paramgrids['LARS']=list(ParameterGrid(self.alg['LARS'][0].run())) if self.LASSOcheckBox.isChecked(): lasso_params=self.alg['LASSO'][0].run() paramgrids['LASSO']={'alphas':lasso_params[1],'params':list(ParameterGrid(lasso_params[0]))} if self.OLScheckBox.isChecked(): paramgrids['OLS']=list(ParameterGrid(self.alg['OLS'][0].run())) if self.OMPcheckBox.isChecked(): paramgrids['OMP']=list(ParameterGrid(self.alg['OMP'][0].run())) if self.PLScheckBox.isChecked(): paramgrids['PLS']=list(ParameterGrid(self.alg['PLS'][0].run())) if self.RidgecheckBox.isChecked(): paramgrids['Ridge']=list(ParameterGrid(self.alg['Ridge'][0].run())) if self.SVRcheckBox.isChecked(): paramgrids['SVR']=list(ParameterGrid(self.alg['SVR'][0].run())) if self.LocalcheckBox.isChecked(): paramgrids['Local Regression']=list(ParameterGrid(self.alg['Local Regression'][0].run())) datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value()] y = np.array(self.data[datakey].df[yvars]) match = np.squeeze((y > yrange[0]) & (y < yrange[1])) data_for_cv = spectral_data(self.data[datakey].df.ix[match]) for key in paramgrids.keys(): print('===== Cross validating '+key+' =====') method=key #if the method supports it, separate out alpha from the other parameters and prepare for calculating path path_methods = ['Elastic Net', 'LASSO']#, 'Ridge'] if method in path_methods: calc_path = True alphas = paramgrids[key]['alphas'] paramgrid = paramgrids[key]['params'] else: alphas = None calc_path = False paramgrid = paramgrids[key] progbar = QtWidgets.QProgressBar() cv_obj = cv.cv(paramgrid, progressbar=progbar) self.data[datakey].df, cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(data_for_cv.df, xcols=xvars, ycol=yvars, yrange=yrange, method=method, alphas = alphas, calc_path = calc_path) try: self.cv_results_combined = pd.concat((self.cv_results_combined,cv_results)) except: self.cv_results_combined = cv_results for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) for n, key in enumerate(cvmodelkeys): self.list_amend(self.modelkeys, len(self.modelkeys), key) self.models[key] = cvmodels[n] self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': coef = np.squeeze(cvmodels[n].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples(self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) self.datakeys.append('Model Coefficients') number = 1 cvid = str('CV Results - ' + yvars[0][1]) while cvid in self.datakeys: number += 1 cvid = str('CV Results - ' + yvars[0][1]) + ' - ' + str(number) self.datakeys.append(cvid) self.data[cvid] = spectral_data(self.cv_results_combined)