def test_icp_regression_tree(self): # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Without normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter( DecisionTreeRegressor(min_samples_leaf=5)) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ["min", "max", "truth", "size"] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df) # ----------------------------------------------------------------------------- # With normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter( DecisionTreeRegressor(min_samples_leaf=5)) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=1)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ["min", "max", "truth", "size"] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df)
def test_acp_regression_tree(self): # ----------------------------------------------------------------------------- # Experiment setup # ----------------------------------------------------------------------------- data = load_diabetes() idx = np.random.permutation(data.target.size) train = idx[:int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] truth = data.target[test] columns = ["min", "max", "truth"] significance = 0.1 # ----------------------------------------------------------------------------- # Define models # ----------------------------------------------------------------------------- models = { "ACP-RandomSubSampler": AggregatedCp( IcpRegressor( RegressorNc(RegressorAdapter(DecisionTreeRegressor()))), RandomSubSampler(), ), "ACP-CrossSampler": AggregatedCp( IcpRegressor( RegressorNc(RegressorAdapter(DecisionTreeRegressor()))), CrossSampler(), ), "ACP-BootstrapSampler": AggregatedCp( IcpRegressor( RegressorNc(RegressorAdapter(DecisionTreeRegressor()))), BootstrapSampler(), ), } # ----------------------------------------------------------------------------- # Train, predict and evaluate # ----------------------------------------------------------------------------- for name, model in models.items(): model.fit(data.data[train, :], data.target[train]) prediction = model.predict(data.data[test, :]) prediction_sign = model.predict(data.data[test, :], significance=significance) table = np.vstack((prediction_sign.T, truth)).T df = pd.DataFrame(table, columns=columns) print("\n{}".format(name)) print("Error rate: {}".format( reg_mean_errors(prediction, truth, significance))) print(df)
def CF_QuanVal(X, Y, estimator, conformalSignificance): print("Starting quantitative conformal prediction validation") icp = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(estimator))), BootstrapSampler()) # icp = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(estimator), # AbsErrorErrFunc(), RegressorNormalizer(estimator, # RegressorAdapter(copy.copy(estimator)), AbsErrorErrFunc())))) # icp_cv = RegIcpCvHelper(icp) # scores = conformal_cross_val_score(icp_cv, # X, # Y, # iterations=5, # folds=5, # scoring_funcs=[reg_mean_errors, reg_median_size, reg_mean_size], # significance_levels=[0.05, 0.1, 0.2, conformalSignificance]) icp.fit(X[:30], Y[:30]) prediction = icp.predict(X[30:]) prediction_sign = icp.predict(X[30:], significance=0.25) interval = prediction_sign[:, 0] - prediction_sign[:, 1] print(np.mean(interval)) print(interval) print("\n") print(prediction) print(prediction_sign) return (icp)
def CF_quantitative_validation(self): ''' Performs internal validation for conformal quantitative models ''' # Make a copy of original matrices. X = self.X.copy() Y = self.Y.copy() # Number of external validations for the aggregated conformal estimator. seeds = [5, 7, 35] # Interval means for each aggregated conformal estimator (out of 3) interval_means = [] # Accuracies for each aggregated conformal estimator (out of 3) accuracies = [] results = [] try: for i in range(len(seeds)): # Generate training a test sets X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.25, random_state=i, shuffle=False) # Create the aggregated conformal regressor. conformal_pred = AggregatedCp( IcpRegressor(RegressorNc(RegressorAdapter( self.estimator))), BootstrapSampler()) # Fit conformal regressor to the data conformal_pred.fit(X_train, Y_train) # Perform prediction on test set prediction = conformal_pred.predict(X_test, self.conformalSignificance) # Add the n validation interval means interval_means.append( np.mean( np.abs(prediction[:, 0]) - np.abs(prediction[:, 1]))) Y_test = Y_test.reshape(-1, 1) # Get boolean mask of instances within the applicability domain. inside_interval = ((prediction[:, 0].reshape(-1, 1) < Y_test) & (prediction[:, 1].reshape(-1, 1) > Y_test)) # Compute the accuracy (number of instances within the AD). accuracy = np.sum(inside_interval) / len(Y_test) # Add validation result to the list of accuracies. accuracies.append(accuracy) except Exception as e: LOG.error(f'Quantitative conformal validation' f' failed with exception: {e}') raise e # Compute mean interval_means and accuracy. interval_means = np.mean(interval_means) accuracies = np.mean(accuracies) # Cut into two decimals. self.conformal_accuracy = float("{0:.2f}".format(accuracies)) self.conformal_mean_interval = float("{0:.2f}".format(interval_means)) #Add quality metrics to results. results.append(('Conformal_mean_interval', 'Conformal mean interval', self.conformal_mean_interval)) results.append(('Conformal_accuracy', 'Conformal accuracy', self.conformal_accuracy)) return True, (results, )
def build(self): if not self.quantitative: print("PLSR only applies to quantitative data") return False, "PLSR only applies to quantitative data" if self.failed: return False, "Error initiating model" X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) if self.cv: self.cv = getCrossVal(self.cv, 46, self.n, self.p) if self.tune: if self.optimiz == 'auto': super(PLSR, self).optimize(X, Y, PLS_r( **self.estimator_parameters), self.tune_parameters) elif self.optimiz == 'manual': self.optimize(X, Y, PLS_r( **self.estimator_parameters), self.tune_parameters) results.append( ('model', 'model type', 'PLSR quantitative (optimized)')) else: print("Building Quantitative PLSR") self.estimator = PLS_r(**self.estimator_parameters) results.append(('model', 'model type', 'PLSR quantitative')) if self.conformal: underlying_model = RegressorAdapter(self.estimator) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=1)) normalizing_model = RegressorAdapter(self.estimator) normalizer = RegressorNormalizer( underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) self.conformal_pred = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.conformal_pred.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal PLSR quantitative')) self.estimator.fit(X, Y) return True, results
def CF_QuanCal(X, Y, estimator): # X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42) acp = AggregatedCp( IcpRegressor( RegressorNc( RegressorAdapter(estimator), AbsErrorErrFunc(), RegressorNormalizer(estimator, copy.copy(estimator), AbsErrorErrFunc())), RandomSubSampler()), ) acp.fit(X, Y) # icp.calibrate(X_test, y_test) return acp
def __init__(self, model, sklearn_model: bool): r"""__init__ method This method is used to adapt the input `model` so it can be used for creating confidente intervals with conformal prediction. Parameters ---------- model: Model we want to use as the underlying model to generate predictions and the confidence interval. This model can only be a scikit learn model, LGBMRegressor, LGBMClassifier, XGBRegressor, XGBClassifier, CatBoostRegressor or CatBoostClassifier. sklearn_model: bool This variable indicates if the model belongs to scikit learn or not. Returns ------- cp: obj: Adapt_to_CP The class of the adapted model. Examples -------- >>> model = lightgbm.LGBMRegressor() >>> cp = Adapt_to_CP(model) """ self.model = model if sklearn_model: if is_classifier(model): self.icp = IcpClassifier(NcFactory.create_nc(model)) elif is_regressor(model): self.icp = IcpRegressor(NcFactory.create_nc(model)) else: model_adapter = NonConformistAdapter(model) if is_classifier(model): self.icp = IcpClassifier(ClassifierNc(model_adapter)) elif is_regressor(model): self.icp = IcpRegressor(RegressorNc(model_adapter)) elif model.__class__.__name__ == "Booster": self.icp = IcpRegressor(RegressorNc(model_adapter))
def test_oob_calibration(self): # ----------------------------------------------------------------------------- # Classification # ----------------------------------------------------------------------------- data = load_iris() icp = OobCpClassifier( ClassifierNc( OobClassifierAdapter( RandomForestClassifier(n_estimators=100, oob_score=True)))) icp_cv = ClassIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[class_mean_errors, class_avg_c], significance_levels=[0.05, 0.1, 0.2], ) print("Classification: iris") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, absolute error # ----------------------------------------------------------------------------- data = load_diabetes() icp = OobCpRegressor( RegressorNc( OobRegressorAdapter( RandomForestRegressor(n_estimators=100, oob_score=True)))) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Absolute error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean())
def build(self): # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) if self.param.getVal('tune'): # Optimize estimator using sklearn-gridsearch if self.estimator_parameters['optimize'] == 'auto': try: LOG.info('Optimizing PLSR using SK-LearnGridSearch') # Remove optimize key from parameter dictionary # to avoid sklearn estimator error (unexpected keyword) self.estimator_parameters.pop("optimize") super(PLSR, self).optimize(X, Y, PLS_r( **self.estimator_parameters), self.param.getDict('PLSR_optimize')) except Exception as e: LOG.error(f'Error performing SK-LearnGridSearch' f' on PLSR estimator with exception {e}') return False, f'Error performing SK-LearnGridSearch on PLSR estimator with exception {e}' # Optimize using flame implementation (recommended) elif self.estimator_parameters['optimize'] == 'manual': LOG.info('Optimizing PLSR using manual method') # Remove optimize key from parameter dictionary # to avoid sklearn estimator error (unexpected keyword) self.estimator_parameters.pop("optimize") success, message = self.optimize(X, Y, PLS_r( **self.estimator_parameters), self.param.getDict('PLSR_optimize')) if not success: return False, message else: LOG.error('Type of tune not recognized, check the input') return False, 'Type of tune not recognized, check the input' results.append(('model', 'model type', 'PLSR quantitative (optimized)')) else: LOG.info('Building Quantitative PLSR with no optimization') try: # Remove optimize key from parameters to avoid error self.estimator_parameters.pop("optimize") # as the sklearn estimator does not have this key self.estimator = PLS_r(**self.estimator_parameters) except Exception as e: LOG.error(f'Error at PLS_r instantiation with ' f'exception {e}') return False, f'Error at PLS_da instantiation with exception {e}' results.append(('model', 'model type', 'PLSR quantitative')) # Fit estimator to the data self.estimator.fit(X, Y) if not self.param.getVal('conformal'): return True, results self.estimator_temp = copy(self.estimator) try: LOG.info('Building PLSR aggregated conformal predictor') underlying_model = RegressorAdapter(self.estimator_temp) # normalizing_model = RegressorAdapter( # KNeighborsRegressor(n_neighbors=1)) normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) except Exception as e: LOG.error(f'Error building aggregated PLSR conformal' f' regressor with exception: {e}') return False, f'Error building aggregated PLSR conformal regressor with exception: {e}' # self.conformal_pred = AggregatedCp(IcpRegressor( # RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) # Fit conformal estimator to the data self.estimator.fit(X, Y) # overrides non-conformal results.append(('model', 'model type', 'conformal PLSR quantitative')) return True, results
def build(self): '''Build a new XGBOOST model with the X and Y numpy matrices ''' try: from xgboost.sklearn import XGBClassifier from xgboost.sklearn import XGBRegressor except Exception as e: return False, 'XGboost not found, please revise your environment' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing XGBOOST estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = XGBRegressor( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model','model type','XGBOOST quantitative (optimized)')) else: self.estimator = XGBClassifier( **self.estimator_parameters) params = self.estimator.get_params() params['num_class'] = 2 self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model','model type','XGBOOST qualitative (optimized)')) except Exception as e: return False, f'Exception optimizing XGBOOST estimator with exception {e}' else: try: if self.param.getVal('quantitative'): LOG.info("Building Quantitative XGBOOST model") # params = { # 'objective': 'reg:squarederror', # 'missing': -99.99999, # # 'max_depth': 20, # # 'learning_rate': 1.0, # # 'silent': 1, # # 'n_estimators': 25 # } # self.estimator = XGBRegressor(**params) self.estimator = XGBRegressor(**self.estimator_parameters) results.append(('model', 'model type', 'XGBOOST quantitative')) else: LOG.info("Building Qualitative XGBOOST model") # params = { # 'objective': 'binary:logistic', # 'max_depth': 3, # #'learning_rate': 0.7, # #'silent': 1, # 'n_estimators': 100 # } self.estimator = XGBClassifier(**self.estimator_parameters) results.append(('model', 'model type', 'XGBOOST qualitative')) self.estimator.fit(X, Y) print(self.estimator) except Exception as e: raise e return False, f'Exception building XGBOOST estimator with exception {e}' self.estimator_temp = copy(self.estimator) if not self.param.getVal('conformal'): return True, results # Create the conformal estimator try: # Conformal regressor if self.param.getVal('quantitative'): LOG.info("Building conformal Quantitative XGBOOST model") underlying_model = RegressorAdapter(self.estimator_temp) #normalizing_model = RegressorAdapter( #KNeighborsRegressor(n_neighbors=5)) normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer( underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor # (RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) results.append(('model', 'model type', 'conformal XGBOOST quantitative')) # Conformal classifier else: LOG.info("Building conformal Qualitative XGBOOST model") self.estimator = AggregatedCp( IcpClassifier( ClassifierNc( ClassifierAdapter(self.estimator_temp), MarginErrFunc() ) ), BootstrapSampler()) # Fit estimator to the data self.estimator.fit(X, Y) results.append(('model', 'model type', 'conformal XGBOOST qualitative')) except Exception as e: raise e return False, f'Exception building conformal XGBOOST estimator with exception {e}' return True, results ## Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
def build(self): '''Build a new DL model with the X and Y numpy matrices ''' try: from keras.wrappers.scikit_learn import KerasClassifier from keras.wrappers.scikit_learn import KerasRegressor except Exception as e: return False, 'Keras not found, please revise your environment' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing Keras estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = KerasRegressor( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model', 'model type', 'KERAS quantitative (optimized)')) else: self.estimator = KerasClassifier( **self.estimator_parameters) #params = self.estimator.get_params() #params['num_class'] = 2 self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model', 'model type', 'KERAS qualitative (optimized)')) except Exception as e: return False, f'Exception optimizing KERAS estimator with exception {e}' else: try: if self.param.getVal('quantitative'): LOG.info("Building Quantitative KERAS mode") self.estimator = KerasRegressor( build_fn=self.create_model, **self.estimator_parameters, verbose=0) results.append( ('model', 'model type', 'Keras quantitative')) else: LOG.info("Building Qualitative Keras model") self.estimator = KerasClassifier( build_fn=self.create_model, dim=self.X.shape[1], **self.estimator_parameters, verbose=0) results.append( ('model', 'model type', 'Keras qualitative')) self.estimator.fit(X, Y) print(self.estimator) except Exception as e: raise e return False, f'Exception building Keras estimator with exception {e}' self.estimator_temp = clone(self.estimator) if not self.param.getVal('conformal'): return True, results # Create the conformal estimator try: # Conformal regressor if self.param.getVal('quantitative'): LOG.info("Building conformal Quantitative Keras model") underlying_model = RegressorAdapter(self.estimator_temp) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=15)) # normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor # (RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) results.append( ('model', 'model type', 'conformal Keras quantitative')) # Conformal classifier else: LOG.info("Building conformal Qualitative Keras model") self.estimator = AggregatedCp( IcpClassifier( ClassifierNc(ClassifierAdapter(self.estimator_temp), MarginErrFunc())), BootstrapSampler()) # Fit estimator to the data print('build finished') self.estimator.fit(X, Y) results.append( ('model', 'model type', 'conformal Keras qualitative')) except Exception as e: raise e return False, f'Exception building conformal Keras estimator with exception {e}' return True, []
def test_cross_validation(self): # ----------------------------------------------------------------------------- # Classification # ----------------------------------------------------------------------------- data = load_iris() icp = IcpClassifier( ClassifierNc( ClassifierAdapter(RandomForestClassifier(n_estimators=100)), MarginErrFunc())) icp_cv = ClassIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[class_mean_errors, class_avg_c], significance_levels=[0.05, 0.1, 0.2], ) print("Classification: iris") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, absolute error # ----------------------------------------------------------------------------- data = load_diabetes() icp = IcpRegressor( RegressorNc( RegressorAdapter(RandomForestRegressor(n_estimators=100)), AbsErrorErrFunc())) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Absolute error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, normalized absolute error # ----------------------------------------------------------------------------- data = load_diabetes() underlying_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) normalizer_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) normalizer = RegressorNormalizer(underlying_model, normalizer_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Normalized absolute error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, normalized signed error # ----------------------------------------------------------------------------- data = load_diabetes() icp = IcpRegressor( RegressorNc( RegressorAdapter(RandomForestRegressor(n_estimators=100)), SignErrorErrFunc())) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Signed error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, signed error # ----------------------------------------------------------------------------- data = load_diabetes() underlying_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) normalizer_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) # The normalization model can use a different error function than is # used to measure errors on the underlying model normalizer = RegressorNormalizer(underlying_model, normalizer_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, SignErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Normalized signed error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean())
idx = np.random.permutation(data.target.size) train = idx[:int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] truth = data.target[test] columns = ['min', 'max', 'truth'] significance = 0.1 # ----------------------------------------------------------------------------- # Define models # ----------------------------------------------------------------------------- models = { 'ACP-RandomSubSampler': AggregatedCp( IcpRegressor(RegressorNc(RegressorAdapter(DecisionTreeRegressor()))), RandomSubSampler()), 'ACP-CrossSampler': AggregatedCp( IcpRegressor(RegressorNc(RegressorAdapter(DecisionTreeRegressor()))), CrossSampler()), 'ACP-BootstrapSampler': AggregatedCp( IcpRegressor(RegressorNc(RegressorAdapter(DecisionTreeRegressor()))), BootstrapSampler()) } # ----------------------------------------------------------------------------- # Train, predict and evaluate # ----------------------------------------------------------------------------- for name, model in models.iteritems():
from nonconformist.nc import RegressorNc, abs_error, abs_error_inv def split_data(data, n_train, n_test): n_train = n_train*len(data)//(n_train+n_test) n_test = len(data)-n_train ind = np.random.permutation(len(data)) return data[ind[:n_train]], data[ind[n_train:n_train+n_test]] data = Orange.data.Table("auto-mpg") imp = Impute() data = imp(data) for sig in np.linspace(0.01, 0.1, 10): errs, szs = [], [] for rep in range(10): train, test = split_data(data, 2, 1) train, calib = split_data(train, 2, 1) icp = IcpRegressor(RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv)) icp.fit(train.X, train.Y) icp.calibrate(calib.X, calib.Y) pred = icp.predict(test.X, significance=sig) acc = sum(p[0] <= y <= p[1] for p, y in zip(pred, test.Y))/len(pred) err = 1-acc sz = sum(p[1]-p[0] for p in pred)/len(pred) errs.append(err) szs.append(sz) print(sig, np.mean(errs), np.mean(szs))
def run_equalized_coverage_experiment(dataset_name, method, seed, save_to_csv=True, test_ratio=0.2): random_state_train_test = seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) if os.path.isdir('/scratch'): local_machine = 0 else: local_machine = 1 if local_machine: dataset_base_path = '/Users/romano/mydata/regression_data/' else: dataset_base_path = '/scratch/users/yromano/data/regression_data/' # desired miscoverage error alpha = 0.1 # desired quanitile levels quantiles = [0.05, 0.95] # name of dataset dataset_name_group_0 = dataset_name + "_non_white" dataset_name_group_1 = dataset_name + "_white" # load the dataset X, y = datasets.GetDataset(dataset_name, dataset_base_path) # divide the dataset into test and train based on the test_ratio parameter x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=test_ratio, random_state=random_state_train_test) # In[2]: # compute input dimensions n_train = x_train.shape[0] in_shape = x_train.shape[1] # divide the data into proper training set and calibration set idx = np.random.permutation(n_train) n_half = int(np.floor(n_train / 2)) idx_train, idx_cal = idx[:n_half], idx[n_half:2 * n_half] # zero mean and unit variance scaling scalerX = StandardScaler() scalerX = scalerX.fit(x_train[idx_train]) # scale x_train = scalerX.transform(x_train) x_test = scalerX.transform(x_test) y_train = np.log(1.0 + y_train) y_test = np.log(1.0 + y_test) # reshape the data x_train = np.asarray(x_train) y_train = np.squeeze(np.asarray(y_train)) x_test = np.asarray(x_test) y_test = np.squeeze(np.asarray(y_test)) # display basic information print("Dataset: %s" % (dataset_name)) print( "Dimensions: train set (n=%d, p=%d) ; test set (n=%d, p=%d)" % (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1])) # In[3]: dataset_name_vec = [] method_vec = [] coverage_vec = [] length_vec = [] seed_vec = [] test_ratio_vec = [] if method == "net": # pytorch's optimizer object nn_learn_func = torch.optim.Adam # number of epochs epochs = 1000 # learning rate lr = 0.0005 # mini-batch size batch_size = 64 # hidden dimension of the network hidden_size = 64 # dropout regularization rate dropout = 0.1 # weight decay regularization wd = 1e-6 # ratio of held-out data, used in cross-validation cv_test_ratio = 0.1 # seed for splitting the data in cross-validation. # Also used as the seed in quantile random forests function cv_random_state = 1 # In[4]: model = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape=in_shape, hidden_size=hidden_size, learn_func=nn_learn_func, epochs=epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) nc = RegressorNc(model, SignErrorErrFunc()) y_lower, y_upper = helper.run_icp(nc, x_train, y_train, x_test, idx_train, idx_cal, alpha) method_name = "Marginal Conformal Neural Network" # compute and print average coverage and average length coverage_sample, length_sample = helper.compute_coverage_per_sample( y_test, y_lower, y_upper, alpha, method_name, x_test, condition) append_statistics(coverage_sample, length_sample, method_name, dataset_name_vec, method_vec, coverage_vec, length_vec, seed_vec, test_ratio_vec, seed, test_ratio, dataset_name_group_0, dataset_name_group_1) # In[] model = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape=in_shape, hidden_size=hidden_size, learn_func=nn_learn_func, epochs=epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) nc = RegressorNc(model, SignErrorErrFunc()) y_lower, y_upper = helper.run_icp(nc, x_train, y_train, x_test, idx_train, idx_cal, alpha, condition) method_name = "Conditional Conformal Neural Network (joint)" # compute and print average coverage and average length coverage_sample, length_sample = helper.compute_coverage_per_sample( y_test, y_lower, y_upper, alpha, method_name, x_test, condition) append_statistics(coverage_sample, length_sample, method_name, dataset_name_vec, method_vec, coverage_vec, length_vec, seed_vec, test_ratio_vec, seed, test_ratio, dataset_name_group_0, dataset_name_group_1) # In[6] category_map = np.array([ condition((x_train[i, :], None)) for i in range(x_train.shape[0]) ]) categories = np.unique(category_map) estimator_list = [] nc_list = [] for i in range(len(categories)): # define a QRF model per group estimator_list.append( helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape=in_shape, hidden_size=hidden_size, learn_func=nn_learn_func, epochs=epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state)) # define the CQR object nc_list.append(RegressorNc(estimator_list[i], SignErrorErrFunc())) # run CQR procedure y_lower, y_upper = helper.run_icp_sep(nc_list, x_train, y_train, x_test, idx_train, idx_cal, alpha, condition) method_name = "Conditional Conformal Neural Network (groupwise)" # compute and print average coverage and average length coverage_sample, length_sample = helper.compute_coverage_per_sample( y_test, y_lower, y_upper, alpha, method_name, x_test, condition) append_statistics(coverage_sample, length_sample, method_name, dataset_name_vec, method_vec, coverage_vec, length_vec, seed_vec, test_ratio_vec, seed, test_ratio, dataset_name_group_0, dataset_name_group_1) # In[] if method == "qnet": # pytorch's optimizer object nn_learn_func = torch.optim.Adam # number of epochs epochs = 1000 # learning rate lr = 0.0005 # mini-batch size batch_size = 64 # hidden dimension of the network hidden_size = 64 # dropout regularization rate dropout = 0.1 # weight decay regularization wd = 1e-6 # desired quantiles quantiles_net = [0.05, 0.95] # ratio of held-out data, used in cross-validation cv_test_ratio = 0.1 # seed for splitting the data in cross-validation. # Also used as the seed in quantile random forests function cv_random_state = 1 # In[7]: # define quantile neural network model quantile_estimator = helper.AllQNet_RegressorAdapter( model=None, fit_params=None, in_shape=in_shape, hidden_size=hidden_size, quantiles=quantiles_net, learn_func=nn_learn_func, epochs=epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) # define the CQR object, computing the absolute residual error of points # located outside the estimated quantile neural network band nc = RegressorNc(quantile_estimator, QuantileRegAsymmetricErrFunc()) # run CQR procedure y_lower, y_upper = helper.run_icp(nc, x_train, y_train, x_test, idx_train, idx_cal, alpha) method_name = "Marginal CQR Neural Network" # compute and print average coverage and average length coverage_sample, length_sample = helper.compute_coverage_per_sample( y_test, y_lower, y_upper, alpha, method_name, x_test, condition) append_statistics(coverage_sample, length_sample, method_name, dataset_name_vec, method_vec, coverage_vec, length_vec, seed_vec, test_ratio_vec, seed, test_ratio, dataset_name_group_0, dataset_name_group_1) # In[] # define qnet model quantile_estimator = helper.AllQNet_RegressorAdapter( model=None, fit_params=None, in_shape=in_shape, hidden_size=hidden_size, quantiles=quantiles_net, learn_func=nn_learn_func, epochs=epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) # define the CQR object nc = RegressorNc(quantile_estimator, QuantileRegAsymmetricErrFunc()) # run CQR procedure y_lower, y_upper = helper.run_icp(nc, x_train, y_train, x_test, idx_train, idx_cal, alpha, condition) method_name = "Conditional CQR Neural Network (joint)" # compute and print average coverage and average length coverage_sample, length_sample = helper.compute_coverage_per_sample( y_test, y_lower, y_upper, alpha, method_name, x_test, condition) append_statistics(coverage_sample, length_sample, method_name, dataset_name_vec, method_vec, coverage_vec, length_vec, seed_vec, test_ratio_vec, seed, test_ratio, dataset_name_group_0, dataset_name_group_1) # In[6] category_map = np.array([ condition((x_train[i, :], None)) for i in range(x_train.shape[0]) ]) categories = np.unique(category_map) quantile_estimator_list = [] nc_list = [] for i in range(len(categories)): # define a QRF model per group quantile_estimator_list.append( helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape=in_shape, hidden_size=hidden_size, quantiles=quantiles_net, learn_func=nn_learn_func, epochs=epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False)) # append a CQR object nc_list.append( RegressorNc(quantile_estimator_list[i], QuantileRegAsymmetricErrFunc())) # run CQR procedure y_lower, y_upper = helper.run_icp_sep(nc_list, x_train, y_train, x_test, idx_train, idx_cal, alpha, condition) method_name = "Conditional CQR Neural Network (groupwise)" # compute and print average coverage and average length coverage_sample, length_sample = helper.compute_coverage_per_sample( y_test, y_lower, y_upper, alpha, method_name, x_test, condition) append_statistics(coverage_sample, length_sample, method_name, dataset_name_vec, method_vec, coverage_vec, length_vec, seed_vec, test_ratio_vec, seed, test_ratio, dataset_name_group_0, dataset_name_group_1) # In[] ############### Summary coverage_str = 'Coverage (expected ' + str(100 - alpha * 100) + '%)' if save_to_csv: outdir = './results/' if not os.path.exists(outdir): os.mkdir(outdir) out_name = outdir + 'results.csv' df = pd.DataFrame({ 'name': dataset_name_vec, 'method': method_vec, coverage_str: coverage_vec, 'Avg. Length': length_vec, 'seed': seed_vec, 'train test ratio': test_ratio_vec }) if os.path.isfile(out_name): df2 = pd.read_csv(out_name) df = pd.concat([df2, df], ignore_index=True) df.to_csv(out_name, index=False)
def build(self): '''Build a new RF model with the X and Y numpy matrices ''' if self.failed: return False X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) if self.cv: self.cv = getCrossVal(self.cv, self.estimator_parameters["random_state"], self.n, self.p) if self.tune: if self.quantitative: self.optimize(X, Y, RandomForestRegressor(), self.tune_parameters) results.append( ('model', 'model type', 'RF quantitative (optimized)')) else: self.optimize(X, Y, RandomForestClassifier(), self.tune_parameters) results.append( ('model', 'model type', 'RF qualitative (optimized)')) else: if self.quantitative: log.info("Building Quantitative RF model") self.estimator_parameters.pop('class_weight', None) self.estimator = RandomForestRegressor( **self.estimator_parameters) results.append(('model', 'model type', 'RF quantitative')) else: log.info("Building Qualitative RF model") self.estimator = RandomForestClassifier( **self.estimator_parameters) results.append(('model', 'model type', 'RF qualitative')) if self.conformal: if self.quantitative: underlying_model = RegressorAdapter(self.estimator) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=5)) normalizing_model = RegressorAdapter(self.estimator) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.conformal_pred = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.conformal_pred.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal RF quantitative')) else: self.conformal_pred = AggregatedCp( IcpClassifier( ClassifierNc(ClassifierAdapter(self.estimator), MarginErrFunc())), BootstrapSampler()) self.conformal_pred.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal RF qualitative')) self.estimator.fit(X, Y) return True, results #### Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
def run_experiment(dataset_name, test_method, random_state_train_test, save_to_csv=True): """ Estimate prediction intervals and print the average length and coverage Parameters ---------- dataset_name : array of strings, list of datasets test_method : string, method to be tested, estimating the 90% prediction interval random_state_train_test : integer, random seed to be used save_to_csv : boolean, save average length and coverage to csv (True) or not (False) """ dataset_name_vec = [] method_vec = [] coverage_vec = [] length_vec = [] seed_vec = [] seed = random_state_train_test random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) coverage_linear=0 length_linear=0 coverage_linear_local=0 length_linear_local=0 coverage_net=0 length_net=0 coverage_net_local=0 length_net_local=0 coverage_forest=0 length_forest=0 coverage_forest_local=0 length_forest_local=0 coverage_cp_qnet=0 length_cp_qnet=0 coverage_qnet=0 length_qnet=0 coverage_cp_sign_qnet=0 length_cp_sign_qnet=0 coverage_cp_re_qnet=0 length_cp_re_qnet=0 coverage_re_qnet=0 length_re_qnet=0 coverage_cp_sign_re_qnet=0 length_cp_sign_re_qnet=0 coverage_cp_qforest=0 length_cp_qforest=0 coverage_qforest=0 length_qforest=0 coverage_cp_sign_qforest=0 length_cp_sign_qforest=0 # determines the size of test set test_ratio = 0.2 # conformal prediction miscoverage level significance = 0.1 # desired quantile levels, used by the quantile regression methods quantiles = [0.05, 0.95] # Random forests parameters (shared by conditional quantile random forests # and conditional mean random forests regression). n_estimators = 1000 # usual random forests n_estimators parameter min_samples_leaf = 1 # default parameter of sklearn # Quantile random forests parameters. # See QuantileForestRegressorAdapter class for more details quantiles_forest = [5, 95] CV_qforest = True coverage_factor = 0.85 cv_test_ratio = 0.05 cv_random_state = 1 cv_range_vals = 30 cv_num_vals = 10 # Neural network parameters (shared by conditional quantile neural network # and conditional mean neural network regression) # See AllQNet_RegressorAdapter and MSENet_RegressorAdapter in helper.py nn_learn_func = torch.optim.Adam epochs = 1000 lr = 0.0005 hidden_size = 64 batch_size = 64 dropout = 0.1 wd = 1e-6 # Ask for a reduced coverage when tuning the network parameters by # cross-validation to avoid too conservative initial estimation of the # prediction interval. This estimation will be conformalized by CQR. quantiles_net = [0.1, 0.9] # local conformal prediction parameter. # See RegressorNc class for more details. beta = 1 beta_net = 1 # local conformal prediction parameter. The local ridge regression method # uses nearest neighbor regression as the MAD estimator. # Number of neighbors used by nearest neighbor regression. n_neighbors = 11 print(dataset_name) sys.stdout.flush() try: # load the dataset X, y = datasets.GetDataset(dataset_name, base_dataset_path) except: print("CANNOT LOAD DATASET!") return # Dataset is divided into test and train data based on test_ratio parameter X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state_train_test) # zero mean and unit variance scaling of the train and test features scalerX = StandardScaler() scalerX = scalerX.fit(X_train) X_train = scalerX.transform(X_train) X_test = scalerX.transform(X_test) # scale the labels by dividing each by the mean absolute response max_ytrain = np.mean(np.abs(y_train)) y_train = y_train/max_ytrain y_test = y_test/max_ytrain # fit a simple ridge regression model (sanity check) model = linear_model.RidgeCV() model = model.fit(X_train, y_train) predicted_data = model.predict(X_test).astype(np.float32) # calculate the normalized mean squared error print("Ridge relative error: %f" % (np.sum((y_test-predicted_data)**2)/np.sum(y_test**2))) sys.stdout.flush() # reshape the data X_train = np.asarray(X_train) y_train = np.squeeze(np.asarray(y_train)) X_test = np.asarray(X_test) y_test = np.squeeze(np.asarray(y_test)) # input dimensions n_train = X_train.shape[0] in_shape = X_train.shape[1] print("Size: train (%d, %d), test (%d, %d)" % (X_train.shape[0], X_train.shape[1], X_test.shape[0], X_test.shape[1])) sys.stdout.flush() # set seed for splitting the data into proper train and calibration np.random.seed(seed) idx = np.random.permutation(n_train) # divide the data into proper training set and calibration set n_half = int(np.floor(n_train/2)) idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half] ######################## Linear if 'linear' == test_method: model = linear_model.RidgeCV() nc = RegressorNc(model) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Ridge") coverage_linear, length_linear = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge") dataset_name_vec.append(dataset_name) method_vec.append('Ridge') coverage_vec.append(coverage_linear) length_vec.append(length_linear) seed_vec.append(seed) nc = NcFactory.create_nc( linear_model.RidgeCV(), normalizer_model=KNeighborsRegressor(n_neighbors=n_neighbors) ) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Ridge-L") coverage_linear_local, length_linear_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge-L") dataset_name_vec.append(dataset_name) method_vec.append('Ridge-L') coverage_vec.append(coverage_linear_local) length_vec.append(length_linear_local) seed_vec.append(seed) ######################### Neural net if 'neural_net' == test_method: model = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) nc = RegressorNc(model) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Net") coverage_net, length_net = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net") dataset_name_vec.append(dataset_name) method_vec.append('Net') coverage_vec.append(coverage_net) length_vec.append(length_net) seed_vec.append(seed) normalizer_adapter = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) adapter = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) normalizer = RegressorNormalizer(adapter, normalizer_adapter, AbsErrorErrFunc()) nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta_net) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Net-L") coverage_net_local, length_net_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net-L") dataset_name_vec.append(dataset_name) method_vec.append('Net-L') coverage_vec.append(coverage_net_local) length_vec.append(length_net_local) seed_vec.append(seed) ################## Random Forest if 'random_forest' == test_method: model = RandomForestRegressor(n_estimators=n_estimators,min_samples_leaf=min_samples_leaf, random_state=0) nc = RegressorNc(model, AbsErrorErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"RF") coverage_forest, length_forest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF") dataset_name_vec.append(dataset_name) method_vec.append('RF') coverage_vec.append(coverage_forest) length_vec.append(length_forest) seed_vec.append(seed) normalizer_adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0) adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0) normalizer = RegressorNormalizer(adapter, normalizer_adapter, AbsErrorErrFunc()) nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"RF-L") coverage_forest_local, length_forest_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF-L") dataset_name_vec.append(dataset_name) method_vec.append('RF-L') coverage_vec.append(coverage_forest_local) length_vec.append(length_forest_local) seed_vec.append(seed) ################## Quantile Net if 'quantile_net' == test_method: model_full = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"QNet") coverage_qnet, length_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QNet") dataset_name_vec.append(dataset_name) method_vec.append('QNet') coverage_vec.append(coverage_qnet) length_vec.append(length_qnet) seed_vec.append(seed) if 'cqr_quantile_net' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Net") coverage_cp_qnet, length_cp_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('CQR Net') coverage_vec.append(coverage_cp_qnet) length_vec.append(length_cp_qnet) seed_vec.append(seed) if 'cqr_asymmetric_quantile_net' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign Net") coverage_cp_sign_qnet, length_cp_sign_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign Net") dataset_name_vec.append(dataset_name) method_vec.append('CQR Sign Net') coverage_vec.append(coverage_cp_sign_qnet) length_vec.append(length_cp_sign_qnet) seed_vec.append(seed) ################### Rearrangement Quantile Net if 'rearrangement' == test_method: model_full = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange QNet") coverage_re_qnet, length_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange QNet") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange QNet') coverage_vec.append(coverage_re_qnet) length_vec.append(length_re_qnet) seed_vec.append(seed) if 'cqr_rearrangement' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Net") coverage_cp_re_qnet, length_cp_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange CQR Net') coverage_vec.append(coverage_cp_re_qnet) length_vec.append(length_cp_re_qnet) seed_vec.append(seed) if 'cqr_asymmetric_rearrangement' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Sign Net") coverage_cp_sign_re_qnet, length_cp_sign_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange CQR Sign Net') coverage_vec.append(coverage_cp_sign_re_qnet) length_vec.append(length_cp_sign_re_qnet) seed_vec.append(seed) ################### Quantile Random Forest if 'quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=False params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model_full = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=np.dot(100,quantiles), params = params_qforest) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"QRF") coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF") dataset_name_vec.append(dataset_name) method_vec.append('QRF') coverage_vec.append(coverage_qforest) length_vec.append(length_qforest) seed_vec.append(seed) if 'cqr_quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=CV_qforest params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=quantiles_forest, params = params_qforest) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR RF") coverage_cp_qforest, length_cp_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR RF") dataset_name_vec.append(dataset_name) method_vec.append('CQR RF') coverage_vec.append(coverage_cp_qforest) length_vec.append(length_cp_qforest) seed_vec.append(seed) if 'cqr_asymmetric_quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=CV_qforest params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=quantiles_forest, params = params_qforest) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign RF") coverage_cp_sign_qforest, length_cp_sign_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign RF") dataset_name_vec.append(dataset_name) method_vec.append('CQR Sign RF') coverage_vec.append(coverage_cp_sign_qforest) length_vec.append(length_cp_sign_qforest) seed_vec.append(seed) # tmp = model.predict(X_test) # y_lower = tmp[:,0] # y_upper = tmp[:,1] # if plot_results: # helper.plot_func_data(y_test,y_lower,y_upper,"QRF") # coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF") # # dataset_name_vec.append(dataset_name) # method_vec.append('QRF') # coverage_vec.append(coverage_qforest) # length_vec.append(length_qforest) # seed_vec.append(seed) ############### Summary coverage_str = 'Coverage (expected ' + str(100 - significance*100) + '%)' results = np.array([[dataset_name, coverage_str, 'Avg. Length', 'Seed'], ['CP Linear', coverage_linear, length_linear, seed], ['CP Linear Local', coverage_linear_local, length_linear_local, seed], ['CP Neural Net', coverage_net, length_net, seed], ['CP Neural Net Local', coverage_net_local, length_net_local, seed], ['CP Random Forest', coverage_forest, length_forest, seed], ['CP Random Forest Local', coverage_forest_local, length_forest_local, seed], ['CP Quantile Net', coverage_cp_qnet, length_cp_qnet, seed], ['CP Asymmetric Quantile Net', coverage_cp_sign_qnet, length_cp_sign_qnet, seed], ['Quantile Net', coverage_qnet, length_qnet, seed], ['CP Rearrange Quantile Net', coverage_cp_re_qnet, length_cp_re_qnet, seed], ['CP Asymmetric Rearrange Quantile Net', coverage_cp_sign_re_qnet, length_cp_sign_re_qnet, seed], ['Rearrange Quantile Net', coverage_re_qnet, length_re_qnet, seed], ['CP Quantile Random Forest', coverage_cp_qforest, length_cp_qforest, seed], ['CP Asymmetric Quantile Random Forest', coverage_cp_sign_qforest, length_cp_sign_qforest, seed], ['Quantile Random Forest', coverage_qforest, length_qforest, seed]]) results_ = pd.DataFrame(data=results[1:,1:], index=results[1:,0], columns=results[0,1:]) print("== SUMMARY == ") print("dataset name: " + dataset_name) print(results_) sys.stdout.flush() if save_to_csv: results = pd.DataFrame(results) outdir = './results/' if not os.path.exists(outdir): os.mkdir(outdir) out_name = outdir + 'results.csv' df = pd.DataFrame({'name': dataset_name_vec, 'method': method_vec, coverage_str : coverage_vec, 'Avg. Length' : length_vec, 'seed': seed_vec}) if os.path.isfile(out_name): df2 = pd.read_csv(out_name) df = pd.concat([df2, df], ignore_index=True) df.to_csv(out_name, index=False)
def build(self): '''Build a new RF model with the X and Y numpy matrices ''' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) results.append(('model', 'model type', 'RF')) conformal = self.param.getVal('conformal') # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing RF estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = RandomForestRegressor( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) # results.append(('model','model type','RF quantitative (optimized)')) else: self.estimator = RandomForestClassifier( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) # results.append(('model','model type','RF qualitative (optimized)')) except Exception as e: return False, f'Exception optimizing RF estimator with exception {e}' else: try: if self.param.getVal('quantitative'): self.estimator = RandomForestRegressor( **self.estimator_parameters) if not conformal: LOG.info("Building Quantitative RF model") # results.append(('model', 'model type', 'RF quantitative')) else: self.estimator = RandomForestClassifier( **self.estimator_parameters) if not conformal: LOG.info("Building Qualitative RF model") # results.append(('model', 'model type', 'RF qualitative')) self.estimator.fit(X, Y) except Exception as e: return False, f'Exception building RF estimator with exception {e}' if not conformal: return True, results self.estimator_temp = copy(self.estimator) # Create the conformal estimator try: # Conformal regressor if self.param.getVal('quantitative'): conformal_settings = self.param.getDict('conformal_settings') LOG.info("Building conformal Quantitative RF model") underlying_model = RegressorAdapter(self.estimator_temp) self.normalizing_model = RegressorAdapter( KNeighborsRegressor( n_neighbors=conformal_settings['KNN_NN'])) # normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, copy(self.normalizing_model), AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor # (RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) # results.append(('model', 'model type', 'conformal RF quantitative')) # Conformal classifier else: LOG.info("Building conformal Qualitative RF model") self.estimator = AggregatedCp( IcpClassifier( ClassifierNc(ClassifierAdapter(self.estimator_temp), MarginErrFunc())), BootstrapSampler()) # Fit estimator to the data self.estimator.fit(X, Y) # results.append(('model', 'model type', 'conformal RF qualitative')) except Exception as e: return False, f'Exception building conformal RF estimator with exception {e}' return True, results ## Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
iterations=5, folds=5, scoring_funcs=[class_mean_errors, class_avg_c], significance_levels=[0.05, 0.1, 0.2]) print('Classification: iris') scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean()) # ----------------------------------------------------------------------------- # Regression, absolute error # ----------------------------------------------------------------------------- data = load_diabetes() icp = IcpRegressor( RegressorNc(RegressorAdapter(RandomForestRegressor(n_estimators=100)), AbsErrorErrFunc())) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score(icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2]) print('Absolute error regression: diabetes') scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean()) # -----------------------------------------------------------------------------
def cv(df, parameters): end = len(df) - 120 out = np.zeros(3) out2 = np.zeros(3) p = parameters.copy() p.pop('algorithm') p.pop('randomized_calibration') p.pop('alpha_') if parameters.get('algorithm') == 'RandomForest': algorithm = RandomForestRegressor(**p) d = {'n_estimators': parameters.get('n_estimators'), "criterion": parameters.get("criterion"), "max_features": parameters.get("max_features"), "min_samples_split": parameters.get("min_samples_split"), "min_samples_leaf": parameters.get("min_samples_leaf") } if parameters.get('algorithm') == 'K-NearestNeighbours': algorithm = KNeighborsRegressor(**p) d = { 'n_neighbours': parameters.get('n_neighbours'), 'weights': parameters.get('weights'), 'metric': parameters.get('metric') } if parameters.get('algorithm') == 'LightGBM': algorithm = LGBMRegressor(**p) d = {"metric": parameters.get("metric"), "num_leaves": parameters.get('num_leaves'), "learning_rate": parameters.get('learning_rate'), "feature_fraction": parameters.get('feature_fraction'), "bagging_fraction": parameters.get('bagging_fraction'), "bagging_freq": parameters.get('bagging_freq'), } if parameters.get('algorithm') == 'LassoRegression': algorithm = Lasso(**p) d = {'alpha_': parameters.get('alpha_')} if parameters.get('algorithm') == 'NeuralNetwork': algorithm = NeuralNetworkAlgorithm(p) if parameters.get('algorithm') == 'LSTM': algorithm = BiLSTM(**p) d = {} d = p d['alpha_'] = parameters.get('alpha_') m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() df=df.drop(['QdfTime' ], axis=1) mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std for i, ratio in enumerate(([.5, 0.66, .84])): if parameters.get('randomized_calibration') == True: train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:int(end * ratio), :].values choose = np.random.choice(len(train_), int(end / 6), replace=False) calibrate = train_[choose, :] mask = np.ones(len(train_), dtype=bool) mask[choose] = False train = train_[mask, :] test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[int(end * ratio):int(end * ratio) + int(end / 6), :].values ytrain_ = df['NetPosUsd'][:int(end * ratio)].values ycalibrate = ytrain_[choose] ytrain = ytrain_[mask] ytest = df['NetPosUsd'].iloc[int(end * ratio):int(end * ratio) + int(end / 6)] else: train = df.drop([ 'NetPosUsd'], axis=1).iloc[:int(end * ratio) - int(end / 6), :].values calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[int(end * ratio) - int(end / 6):int(end * ratio), :].values test = df.drop([ 'NetPosUsd'], axis=1).iloc[int(end * ratio):int(end * ratio) + int(end / 6), :].values ytrain = df['NetPosUsd'][:int(end * ratio) - int(end / 6)].values ycalibrate = df['NetPosUsd'][int(end * ratio) - int(end / 6):int(end * ratio)].values ytest = df['NetPosUsd'][int(end * ratio):int(end * ratio) + int(end / 6)].values # print(len(train),len(ytrain),len(calibrate),len(ycalibrate),len(test),len(ytest)) # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction = prediction * s + m ytest = ytest * s + m size = size * s + m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) underlying_model = RegressorAdapter(algorithm) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['cp_lower', 'cp_upper'] prediction = prediction * s + m table = np.vstack([prediction.T]).T dfcp = pd.DataFrame(table, columns=header) dfncp['CP_lower'] = dfcp['cp_lower'] dfncp['CP_upper'] = dfcp['cp_upper'] out[i] = qd_objective(dfncp.NetPosUsd, dfncp['CP_lower'], dfncp['CP_upper'], parameters.get('alpha_')) out2[i] = qd_objective(dfncp.NetPosUsd, dfncp['NCP_lower'], dfncp['NCP_upper'], parameters.get('alpha_')) d['CP_loss'] = np.mean(out) d['NCP_loss'] = np.mean(out2) if os.path.exists(parameters.get('algorithm') + '_cv.csv') == True: pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', mode='a', header=False, index=False) else: pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', encoding='utf-8', index=False)
def run_experiment(cur_test_method, cur_dataset_name, cur_batch_size, cur_lr_loss, cur_lr_dis, cur_loss_steps, cur_dis_steps, cur_mu_val, cur_epochs, cur_model_type, cur_regression_type, cur_random_state, cur_second_scale, num_experiments): method = cur_test_method seed = cur_random_state random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) dataset = cur_dataset_name batch_size = cur_batch_size # step size to minimize loss lr_loss = cur_lr_loss # step size used to fit GAN's classifier lr_dis = cur_lr_dis # inner epochs to fit loss loss_steps = cur_loss_steps # inner epochs to fit GAN's classifier dis_steps = cur_dis_steps # total number of epochs epochs = cur_epochs # utility loss if cur_regression_type == "mreg": cost_pred = torch.nn.MSELoss() out_shape = 1 else: raise model_type = cur_model_type metric = "equalized_odds" print(dataset) print(method) sys.stdout.flush() avg_length_0 = np.zeros(num_experiments) avg_length_1 = np.zeros(num_experiments) avg_coverage_0 = np.zeros(num_experiments) avg_coverage_1 = np.zeros(num_experiments) avg_p_val = np.zeros(num_experiments) mse = np.zeros(num_experiments) for i in range(num_experiments): # Split into train and test X, A, Y, X_cal, A_cal, Y_cal, X_test, A_test, Y_test = get_dataset.get_train_test_data( base_path, dataset, seed + i) in_shape = X.shape[1] print("n train = " + str(X.shape[0]) + " p = " + str(X.shape[1])) print("n calibration = " + str(X_cal.shape[0])) print("n test = " + str(X_test.shape[0])) sys.stdout.flush() if method == "AdversarialDebiasing": class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = adv_debiasing.AdvDebiasingRegLearner( lr=lr_loss, N_CLF_EPOCHS=loss_steps, N_ADV_EPOCHS=dis_steps, N_EPOCH_COMBINED=epochs, cost_pred=cost_pred, in_shape=in_shape, batch_size=batch_size, model_type=model_type, out_shape=out_shape, lambda_vec=cur_mu_val) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) elif method == 'FairDummies': class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = fair_dummies_learning.EquiRegLearner( lr=lr_loss, pretrain_pred_epochs=0, pretrain_dis_epochs=0, epochs=epochs, loss_steps=loss_steps, dis_steps=dis_steps, cost_pred=cost_pred, in_shape=in_shape, batch_size=batch_size, model_type=model_type, lambda_vec=cur_mu_val, second_moment_scaling=cur_second_scale, out_shape=out_shape) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) elif method == 'HGR': class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = continuous_fairness.HGR_Reg_Learner( lr=lr_loss, epochs=epochs, mu=cur_mu_val, cost_pred=cost_pred, in_shape=in_shape, out_shape=out_shape, batch_size=batch_size, model_type=model_type) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) elif method == 'Baseline': class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = fair_dummies_learning.EquiRegLearner( lr=lr_loss, pretrain_pred_epochs=epochs, pretrain_dis_epochs=0, epochs=0, loss_steps=0, dis_steps=0, cost_pred=cost_pred, in_shape=in_shape, batch_size=batch_size, model_type=model_type, lambda_vec=0, second_moment_scaling=0, out_shape=out_shape) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) fairness_reg = RegAdapter(model=None) if cur_regression_type == "mreg": nc = RegressorNc(fairness_reg, AbsErrorErrFunc()) else: raise # function that extracts the group identifier def condition(x, y=None): return int(x[0][0] > 0) icp = IcpRegressor(nc, condition=condition) input_data_train = np.concatenate((A[:, np.newaxis], X), 1) icp.fit(input_data_train, Y) input_data_cal = np.concatenate((A_cal[:, np.newaxis], X_cal), 1) icp.calibrate(input_data_cal, Y_cal) input_data_test = np.concatenate((A_test[:, np.newaxis], X_test), 1) Yhat_test = icp.predict(input_data_test, significance=0.1) # compute and print average coverage and average length coverage_sample, length_sample = compute_coverage_per_sample( Y_test, Yhat_test[:, 0], Yhat_test[:, 1], 0.1, method, input_data_test, condition) avg_coverage, avg_length = compute_coverage_len( Y_test, Yhat_test[:, 0], Yhat_test[:, 1]) avg_length_0[i] = np.mean(length_sample[0]) avg_coverage_0[i] = np.mean(coverage_sample[0]) avg_length_1[i] = np.mean(length_sample[1]) avg_coverage_1[i] = np.mean(coverage_sample[1]) Yhat_out_cal = fairness_reg.learner.predict(input_data_cal) Yhat_out_test = fairness_reg.learner.predict(input_data_test) if out_shape == 1: mse[i] = np.mean((Yhat_out_test - Y_test)**2) MSE_trivial = np.mean((np.mean(Y_test) - Y_test)**2) print("MSE = " + str(mse[i]) + "MSE Trivial = " + str(MSE_trivial)) p_val = utility_functions.fair_dummies_test_regression( Yhat_out_cal, A_cal, Y_cal, Yhat_out_test, A_test, Y_test, num_reps=1, num_p_val_rep=1000, reg_func_name="Net") avg_p_val[i] = p_val print("experiment = " + str(i + 1)) # if out_shape==2: # init_coverage, init_length = compute_coverage_len(Y_test, Yhat_out_test[:,0], Yhat_out_test[:,1]) # print("Init Coverage = " + str(init_coverage)) # print("Init Length = " + str(init_length)) print("Coverage 0 = " + str(avg_coverage_0[i])) print("Coverage 1 = " + str(avg_coverage_1[i])) print("Length 0 = " + str(avg_length_0[i])) print("Length 1 = " + str(avg_length_1[i])) print("MSE = " + str(mse[i])) print("p_val = " + str(p_val)) sys.stdout.flush() outdir = './results/' if not os.path.exists(outdir): os.mkdir(outdir) out_name = outdir + 'results.csv' full_name = cur_test_method + "_" + cur_model_type + "_" + cur_regression_type df = pd.DataFrame({ 'method': [cur_test_method], 'dataset': [cur_dataset_name], 'batch_size': [cur_batch_size], 'lr_loss': [cur_lr_loss], 'lr_dis': [cur_lr_dis], 'loss_steps': [cur_loss_steps], 'dis_steps': [cur_dis_steps], 'mu_val': [cur_mu_val], 'epochs': [cur_epochs], 'random_state': [seed + i], 'model_type': [cur_model_type], 'metric': [metric], 'cur_second_scale': [cur_second_scale], 'regression_type': [cur_regression_type], 'avg_length': [avg_length], 'avg_coverage': [avg_coverage], 'avg_length_0': [avg_length_0[i]], 'avg_length_1': [avg_length_1[i]], 'mse': [mse[i]], 'avg_coverage_0': [avg_coverage_0[i]], 'avg_coverage_1': [avg_coverage_1[i]], 'p_val': [p_val], 'full_name': [full_name] }) if os.path.isfile(out_name): df2 = pd.read_csv(out_name) df = pd.concat([df2, df], ignore_index=True) df.to_csv(out_name, index=False) print(full_name) print( "Num experiments %02d | Avg MSE = %.4f | Avg Length 0 = %.4f | Avg Length 1 = %.4f | Avg Coverage 0 = %.4f | Avg Coverage 1 = %.4f | Avg p_val = %.4f | min p_val = %.4f" % (i + 1, np.mean(mse[:i + 1]), np.mean(avg_length_0[:i + 1]), np.mean(avg_length_1[:i + 1]), np.mean(avg_coverage_0[:i + 1]), np.mean(avg_coverage_1[:i + 1]), np.mean( avg_p_val[:i + 1]), np.min(avg_p_val[:i + 1]))) print("======== Done =========") sys.stdout.flush()
def evaluate(model_filepath, train_filepath, test_filepath, calibrate_filepath): """Evaluate model to estimate power. Args: model_filepath (str): Path to model. train_filepath (str): Path to train set. test_filepath (str): Path to test set. calibrate_filepath (str): Path to calibrate set. """ METRICS_FILE_PATH.parent.mkdir(parents=True, exist_ok=True) # Load parameters params = yaml.safe_load(open("params.yaml"))["evaluate"] params_train = yaml.safe_load(open("params.yaml"))["train"] params_split = yaml.safe_load(open("params.yaml"))["split"] test = np.load(test_filepath) X_test = test["X"] y_test = test["y"] # pandas data frame to store predictions and ground truth. df_predictions = None y_pred = None if params_split["calibrate_split"] == 0: model = models.load_model(model_filepath) y_pred = model.predict(X_test) else: trained_model = models.load_model(model_filepath) # mycustommodel = MyCustomModel(model_filepath) mycustommodel = MyCustomModel(trained_model) m = cnn(X_test.shape[-2], X_test.shape[-1], output_length=1, kernel_size=params_train["kernel_size"]) nc = RegressorNc( mycustommodel, err_func=AbsErrorErrFunc(), # non-conformity function # normalizer_model=KNeighborsRegressor(n_neighbors=15) # normalizer # normalizer=m ) # nc = NcFactory.create_nc(mycustommodel, # err_func=AbsErrorErrFunc(), # non-conformity function # # normalizer_model=KNeighborsRegressor(n_neighbors=15) # normalizer # normalizer_model=m # ) model = IcpRegressor(nc) # Fit the normalizer. train = np.load(train_filepath) X_train = train["X"] y_train = train["y"] y_train = y_train.reshape((y_train.shape[0], )) model.fit(X_train, y_train) # Calibrate model. calibrate = np.load(calibrate_filepath) X_calibrate = calibrate["X"] y_calibrate = calibrate["y"] y_calibrate = y_calibrate.reshape((y_calibrate.shape[0], )) model.calibrate(X_calibrate, y_calibrate) print(f"Calibration: {X_calibrate.shape}") # Set conformal prediction error. This should be a parameter specified by the user. error = 0.05 # Predictions will contain the intervals. We need to compute the middle # points to get the actual predictions y. predictions = model.predict(X_test, significance=error) # Compute middle points. y_pred = predictions[:, 0] + (predictions[:, 1] - predictions[:, 0]) / 2 # Reshape to put it in the same format as without calibration set. y_pred = y_pred.reshape((y_pred.shape[0], 1)) # Build data frame with predictions. my_results = list( zip(np.reshape(y_test, (y_test.shape[0], )), np.reshape(y_pred, (y_pred.shape[0], )), predictions[:, 0], predictions[:, 1])) df_predictions = pd.DataFrame(my_results, columns=[ 'ground_truth', 'predicted', 'lower_bound', 'upper_bound' ]) save_predictions(df_predictions) plot_intervals(df_predictions) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print("MSE: {}".format(mse)) print("R2: {}".format(r2)) plot_prediction(y_test, y_pred, inputs=X_test, info="(R2: {})".format(r2)) plot_individual_predictions(y_test, y_pred) with open(METRICS_FILE_PATH, "w") as f: json.dump(dict(mse=mse, r2=r2), f)
# Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Without normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter(DecisionTreeRegressor(min_samples_leaf=5)) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ['min','max','truth','size'] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df) # -----------------------------------------------------------------------------
def train_and_test_quantile_QCP(parameters): params = parameters.copy() params.pop('algorithm') quantiles_forest = [(params['alpha_'] / 2), (100 - params['alpha_'] / 2)] params.pop('alpha_') validation = params['validation'] params.pop('validation') for i in tqdm(range(29)): path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['Unnamed: 0', 'QdfTime'], axis=1).fillna(0) train_test_split = len(df) - 120 m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std train_test_split = len(df) - 120 train = 1 * df.drop(['NetPosUsd'], axis=1).iloc[:train_test_split, :].values test = 1 * (df.drop(['NetPosUsd'], axis=1)).iloc[train_test_split:, :].values ytrain = df['NetPosUsd'][:train_test_split].values ytest = df['NetPosUsd'].iloc[train_test_split:] idx_train = np.arange(train_test_split - validation) idx_cal = np.arange(train_test_split - validation, train_test_split) if parameters.get('algorithm') == 'QuantileGradientBoosting': quantile_estimator = helper.QuantileGradientBoosting(model=None, quantiles=quantiles_forest, params=params) if parameters.get('algorithm') == 'QuantileLightGBM': quantile_estimator = helper.QuantileLightGBM(model=None, quantiles=quantiles_forest, params=params) if parameters.get('algorithm') == 'QuantileRegression': quantile_estimator = helper.QuantileRegression(model=None, quantiles=quantiles_forest, params=params) if parameters.get('algorithm') == 'QuantileRandomForest': quantile_estimator = helper.QuantileForestRegressorAdapterNew(model=None, quantiles=quantiles_forest, params=params) if parameters.get('algorithm') == 'QuantileKNN': quantile_estimator = helper.QuantileKNN(model=None, quantiles=quantiles_forest, params=params) nc = RegressorNc(quantile_estimator, QuantileRegErrFunc()) # run CQR procedure lower, upper = helper.run_icp(nc, train, ytrain, test, idx_train, idx_cal, alpha) lower = lower * s + m upper = upper * s + m ytest = ytest * s + m header = ['QCP_lower', 'QCP_upper', 'NetPosUsd', 'prediction'] size = upper / 2 + lower / 2 table = np.vstack([lower, upper, ytest, size]).T dfncp = pd.DataFrame(table, columns=header) if i == 0: dfncp.to_csv( 'QCP' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_')).astype(int)) + '_' + str(validation) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( 'QCP' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_')).astype(int)) + '_' + str(validation) + '.csv', mode='a', header=False, index=False)
def CF_quantitative_validation(self): ''' Performs internal validation for conformal quantitative models ''' # Make a copy of original matrices. X = self.X.copy() Y = self.Y.copy() info = [] kf = KFold(n_splits=self.param.getVal('ModelValidationN'), shuffle=True, random_state=46) # Copy Y vector to use it as template to assign predictions Y_pred = copy.copy(Y).tolist() try: for train_index, test_index in kf.split(X): # Generate training and test sets X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] # Generate training a test sets # Create the aggregated conformal regressor. conformal_pred = AggregatedCp( IcpRegressor( RegressorNc(RegressorAdapter(self.estimator_temp))), BootstrapSampler()) # Fit conformal regressor to the data conformal_pred.fit(X_train, Y_train) # Perform prediction on test set prediction = conformal_pred.predict( X_test, self.param.getVal('conformalSignificance')) # Assign the prediction its original index for index, el in enumerate(test_index): Y_pred[el] = prediction[index] except Exception as e: LOG.error(f'Quantitative conformal validation' f' failed with exception: {e}') raise e Y_pred = np.asarray(Y_pred) # Add the n validation interval means interval_mean = np.mean(np.abs((Y_pred[:, 0]) - (Y_pred[:, 1]))) # Get boolean mask of instances # within the applicability domain. inside_interval = ((Y_pred[:, 0].reshape(-1, 1) < Y) & (Y_pred[:, 1].reshape(-1, 1) > Y)) # Compute the accuracy (number of instances within the AD). accuracy = np.sum(inside_interval) / len(Y) # Cut into two decimals. self.conformal_interval_medians = (np.mean(Y_pred, axis=1)) self.conformal_accuracy = float("{0:.2f}".format(accuracy)) self.conformal_mean_interval = float("{0:.2f}".format(interval_mean)) #Add quality metrics to results. info.append(('Conformal_mean_interval', 'Conformal mean interval', self.conformal_mean_interval)) info.append(('Conformal_accuracy', 'Conformal accuracy', self.conformal_accuracy)) info.append( ('Conformal_interval_medians', 'Conformal interval medians', self.conformal_interval_medians)) info.append(('Conformal_prediction_ranges', 'Conformal prediction ranges', Y_pred)) results = {} results['quality'] = info return True, results
def train_and_test_cp_algo(i): window = 96 p = {'window': window} algorithm = BiLSTM(p) path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['QdfTime', 'Unnamed: 0'], axis=1).fillna(0) y_raw_test = df.NetPosUsd[-120:] median_ = df.NetPosUsd.median() mad_ = mad(df.NetPosUsd.values) df.NetPosUsd = mlog_trans(df.NetPosUsd.values) # mean = df.NetPosUsd.mean() # std = df.NetPosUsd.std() # df.NetPosUsd = (df.NetPosUsd - mean) / std data = df.NetPosUsd.values def generate_index(window, data_matrix): ''' :return: ''' num_elements = data_matrix.shape[0] for start, stop in zip(range(0, num_elements - window, 1), range(window, num_elements, 1)): yield data_matrix[stop - window:stop].reshape((-1, 1)) cnt = [] for sequence in generate_index(window, data): cnt.append(sequence) cnt = np.array(cnt) X = cnt y = data[window:] X = X.reshape(X.shape[0], X.shape[1]) train_test_split = X.shape[0] - 120 - 3480 train = X[:train_test_split, :] calibrate = X[train_test_split:train_test_split + 3480, :] test = X[-120:] ytrain = y[:train_test_split] ycalibrate = y[train_test_split:train_test_split + 3480] ytest = y[-120:] underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) underlying_model2 = RegressorAdapter(algorithm) nc2 = RegressorNc(underlying_model2, AbsErrorErrFunc()) icp2 = IcpRegressor(nc2) icp2.fit(train, ytrain) icp2.calibrate(calibrate, ycalibrate) for a in tqdm(np.linspace(5, 95, 19)): # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=a / 100) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] lower, upper = prediction[:, 0], prediction[:, 1] lower = mlog_inverse(lower, median_, mad_) upper = mlog_inverse(upper, median_, mad_) ytest = mlog_inverse(ytest, median_, mad_) # lower=lower*std+mean # upper=upper*std+mean # ytest=ytest*std+mean size = upper / 2 + lower / 2 table = np.vstack([lower, upper, y_raw_test, size.T]).T dfncp = pd.DataFrame(table, columns=header) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp2.predict(test, significance=a / 100) header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction'] lower, upper = prediction[:, 0], prediction[:, 1] lower = mlog_inverse(lower, median_, mad_) upper = mlog_inverse(upper, median_, mad_) ytest = mlog_inverse(ytest, median_, mad_) # lower=lower*std+mean # upper=upper*std+mean # ytest=ytest*std+mean size = upper / 2 + lower / 2 table = np.vstack([lower, upper, y_raw_test, size.T]).T dfcp = pd.DataFrame(table, columns=header) if i == 0: dfcp.to_csv( 'CP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', encoding='utf-8', index=False) else: dfcp.to_csv( 'CP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', mode='a', header=False, index=False) if i == 0: dfncp.to_csv( 'NCP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( 'NCP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', mode='a', header=False, index=False)
from nonconformist.base import RegressorAdapter from nonconformist.icp import IcpRegressor from nonconformist.nc import RegressorNc, AbsErrorErrFunc, SignErrorErrFunc # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- icp = IcpRegressor( RegressorNc(RegressorAdapter(DecisionTreeRegressor()), SignErrorErrFunc())) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.05) header = np.array(['min', 'max', 'Truth']) table = np.vstack([prediction.T, data.target[test]]).T df = pd.DataFrame(np.vstack([header, table])) print(df)
data.data, data.target, iterations=5, folds=5, scoring_funcs=[class_mean_errors, class_avg_c], significance_levels=[0.05, 0.1, 0.2]) print('Classification: iris') scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean()) # ----------------------------------------------------------------------------- # Regression, absolute error # ----------------------------------------------------------------------------- data = load_diabetes() icp = OobCpRegressor(RegressorNc(OobRegressorAdapter(RandomForestRegressor(n_estimators=100, oob_score=True)))) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score(icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2]) print('Absolute error regression: diabetes') scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean())
def build(self): '''Build a new SVM model with the X and Y numpy matrices''' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): try: # Check type of model if self.param.getVal('quantitative'): self.optimize(X, Y, svm.SVR(**self.estimator_parameters), self.tune_parameters) results.append(('model', 'model type', 'SVM quantitative (optimized)')) else: self.optimize(X, Y, svm.SVC(**self.estimator_parameters), self.tune_parameters) results.append( ('model', 'model type', 'SVM qualitative (optimized)')) LOG.debug('SVM estimator optimized') except Exception as e: LOG.error(f'Exception optimizing SVM' f'estimator with exception {e}') else: try: LOG.info("Building SVM model") if self.param.getVal('quantitative'): LOG.info("Building Quantitative SVM-R model") self.estimator = svm.SVR(**self.estimator_parameters) results.append(('model', 'model type', 'SVM quantitative')) else: self.estimator = svm.SVC(**self.estimator_parameters) results.append(('model', 'model type', 'SVM qualitative')) except Exception as e: LOG.error(f'Exception building SVM' f'estimator with exception {e}') self.estimator.fit(X, Y) self.estimator_temp = copy(self.estimator) if self.param.getVal('conformal'): try: LOG.info("Building aggregated conformal SVM model") if self.param.getVal('quantitative'): underlying_model = RegressorAdapter(self.estimator_temp) # normalizing_model = RegressorAdapter( # KNeighborsRegressor(n_neighbors=5)) normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor( # RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal SVM quantitative')) else: self.estimator = AggregatedCp( IcpClassifier( ClassifierNc( ClassifierAdapter(self.estimator_temp), MarginErrFunc())), BootstrapSampler()) self.estimator.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal SVM qualitative')) except Exception as e: LOG.error(f'Exception building aggregated conformal SVM ' f'estimator with exception {e}') # Fit estimator to the data return True, results
def train_and_test_cp_algo(parameters): p = parameters.copy() p.pop('algorithm') p.pop('randomized_calibration') p.pop('alpha_') p.pop('calibration_size') p.pop('WhichCP') for i in tqdm(range(29)): if parameters.get('algorithm') == 'RandomForest': algorithm = RandomForestRegressor(**p) if parameters.get('algorithm') == 'K-NearestNeighbours': algorithm = KNeighborsRegressor(**p) if parameters.get('algorithm') == 'LightGBM': algorithm = LGBMRegressor(**p) if parameters.get('algorithm') == 'LassoRegression': algorithm = Lasso(**p) if parameters.get('algorithm') == 'NeuralNetwork': algorithm = NeuralNetworkAlgorithm(p) if parameters.get('algorithm') == 'LSTM': algorithm = BiLSTM(**p) if parameters.get('algorithm') == 'GradientBoosting': algorithm =GradientBoostingRegressor(**p) path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['Unnamed: 0','QdfTime'], axis=1).fillna(0) m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std if parameters.get('randomized_calibration') == True: train_test_split = len(df) - 120 train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values choose = np.random.choice(len(train_), parameters.get("calibration_size"), replace=False) calibrate = train_[choose, :] mask = np.ones(len(train_), dtype=bool) mask[choose] = False train = train_[mask, :] test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[train_test_split:, :].values ytrain_ = df['NetPosUsd'][:train_test_split].values ycalibrate = ytrain_[choose] ytrain = ytrain_[mask] ytest = df['NetPosUsd'].iloc[train_test_split:] else: train_test_split = len(df) - 120 - parameters.get("calibration_size") train = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[train_test_split:train_test_split + parameters.get("calibration_size"), :].values test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[-120:,:].values ytrain = df['NetPosUsd'][:train_test_split].values ycalibrate = df['NetPosUsd'][train_test_split:train_test_split + parameters.get("calibration_size")] ytest = df['NetPosUsd'].iloc[-120:] if parameters.get("WhichCP") == 'NCP': underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction=prediction*s+m ytest=ytest*s+m size=size*s+m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) else: underlying_model = RegressorAdapter(algorithm) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction = prediction * s + m ytest = ytest * s + m size = size * s + m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) if i == 0: dfncp.to_csv( parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str( parameters.get('calibration_size')) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str( parameters.get('calibration_size')) + '.csv', mode='a', header=False, index=False) del algorithm