Exemplo n.º 1
0
 def __init__(self,
              data_set_name,
              metric=RootMeanSquaredError,
              models=None,
              ensembles=None,
              benchmark_id=None):
     """Initializes benchmark environment."""
     self.benchmark_id = benchmark_id
     self.data_set_name = data_set_name
     # Creates file name as combination of data set name and and date.
     self.file_name = self.data_set_name + "_" + self.benchmark_id + "__" + _now.strftime(
         "%Y_%m_%d__%H_%M_%S")
     # Loads samples into object.
     # self.samples = [load_samples(data_set_name, index) for index in range(10)]
     # self.samples = [load_samples_no_val(data_set_name, index) for index in range(10)] # changed from 30 , change back at the end
     self.samples = load_standardized_samples(data_set_name)
     self.metric = metric
     self.ensembles = ensembles
     self.models = models
     # If data set is classification problem, remove regression models. Else, vice versa.
     if is_classification(
             self.samples):  # original self.samples[0][0] new self.samples
         self.classification = True
         if 'mlpr_lbfgs' in self.models.keys():
             del self.models['mlpr_lbfgs']
         if 'mlpr_adam' in self.models.keys():
             del self.models['mlpr_adam']
         if 'mlpr_sgd' in self.models.keys():
             del self.models['mlpr_sgd']
     else:
         self.classification = False
         if 'mlpc_lbfgs' in self.models.keys():
             del self.models['mlpc_lbfgs']
         if 'mlpc_adam' in self.models.keys():
             del self.models['mlpc_adam']
         if 'mlpc_sgd' in self.models.keys():
             del self.models['mlpc_sgd']
     # if models = MLP, remove Random Independent Weighting
     if self.ensembles != None:
         if 'mlpc_lbfgs' in self.models.keys(
         ) or 'mlpr_lbfgs' in self.models.keys():
             if 'riw' in self.ensembles.keys():
                 del self.ensembles['riw']
     # Create results dictionary with models under study.
     self.results = {
         k: [None for i in range(_OUTER_FOLDS)]
         for k in self.models.keys()
     }
     if self.ensembles != None:
         self.results_ensemble = {
             ensemble: [None for i in range(_OUTER_FOLDS)]
             for ensemble in self.ensembles.keys()
         }
     self.best_result = [None for i in range(_OUTER_FOLDS)]
     # Serialize benchmark environment.
     benchmark_to_pickle(self)
def _standardize_data_set(data_set):
    """"""
    data_set_ext = _remove_unary_features(data_set)
    cols = data_set_ext.columns
    if is_classification(data_set_ext):
        data_set_ext[cols[-1]] = data_set_ext[cols[-1]].astype(float)
        cols = cols[:-1]
    for c in cols:
        data_set_ext[c] = (data_set_ext[c] -
                           data_set_ext[c].mean()) / data_set_ext[c].std()
    return data_set_ext
Exemplo n.º 3
0
    def __init__(self,
                 data_set_name,
                 metric=RootMeanSquaredError,
                 models=_MODELS):
        """Initializes benchmark environment."""

        self.data_set_name = data_set_name
        # Creates file name as combination of data set name and and date.
        self.file_name = self.data_set_name + "__" + _now.strftime(
            "%Y_%m_%d__%H_%M_%S")
        # Loads samples into object.
        self.samples = [
            load_samples(data_set_name, index) for index in range(30)
        ]
        self.metric = metric
        self.models = models
        # If data set is classification problem, remove regression models. Else, vice versa.
        if is_classification(self.samples[0][0]):
            if 'svr' in self.models.keys():
                del self.models['svr']
            if 'mlpr' in self.models.keys():
                del self.models['mlpr']
            if 'rfr' in self.models.keys():
                del self.models['rfr']
        else:
            if 'svc' in self.models.keys():
                del self.models['svc']
            if 'mlpc' in self.models.keys():
                del self.models['mlpc']
            if 'rfc' in self.models.keys():
                del self.models['rfc']
        # Create results dictionary with models under study.
        self.results = {
            k: [None for i in self.samples]
            for k in self.models.keys()
        }
        # Serialize benchmark environment.
        benchmark_to_pickle(self)
Exemplo n.º 4
0
    def __init__(self,
                 dataset_name,
                 learning_metric=None,
                 selection_metric=None,
                 models=None,
                 ensembles=None,
                 benchmark_id=None,
                 file_path=None):
        """Initializes benchmark environment."""

        self.benchmark_id = benchmark_id
        self.dataset_name = dataset_name
        # Creates file name as combination of dataset name and and date
        self.file_name = self.dataset_name + "_" + self.benchmark_id + "__" + _now.strftime(
            "%Y_%m_%d__%H_%M_%S")

        # Loads samples into object
        self.samples = load_standardized_samples(dataset_name, file_path)
        self.ensembles = ensembles
        self.models = models

        # todo make it more general (small fix can be done later ...)
        if self.dataset_name == 'data_batch_1':
            print(self.samples.keys())
            b = self.samples[b'labels']
            self.samples = pd.DataFrame(self.samples[b'data'])
            self.samples = self.samples / 255
            self.samples[3072] = b
            self.samples = self.samples.head(
                1000)  # change it to get all data not sample
            print(self.samples.shape)

        # If dataset is classification problem, remove regression models. Else, vice versa.
        if is_binary(
                self.samples):  # original self.samples[0][0] new self.samples
            self.classification = True
            self.binary = True

            if learning_metric != None:
                self.learning_metric = learning_metric
            else:
                self.learning_metric = RootMeanSquaredError

            if selection_metric != None:
                self.selection_metric = selection_metric
            else:
                self.selection_metric = AUROC

            if 'mlpr_lbfgs' in self.models.keys():
                del self.models['mlpr_lbfgs']
            if 'mlpr_adam' in self.models.keys():
                del self.models['mlpr_adam']
            if 'mlpr_sgd' in self.models.keys():
                del self.models['mlpr_sgd']

        elif is_classification(self.samples):
            self.classification = True
            self.binary = False

            if learning_metric != None:
                self.learning_metric = learning_metric
            else:
                self.learning_metric = RootMeanSquaredError

            if selection_metric != None:
                self.selection_metric = selection_metric
            else:
                self.selection_metric = AUROC

        else:
            self.classification = False

            if learning_metric != None:
                self.learning_metric = learning_metric
            else:
                self.learning_metric = RootMeanSquaredError

            if selection_metric != None:
                self.selection_metric = selection_metric
            else:
                self.selection_metric = RootMeanSquaredError

            if 'mlpc_lbfgs' in self.models.keys():
                del self.models['mlpc_lbfgs']
            if 'mlpc_adam' in self.models.keys():
                del self.models['mlpc_adam']
            if 'mlpc_sgd' in self.models.keys():
                del self.models['mlpc_sgd']

        # if models = MLP, remove Random Independent Weighting
        if self.ensembles != None:
            if 'mlpc_lbfgs' in self.models.keys(
            ) or 'mlpr_lbfgs' in self.models.keys():
                if 'riw' in self.ensembles.keys():
                    del self.ensembles['riw']

        # Create results dictionary with models under study.
        self.results = {
            k: [None for i in range(_OUTER_FOLDS)]
            for k in self.models.keys()
        }

        if self.ensembles != None:
            self.results_ensemble = {
                ensemble: [None for i in range(_OUTER_FOLDS)]
                for ensemble in self.ensembles.keys()
            }

        self.best_result = [None for i in range(_OUTER_FOLDS)]

        # Serialize benchmark environment.
        benchmark_to_pickle(self)
    def __init__(self, dataset_name, learning_metric=None, selection_metric=None, models=None, ensembles=None, benchmark_id=None):
        """Initializes benchmark environment."""
        
        self.benchmark_id = benchmark_id
        self.data_set_name = dataset_name
        # Creates file name as combination of dataset name and and date # this is the benchmark name
        self.file_name = 'c_' + self.data_set_name + "_" + self.benchmark_id + "__" + _now.strftime("%Y_%m_%d__%H_%M_%S")

        self.samples = load_pmlb_samples(dataset_name)
        self.ensembles = ensembles
        self.models = models
        
        # If dataset is classification problem, remove regression models. Else, vice versa.
        if is_classification(self.samples):  # original self.samples[0][0] new self.samples
            self.classification = True
            
            if learning_metric != None:
                self.learning_metric = learning_metric
            else:
                self.learning_metric = RootMeanSquaredError
            
            if selection_metric != None:
                self.selection_metric = selection_metric
            else:
                self.selection_metric = AUROC
            
            if 'mlpr_lbfgs' in self.models.keys():
                del self.models['mlpr_lbfgs']
            if 'mlpr_adam' in self.models.keys():
                del self.models['mlpr_adam']
            if 'mlpr_sgd' in self.models.keys():
                del self.models['mlpr_sgd']
        else:
            self.classification = False
            
            if learning_metric != None:
                self.learning_metric = learning_metric
            else:
                self.learning_metric = RootMeanSquaredError
            
            if selection_metric != None:
                self.selection_metric = selection_metric
            else:
                self.selection_metric = RootMeanSquaredError

            if 'mlpc_lbfgs' in self.models.keys():
                del self.models['mlpc_lbfgs']
            if 'mlpc_adam' in self.models.keys():
                del self.models['mlpc_adam']
            if 'mlpc_sgd' in self.models.keys(): 
                del self.models['mlpc_sgd']
        
        # if models = MLP, remove Random Independent Weighting 
        if self.ensembles != None:
            if 'mlpc_lbfgs' in self.models.keys() or 'mlpr_lbfgs' in self.models.keys(): 
                if 'riw' in self.ensembles.keys(): 
                    del self.ensembles['riw']
        
        # Create results dictionary with models under study.
        self.results = {k: [None for i in range(_OUTER_FOLDS)] for k in self.models.keys()}
        
        if self.ensembles != None:
            self.results_ensemble = {ensemble: [None for i in range(_OUTER_FOLDS)] for ensemble in self.ensembles.keys()}
        
        self.best_result = [None for i in range(_OUTER_FOLDS)]
        
        # Serialize benchmark environment.
        benchmark_to_pickle(self)