Пример #1
0
    def test_normalized_score_becomes_none(self):
        """Tunables that worked at some point but end up removed are not tried again.

        After commit ``6a08dc3cf1b68b35630cae6a87783aec4e2c9f83`` the following
        scenario has been observed:

        - One tunable produces a score at least once and then fails the next trials.
        - All the other tunables never produce any score.
        - Once all the tuners are created, only the one that produced a score is used.
        - After enough errors, this one is discarded, so `_normalized_errors` is empty.
        - Since a random.choice is used over the list of tunables, which still contains
          the one tha has been discarded, at some point the discarded one is tried again.

        This test certifies that this scenario cannot happen again, by validating that
        the number of errors is always ``max_errors`` at most.
        """
        scores = []

        def scorer(name, proposal):
            """Produce a score for the first trial and then fail forever."""
            if not scores:
                scores.append(1)   # boolean variable fails due to scope unles using global
                return 1

            raise Exception()

        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 10]
                }
            },
            'another_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 10]
                }
            }
        }

        session = BTBSession(tunables, scorer, max_errors=3)

        with pytest.raises(StopTuning):
            session.run(8)

        assert session.errors == {'a_tunable': 3, 'another_tunable': 3}
Пример #2
0
    def test_stop(self):
        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            }
        }

        session = BTBSession(tunables, self.scorer)

        with pytest.raises(StopTuning):
            session.run()
Пример #3
0
    def test_errors(self):
        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            },
            'another_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            }
        }

        def scorer(name, proposal):
            if name == 'another_tunable':
                raise Exception()
            else:
                return proposal['a_parameter']

        session = BTBSession(tunables, scorer)

        best = session.run(4)

        assert best['name'] == 'a_tunable'
        assert best['config'] == {'a_parameter': 2}
Пример #4
0
    def tune(self, X, y, max_evals=10, scoring=None, verbose=False):
        """ Tune the pipeline hyper-parameters and select the optimized model.

        Args:
            X (pandas.DataFrame or ndarray):
                Inputs to the pipeline.
            y (pandas.Series or ndarray):
                Target values.
            max_evals (int):
                Maximum number of hyper-parameter optimization iterations.
            scoring (str):
                The name of the scoring function.
            verbose (bool):
                Whether to log information during processing.
        """
        tunables = {'0': self._pipeline.get_tunable_hyperparameters(flat=True)}

        session = BTBSession(tunables,
                             lambda _, hyparam: self.k_fold_validation(
                                 hyparam, X=X, y=y, scoring=scoring),
                             max_errors=max_evals,
                             verbose=verbose)

        best_proposal = session.run(max_evals)
        self._pipeline.set_hyperparameters(best_proposal['config'])
Пример #5
0
    def test_allow_duplicates(self):
        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            }
        }

        session = BTBSession(tunables, self.scorer, allow_duplicates=True)

        best = session.run(10)

        assert best['name'] == 'another_tunable'
        assert best['config'] == {'a_parameter': 2}
Пример #6
0
    def test_run_score_none(self):
        # setup
        instance = MagicMock(spec_set=BTBSession)
        instance.propose.return_value = ('test', {'hp': 'test'})
        instance._scorer.side_effect = Exception()
        instance.best_proposal = {'test': 'config'}
        instance._range = range
        instance.iterations = 0

        # run
        result = BTBSession.run(instance, 1)

        # assert
        instance._scorer.assert_called_once_with('test', {'hp': 'test'})
        instance.record.assert_called_once_with('test', {'hp': 'test'}, None)
        assert result == {'test': 'config'}
        assert instance.iterations == 1
Пример #7
0
    def test_run_score(self):
        # setup
        instance = MagicMock(spec_set=BTBSession)
        instance.propose.return_value = ('test', 'config')
        instance._scorer.return_value = 1
        instance.best_proposal = {'test': 'config'}
        instance._range = range
        instance.iterations = 0

        # run
        result = BTBSession.run(instance, 1)

        # assert
        instance._scorer.assert_called_once_with('test', 'config')
        instance.record.assert_called_once_with('test', 'config', 1)
        assert result == {'test': 'config'}
        assert instance.iterations == 1
Пример #8
0
    def test_minimize(self):
        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            }
        }

        session = BTBSession(tunables, self.scorer, maximize=False)

        best = session.run(3)

        assert best == session.best_proposal
        assert best['name'] == 'a_tunable'
        assert best['config'] == {'a_parameter': 0}
Пример #9
0
    def test_allow_errors(self):
        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 1]
                }
            }
        }

        def scorer(name, proposal):
            if proposal['a_parameter'] == 0:
                raise Exception()

            return 1

        session = BTBSession(tunables, scorer, max_errors=10)

        best = session.run(10)

        assert best['name'] == 'a_tunable'
        assert best['config'] == {'a_parameter': 1}
Пример #10
0
    def test_multiple(self):
        tunables = {
            'a_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            },
            'another_tunable': {
                'a_parameter': {
                    'type': 'int',
                    'default': 0,
                    'range': [0, 2]
                }
            }
        }

        session = BTBSession(tunables, self.scorer)

        best = session.run(6)

        assert best['name'] == 'another_tunable'
        assert best['config'] == {'a_parameter': 2}
Пример #11
0
def train_btb(X_train,X_test,y_train,y_test,mtype,common_name_model,problemtype,classes,default_featurenames,transform_model,settings,model_session):

	# create file names
	model_name=common_name_model+'.pickle'
	folder='btb_session'
	csvname=common_name_model.split('_')[0]
	curdir=os.getcwd()
	files=list()

	# make a temporary folder for the training session
	try:
		os.mkdir(folder)
		os.chdir(folder)
	except:
		shutil.rmtree(folder)
		os.mkdir(folder)
		os.chdir(folder)

	# get training and testing data
	try:
		shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_train_transformed.csv',os.getcwd()+'/train.csv')
		shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_test_transformed.csv',os.getcwd()+'/test.csv')
	except:
		shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_train.csv',os.getcwd()+'/train.csv')  
		shutil.copy(curdir+'/'+model_session+'/data/'+csvname+'_test.csv',os.getcwd()+'/test.csv')

	# create required .JSON
	dataset_id, filename=create_json(folder, 'train.csv')
	os.mkdir(dataset_id)
	os.chdir(dataset_id)
	os.mkdir('tables')
	shutil.copy(curdir+'/'+folder+'/train.csv', os.getcwd()+'/tables/train.csv')

	if mtype=='c':

		def build_model(name, hyperparameters):
			model_class = models[name]
			return model_class(random_state=0, **hyperparameters)

		def score_model(name, hyperparameters):
			model = build_model(name, hyperparameters)
			scores = cross_val_score(model, X_train, y_train)
			return scores.mean()

		rf_hyperparams = {'n_estimators': IntHyperParam(min=10, max=500),
						'max_depth': IntHyperParam(min=10, max=500)}

		rf_tunable = Tunable(rf_hyperparams)
		print(rf_tunable)

		svc_hyperparams = {'C': FloatHyperParam(min=0.01, max=10.0),
							'gamma': FloatHyperParam(0.000000001, 0.0000001)}

		svc_tunable = Tunable(svc_hyperparams)
		print(svc_tunable)

		tuners = {'RF': rf_tunable,
				  'SVC': svc_tunable}

		print(tuners)

		models = {'RF': RandomForestClassifier,
				  'SVC': SVC}

		selector = UCB1(['RF', 'SVC'])

		session = BTBSession(tuners, score_model, verbose=True)
		best_proposal = session.run(iterations=100)  
		best_model = build_model(best_proposal['name'], best_proposal['config'])
		best_model.fit(X_train, y_train)
		accuracy =  best_model.score(X_test, y_test)

		# tuner.record(parameters, score)
		print('ACCURACY:')
		print(accuracy)

		# now save the model in .pickle
		os.chdir(curdir)
		f=open(model_name,'wb')
		pickle.dump(best_model, f)
		f.close()


	elif mtype == 'r':


		tunables = {
			'random_forest': {
				'n_estimators': {'type': 'int', 'default': 2, 'range': [1, 1000]},
				'max_features': {'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt']},
				'min_samples_split': {'type': 'int', 'default': 2, 'range': [2, 20]},
				'min_samples_leaf': {'type': 'int', 'default': 2, 'range': [1, 20]},
			},
			'extra_trees': {
				'n_estimators': {'type': 'int', 'default': 2, 'range': [1, 1000]},
				'max_features': {'type': 'str', 'default': 'log2', 'range': [None, 'auto', 'log2', 'sqrt']},
				'min_samples_split': {'type': 'int', 'default': 2, 'range': [2, 20]},
				'min_samples_leaf': {'type': 'int', 'default': 2, 'range': [1, 20]},
			}
		}

		models = {
			'random_forest': RandomForestRegressor,
			'extra_trees': ExtraTreesRegressor,
		}

		def build_model(name, hyperparameters):
			model_class = models[name]
			return model_class(random_state=0, **hyperparameters)

		def score_model(name, hyperparameters):
			model = build_model(name, hyperparameters)
			r2_scorer = make_scorer(r2_score)
			scores = cross_val_score(model, X_train, y_train, scoring=r2_scorer)
			return scores.mean()


		session = BTBSession(tunables, score_model, verbose=True)
		best_proposal = session.run(iterations=100)  
		best_model = build_model(best_proposal['name'], best_proposal['config'])

		best_model.fit(X_train, y_train)
		pred = best_model.predict(X_test)

		r2_score=r2_score(y_test, pred)

		print('R2 score!!')
		print(r2_score)
		
		# now save the model in .pickle
		os.chdir(curdir)
		f=open(model_name,'wb')
		pickle.dump(best_model, f)
		f.close()

	files.append(model_name)
	files.append(folder)
	model_dir=os.getcwd()

	return model_name, model_dir, files
Пример #12
0
def test_session():
    def build_model(name, hyperparameters):
        model_class = models[name]
        return model_class(random_state=0, **hyperparameters)

    def score_model(name, hyperparameters):
        model = build_model(name, hyperparameters)
        r2_scorer = make_scorer(r2_score)
        scores = cross_val_score(model, X_train, y_train, scoring=r2_scorer)
        return scores.mean()

    dataset = load_dataset()

    X_train, X_test, y_train, y_test = train_test_split(dataset.data,
                                                        dataset.target,
                                                        test_size=0.3,
                                                        random_state=0)

    tunables = {
        'random_forest': {
            'n_estimators': {
                'type': 'int',
                'default': 2,
                'range': [1, 1000]
            },
            'max_features': {
                'type': 'str',
                'default': 'log2',
                'range': [None, 'auto', 'log2', 'sqrt']
            },
            'min_samples_split': {
                'type': 'int',
                'default': 2,
                'range': [2, 20]
            },
            'min_samples_leaf': {
                'type': 'int',
                'default': 2,
                'range': [1, 20]
            },
        },
        'extra_trees': {
            'n_estimators': {
                'type': 'int',
                'default': 2,
                'range': [1, 1000]
            },
            'max_features': {
                'type': 'str',
                'default': 'log2',
                'range': [None, 'auto', 'log2', 'sqrt']
            },
            'min_samples_split': {
                'type': 'int',
                'default': 2,
                'range': [2, 20]
            },
            'min_samples_leaf': {
                'type': 'int',
                'default': 2,
                'range': [1, 20]
            },
        }
    }

    models = {
        'random_forest': RandomForestRegressor,
        'extra_trees': ExtraTreesRegressor,
    }

    session = BTBSession(tunables, score_model, verbose=True)
    session.run(2)