def test_fit_biginc(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, max_evals=5, trial_timeout=5.0, fit_increment=20) model.fit(self.X, self.Y) # -- make sure we only get 5 even with big fit_increment assert len(model.trials.trials) == 5
def test_sparse_random_projection(self): # restrict n_components to be less than or equal to data dimension # to prevent sklearn warnings from printing during tests n_components = scope.int( hp.quniform('preprocessing.n_components', low=1, high=8, q=1)) model = hyperopt_estimator( classifier=components.gaussian_nb('classifier'), preprocessing=[ components.sparse_random_projection( 'preprocessing', n_components=n_components, ) ], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) X_train = np.random.randn(1000, 8) Y_train = (self.X_train[:, 0] > 0).astype('int') X_test = np.random.randn(1000, 8) Y_test = (self.X_test[:, 0] > 0).astype('int') model.fit(X_train, Y_train) model.score(X_test, Y_test)
def test_fit_iter_basic(self): model = hyperopt_estimator(verbose=1, trial_timeout=5.0) for ii, trials in enumerate(model.fit_iter(self.X, self.Y)): assert trials is model.trials assert len(trials.trials) == ii if ii == 10: break
def sklearn_digits( classifier, algorithm, max_evals=100, seed=1, filename = 'none', preproc=[], loss=None ): global suppress_output if suppress_output: dump_file = None else: dump_file = filename+'.dump' estim = hyperopt_estimator( classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=60, fit_increment_dump_filename=dump_file, loss_fn=loss, verbose=1) filename = filename + '.out' digits = load_digits() X = digits.data y = digits.target test_size = int( 0.2 * len( y ) ) np.random.seed( seed ) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] print(y_train.shape) print(y_test.shape) find_model( X_train, y_train, X_test, y_test, estim, filename )
def test_continuous_loss_fn(): """ Demonstrate using a custom loss function with the continuous_loss_fn option. """ from sklearn.metrics import log_loss # Generate some random data X = np.hstack([ np.vstack([ np.random.normal(0,1,size=(1000,10)), np.random.normal(1,1,size=(1000,10)), ]), np.random.normal(0,1,size=(2000,10)), ]) y = np.zeros(2000) y[:1000] = 1 def loss_function(targ, pred): # hyperopt_estimator flattens the prediction when saving it. This also # affects multilabel classification. pred = pred.reshape( (-1, 2) ) return log_loss(targ, pred[:,1]) # Try to fit an SGD model using log_loss as the loss function cls = hyperopt_estimator( classifier=components.sgd('sgd', loss='log'), preprocessing=[], loss_fn = loss_function, continuous_loss_fn=True, ) cls.fit(X,y,cv_shuffle=True)
def train_one_model( name_classifier, X: np.array, y: np.array, mix_algo, max_evals: int, timeout: int, n: int, ) -> Tuple: name, classifier = name_classifier trainlogger.info("i'm using a timeout of {}".format(timeout)) m = hyperopt_estimator( classifier=classifier("classifier"), algo=mix_algo, trial_timeout=timeout, preprocessing=[], max_evals=max_evals, loss_fn= f1lossfn, # f1 macro is probably more meaningfull than accuracy # continuous_loss_fn = True, seed=RANDOM_SEED, ) trainlogger.info("training {}".format(name)) m.fit( X, y, cv_shuffle=True, n_folds=n) # hyperopt-sklearn takes care of the cross validations m.retrain_best_model_on_full_data(X, y) m = m.best_model()["learner"] return (name, m)
def sklearn_digits( classifier=None ): #estim = hyperopt_estimator( classifier=any_classifier('hai'), algo=tpe.suggest ) if classifier is None: classifier = any_classifier('any') estim = hyperopt_estimator( classifier=classifier ) digits = load_digits() X = digits.data y = digits.target test_size = 50 np.random.seed(0) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] estim.fit( X_train, y_train ) pred = estim.predict( X_test ) print( pred ) print ( y_test ) print( score( pred, y_test ) ) print( estim.best_model() )
def test_sparse_input(): """ Ensure the estimator can handle sparse X matrices. """ import scipy.sparse as ss # Generate some random sparse data nrows,ncols,nnz = 100,50,10 ntrue = nrows // 2 D,C,R = [],[],[] for r in range(nrows): feats = np.random.choice(range(ncols), size=nnz, replace=False) D.extend([1]*nnz) C.extend(feats) R.extend([r]*nnz) X = ss.csr_matrix( (D,(R,C)), shape=(nrows, ncols)) y = np.zeros( nrows ) y[:ntrue] = 1 # Try to fit an SGD model cls = hyperopt_estimator( classifier=components.sgd('sgd', loss='log'), preprocessing=[], ) cls.fit(X,y)
def test_sparse_input(): """ Ensure the estimator can handle sparse X matrices. """ import scipy.sparse as ss # Generate some random sparse data nrows, ncols, nnz = 100, 50, 10 ntrue = nrows // 2 D, C, R = [], [], [] for r in range(nrows): feats = np.random.choice(range(ncols), size=nnz, replace=False) D.extend([1] * nnz) C.extend(feats) R.extend([r] * nnz) X = ss.csr_matrix((D, (R, C)), shape=(nrows, ncols)) y = np.zeros(nrows) y[:ntrue] = 1 # Try to fit an SGD model cls = hyperopt_estimator( classifier=components.sgd('sgd', loss='log'), preprocessing=[], ) cls.fit(X, y)
def test_sparse_random_projection(self): # restrict n_components to be less than or equal to data dimension # to prevent sklearn warnings from printing during tests n_components = scope.int(hp.quniform( 'preprocessing.n_components', low=1, high=8, q=1 )) model = hyperopt_estimator( classifier=components.gaussian_nb('classifier'), preprocessing=[ components.sparse_random_projection( 'preprocessing', n_components=n_components, ) ], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) X_train = np.random.randn(1000, 8) Y_train = (self.X_train[:, 0] > 0).astype('int') X_test = np.random.randn(1000, 8) Y_test = (self.X_test[:, 0] > 0).astype('int') model.fit(X_train, Y_train) model.score(X_test, Y_test)
def test_continuous_loss_fn(): """ Demonstrate using a custom loss function with the continuous_loss_fn option. """ from sklearn.metrics import log_loss # Generate some random data X = np.hstack([ np.vstack([ np.random.normal(0, 1, size=(1000, 10)), np.random.normal(1, 1, size=(1000, 10)), ]), np.random.normal(0, 1, size=(2000, 10)), ]) y = np.zeros(2000) y[:1000] = 1 def loss_function(targ, pred): # hyperopt_estimator flattens the prediction when saving it. This also # affects multilabel classification. pred = pred.reshape((-1, 2)) return log_loss(targ, pred[:, 1]) # Try to fit an SGD model using log_loss as the loss function cls = hyperopt_estimator( classifier=components.sgd('sgd', loss='log'), preprocessing=[], loss_fn=loss_function, continuous_loss_fn=True, ) cls.fit(X, y, cv_shuffle=True)
def test_smoke(self): # -- verify the space argument is accepted and runs space = components.generic_space() model = hyperopt_estimator(verbose=1, max_evals=10, trial_timeout=5, space=space) model.fit(self.X, self.Y)
def test_fit(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, max_evals=5, trial_timeout=5.0) model.fit(self.X, self.Y) assert len(model.trials.trials) == 5
def test_regressor(self): model = hyperopt_estimator(regressor=reg_fn('regressor'), preprocessing=[], algo=rand.suggest, trial_timeout=50.0, max_evals=2, verbose=True) model.fit(self.X_train, self.Y_train) model.score(self.X_test, self.Y_test)
def hyperopt_850_556(): # Load data dir_key = '1406' data_key = '850+556' dir_path = dir_path_dict[dir_key] data_str = dir_path + data_str_dict[data_key] # Redirect stdout to file stdout_path = 'outcome_hyperopt_any.any.txt' print '[INFO] stdout_path:\t{}'.format(stdout_path) sys.stdout = open(stdout_path, 'w') print "[INFO] params:\talgo=tpe.suggest" # Train scores = [] sensis = [] specis = [] for i in range(10): # Load data data_path = data_str.format(i + 1) print data_path trainset, testset = get_dataset(data_path=data_path) train_data, train_label = trainset test_data, test_label = testset # Create the estimator object # estim = hyperopt_estimator(classifier=any_classifier('mySVC'), # algo=tpe.suggest, # preprocessing=[standard_scaler('std_scl')]) estim = hyperopt_estimator(algo=tpe.suggest, seed=RANDOM_SEED) # Search the space of classifiers and preprocessing steps and their # respective hyperparameters in sklearn to fit a model to the data estim.fit(train_data, train_label) # show instances of the best classifier model = estim.best_model() print model # Make a prediction using the optimized model prediction = estim.predict(test_data) error = np.count_nonzero(prediction - test_label) / test_data.shape[0] sensi, speci = my_scores(test_label, prediction) print 1 - error, sensi, speci # Report the accuracy of the classifier on a given set of data score = estim.score(test_data, test_label) print score scores.append(score) sensis.append(sensi) specis.append(speci) print scores print "accur:\t{}\tstd:\t{}".format(np.mean(scores), np.std(scores)) print "sensi:\t{}".format(np.mean(sensis)) print "speci:\t{}".format(np.mean(specis))
def test_classifier(self): model = hyperopt_estimator( classifier=clf_fn('classifier'), preprocessing=[], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) model.fit(self.X_train, self.Y_train_multilabel) model.score(self.X_test, self.Y_test_multilabel)
def test_regressor(self): model = hyperopt_estimator( regressor=reg_fn('regressor'), preprocessing=[], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) model.fit(self.X_train, self.Y_train) model.score(self.X_test, self.Y_test)
def test_preprocessing(self): model = hyperopt_estimator( classifier=components.gaussian_nb('classifier'), preprocessing=[pre_fn('preprocessing')], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) model.fit(self.X_train, self.Y_train) model.score(self.X_test, self.Y_test)
def test_multinomial_nb(self): model = hyperopt_estimator( classifier=components.multinomial_nb('classifier'), preprocessing=[], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) # Inputs for MultinomialNB must be non-negative model.fit(np.abs(self.X_train), self.Y_train) model.score(np.abs(self.X_test), self.Y_test)
def sklearn_newsgroups(classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None): global suppress_output if suppress_output: dump_file = None else: dump_file = filename + '.dump' estim = hyperopt_estimator(classifier=classifier, algo=algorithm, preprocessing=[tfidf('tfidf')], max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' if REMOVE_HEADERS: train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) else: train = fetch_20newsgroups(subset='train') test = fetch_20newsgroups(subset='test') if PRE_VECTORIZE: vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(train.data) y_train = train.target X_test = vectorizer.transform(test.data) y_test = test.target else: X_train = train.data y_train = train.target X_test = test.data y_test = test.target print(y_train.shape) print(y_test.shape) find_model(X_train, y_train, X_test, y_test, estim, filename)
def test_one_hot_encoder(self): # requires a classifier that can handle sparse data model = hyperopt_estimator( classifier=components.multinomial_nb('classifier'), preprocessing=[components.one_hot_encoder('preprocessing')], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) # Inputs for one_hot_encoder must be non-negative integers model.fit(np.abs(np.round(self.X_test).astype(np.int)), self.Y_test) model.score(np.abs(np.round(self.X_test).astype(np.int)), self.Y_test)
def test_warm_start(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, max_evals=5, trial_timeout=5.0) params = model.get_params() assert params['algo'] == rand.suggest assert params['max_evals'] == 5 model.fit(self.X, self.Y, warm_start=False) assert len(model.trials.trials) == 5 model.set_params(algo=tpe.suggest, max_evals=10) params = model.get_params() assert params['algo'] == tpe.suggest assert params['max_evals'] == 10 model.fit(self.X, self.Y, warm_start=True) assert len(model.trials.trials) == 15 # 5 + 10 = 15.
def sklearn_newsgroups( classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None ): global suppress_output if suppress_output: dump_file = None else: dump_file = filename+'.dump' estim = hyperopt_estimator( classifier=classifier, algo=algorithm, preprocessing=[tfidf('tfidf')], max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' if REMOVE_HEADERS: train = fetch_20newsgroups( subset='train', remove=('headers', 'footers', 'quotes') ) test = fetch_20newsgroups( subset='test', remove=('headers', 'footers', 'quotes') ) else: train = fetch_20newsgroups( subset='train' ) test = fetch_20newsgroups( subset='test' ) if PRE_VECTORIZE: vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform( train.data ) y_train = train.target X_test = vectorizer.transform( test.data ) y_test = test.target else: X_train = train.data y_train = train.target X_test = test.data y_test = test.target print(y_train.shape) print(y_test.shape) find_model( X_train, y_train, X_test, y_test, estim, filename )
def sklearn_convex(classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None): global suppress_output if suppress_output: dump_file = None else: dump_file = filename + '.dump' estim = hyperopt_estimator(classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' dataset_store.download('convex') trainset, validset, testset = dataset_store.get_classification_problem( 'convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) print(y_train.shape) print(y_valid.shape) print(y_test.shape) #find_model( X_train, y_train, X_test, y_test, estim, filename ) find_model(X_fulltrain, y_fulltrain, X_test, y_test, estim, filename)
def sklearn_digits(classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None): global suppress_output if suppress_output: dump_file = None else: dump_file = filename + '.dump' estim = hyperopt_estimator(classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=60, fit_increment_dump_filename=dump_file, loss_fn=loss, verbose=1) filename = filename + '.out' digits = load_digits() X = digits.data y = digits.target test_size = int(0.2 * len(y)) np.random.seed(seed) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] print(y_train.shape) print(y_test.shape) find_model(X_train, y_train, X_test, y_test, estim, filename)
def test_tfidf(self): # requires a classifier that can handle sparse data model = hyperopt_estimator( classifier=components.multinomial_nb('classifier'), preprocessing=[components.tfidf('preprocessing')], algo=rand.suggest, trial_timeout=5.0, max_evals=5, ) X = np.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]) Y = np.array([0, 1, 2, 0]) model.fit(X, Y) model.score(X, Y)
def test_crossvalidation(): """ Demonstrate performing a k-fold CV using the fit() method. """ # Generate some random data X = np.hstack([ np.vstack([ np.random.normal(0,1,size=(1000,10)), np.random.normal(1,1,size=(1000,10)), ]), np.random.normal(0,1,size=(2000,10)), ]) y = np.zeros(2000) y[:1000] = 1 # Try to fit a model cls = hyperopt_estimator( classifier=components.sgd('sgd', loss='log'), preprocessing=[], ) cls.fit(X,y,cv_shuffle=True, n_folds=5)
def test_crossvalidation(): """ Demonstrate performing a k-fold CV using the fit() method. """ # Generate some random data X = np.hstack([ np.vstack([ np.random.normal(0, 1, size=(1000, 10)), np.random.normal(1, 1, size=(1000, 10)), ]), np.random.normal(0, 1, size=(2000, 10)), ]) y = np.zeros(2000) y[:1000] = 1 # Try to fit a model cls = hyperopt_estimator( classifier=components.sgd('sgd', loss='log'), preprocessing=[], ) cls.fit(X, y, cv_shuffle=True, n_folds=5)
def sklearn_convex( classifier, algorithm, max_evals=100, seed=1, filename = 'none', preproc=[], loss=None ): global suppress_output if suppress_output: dump_file = None else: dump_file = filename+'.dump' estim = hyperopt_estimator( classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' dataset_store.download('convex') trainset,validset,testset = dataset_store.get_classification_problem('convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) print(y_train.shape) print(y_valid.shape) print(y_test.shape) #find_model( X_train, y_train, X_test, y_test, estim, filename ) find_model( X_fulltrain, y_fulltrain, X_test, y_test, estim, filename )
def mnist_digits(): estim = hyperopt_estimator( classifier=any_classifier('hai') ) digits = fetch_mldata('MNIST original') X = digits.data y = digits.target test_size = int( 0.2 * len( y ) ) np.random.seed(0) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] estim.fit( X_train, y_train ) pred = estim.predict( X_test ) print( pred ) print ( y_test ) print( score( pred, y_test ) ) print( estim.best_model() )
def main(): experiment = Experiment( api_key=os.getenv("COMET_API_KEY", None), project_name="mof-oxidation-states", ) print("Loading Data") X_train = np.load(FEAT_TRAIN_PATH) X_valid = np.load(FEAT_VALID_PATH) X_test = np.load(FEAT_TEST_PATH) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_valid = scaler.transform(X_valid) X_test = scaler.transform(X_test) joblib.dump( scaler, os.path.join("models", STARTTIMESTRING + "_" + "scaler.joblib")) # vt = VarianceThreshold(0.1) # X_train = vt.fit_transform(X_train) # X_valid = vt.transform(X_valid) # X_test = vt.transform(X_test) # joblib.dump(vt, os.path.join("models", STARTTIMESTRING + "_" + "vt.joblib")) y_train = np.load(LABEL_TRAIN_PATH) y_valid = np.load(LABEL_VALID_PATH) y_test = np.load(LABEL_TEST_PATH) optimized_models = [] mix_algo = partial( mix.suggest, p_suggest=[ (0.15, rand.suggest), (0.7, tpe.suggest), (0.15, anneal.suggest), ], ) print("Optimizing classifiers") for name, classifier in CLASSIFIERS: m = hyperopt_estimator( classifier=classifier("classifier"), algo=mix_algo, trial_timeout=TIMEOUT, loss_fn=f1_loss, preprocessing=[], max_evals=MAX_EVALS, seed=RANDOM_SEED, ) # Random undersampling to make the base estimators even more uncorrelated X_train_, y_train_ = undersample_2(X_train, y_train) m.fit( np.vstack([X_train_, X_valid]), np.vstack([y_train_.reshape(-1, 1), y_valid.reshape(-1, 1)]), valid_size=len(X_valid), cv_shuffle=False, ) m.retrain_best_model_on_full_data(X_train_, y_train_) m = m.best_model()["learner"] optimized_models.append((name, m)) model_eval( optimized_models, X_train, y_train, X_test, y_test, outdir_metrics="metrics", outdir_models="models", ) vc = VotingClassifier(optimized_models, voting="soft") vc._calibrate_base_estimators("sigmoid", X_valid, y_valid) model_eval( [("ensemble", vc)], X_train, y_train, X_test, y_test, outdir_metrics="metrics", outdir_models="models", )
def test_fit(self): model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0) model.fit(self.X, self.Y) assert len(model.trials.trials) == 5
def test_smoke(self): # -- verify the space argument is accepted and runs space = components.generic_space() model = hyperopt_estimator( verbose=1, max_evals=10, trial_timeout=5, space=space) model.fit(self.X, self.Y)
def tune_fit( # pylint:disable=dangerous-default-value models: list, X: np.ndarray, y: np.ndarray, max_evals: int = 400, timeout: int = 10 * 60, mix_ratios: dict = {"rand": 0.1, "tpe": 0.8, "anneal": 0.1}, valid_size: float = VALID_SIZE, ) -> list: """Tune model hyperparameters using hyperopt using a mixed strategy. Make sure when using this function that no data leakage happens. This data here should be seperate from training and test set. Arguments: models {list} -- list of models that should be optimized X_valid {np.ndarray} -- features y_valid {np.ndarray} -- labels max_evals {int} -- maximum number of evaluations of hyperparameter optimizations timeout {int} -- timeout in seconds after which the optimization stops mix_ratios {dict} -- dictionary which provides the ratios of the different optimization algorithms valid_size {float} -- fraction of the last part of the training set used for validation Returns: list -- list of tuples (name, model) of optimized models """ assert sum(list(mix_ratios.values())) == 1 assert list(mix_ratios.keys()) == ["rand", "tpe", "anneal"] trainlogger.debug("performing hyperparameter optimization") optimized_models = [] mix_algo = partial( mix.suggest, p_suggest=[ (mix_ratios["rand"], rand.suggest), (mix_ratios["tpe"], tpe.suggest), (mix_ratios["anneal"], anneal.suggest), ], ) for name, classifier in models: m = hyperopt_estimator( classifier=classifier("classifier"), algo=mix_algo, trial_timeout=timeout, preprocessing=[], max_evals=max_evals, seed=RANDOM_SEED, # n_jobs=-1, # todo fix installation to use my forks ) m.fit( X, y, valid_size=valid_size, cv_shuffle=False ) # avoid shuffleing to have the same validation set for the ensemble stage # chose the model with best hyperparameters and train it n_train = int(len(y) * (1 - valid_size)) X_train = X[:n_train] y_train = y[:n_train] m.retrain_best_model_on_full_data(X_train, y_train) m = m.best_model()["learner"] optimized_models.append((name, m)) return optimized_models