def main(): for dataset in glob.glob('../../../Dodge/data/UCI/*.csv'): df = pd.read_csv(dataset) target = df.columns[-1] sys.stdout = open(f'./hyperopt-log/{dataset.split("/")[-1]}.txt', 'w') try: print(f'Running {dataset}') print('=' * 20) data = DataLoader.from_file(dataset, target=target, col_start=0, col_stop=-1) a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: raise continue
def applyHPSKLEARN(X_train, y_train, X_test, y_test, SavePath, max_evals=100, trial_timeout=100, useSavedModels = True): if not useSavedModels or not os.path.isfile(SavePath+".pckl"): HPSKLEARNModel = HyperoptEstimator(regressor=any_regressor('reg'), preprocessing=any_preprocessing('pre'), loss_fn=mean_squared_error, max_evals=max_evals, trial_timeout=trial_timeout, algo=tpe.suggest) # perform the search HPSKLEARNModel.fit(X_train, y_train) pickle.dump(HPSKLEARNModel, open(SavePath+".pckl", 'wb')) else: HPSKLEARNModel = pickle.load(open(SavePath+".pckl", 'rb')) # summarize performance score = HPSKLEARNModel.score(X_test, y_test) y_hat = HPSKLEARNModel.predict(X_test) print("HPSKLEARN - Score: ") print("MAE: %.4f" % score) # summarize the best model print(HPSKLEARNModel.best_model()) return y_hat
def main(): for dataset in [ 'DataClass.csv', 'FeatureEnvy.csv', 'GodClass.csv', 'LongMethod.csv' ]: sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w') try: print(f'Running {dataset}') print('=' * 20) data = DataLoader.from_file(f'../../../Dodge/data/smell/{dataset}', target='SMELLS', col_start=0, col_stop=-1) a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: continue
def _create_estimator_random_classifier(classifier=any_classifier('my_clf' ), preprocessing=any_preprocessing( 'my_pre'), max_evals=100, trial_timeout=120, seed=None, algo=tpe.suggest): """ :param classifier: :param preprocessing: :param max_evals: :param trial_timeout: :param seed: :param algo: :return: """ estim = HyperoptEstimator(classifier=classifier, preprocessing=preprocessing, algo=algo, max_evals=max_evals, trial_timeout=trial_timeout, ex_preprocs=None, regressor=None, space=None, loss_fn=None, continuous_loss_fn=False, verbose=False, fit_increment=1, fit_increment_dump_filename=None, seed=seed, use_partial_fit=False, refit=True) return estim
def test(): iris = load_iris() X = iris.data y = iris.target test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] # Instantiate a HyperoptEstimator with the search space and number of evaluations estim = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit(X_train, y_train) # Show the results print(estim.score(X_test, y_test)) # 1.0 print(estim.best_model())
def train_hypsklearn(X_train, X_test, y_train, y_test, mtype, common_name_model, problemtype, classes, default_featurenames, transform_model, settings, model_session): modelname = common_name_model + '.pickle' files = list() if mtype in [' classification', 'c']: estim = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit(X_train, y_train) elif mtype in ['regression', 'r']: estim = HyperoptEstimator(classifier=any_regressor('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit(X_train, y_train) # Show the results print(estim.score(X_test, y_test)) print(estim.best_model()) scores = estim.score(X_test, y_test) bestmodel = str(estim.best_model()) print('saving classifier to disk') f = open(modelname, 'wb') pickle.dump(estim, f) f.close() files.append(modelname) modeldir = os.getcwd() return modelname, modeldir, files
def bayesian_opt_pipeline(): X, y = generate_dataset() estimator = HyperoptEstimator( classifier=svc("hyperopt_svc"), preprocessing=any_preprocessing("hyperopt_preprocess"), algo=tpe.suggest, max_evals=100, trial_timeout=120) start_time = time.time() estimator.fit(X, y) print(f"Time taken for fitting {time.time() - start_time} seconds") print("best model:") print(estimator.best_model())
def hyper_bot(self): """ print accuracy :return: None """ model = HyperoptEstimator( classifier=any_classifier("cla"), preprocessing=any_preprocessing("pre"), algo=tpe.suggest, max_evals=20, trial_timeout=30, ) model.fit(self.x_train, self.y_train) accuracy = model.score(self.x_test, self.x_train) print(f"Accuray: {accuracy}")
def main(): file_dic = {"ivy": ["ivy-1.4.csv", "ivy-2.0.csv"], "lucene": ["lucene-2.0.csv", "lucene-2.2.csv"], "lucene2": ["lucene-2.2.csv", "lucene-2.4.csv"], "poi": ["poi-1.5.csv", "poi-2.5.csv"], "poi2": ["poi-2.5.csv", "poi-3.0.csv"], "synapse": ["synapse-1.0.csv", "synapse-1.1.csv"], "synapse2": ["synapse-1.1.csv", "synapse-1.2.csv"], "camel": ["camel-1.2.csv", "camel-1.4.csv"], "camel2": ["camel-1.4.csv", "camel-1.6.csv"], "xerces": ["xerces-1.2.csv", "xerces-1.3.csv"], "jedit": ["jedit-3.2.csv", "jedit-4.0.csv"], "jedit2": ["jedit-4.0.csv", "jedit-4.1.csv"], "log4j": ["log4j-1.0.csv", "log4j-1.1.csv"], "xalan": ["xalan-2.4.csv", "xalan-2.5.csv"] } for dataset in file_dic: sys.stdout = open(f'./hyperopt-log/{dat}.txt', 'w') print(f'Running {dat}') print('=' * 20) data = DataLoader.from_files( base_path='./issue_close_time/', files=file_dic[dataset]) try: a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing( 'pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b-a, 'seconds.') except: continue
def main(): directories = [ "1 day", "7 days", "14 days", "30 days", "90 days", "180 days", "365 days" ] datasets = [ "camel", "cloudstack", "cocoon", "hadoop", "deeplearning", "hive", "node", "ofbiz", "qpid" ] for dat in datasets: for time_ in directories: sys.stdout = open(f'./hyperopt-log/{dat}-{time_}.txt', 'w') print(f'Running {dat}-{time_}') print('=' * 30) data = DataLoader.from_file( "/Users/ryedida/PycharmProjects/raise-package/issue_close_time/" + time_ + "/" + dat + ".csv", target="timeOpen", col_start=0) try: a = time.time() estim = HyperoptEstimator( classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=partial(loss, dat, time_), trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except ValueError: continue except: continue
# define dataset X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=5, random_state=1) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = HyperoptEstimator(classifier=any_classifier("cla"), preprocessing=any_preprocessing("pre"), algo=tpe.suggest, max_evals=50, trial_timeout=30) # perform the search model.fit(X_train, y_train) # summarize performance accuracy = model.score(X_test, y_test) print(f"Accuracy: {accuracy}") # summarize the best model print(model.best_model) # the problem is that hyperopt sklearn is not advancing
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment) if ',' in scoring: scoring = scoring.split(',') # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias if scoring == 'precision': scoring = make_scorer(precision_score, zero_division=1) os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup( filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model' ) index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model p = importlib.import_module('pipelines.' + pipeline) experiment_index = {} if n_jobs == 'auto': n_jobs = os.cpu_count() # Load parameter grid argument if param_grid == None: param_grid = p.PARAMETER_GRID elif type(param_grid) is 'str': with open(param_grid, 'r') as f: param_grid = json.load(f) logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset)) for _sym, data in dataset_index.items(): logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) current_target = p.TARGET if not use_target else use_target # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] features = features.replace([np.inf, -np.inf], np.nan) imputer = SimpleImputer() imputer.fit(features.values) feat_imp_values = imputer.transform(features.values) features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns) X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size) # Summarize distribution logger.info("Start Hyperopt search") if expanding_window: cv = TimeSeriesSplit(n_splits=expanding_window) #cv = sliding_window_split(X_train, 0.1) est = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) est.fit(X_train, y_train) logger.info("End Hyperopt search") # Take the fitted ensemble with tuned hyperparameters clf = est.best_model()['learner'] best_score = est.score(X_train, y_train) best_params = {} # Plot learning curve for the classifier #est = p.estimator #est.set_params(**best_params) _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True) #plt.tight_layout() _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ] #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv) axes[1][0].set_title("{} - ROC (Train)".format(_sym)) plot_roc_curve(clf, X_train, y_train, ax=axes[1][0]) axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1]) axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym)) plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2]) axes[2][0].set_title("{} - ROC (Test)".format(_sym)) plot_roc_curve(clf, X_test, y_test, ax=axes[2][0]) axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1]) axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym)) plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2]) curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym) plt.savefig(curve_path) plt.close() # Test ensemble's performance on training and test sets predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1))) predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2))) report = { 'training_set': { 'features':X_train.shape[1], 'records':X_train.shape[0], 'class_distribution': get_class_distribution(y_train), 'classification_report': train_report, 'accuracy': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'precision': precision_score(y_train, predictions1), 'recall': recall_score(y_train, predictions1), 'f1': f1_score(y_train, predictions1), 'y_true':[y for y in y_train], 'y_pred':[y for y in predictions1] }, 'test_set': { 'features':X_test.shape[1], 'records':X_test.shape[0], 'class_distribution':get_class_distribution(y_test), 'classification_report': test_report, 'accuracy': accuracy_score(y_test, predictions2), 'precision': precision_score(y_test, predictions2), 'mse': mean_squared_error(y_test, predictions2), 'recall': recall_score(y_test, predictions2), 'f1': f1_score(y_test, predictions2), 'y_true': [y for y in y_test], 'y_pred': [y for y in predictions2] } } # If the classifier has a feature_importances attribute, save it in the report feature_importances = None if hasattr(clf, 'feature_importances_'): feature_importances = clf.feature_importances_ elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'): feature_importances = clf.named_steps.c.feature_importances_ if feature_importances is not None: importances = {features.columns[i]: v for i, v in enumerate(feature_importances)} labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])} report['feature_importances'] = labeled if hasattr(clf, 'ranking_'): report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)} if hasattr(clf, 'support_'): report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s] train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()] test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()] logger.info('Model evaluation: \n' '== Training set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ '== Test set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist), report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'], report['training_set']['recall'], report['training_set']['f1'], X_test.shape[1], X_test.shape[0], '\n'.join(test_dist), report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'], report['test_set']['recall'], report['test_set']['f1'] ) ) # Save a pickle dump of the model model_path = '{}{}.p'.format(models_dir, _sym) with open(model_path, 'wb') as f: pickle.dump(clf, f) # Save the model's parameters params_path = '{}{}_parameters.json'.format(models_dir, _sym) with open(params_path, 'w') as f: json.dump(best_params, f, indent=4) # Save the report for this model report_path = '{}{}.json'.format(reports_dir, _sym) with open(report_path, 'w') as f: json.dump(report, f, indent=4) # Update the experiment's index with the new results, and save it experiment_index[_sym] = { 'model':model_path, 'params':params_path, 'report':report_path } with open(experiment_index_file, 'w') as f: json.dump(experiment_index, f, indent=4) logger.info("--- {} end ---".format(_sym)) return experiment_index
y_test = y[indices[-test_size:]] import pandas as pd y_train_ohe = pd.get_dummies(y_train) # Instantiate a HyperoptEstimator with the search space and number of evaluations from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(X_train, y_train) import lightgbm as lgb model_lgb = lgb.LGBMClassifier() model_lgb.fit(X_train, y_train) estim = HyperoptEstimator(classifier=model_lgb, preprocessing=any_preprocessing('standard_scaler'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit(X_train, y_train, random_state=50) # estim.fit(X_train, y_train_ohe) # Show the results print(estim.score(X_test, y_test)) # 1.0 print(estim.best_model())
test_size=0.3, random_state=42) print('Prepared data: X_train: %s y_train: %s' % (X_train.shape, y_train.shape)) print('Prepared data: X_test: %s y_test: %s' % (X_test.shape, y_test.shape)) # replace training dataset X = X_train y = y_train """ ESTIMATOR WITH BAYESIAN TUNING """ from hpsklearn import HyperoptEstimator, any_regressor, any_preprocessing from hyperopt import tpe # Instantiate a HyperoptEstimator with the search space and number of evaluations clf = HyperoptEstimator(regressor=any_regressor('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=250, trial_timeout=300) clf.fit(X, y) print(clf.best_model()) y_hat = clf.predict(X_test) dscores = metrics_regression(y_test, y_hat, X.shape[1]) tf = t.since('test') print( '\nBayesian tuning -test: bias = %.3f mae = %.3f r2 = %.3f (time: %s)' % (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf))) # training y_hat = clf.predict(X)
waferlabel=waferlabel['result'] wafer=pd.DataFrame(wafer,columns=['cnnResult','svmResult']) # Download the data and split into training and test sets X=wafer.values y=waferlabel.values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=.25, random_state=1) estim = HyperoptEstimator( preprocessing=any_preprocessing('pp'), classifier=any_classifier('clf'), algo=tpe.suggest, trial_timeout=200.0, # seconds max_evals=10, seed=1 ) estim.fit( X_train, y_train ) print(estim.score(X_test, y_test),estim.best_model() ) joblib.dump(estim,'ensemble4.m') ensemble=joblib.load('ensemble4.m') ensemble.predict(X_test)