def test_all_failed_trials(self): spark_trials = SparkTrials(parallelism=1) with patch_logger("hyperopt-spark", logging.DEBUG) as output: fmin( fn=fn_succeed_within_range, space=hp.uniform("x", 5, 10), algo=anneal.suggest, max_evals=1, trials=spark_trials, return_argmin=False, ) log_output = output.getvalue().strip() self.assertEqual(spark_trials.count_failed_trials(), 1) self.assert_task_failed(log_output, 0) spark_trials = SparkTrials(parallelism=4) # Here return_argmin is True (by default) and an exception should be thrown with self.assertRaisesRegexp(Exception, "There are no evaluation tasks"): fmin( fn=fn_succeed_within_range, space=hp.uniform("x", 5, 8), algo=anneal.suggest, max_evals=2, trials=spark_trials, )
def test_task_maxFailures_warning(self): # With quick trials, do not print warning. with patch_logger('hyperopt-spark', logging.DEBUG) as output: fmin(fn=fn_succeed_within_range, space=hp.uniform('x', -1, 1), algo=anneal.suggest, max_evals=1, trials=SparkTrials()) log_output = output.getvalue().strip() self.assertNotIn( "spark.task.maxFailures", log_output, """ "spark.task.maxFailures" warning should not appear in log: {log_output}""" .format(log_output=log_output)) # With slow trials, print warning. ORIG_LONG_TRIAL_DEFINITION_SECONDS = _SparkFMinState._LONG_TRIAL_DEFINITION_SECONDS try: _SparkFMinState._LONG_TRIAL_DEFINITION_SECONDS = 0 with patch_logger('hyperopt-spark', logging.DEBUG) as output: fmin(fn=fn_succeed_within_range, space=hp.uniform('x', -1, 1), algo=anneal.suggest, max_evals=1, trials=SparkTrials()) log_output = output.getvalue().strip() self.assertIn( "spark.task.maxFailures", log_output, """ "spark.task.maxFailures" warning missing from log: {log_output}""" .format(log_output=log_output)) finally: _SparkFMinState._LONG_TRIAL_DEFINITION_SECONDS = ORIG_LONG_TRIAL_DEFINITION_SECONDS
def test_invalid_timeout(self): with self.assertRaisesRegexp( Exception, "timeout argument should be None or a positive value. Given value: -1", ): SparkTrials(parallelism=4, timeout=-1) with self.assertRaisesRegexp( Exception, "timeout argument should be None or a positive value. Given value: True", ): SparkTrials(parallelism=4, timeout=True)
def test_no_retry_for_long_tasks(self): NUM_TRIALS = 2 output_dir = tempfile.mkdtemp() def fn(_): with open(os.path.join(output_dir, str(timeit.default_timer())), "w") as f: f.write("1") raise Exception("Failed!") spark_trials = SparkTrials(parallelism=2) try: fmin( fn=fn, space=hp.uniform("x", 0, 1), algo=anneal.suggest, max_evals=NUM_TRIALS, trials=spark_trials, show_progressbar=False, return_argmin=False, ) except BaseException as e: self.assertEqual( "There are no evaluation tasks, cannot return argmin of task losses.", str(e), ) call_count = len(os.listdir(output_dir)) self.assertEqual(NUM_TRIALS, call_count)
def run_fmin(self, online=True, upload=True, objective=objective_success, max_evals=3, wrap=None, **kwargs): project = 'hyperopt-integration-test' if wrap == 'mongo': trials = MongoTrials('mongo://mongodb:27017/foo_db/jobs', exp_key=str(uuid.uuid4())) elif wrap == 'spark': trials = SparkTrials() else: trials = None trials = SigOptTrials(project=project, online=(online and upload), trials=trials) try: best = fmin(objective, space={ 'x': hp.uniform('x', -10, 10), 'y': hp.uniform('y', -10, 10) }, algo=tpe.suggest, max_evals=max_evals, trials=trials, **kwargs) except hyperopt.exceptions.AllTrialsFailed: best = None if upload and not online: trials.upload() return trials, best
def train_keras_model(train_desc, test_desc, train_cat, test_cat, distributed=False, shop="all"): with mlflow.start_run(run_name="keras", nested=True): if (distributed): from hyperopt import SparkTrials trials = SparkTrials() else: trials = Trials() run_keras = RunKeras(train_desc, test_desc, train_cat, test_cat) argmin = fmin(run_keras.keras_model, get_search_space(), algo=tpe.suggest, max_evals=10, show_progressbar=True, trials=trials) best_params = space_eval(get_search_space(), argmin) best_model, f1 = run_keras.train_model(best_params) #mlflow.keras.log_model(best_model, 'model') mlflow.log_metric("f1", f1) #mlflow.log_metric("delta_version", delta_version) mlflow.set_tag("shop", shop) mlflow.set_tag("model", "keras_classifier") return argmin
def test_timeout_without_job_cancellation(self): timeout = 4 spark_trials = SparkTrials(parallelism=1, timeout=timeout) spark_trials._spark_supports_job_cancelling = False def fn(x): time.sleep(0.5) return x with patch_logger('hyperopt-spark', logging.DEBUG) as output: fmin(fn=fn, space=hp.uniform('x', -1, 1), algo=anneal.suggest, max_evals=10, trials=spark_trials, max_queue_len=1, show_progressbar=False, return_argmin=False) log_output = output.getvalue().strip() self.assertTrue(spark_trials._fmin_cancelled) self.assertEqual(spark_trials._fmin_cancelled_reason, "fmin run timeout") self.assertGreater(spark_trials.count_successful_trials(), 0) self.assertGreater(spark_trials.count_cancelled_trials(), 0) self.assertIn( "fmin is cancelled, so new trials will not be launched", log_output, """ "fmin is cancelled, so new trials will not be launched" missing from log: {log_output}""".format(log_output=log_output)) self.assertIn( "SparkTrials will block", log_output, """ "SparkTrials will block" missing from log: {log_output}""". format(log_output=log_output)) self.assert_task_succeeded(log_output, 0)
def test_trial_run_info(self): spark_trials = SparkTrials(parallelism=4) with patch_logger("hyperopt-spark") as output: fmin( fn=fn_succeed_within_range, space=hp.uniform("x", -5, 5), algo=anneal.suggest, max_evals=8, return_argmin=False, trials=spark_trials, rstate=np.random.RandomState(99), ) self.check_run_status(spark_trials, output, num_total=8, num_success=7, num_failure=1) expected_result = {"loss": 1.0, "status": "ok"} for trial in spark_trials._dynamic_trials: if trial["state"] == base.JOB_STATE_DONE: self.assertEqual( trial["result"], expected_result, "Wrong result has been saved: Expected {e} but got {r}.". format(e=expected_result, r=trial["result"]), ) elif trial["state"] == base.JOB_STATE_ERROR: err_message = trial["misc"]["error"][1] self.assertIn( "RuntimeError", err_message, "Missing {e} in {r}.".format(e="RuntimeError", r=err_message), ) self.assertIn( "Traceback (most recent call last)", err_message, "Missing {e} in {r}.".format(e="Traceback", r=err_message), ) num_success = spark_trials.count_by_state_unsynced(base.JOB_STATE_DONE) self.assertEqual( num_success, 7, "Wrong number of successful trial runs: Expected {e} but got {r}.". format(e=7, r=num_success), ) num_failure = spark_trials.count_by_state_unsynced( base.JOB_STATE_ERROR) self.assertEqual( num_failure, 1, "Wrong number of failed trial runs: Expected {e} but got {r}.". format(e=1, r=num_failure), )
def test_exception_when_spark_not_available(self): import hyperopt orig_have_spark = hyperopt.spark._have_spark hyperopt.spark._have_spark = False try: with self.assertRaisesRegexp(Exception, "cannot import pyspark"): SparkTrials(parallelism=4) finally: hyperopt.spark._have_spark = orig_have_spark
def test_accepting_sparksession(self): spark_trials = SparkTrials( parallelism=2, spark_session=SparkSession.builder.getOrCreate()) fmin(fn=lambda x: x + 1, space=hp.uniform('x', 5, 8), algo=anneal.suggest, max_evals=2, trials=spark_trials)
def test_pin_thread_on(self): if not self._pin_mode_enabled: raise unittest.SkipTest() spark_trials = SparkTrials(parallelism=2) self.assertTrue(spark_trials._spark_pinned_threads_enabled) self.assertTrue(spark_trials._spark_supports_job_cancelling) fmin( fn=lambda x: x + 1, space=hp.uniform("x", -1, 1), algo=rand.suggest, max_evals=5, trials=spark_trials, ) self.assertEqual(spark_trials.count_successful_trials(), 5)
def test_all_successful_trials(self): spark_trials = SparkTrials(parallelism=1) with patch_logger('hyperopt-spark', logging.DEBUG) as output: fmin(fn=fn_succeed_within_range, space=hp.uniform('x', -1, 1), algo=anneal.suggest, max_evals=1, trials=spark_trials) log_output = output.getvalue().strip() self.assertEqual(spark_trials.count_successful_trials(), 1) self.assertIn( "fmin thread exits normally", log_output, """Debug info "fmin thread exits normally" missing from log: {log_output}""" .format(log_output=log_output)) self.assert_task_succeeded(log_output, 0)
def test_trial_run_info(self): spark_trials = SparkTrials(parallelism=4) with patch_logger('hyperopt-spark') as output: fmin(fn=fn_succeed_within_range, space=hp.uniform('x', -5, 5), algo=anneal.suggest, max_evals=8, return_argmin=False, trials=spark_trials) self.check_run_status(spark_trials, output, num_total=8, num_success=7, num_failure=1) expected_result = {'loss': 1.0, 'status': 'ok'} for trial in spark_trials._dynamic_trials: if trial['state'] == base.JOB_STATE_DONE: self.assertEqual( trial['result'], expected_result, "Wrong result has been saved: Expected {e} but got {r}.". format(e=expected_result, r=trial['result'])) elif trial['state'] == base.JOB_STATE_ERROR: err_message = trial['misc']['error'][1] self.assertIn( "RuntimeError", err_message, "Missing {e} in {r}.".format(e="RuntimeError", r=err_message)) num_success = spark_trials.count_by_state_unsynced(base.JOB_STATE_DONE) self.assertEqual( num_success, 7, "Wrong number of successful trial runs: Expected {e} but got {r}.". format(e=7, r=num_success)) num_failure = spark_trials.count_by_state_unsynced( base.JOB_STATE_ERROR) self.assertEqual( num_failure, 1, "Wrong number of failed trial runs: Expected {e} but got {r}.". format(e=1, r=num_failure))
def train(df, experiment_name, run_name): mlflow.set_experiment(experiment_name) data = df.toPandas() X_train, X_test, y_train, y_test = train_test_split(data.drop(["quality"], axis=1), data[["quality"]].values.ravel(), random_state=42) search_space = { 'n_estimators': hp.uniform('n_estimators', 10, 100), 'min_samples_leaf': hp.uniform('min_samples_leaf', 1, 20), 'max_depth': hp.uniform('max_depth', 2, 10), } spark_trials = SparkTrials(parallelism=4) with mlflow.start_run(run_name=run_name): fmin( fn=evaluate_hyperparams_wrapper(X_train, X_test, y_train, y_test), space=search_space, algo=tpe.suggest, max_evals=10, trials=spark_trials, )
validation_steps, device) return val_loss # COMMAND ---------- # DBTITLE 1,Hyperopt BATCH_SIZE = 100 NUM_EPOCHS = 1 def train_fn(lr): loss = train_and_evaluate(lr) return {'loss': loss, 'status': STATUS_OK} search_space = hp.loguniform('lr', -10, -4) argmin = fmin(fn=train_fn, space=search_space, algo=tpe.suggest, max_evals=1, trials=SparkTrials(parallelism=8)) # COMMAND ---------- argmin # COMMAND ----------
# COMMAND ---------- # MAGIC %md # MAGIC ### Hyperparameter Tuning # MAGIC User HyperOpt with Spark trials to run distributed hyperparameters tuning across workers in parallel # COMMAND ---------- spark.conf.set("spark.databricks.mlflow.trackHyperopt.enabled", False) # COMMAND ---------- from functools import partial from hyperopt import SparkTrials, hp, fmin, tpe, STATUS_FAIL, STATUS_OK spark_trials = SparkTrials() hyperopt_algo = tpe.suggest n_components_range = np.arange(4, 12, 1, dtype=int) max_depth_range = np.arange(3, 8, 1, dtype=int) learning_rate_range = np.arange(0.01, 0.15, 0.01) n_estimators_range = np.arange(500, 2000, 1, dtype=int) params = { 'pca_params': { 'n_components': hp.choice('n_components', n_components_range) }, 'algo_params': { 'max_depth': hp.choice('max_depth', max_depth_range), 'learning_rate': hp.choice('learning_rate', learning_rate_range), 'n_estimators': hp.choice('n_estimators', n_estimators_range),
def test_timeout_with_job_cancellation(self): if not self.sparkSupportsJobCancelling(): print( "Skipping timeout test since this Apache PySpark version does not " "support cancelling jobs by job group ID.") return timeout = 2 spark_trials = SparkTrials(parallelism=4, timeout=timeout) def fn(x): if x < 0: time.sleep(timeout + 20) raise Exception("Task should have been cancelled") else: time.sleep(1) return x # Test 1 cancelled trial. Examine logs. with patch_logger("hyperopt-spark", logging.DEBUG) as output: fmin( fn=fn, space=hp.uniform("x", -2, 0), algo=anneal.suggest, max_evals=1, trials=spark_trials, max_queue_len=1, show_progressbar=False, return_argmin=False, rstate=np.random.RandomState(4), ) log_output = output.getvalue().strip() self.assertTrue(spark_trials._fmin_cancelled) self.assertEqual(spark_trials._fmin_cancelled_reason, "fmin run timeout") self.assertEqual(spark_trials.count_cancelled_trials(), 1) self.assertIn( "Cancelling all running jobs", log_output, """ "Cancelling all running jobs" missing from log: {log_output}""" .format(log_output=log_output), ) self.assertIn( "trial task 0 cancelled", log_output, """ "trial task 0 cancelled" missing from log: {log_output}""". format(log_output=log_output), ) self.assertNotIn( "Task should have been cancelled", log_output, """ "Task should have been cancelled" should not in log: {log_output}""".format(log_output=log_output), ) self.assert_task_failed(log_output, 0) # Test mix of successful and cancelled trials. spark_trials = SparkTrials(parallelism=4, timeout=4) fmin( fn=fn, space=hp.uniform("x", -0.25, 5), algo=anneal.suggest, max_evals=6, trials=spark_trials, max_queue_len=1, show_progressbar=False, return_argmin=True, rstate=np.random.RandomState(4), ) time.sleep(2) self.assertTrue(spark_trials._fmin_cancelled) self.assertEqual(spark_trials._fmin_cancelled_reason, "fmin run timeout") # There are 2 finished trials, 1 cancelled running trial and 1 cancelled # new trial. We do not need to check the new trial since it is not started yet. self.assertGreaterEqual( spark_trials.count_successful_trials(), 1, "Expected at least 1 successful trial but found none.", ) self.assertGreaterEqual( spark_trials.count_cancelled_trials(), 1, "Expected at least 1 cancelled trial but found none.", )
try: class_att = AttentionTFIDFClassifier(**params, nepochs=25, _verbose=False) print(class_att) class_att.fit(fold.X_train, fold.y_train, fold.X_val, fold.y_val) return { "loss": class_att._loss, "status": STATUS_OK, "model": class_att.to('cpu') } except: return {"status": STATUS_FAIL} #trials = Trials() trials = SparkTrials(parallelism=cpu_count()) best = fmin(fn=hyperparameter_tuning_try, space=space, algo=tpe.suggest, max_evals=15 * cpu_count(), trials=trials) print("Best: {}".format(best)) class_att = trials.best_trial['result']['model'] y_pred = class_att.predict(fold.X_test) with open(path.join(path_result, f'fold{i}'), 'w') as file_writer: file_writer.write(';'.join(map(str, y_pred))) print(
def test_quadratic1_tpe(self): # TODO: Speed this up or remove it since it is slow (1 minute on laptop) spark_trials = SparkTrials(parallelism=4) test_quadratic1_tpe(spark_trials)
# COMMAND ---------- single_node_epochs = 20 num_classes = 10 #search space for hyperparameter tuning space = { 'stride': hp.quniform('stride', 2, 4, 1), 'batch_size': hp.uniform('batch_size', 32, 128), 'learning_rate': hp.uniform('learning_rate', -10, 0), 'optimizer': hp.choice('optimizer', ['adadelta', 'adam', 'rmsprop']) } dbutils.fs.rm('/mnt/ved-demo/mlmodels/mnist', True) dbutils.fs.mkdirs('/mnt/ved-demo/mlmodels/mnist') spark_trials = SparkTrials(parallelism=parallelism) with mlflow.start_run(): argmin = fmin(fn=runCNN, space=space, algo=tpe.suggest, max_evals=32, show_progressbar=False, trials=spark_trials) #install keras separetly # COMMAND ---------- # MAGIC %md # MAGIC #### Return the set of hyperparams that minimized the loss
} # COMMAND ---------- from math import factorial from hyperopt import fmin, tpe, STATUS_OK, SparkTrials import numpy as np # set the parallelism of the search cluster_nodes = 3 node_cores = 4 num_parallelism = min((cluster_nodes * node_cores), factorial(len(params))) # Creating a parent run with mlflow.start_run(): num_evals = 100 #max models to evaluate trials = SparkTrials(num_parallelism) best_hyperparam = fmin(fn=objective_function, space=params, algo=tpe.suggest, max_evals=num_evals, trials=trials) # Log param and metric for the best model for name, value in best_hyperparam.items(): mlflow.log_param(name, value) mlflow.log_metric("loss", trials.best_trial["result"]["loss"]) # COMMAND ----------