def test_model_log(self): with TempDir(chdr=True, remove_on_exit=True) as tmp: model_path = tmp.path("linear.pkl") with open(model_path, "wb") as f: pickle.dump(self._linear_lr, f) tracking_dir = os.path.abspath(tmp.path("mlruns")) tracking.set_tracking_uri("file://%s" % tracking_dir) tracking.start_run() try: pyfunc.log_model(artifact_path="linear", data_path=model_path, loader_module=os.path.basename(__file__)[:-3], code_path=[__file__]) run_id = tracking.active_run().info.run_uuid path = tracking._get_model_log_dir("linear", run_id) m = Model.load(os.path.join(path, "MLmodel")) print(m.__dict__) x = pyfunc.load_pyfunc("linear", run_id=run_id) xpred = x.predict(self._X) np.testing.assert_array_equal(self._linear_lr_predict, xpred) finally: tracking.end_run() tracking.set_tracking_uri(None) # Remove the log directory in order to avoid adding new tests to pytest... shutil.rmtree(tracking_dir)
def test_log_model(sequential_model, data, sequential_predicted): old_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: with TempDir(chdr=True, remove_on_exit=True) as tmp: try: tracking.set_tracking_uri(tmp.path("test")) if should_start_run: mlflow.start_run() artifact_path = "pytorch" mlflow.pytorch.log_model(sequential_model, artifact_path=artifact_path) model_uri = "runs:/{run_id}/{artifact_path}".format( run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path) # Load model sequential_model_loaded = mlflow.pytorch.load_model( model_uri=model_uri) test_predictions = _predict(sequential_model_loaded, data) np.testing.assert_array_equal(test_predictions, sequential_predicted) finally: mlflow.end_run() tracking.set_tracking_uri(old_uri)
def test_set_experiment(): with pytest.raises(TypeError): mlflow.set_experiment() with pytest.raises(Exception): mlflow.set_experiment(None) with pytest.raises(Exception): mlflow.set_experiment("") try: with TempDir() as tracking_uri: tracking.set_tracking_uri(tracking_uri.path()) name = "random_exp" exp_id = mlflow.create_experiment(name) mlflow.set_experiment(name) run = start_run() assert run.info.experiment_id == exp_id end_run() another_name = "another_experiment" mlflow.set_experiment(another_name) exp_id2 = mlflow.tracking.MlflowClient().get_experiment_by_name( another_name) another_run = start_run() assert another_run.info.experiment_id == exp_id2.experiment_id end_run() finally: # Need to do this to clear active experiment to restore state mlflow.tracking.fluent._active_experiment_id = None
def test_dnn(): old_uri = tracking.get_tracking_uri() try: with TempDir(chdr=False, remove_on_exit=True) as tmp: diamonds = tmp.path("diamonds") estimator = tmp.path("estimator") artifacts = tmp.path("artifacts") os.mkdir(diamonds) os.mkdir(estimator) os.mkdir(artifacts) tracking.set_tracking_uri(artifacts) # Download the diamonds dataset via mlflow run run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, experiment_id=tracking._get_experiment_id(), mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Run the main dnn app via mlflow run("apps/dnn-regression", entry_point="main", version=None, parameters={ "model-dir": estimator, "train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "hidden-units": "30,30", "label-col": "price", "steps": 5000, "batch-size": 128 }, experiment_id=tracking._get_experiment_id(), mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Loading the saved model as a pyfunc. pyfunc = tensorflow.load_pyfunc( os.path.join(estimator, os.listdir(estimator)[0])) df = pandas.read_parquet( os.path.join(diamonds, "test_diamonds.parquet")) predict_df = pyfunc.predict(df) assert 'predictions' in predict_df assert isinstance(predict_df['predictions'][0][0], numpy.float32) finally: tracking.set_tracking_uri(old_uri)
def test_log_saved_model(self): # This tests model logging capabilities on the sklearn.iris dataset. iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. y = iris.target trainingFeatures = {} for i in range(0, 2): # TensorFlow is fickle about feature names, so we remove offending characters iris.feature_names[i] = iris.feature_names[i].replace(" ", "") iris.feature_names[i] = iris.feature_names[i].replace("(", "") iris.feature_names[i] = iris.feature_names[i].replace(")", "") trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i + 1] tf_feat_cols = [] feature_names = iris.feature_names[:2] # Creating TensorFlow-specific numeric columns for input. for col in iris.feature_names[:2]: tf_feat_cols.append(tf.feature_column.numeric_column(col)) # Creating input training function. input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures, y, shuffle=False, batch_size=1) # Creating Deep Neural Network Regressor. estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols, hidden_units=[1]) # Training and creating expected predictions on training dataset. estimator.train(input_train, steps=10) # Saving the estimator's prediction on the training data; assume the DNNRegressor # produces a single output column named 'predictions' pred_col = "predictions" estimator_preds = [s[pred_col] for s in estimator.predict(input_train)] estimator_preds_df = pd.DataFrame({pred_col: estimator_preds}) old_tracking_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: with TempDir(chdr=True, remove_on_exit=True) as tmp: try: # Creating dict of features names (str) to placeholders (tensors) feature_spec = {} for name in feature_names: feature_spec[name] = tf.placeholder("float", name=name, shape=[150]) tracking.set_tracking_uri("test") if should_start_run: tracking.start_run() pyfunc_preds_df = self.helper( feature_spec, tmp, estimator, pandas.DataFrame(data=X, columns=feature_names)) # Asserting that the loaded model predictions are as expected. assert estimator_preds_df.equals(pyfunc_preds_df) finally: # Restoring the old logging location. tracking.end_run() tracking.set_tracking_uri(old_tracking_uri)
def test_no_nested_run(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) first_run = start_run() with first_run: with pytest.raises(Exception): start_run() finally: tracking.set_tracking_uri(None)
def test_log_saved_model(self): # This tests model logging capabilities on the sklearn.iris dataset. with TempDir(chdr=False, remove_on_exit=True) as tmp: iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. y = iris.target trainingFeatures = {} feature_names = iris.feature_names[:2] for i in range(0, 2): # TensorFlow is fickle about feature names, so we remove offending characters iris.feature_names[i] = iris.feature_names[i].replace(" ", "") iris.feature_names[i] = iris.feature_names[i].replace("(", "") iris.feature_names[i] = iris.feature_names[i].replace(")", "") trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i + 1] tf_feat_cols = [] feature_names = iris.feature_names[:2] # Creating TensorFlow-specific numeric columns for input. for col in iris.feature_names[:2]: tf_feat_cols.append(tf.feature_column.numeric_column(col)) # Creating input training function. input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures, y, shuffle=False, batch_size=1) # Creating Deep Neural Network Regressor. estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols, hidden_units=[1]) # Training and creating expected predictions on training dataset. estimator.train(input_train, steps=100) estimator_preds = estimator.predict(input_train) # Setting the logging such that it is in the temp folder and deleted after the test. old_tracking_dir = tracking.get_tracking_uri() tracking_dir = os.path.abspath(tmp.path("mlruns")) tracking.set_tracking_uri("file://%s" % tracking_dir) tracking.start_run() try: # Creating dict of features names (str) to placeholders (tensors) feature_spec = {} for name in feature_names: feature_spec[name] = tf.placeholder("float", name=name, shape=[150]) saved = [s['predictions'] for s in estimator_preds] results = self.helper( feature_spec, tmp, estimator, pandas.DataFrame(data=X, columns=feature_names)) # Asserting that the loaded model predictions are as expected. np.testing.assert_array_equal(saved, results) finally: # Restoring the old logging location. tracking.end_run() tracking.set_tracking_uri(old_tracking_dir)
def test_log_metric_validation(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) active_run = start_run() run_uuid = active_run.info.run_uuid with active_run: mlflow.log_metric("name_1", "apple") finished_run = tracking.MlflowClient().get_run(run_uuid) assert len(finished_run.data.metrics) == 0 finally: tracking.set_tracking_uri(None)
def test_model_log(self): with TempDir(chdr=True, remove_on_exit=True): tracking.set_tracking_uri("mlruns") tracking.start_run() try: sklearn.log_model(sk_model=self._linear_lr, artifact_path="linear") x = sklearn.load_model("linear", run_id=tracking.active_run().info.run_uuid) xpred = x.predict(self._X) np.testing.assert_array_equal(self._linear_lr_predict, xpred) finally: tracking.end_run() tracking.set_tracking_uri(None)
def test_gbt(): old_uri = tracking.get_tracking_uri() with TempDir(chdr=False, remove_on_exit=True) as tmp: try: diamonds = tmp.path("diamonds") artifacts = tmp.path("artifacts") os.mkdir(diamonds) os.mkdir(artifacts) tracking.set_tracking_uri(artifacts) # Download the diamonds dataset via mlflow run run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, experiment_id=0, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) initial = os.path.join(artifacts, "0") dir_list = os.listdir(initial) # Run the main gbt app via mlflow run("apps/gbt-regression", entry_point="main", version=None, parameters={"train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "n-trees": 10, "m-depth": 3, "learning-rate": .1, "loss": "rmse", "label-col": "price"}, experiment_id=0, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Identifying the new run's folder main = None for item in os.listdir(initial): if item not in dir_list: main = item pyfunc = load_pyfunc(os.path.join(initial, main, "artifacts/model/model.pkl")) df = pandas.read_parquet(os.path.join(diamonds, "test_diamonds.parquet")) # Removing the price column from the DataFrame so we can use the features to predict df = df.drop(columns="price") # Predicting from the saved pyfunc predict = pyfunc.predict(df) # Make sure the data is of the right type assert isinstance(predict[0], numpy.float32) finally: tracking.set_tracking_uri(old_uri)
def test_start_and_end_run(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) # Use the start_run() and end_run() APIs without a `with` block, verify they work. active_run = start_run() mlflow.log_metric("name_1", 25) end_run() finished_run = tracking.MlflowClient().get_run(active_run.info.run_uuid) # Validate metrics assert len(finished_run.data.metrics) == 1 expected_pairs = {"name_1": 25} for metric in finished_run.data.metrics: assert expected_pairs[metric.key] == metric.value finally: tracking.set_tracking_uri(None)
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) tracking.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: tracking.start_run() sparkm.log_model(artifact_path="model", spark_model=model) run_id = tracking.active_run().info.run_uuid x = pyfunc.load_pyfunc("model", run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 reloaded_model = sparkm.load_model("model", run_id=run_id) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 finally: tracking.end_run() tracking.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test_model_log(self): old_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: with TempDir(chdr=True, remove_on_exit=True) as tmp: try: tracking.set_tracking_uri("test") if should_start_run: tracking.start_run() sklearn.log_model(sk_model=self._linear_lr, artifact_path="linear") x = sklearn.load_model("linear", run_id=tracking.active_run().info.run_uuid) xpred = x.predict(self._X) np.testing.assert_array_equal(self._linear_lr_predict, xpred) finally: tracking.end_run() tracking.set_tracking_uri(old_uri)
def test_create_experiment(): with pytest.raises(TypeError): mlflow.create_experiment() with pytest.raises(Exception): mlflow.create_experiment(None) with pytest.raises(Exception): mlflow.create_experiment("") try: tracking.set_tracking_uri(tempfile.mkdtemp()) exp_id = mlflow.create_experiment( "Some random experiment name %d" % random.randint(1, 1e6)) assert exp_id is not None finally: tracking.set_tracking_uri(None)
def test_log_saved_model(self): with TempDir(chdr=False, remove_on_exit=True) as tmp: # Setting the logging such that it is in the temp folder and deleted after the test. old_tracking_dir = tracking.get_tracking_uri() tracking_dir = os.path.abspath(tmp.path("mlruns")) tracking.set_tracking_uri("file://%s" % tracking_dir) tracking.start_run() try: # Creating dict of features names (str) to placeholders (tensors) feature_spec = {} for name in self._feature_names: feature_spec[name] = tf.placeholder("float", name=name, shape=[150]) # Creating receiver function for model saving. receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( feature_spec) saved_model_path = tmp.path("model") os.makedirs(saved_model_path) os.makedirs(tmp.path("hello")) # Saving Tensorflow model. saved_model_path = self._dnn.export_savedmodel( saved_model_path, receiver_fn).decode("utf-8") # Logging the Tensorflow model just saved. tensorflow.log_saved_model(saved_model_dir=saved_model_path, signature_def_key="predict", artifact_path=tmp.path("hello")) # Loading the saved Tensorflow model as a pyfunc. x = pyfunc.load_pyfunc(saved_model_path) # Predicting on the iris dataset using the pyfunc. xpred = x.predict( pandas.DataFrame(data=self._X, columns=self._feature_names)) saved = [] for s in self._dnn_predict: saved.append(s['predictions']) loaded = [] for index, rows in xpred.iterrows(): loaded.append(rows) # Asserting that the loaded model predictions are as expected. np.testing.assert_array_equal(saved, loaded) finally: # Restoring the old logging location. tracking.end_run() tracking.set_tracking_uri(old_tracking_dir)
def test_model_log(self): old_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: with TempDir(chdr=True, remove_on_exit=True) as tmp: try: tracking.set_tracking_uri("test") if should_start_run: tracking.start_run() mlflow.h2o.log_model(self.gbm, artifact_path="gbm") # Load model gbm_loaded = mlflow.h2o.load_model("gbm", run_id=tracking.active_run().info.run_uuid) assert all(gbm_loaded.predict(self.test).as_data_frame() == self.predicted) finally: tracking.end_run() tracking.set_tracking_uri(old_uri)
def test_log_param(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) active_run = start_run() run_uuid = active_run.info.run_uuid with active_run: mlflow.log_param("name_1", "a") mlflow.log_param("name_2", "b") mlflow.log_param("name_1", "c") mlflow.log_param("nested/nested/name", 5) finished_run = tracking.MlflowClient().get_run(run_uuid) # Validate params assert len(finished_run.data.params) == 3 expected_pairs = {"name_1": "c", "name_2": "b", "nested/nested/name": "5"} for param in finished_run.data.params: assert expected_pairs[param.key] == param.value finally: tracking.set_tracking_uri(None)
def test_log_metric(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) active_run = start_run() run_uuid = active_run.info.run_uuid with active_run: mlflow.log_metric("name_1", 25) mlflow.log_metric("name_2", -3) mlflow.log_metric("name_1", 30) mlflow.log_metric("nested/nested/name", 40) finished_run = tracking.MlflowClient().get_run(run_uuid) # Validate metrics assert len(finished_run.data.metrics) == 3 expected_pairs = {"name_1": 30, "name_2": -3, "nested/nested/name": 40} for metric in finished_run.data.metrics: assert expected_pairs[metric.key] == metric.value finally: tracking.set_tracking_uri(None)
def test_log_model(model, data, predicted): old_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: with TempDir(chdr=True, remove_on_exit=True) as tmp: try: tracking.set_tracking_uri(tmp.path("test")) if should_start_run: mlflow.start_run() mlflow.pytorch.log_model(model, artifact_path="pytorch") # Load model run_id = mlflow.active_run().info.run_uuid model_loaded = mlflow.pytorch.load_model("pytorch", run_id=run_id) test_predictions = _predict(model_loaded, data) assert np.all(test_predictions == predicted) finally: mlflow.end_run() tracking.set_tracking_uri(old_uri)
def test_log_artifact(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) artifact_src_dir = tempfile.mkdtemp() # Create artifacts _, path0 = tempfile.mkstemp(dir=artifact_src_dir) _, path1 = tempfile.mkstemp(dir=artifact_src_dir) for i, path in enumerate([path0, path1]): with open(path, "w") as handle: handle.write("%s" % str(i)) # Log an artifact, verify it exists in the directory returned by get_artifact_uri # after the run finishes artifact_parent_dirs = ["some_parent_dir", None] for parent_dir in artifact_parent_dirs: with start_run(): run_artifact_dir = mlflow.get_artifact_uri() mlflow.log_artifact(path0, parent_dir) expected_dir = os.path.join(run_artifact_dir, parent_dir) \ if parent_dir is not None else run_artifact_dir assert os.listdir(expected_dir) == [os.path.basename(path0)] logged_artifact_path = os.path.join(expected_dir, path0) assert filecmp.cmp(logged_artifact_path, path0, shallow=False) # Log multiple artifacts, verify they exist in the directory returned by get_artifact_uri for parent_dir in artifact_parent_dirs: with start_run(): run_artifact_dir = mlflow.get_artifact_uri() mlflow.log_artifacts(artifact_src_dir, parent_dir) # Check that the logged artifacts match expected_artifact_output_dir = os.path.join(run_artifact_dir, parent_dir) \ if parent_dir is not None else run_artifact_dir dir_comparison = filecmp.dircmp(artifact_src_dir, expected_artifact_output_dir) assert len(dir_comparison.left_only) == 0 assert len(dir_comparison.right_only) == 0 assert len(dir_comparison.diff_files) == 0 assert len(dir_comparison.funny_files) == 0 finally: tracking.set_tracking_uri(None)
def test_start_run_context_manager(): try: tracking.set_tracking_uri(tempfile.mkdtemp()) first_run = start_run() first_uuid = first_run.info.run_uuid with first_run: # Check that start_run() causes the run information to be persisted in the store persisted_run = tracking.MlflowClient().get_run(first_uuid) assert persisted_run is not None assert persisted_run.info == first_run.info finished_run = tracking.MlflowClient().get_run(first_uuid) assert finished_run.info.status == RunStatus.FINISHED # Launch a separate run that fails, verify the run status is FAILED and the run UUID is # different second_run = start_run() assert second_run.info.run_uuid != first_uuid with pytest.raises(Exception): with second_run: raise Exception("Failing run!") finished_run2 = tracking.MlflowClient().get_run(second_run.info.run_uuid) assert finished_run2.info.status == RunStatus.FAILED finally: tracking.set_tracking_uri(None)
def test_model_log(model, data, predicted): x, y = data old_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: with TempDir(chdr=True, remove_on_exit=True) as tmp: try: tracking.set_tracking_uri("test") if should_start_run: tracking.start_run() mlflow.keras.log_model(model, artifact_path="keras_model") # Load model model_loaded = mlflow.keras.load_model( "keras_model", run_id=tracking.active_run().info.run_uuid) assert all(model_loaded.predict(x) == predicted) # Loading pyfunc model pyfunc_loaded = mlflow.pyfunc.load_pyfunc( "keras_model", run_id=tracking.active_run().info.run_uuid) assert all(pyfunc_loaded.predict(x).values == predicted) finally: tracking.end_run() tracking.set_tracking_uri(old_uri)
def setup_mlflow_tracking(self, URI, experiment_name, run_name): # select URI for server tracking set_tracking_uri(uri=URI) if is_tracking_uri_set(): logging.debug('MLFlow URI: ' + str(get_tracking_uri())) # CRUD interface self.client = MlflowClient(tracking_uri=get_tracking_uri()) # Experiment setup if self.client.get_experiment_by_name(name=experiment_name) is None: exp_id = self.client.create_experiment(name=experiment_name) else: exp = self.client.get_experiment_by_name(name=experiment_name) exp_id = exp.experiment_id # Run setup mlflow.start_run(experiment_id=exp_id, run_name=run_name) self.run_id = mlflow.active_run().info.run_id data = self.client.get_run(mlflow.active_run().info.run_id).data logging.info('MLFlow tracking started - Experiment: ' + str(experiment_name) + " - Run: " + str(data.tags["mlflow.runName"]))
def test_categorical_columns(self): """ This tests logging capabilities on datasets with categorical columns. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/get_started/\ regression/imports85.py for reference code. """ with TempDir(chdr=False, remove_on_exit=True) as tmp: path = os.path.abspath("tests/data/uci-autos-imports-85.data") # Order is important for the csv-readers, so we use an OrderedDict here. defaults = collections.OrderedDict([("body-style", [""]), ("curb-weight", [0.0]), ("highway-mpg", [0.0]), ("price", [0.0])]) types = collections.OrderedDict( (key, type(value[0])) for key, value in defaults.items()) df = pandas.read_csv(path, names=types.keys(), dtype=types, na_values="?") df = df.dropna() # Extract the label from the features dataframe. y_train = df.pop("price") # Creating the input training function required. trainingFeatures = {} for i in df: trainingFeatures[i] = df[i].values input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures, y_train.values, shuffle=False, batch_size=1) # Creating the feature columns required for the DNNRegressor. body_style_vocab = [ "hardtop", "wagon", "sedan", "hatchback", "convertible" ] body_style = tf.feature_column.categorical_column_with_vocabulary_list( key="body-style", vocabulary_list=body_style_vocab) feature_columns = [ tf.feature_column.numeric_column(key="curb-weight"), tf.feature_column.numeric_column(key="highway-mpg"), # Since this is a DNN model, convert categorical columns from sparse # to dense. # Wrap them in an `indicator_column` to create a # one-hot vector from the input. tf.feature_column.indicator_column(body_style) ] # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns # defined above as input. estimator = tf.estimator.DNNRegressor( hidden_units=[20, 20], feature_columns=feature_columns) # Training the estimator. estimator.train(input_fn=input_train, steps=10) # Saving the estimator's prediction on the training data; assume the DNNRegressor # produces a single output column named 'predictions' pred_col = "predictions" estimator_preds = [ s[pred_col] for s in estimator.predict(input_train) ] estimator_preds_df = pd.DataFrame({pred_col: estimator_preds}) # Setting the logging such that it is in the temp folder and deleted after the test. old_tracking_dir = tracking.get_tracking_uri() tracking_dir = os.path.abspath(tmp.path("mlruns")) tracking.set_tracking_uri("file://%s" % tracking_dir) tracking.start_run() try: # Creating dict of features names (str) to placeholders (tensors) feature_spec = {} feature_spec["body-style"] = tf.placeholder("string", name="body-style", shape=[None]) feature_spec["curb-weight"] = tf.placeholder( "float", name="curb-weight", shape=[None]) feature_spec["highway-mpg"] = tf.placeholder( "float", name="highway-mpg", shape=[None]) pyfunc_preds_df = self.helper(feature_spec, tmp, estimator, df) # Asserting that the loaded model predictions are as expected. Allow for some # imprecision as this is expected with TensorFlow. pandas.testing.assert_frame_equal(pyfunc_preds_df, estimator_preds_df, check_less_precise=6) finally: # Restoring the old logging location. tracking.end_run() tracking.set_tracking_uri(old_tracking_dir)
def test_linear(): old_uri = tracking.get_tracking_uri() with TempDir(chdr=False, remove_on_exit=True) as tmp: try: diamonds = tmp.path("diamonds") root_tracking_dir = tmp.path("root_tracking_dir") os.mkdir(diamonds) os.mkdir(root_tracking_dir) tracking.set_tracking_uri(root_tracking_dir) # Download the diamonds dataset via mlflow run mlflow.set_experiment("test-experiment") run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Run the main linear app via mlflow submitted_run = run("apps/linear-regression", entry_point="main", version=None, parameters={ "train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "alpha": .001, "l1-ratio": .5, "label-col": "price" }, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) pyfunc = load_pyfunc(path="model", run_id=submitted_run.run_id) df = pandas.read_parquet( os.path.join(diamonds, "test_diamonds.parquet")) # Removing the price column from the DataFrame so we can use the features to predict df = df.drop(columns="price") # Predicting from the saved pyfunc predict = pyfunc.predict(df) # Make sure the data is of the right type assert isinstance(predict[0], numpy.float64) finally: tracking.set_tracking_uri(old_uri)
# %% start = datetime.now() prepared_data = prepared_data.spark.persist() ##### 6e changement : Pour utiliser scikit learn avec koalas, il faut utiliser Mlflow ##### Mais l'entrainement restera sur des dataframe pandas, seule la prédiction peut être faite avec koalas prepared_data = prepared_data.to_pandas() #### On prépare donc l'environnement from mlflow.tracking import MlflowClient, set_tracking_uri import mlflow.sklearn from tempfile import mkdtemp d = mkdtemp("koalas_mlflow") set_tracking_uri("file:%s" % d) client = MlflowClient() exp = mlflow.create_experiment("my_experiment") mlflow.set_experiment("my_experiment") # Split Train/Test from sklearn.model_selection import train_test_split X = prepared_data.loc[:, prepared_data.columns != 'new_confirmed'] y = prepared_data['new_confirmed'].ravel() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # Scale des valeurs from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train)
def test_categorical_columns(self): """ This tests logging capabilities on datasets with categorical columns. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/get_started/regression/imports85.py for reference code. """ with TempDir(chdr=False, remove_on_exit=True) as tmp: # Downloading the data into a pandas DataFrame. URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data" path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL) # Order is important for the csv-readers, so we use an OrderedDict here. defaults = collections.OrderedDict([("body-style", [""]), ("curb-weight", [0.0]), ("highway-mpg", [0.0]), ("price", [0.0])]) types = collections.OrderedDict( (key, type(value[0])) for key, value in defaults.items()) df = pandas.read_csv(path, names=types.keys(), dtype=types, na_values="?") df = df.dropna() # Extract the label from the features dataframe. y_train = df.pop("price") # Creating the input training function required. trainingFeatures = {} for i in df: trainingFeatures[i] = df[i].values input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures, y_train.values, shuffle=False, batch_size=1) # Creating the feature columns required for the DNNRegressor. body_style_vocab = [ "hardtop", "wagon", "sedan", "hatchback", "convertible" ] body_style = tf.feature_column.categorical_column_with_vocabulary_list( key="body-style", vocabulary_list=body_style_vocab) feature_columns = [ tf.feature_column.numeric_column(key="curb-weight"), tf.feature_column.numeric_column(key="highway-mpg"), # Since this is a DNN model, convert categorical columns from sparse # to dense. # Wrap them in an `indicator_column` to create a # one-hot vector from the input. tf.feature_column.indicator_column(body_style), ] # Build a DNNRegressor, with 2x20-unit hidden layers, with the feature columns # defined above as input. estimator = tf.estimator.DNNRegressor( hidden_units=[20, 20], feature_columns=feature_columns) # Training the estimator. estimator.train(input_fn=input_train, steps=100) # Saving the estimator's prediction on the training data. estimator_preds = estimator.predict(input_train) # Setting the logging such that it is in the temp folder and deleted after the test. old_tracking_dir = tracking.get_tracking_uri() tracking_dir = os.path.abspath(tmp.path("mlruns")) tracking.set_tracking_uri("file://%s" % tracking_dir) tracking.start_run() try: # Creating dict of features names (str) to placeholders (tensors) feature_spec = {} feature_spec["body-style"] = tf.placeholder("string", name="body-style", shape=[None]) feature_spec["curb-weight"] = tf.placeholder( "float", name="curb-weight", shape=[None]) feature_spec["highway-mpg"] = tf.placeholder( "float", name="highway-mpg", shape=[None]) saved = [s['predictions'] for s in estimator_preds] results = self.helper(feature_spec, tmp, estimator, df) # Asserting that the loaded model predictions are as expected. # TensorFlow is known to have precision errors, hence the almost_equal. np.testing.assert_array_almost_equal(saved, results, decimal=2) finally: # Restoring the old logging location. tracking.end_run() tracking.set_tracking_uri(old_tracking_dir)
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() feature_names = ["0", "1", "2", "3"] pandas_df = pd.DataFrame(iris.data, columns=feature_names) # to make spark_udf work pandas_df['label'] = pd.Series(iris.target) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = tracking.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) tracking.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: tracking.start_run() artifact_path = "model%d" % cnt cnt += 1 if dfs_tmp_dir: sparkm.log_model(artifact_path=artifact_path, spark_model=model, dfs_tmpdir=dfs_tmp_dir) else: sparkm.log_model(artifact_path=artifact_path, spark_model=model) run_id = tracking.active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 # test spar_udf preds4 = score_model_as_udf(artifact_path, run_id, pandas_df) assert preds1 == preds4 # make sure we did not leave any temp files behind x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert not os.listdir(x) shutil.rmtree(x) finally: tracking.end_run() tracking.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)