class LendingClubTrainingPipeline(): def __init__(self, spark, input_path, model_name, limit=None): self.spark = spark self.input_path = input_path self.model_name = model_name self.limit = limit self.data_provider = LendingClubDataProvider(spark, input_path, limit) def run(self): X_train, X_test, Y_train, Y_test = self.data_provider.run() self.train(X_train, X_test, Y_train, Y_test) def train(self, X_train, X_test, Y_train, Y_test): cl = LogisticRegression(random_state=42, max_iter=100) # cl = RandomForestClassifier(random_state=42) cl.fit(X_train, Y_train) with mlflow.start_run(run_name="Training") as run: self.eval_and_log_metrics(cl, X_test, Y_test) mlflow.sklearn.log_model(cl, "model") def eval_and_log_metrics(self, estimator, X, Y): predictions = estimator.predict(X) # Calc metrics acc = accuracy_score(Y, predictions) roc = roc_auc_score(Y, predictions) mse = mean_squared_error(Y, predictions) mae = mean_absolute_error(Y, predictions) r2 = r2_score(Y, predictions) # Print metrics print(" acc: {}".format(acc)) print(" roc: {}".format(roc)) print(" mse: {}".format(mse)) print(" mae: {}".format(mae)) print(" R2: {}".format(r2)) # Log metrics mlflow.log_metric("acc", acc) mlflow.log_metric("roc", roc) mlflow.log_metric("mse", mse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) mlflow.set_tag('candidate', 'true')
class LendingClubModelEvaluationPipeline(): def __init__(self, spark, experimentID, model_name, input_path, limit=None): self.spark = spark self.input_path = input_path self.model_name = model_name self.limit = limit self.experimentID = experimentID self.data_provider = LendingClubDataProvider(spark, input_path, limit) def run(self): mlflow_client = MlflowClient() _, X_test, _, Y_test = self.data_provider.run() cand_run_ids = self.get_candidate_models() best_cand_roc, best_cand_run_id = self.get_best_model( cand_run_ids, X_test, Y_test) print('Best ROC (candidate models): ', best_cand_roc) try: versions = mlflow_client.get_latest_versions(self.model_name, stages=['Production']) prod_run_ids = [v.run_id for v in versions] best_prod_roc, best_prod_run_id = self.get_best_model( prod_run_ids, X_test, Y_test) except RestException: best_prod_roc = -1 print('ROC (production models): ', best_prod_roc) if best_cand_roc >= best_prod_roc: # deploy new model model_version = mlflow.register_model( "runs:/" + best_cand_run_id + "/model", self.model_name) time.sleep(5) mlflow_client.transition_model_version_stage( name=self.model_name, version=model_version.version, stage="Production") print('Deployed version: ', model_version.version) # remove candidate tags for run_id in cand_run_ids: mlflow_client.set_tag(run_id, 'candidate', 'false') def get_best_model(self, run_ids, X, Y): best_roc = -1 best_run_id = None for run_id in run_ids: roc = self.evaluate_model(run_id, X, Y) if roc > best_roc: best_roc = roc best_run_id = run_id return best_roc, best_run_id def get_candidate_models(self): spark_df = self.spark.read.format("mlflow-experiment").load( self.experimentID) pdf = spark_df.where("tags.candidate='true'").select( "run_id").toPandas() return pdf['run_id'].values def evaluate_model(self, run_id, X, Y): model = mlflow.sklearn.load_model('runs:/{}/model'.format(run_id)) predictions = model.predict(X) # acc = accuracy_score(Y, predictions) roc = roc_auc_score(Y, predictions) return roc