def run_training(): """Train the model""" #Read Data data = load_dataset() #Training pipeline pipeline.fake_news_pipeline.fit(data["traducción"].values, data["fake"].values) save_pipeline(pipeline_to_save=pipeline.fake_news_pipeline)
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_input = test_data[0:1] # When subject = make_prediction(input_data=single_test_input) # Then assert subject is not None assert isinstance(subject.get('predictions')[0], float) assert math.ceil(subject.get('predictions')[0]) == 112476
def run_training(): """Train the model.""" # read in data data = load_dataset(file_name=config.TRAINING_DATA_FILE) # clean the target variable data = clean_target_variable(data) # split training & validation data X_train, X_val, y_train, y_val = split_dataset(data) print('...fitting model') pipeline.interest_rate_pipe.fit(X_train[config.FEATURES], y_train) _logger.info(f"...saving model version: {_version}") save_pipeline(pipeline.interest_rate_pipe)
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_input = test_data # When subject = make_prediction(input_data=multiple_test_input) # Then assert subject is not None assert len(subject.get('predictions')) == 1451 # We expect some rows to be filtered out assert len(subject.get('predictions')) != original_data_length
def run_training() -> None: """Train the model.""" # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) # transform the target data[config.TARGET] = data[config.PRE_TARGET].map({ 'Fully Paid': 1, 'Charged Off': 0 }) data = data.drop(config.PRE_TARGET, axis=1) # divide train and test, set random_state X_train, X_test, y_train, y_test = train_test_split(data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) # Set MLFlow experiment mlflow.set_experiment(config.EXPERIMENT_NAME) with mlflow.start_run(): loan_pipe.fit(X_train[config.FEATURES], y_train) # Log dataset dataset_full_path_name = dataset_location() mlflow.set_tag("dataset", dataset_full_path_name) # Log parameters mlflow.log_param("penalty", model_config.PENALTY) mlflow.log_param("dual", model_config.DUAL) mlflow.log_param("C", model_config.C) mlflow.log_param("fit_intercept", model_config.FIT_INTERCEPT) mlflow.log_param("random_state", model_config.RANDOM_STATE) mlflow.log_param("class_weight", model_config.CLASS_WEIGHT) mlflow.log_param("max_iter", model_config.MAX_ITER) mlflow.log_param("multi_class", model_config.MULTI_CLASS) # Log the metrics score = loan_pipe.score(X_train, y_train) mlflow.log_metric("score", score) # Save the sklearn pipeline as mlflow model conda_env = mlflow.sklearn.get_default_conda_env() mlflow.sklearn.log_model(loan_pipe, "sklearn_pipeline", conda_env=conda_env)
def run_training() -> None: """Train the model.""" # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) # we are setting the seed here # transform the target y_train = np.log(y_train) y_test = np.log(y_test) pipeline.price_pipe.fit(X_train[config.FEATURES], y_train) save_pipeline(pipeline_to_persist=pipeline.price_pipe)