def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv" ) save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training rf = RandomForestRegressor( n_estimators=10, max_depth=11, bootstrap=True, random_state=RANDOM_SEED ) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) rf_err = ((y_test - y_pred) ** 2).sum() # Prediction error err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model( rf, scaler, "Random forest", err, "Random forest Regressor", model_binary ) print(err) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/bank_marketing-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["deposit"] X = data.drop("deposit", axis=1) y_train_df = train_data["deposit"] X_train_df = train_data.drop("deposit", axis=1) y_test_df = test_data["deposit"] X_test_df = test_data.drop("deposit", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model trainings dtree = DecisionTreeClassifier(criterion="entropy", random_state=RANDOM_SEED) dtree.fit(X_train.values, y_train.values) dtree_acc = dtree.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model( dtree, scaler, "Decision Tree", dtree_acc, "Decision Tree Classifier", model_binary, ) print(dtree_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv") save_model_as = msg.payload.get("model_name") data = prep_diabetes_dataset(pd.read_csv(training_data_uri)) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = prep_diabetes_dataset(pd.read_csv(train_dataset)) test_data = prep_diabetes_dataset(pd.read_csv(test_dataset)) # Separate outcome y = data["Outcome"] X = data.drop("Outcome", axis=1) y_train_df = train_data["Outcome"] X_train_df = train_data.drop("Outcome", axis=1) y_test_df = test_data["Outcome"] X_test_df = test_data.drop("Outcome", axis=1) # create encoder on entire dataset scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training dtree = DecisionTreeClassifier(criterion="entropy", random_state=RANDOM_SEED) dtree.fit(X_train, y_train) dtree_acc = dtree.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model( dtree, scaler, "Decision Tree", dtree_acc, "Basic Decision Tree model'", model_binary, ) print(dtree_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/customer_churn-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Exited"] X = data.drop("Exited", axis=1) y_train_df = train_data["Exited"] X_train_df = train_data.drop("Exited", axis=1) y_test_df = test_data["Exited"] X_test_df = test_data.drop("Exited", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training gbMod = GradientBoostingClassifier(loss="deviance", n_estimators=200, random_state=RANDOM_SEED) gbMod.fit(X_train, y_train) gb_acc = gbMod.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(gbMod, scaler, "GBM", gb_acc, "Gradient Boosting Model", model_binary) print(gb_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training svr = SVR() svr.fit(X_train, y_train) y_pred = svr.predict(X_test) err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model(svr, scaler, "Support vector regressor", err, "Support vector regressor", model_binary) print(err) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/adult_income-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["income"] X = data.drop("income", axis=1) y_train_df = train_data["income"] X_train_df = train_data.drop("income", axis=1) y_test_df = test_data["income"] X_test_df = test_data.drop("income", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training logit = LogisticRegression(random_state=RANDOM_SEED, solver="lbfgs", max_iter=1000) logit.fit(X_train.values, y_train.values) logit_acc = logit.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model(logit, scaler, "LR", logit_acc, "Logistic Regression Classifier", model_binary) print(logit_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv") save_model_as = msg.payload.get("model_name") data = prep_diabetes_dataset(pd.read_csv(training_data_uri)) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = prep_diabetes_dataset(pd.read_csv(train_dataset)) test_data = prep_diabetes_dataset(pd.read_csv(test_dataset)) # Separate outcome y = data["Outcome"] X = data.drop("Outcome", axis=1) y_train_df = train_data["Outcome"] X_train_df = train_data.drop("Outcome", axis=1) y_test_df = test_data["Outcome"] X_test_df = test_data.drop("Outcome", axis=1) # create encoder on entire dataset scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training logit = LogisticRegression(random_state=RANDOM_SEED, solver="lbfgs").fit(X_train, y_train) logit.score(X_test, y_test) logit_acc = logit.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(logit, scaler, "LR", logit_acc, "Logistic Regression Classifier", model_binary) print(logit_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/german_credit-decoded.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["outcome"] X = data.drop("outcome", axis=1) y_train_df = train_data["outcome"] X_train_df = train_data.drop("outcome", axis=1) y_test_df = test_data["outcome"] X_test_df = test_data.drop("outcome", axis=1) # create encoder on entire dataset scaler = Encoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training mlp = MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=2000, random_state=RANDOM_SEED) mlp.fit(X_train.values, y_train.values) mlp_acc = mlp.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model(mlp, scaler, "MLP", mlp_acc, "Basic MLP model", model_binary) print(mlp_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/customer_churn-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Exited"] X = data.drop("Exited", axis=1) y_train_df = train_data["Exited"] X_train_df = train_data.drop("Exited", axis=1) y_test_df = test_data["Exited"] X_test_df = test_data.drop("Exited", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training mlp = MLPClassifier(random_state=RANDOM_SEED) mlp.fit(X_train, y_train) mlp.score(X_test, y_test) mlp_acc = mlp.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(mlp, scaler, "MLP", mlp_acc, "Basic MLP classifier", model_binary) print(mlp_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/customer_churn-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Exited"] X = data.drop("Exited", axis=1) y_train_df = train_data["Exited"] X_train_df = train_data.drop("Exited", axis=1) y_test_df = test_data["Exited"] X_test_df = test_data.drop("Exited", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training SVM = svm.SVC(gamma="scale", random_state=RANDOM_SEED) SVM.fit(X_train.values, y_train.values) svm_acc = SVM.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model(SVM, scaler, "SVM", svm_acc, "Support Vector Machine", model_binary) print(svm_acc) return f"model: {model_binary}"
def train(msg): random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/heart_disease_multiclass-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["class_att"] X = data.drop("class_att", axis=1) y_train_df = train_data["class_att"] X_train_df = train_data.drop("class_att", axis=1) y_test_df = test_data["class_att"] X_test_df = test_data.drop("class_att", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training rf = RandomForestClassifier() rf.fit(X_train, y_train) rf_acc = rf.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(rf, scaler, 'RF', rf_acc, 'Random Forest Classifier', model_binary) print(rf_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training lmodel_l1 = Lasso( alpha=1e-4, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=RANDOM_SEED, selection="cyclic", tol=0.0001, warm_start=False, ) lmodel_l1.fit(X_train, y_train) y_pred = lmodel_l1.predict(X_test) l1_err = ((y_test - y_pred)**2).sum() # Prediction error err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model( lmodel_l1, scaler, "Linear L1", err, "Linear regression with L1 regularization", model_binary, ) print(err) return f"model: {model_binary}"