def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/adult_income-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["income"] X = data.drop("income", axis=1) y_train_df = train_data["income"] X_train_df = train_data.drop("income", axis=1) y_test_df = test_data["income"] X_test_df = test_data.drop("income", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training xgbt = xgb.XGBClassifier(objective="binary:logistic", random_state=RANDOM_SEED) xgbt.fit(X_train, y_train) y_pred = xgbt.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions xgbt_acc = accuracy_score(y_test, predictions) model_binary = f"models/{save_model_as}.pkl" pickle_model( xgbt, scaler, "XGBoost", xgbt_acc, "Extreme Gradient Boosting Classifier", model_binary, ) print(xgbt_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training rf = RandomForestRegressor(n_estimators=10, max_depth=11, bootstrap=True, random_state=RANDOM_SEED) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) rf_err = ((y_test - y_pred)**2).sum() # Prediction error err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model(rf, scaler, "Random forest", err, "Random forest Regressor", model_binary) print(err) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/bank_marketing-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["deposit"] X = data.drop("deposit", axis=1) y_train_df = train_data["deposit"] X_train_df = train_data.drop("deposit", axis=1) y_test_df = test_data["deposit"] X_test_df = test_data.drop("deposit", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model trainings dtree = DecisionTreeClassifier(criterion="entropy", random_state=RANDOM_SEED) dtree.fit(X_train.values, y_train.values) dtree_acc = dtree.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model( dtree, scaler, "Decision Tree", dtree_acc, "Decision Tree Classifier", model_binary, ) print(dtree_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv") save_model_as = msg.payload.get("model_name") data = prep_diabetes_dataset(pd.read_csv(training_data_uri)) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = prep_diabetes_dataset(pd.read_csv(train_dataset)) test_data = prep_diabetes_dataset(pd.read_csv(test_dataset)) # Separate outcome y = data["Outcome"] X = data.drop("Outcome", axis=1) y_train_df = train_data["Outcome"] X_train_df = train_data.drop("Outcome", axis=1) y_test_df = test_data["Outcome"] X_test_df = test_data.drop("Outcome", axis=1) # create encoder on entire dataset scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training dtree = DecisionTreeClassifier(criterion="entropy", random_state=RANDOM_SEED) dtree.fit(X_train, y_train) dtree_acc = dtree.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model( dtree, scaler, "Decision Tree", dtree_acc, "Basic Decision Tree model'", model_binary, ) print(dtree_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/customer_churn-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Exited"] X = data.drop("Exited", axis=1) y_train_df = train_data["Exited"] X_train_df = train_data.drop("Exited", axis=1) y_test_df = test_data["Exited"] X_test_df = test_data.drop("Exited", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training rfMod = RandomForestClassifier(n_estimators=10, criterion="gini", random_state=RANDOM_SEED) rfMod.fit(X_train, y_train) # Compute the model accuracy on the given test data and labels rf_acc = rfMod.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(rfMod, scaler, "GBM", rf_acc, "Gradient Boosting Model", model_binary) print(rf_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training svr = SVR() svr.fit(X_train, y_train) y_pred = svr.predict(X_test) err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model(svr, scaler, "Support vector regressor", err, "Support vector regressor", model_binary) print(err) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv") save_model_as = msg.payload.get("model_name") data = prep_diabetes_dataset(pd.read_csv(training_data_uri)) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = prep_diabetes_dataset(pd.read_csv(train_dataset)) test_data = prep_diabetes_dataset(pd.read_csv(test_dataset)) # Separate outcome y = data["Outcome"] X = data.drop("Outcome", axis=1) y_train_df = train_data["Outcome"] X_train_df = train_data.drop("Outcome", axis=1) y_test_df = test_data["Outcome"] X_test_df = test_data.drop("Outcome", axis=1) # create encoder on entire dataset scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training logit = LogisticRegression(random_state=RANDOM_SEED, solver="lbfgs").fit(X_train, y_train) logit.score(X_test, y_test) logit_acc = logit.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(logit, scaler, "LR", logit_acc, "Logistic Regression Classifier", model_binary) print(logit_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/bank_marketing-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["deposit"] X = data.drop("deposit", axis=1) y_train_df = train_data["deposit"] X_train_df = train_data.drop("deposit", axis=1) y_test_df = test_data["deposit"] X_test_df = test_data.drop("deposit", axis=1) # create encoder on entire dataset scaler = Encoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training mlp = MLPClassifier(random_state=RANDOM_SEED) mlp.fit(X_train, y_train) mlp.score(X_test, y_test) mlp_acc = mlp.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(mlp, scaler, "MLP", mlp_acc, "Basic MLP classifier", model_binary) print(mlp_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv") save_model_as = msg.payload.get("model_name") data = prep_diabetes_dataset(pd.read_csv(training_data_uri)) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = prep_diabetes_dataset(pd.read_csv(train_dataset)) test_data = prep_diabetes_dataset(pd.read_csv(test_dataset)) # Separate outcome y = data["Outcome"] X = data.drop("Outcome", axis=1) y_train_df = train_data["Outcome"] X_train_df = train_data.drop("Outcome", axis=1) y_test_df = test_data["Outcome"] X_test_df = test_data.drop("Outcome", axis=1) # create encoder on entire dataset scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training mlp = MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=1000, random_state=RANDOM_SEED) mlp.fit(X_train, y_train) mlp.score(X_test, y_test) mlp_acc = mlp.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(mlp, scaler, "MLP", mlp_acc, "Basic MLP model", model_binary) print(mlp_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/german_credit-decoded.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["outcome"] X = data.drop("outcome", axis=1) y_train_df = train_data["outcome"] X_train_df = train_data.drop("outcome", axis=1) y_test_df = test_data["outcome"] X_test_df = test_data.drop("outcome", axis=1) # create encoder on entire dataset scaler = Encoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training logit = LogisticRegression(random_state=RANDOM_SEED, solver="lbfgs", max_iter=1000) logit.fit(X_train.values, y_train.values) logit_acc = logit.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model(logit, scaler, "LR", logit_acc, "Logistic Regression Classifier", model_binary) print(logit_acc) return f"model: {model_binary}"
def train(msg): random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/heart_disease_multiclass-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["class_att"] X = data.drop("class_att", axis=1) y_train_df = train_data["class_att"] X_train_df = train_data.drop("class_att", axis=1) y_test_df = test_data["class_att"] X_test_df = test_data.drop("class_att", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training rf = RandomForestClassifier() rf.fit(X_train, y_train) rf_acc = rf.score(X_test, y_test) model_binary = f"models/{save_model_as}.pkl" pickle_model(rf, scaler, 'RF', rf_acc, 'Random Forest Classifier', model_binary) print(rf_acc) return f"model: {model_binary}"
def train(msg): random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/heart_disease_multiclass-prepped.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["class_att"] X = data.drop("class_att", axis=1) y_train_df = train_data["class_att"] X_train_df = train_data.drop("class_att", axis=1) y_test_df = test_data["class_att"] X_test_df = test_data.drop("class_att", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder(X) scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training logit = LogisticRegression(random_state=RANDOM_SEED, solver='lbfgs') logit.fit(X_train.values, y_train.values) logit_acc = logit.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model(logit, scaler, 'LR', logit_acc, 'Logistic Regression Classifier', model_binary) print(logit_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get("$ref", "./data/german_credit-decoded.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["outcome"] X = data.drop("outcome", axis=1) y_train_df = train_data["outcome"] X_train_df = train_data.drop("outcome", axis=1) y_test_df = test_data["outcome"] X_test_df = test_data.drop("outcome", axis=1) # create encoder on entire dataset scaler = Encoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training SVM = svm.SVC(gamma="scale", random_state=RANDOM_SEED, probability=True) SVM.fit(X_train.values, y_train.values) svm_acc = SVM.score(X_test.values, y_test.values) model_binary = f"models/{save_model_as}.pkl" pickle_model(SVM, scaler, "SVM", svm_acc, "Basic SVM model", model_binary) print(svm_acc) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training lmodel_l2 = Ridge( alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, random_state=RANDOM_SEED, tol=0.0001, ) lmodel_l2.fit(X_train, y_train) y_pred = lmodel_l2.predict(X_test) l2_err = ((y_test - y_pred)**2).sum() # Prediction error err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model( lmodel_l2, scaler, "Linear L2", err, "Linear regression with L2 regularization", model_binary, ) print(err) return f"model: {model_binary}"
def train(msg): # for reproducible training random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) set_random_seed(RANDOM_SEED) training_data_uri = msg.payload.get( "$ref", "./data/auto_insurance_claims_dataset.csv") save_model_as = msg.payload.get("model_name") data = pd.read_csv(training_data_uri) train_dataset = training_data_uri.replace(".csv", "-train.csv") test_dataset = training_data_uri.replace(".csv", "-test.csv") train_data = pd.read_csv(train_dataset) test_data = pd.read_csv(test_dataset) # Separate outcome y = data["Total Claim Amount"] X = data.drop("Total Claim Amount", axis=1) y_train_df = train_data["Total Claim Amount"] X_train_df = train_data.drop("Total Claim Amount", axis=1) y_test_df = test_data["Total Claim Amount"] X_test_df = test_data.drop("Total Claim Amount", axis=1) # create encoder on entire dataset scaler = CategoricalEncoder() scaler.fit(X) # apply encoding to train and test data features # applied on test data to calculate accuracy metric X_train = scaler.transform(X_train_df) y_train = y_train_df X_test = scaler.transform(X_test_df) y_test = y_test_df # start model training NN = Sequential() NN.add(Dense(1000, input_dim=X_train.shape[1], activation="relu")) NN.add(Dense(200, activation="relu")) NN.add(Dense(50, activation="relu")) NN.add(Dense(1)) NN.summary() NN.compile(loss="mse", optimizer="adam", metrics=["mse", "mae"]) NN.fit(X_train, y_train, epochs=500, batch_size=300, verbose=0) y_pred = NN.predict(X_test) err = r2_score(y_test, y_pred) model_binary = f"models/{save_model_as}.pkl" pickle_model( NNPredictWrapper(NN), scaler, "Neural Network", err, "Four-layer neural network", model_binary, ) print(err) return f"model: {model_binary}"