def hdbscan_clustering(data, min_cluster_size, min_samples=None, excluded_variables=[], prepare_data=True, random_state=0, max_iter=1000, display=True, dims=3, reduction_algorithm="isomap", evaluate_clusters=False, dim_red_params=None): """ Perform clustering using hdb scan while at the same time reducing the dimensions using umap :param data: data to cluster :type data: :param min_cluster_size: min samples that should make up a cluster :type min_cluster_size: :return: :rtype: """ if prepare_data: data = create_training_data(data, excluded_variables=excluded_variables, test_train_split=False) numpy_data = data.values else: numpy_data = data.values labels = hdbscan.HDBSCAN( min_samples=min_samples, min_cluster_size=min_cluster_size, ).fit_predict(numpy_data) print(labels) if display: plot_clusters(numpy_data, labels, dims=dims, reduction_algorithm=reduction_algorithm) if evaluate_clusters: cluster_dfs = extract_clusters(data, labels) comparison_result = compare_clusters(data, cluster_dfs) pprint(comparison_result) return labels
def logistic_regression(data=None, num_cols=None, cat_cols=None, target=None, train_data=None, train_labels=None): """ Train a classifier using logistic regression Parameters ---------- data : num_cols : cat_cols : target : train_data : train_labels : Returns ------- """ if train_data is not None and train_labels is not None: clf = LogisticRegression(penalty="elasticnet") clf.fit(train_data, train_labels) return clf else: x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target) clf = LogisticRegression(penalty="elasticnet") clf.fit(x_train, y_train) # TODO display results return clf
def k_means_cluster(data, n_clusters, excluded_variables=[], prepare_data=True, random_state=0, max_iter=1000, display=True, dims=3, reduction_algorithm="isomap", evaluate_clusters=False, dim_red_params=None): """ Attempts k means clustering for the selected data :param data: :type data: :return: :rtype: """ # Perform k means clustering if prepare_data: data = create_training_data(data, excluded_variables=excluded_variables, test_train_split=False) numpy_data = data.values else: numpy_data = data.values # TODO give more configuration options kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter).fit(numpy_data) labels = kmeans.predict(numpy_data) if display: plot_clusters(numpy_data, labels, dims=dims, reduction_algorithm=reduction_algorithm) if evaluate_clusters: cluster_dfs = extract_clusters(data, labels) comparison_result = compare_clusters(data, cluster_dfs) pprint(comparison_result) return labels
def random_forest_regressor(data=None, num_cols=None, cat_cols=None, target=None, train_data=None, train_labels=None): if train_data is not None and train_labels is not None: clf = RandomForestRegressor(random_state=0) clf.fit(train_data, train_labels) return clf else: x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target) y_train = y_train.astype(float) y_test = y_test.astype(float) clf = RandomForestRegressor(random_state=0) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(clf.score(x_test, y_test)) # probs = np.max(clf.predict_proba(x_test), axis=1) # print(probs) # TODO make this more general if len(y_train.unique()) == 2: roc_auc = create_roc_auc_plot(y_test.values, probs) else: # TODO plot regression result pass feature_importances = plot_feature_importances( x_train, clf.feature_importances_) return clf
def linear_regression(data, num_cols, cat_cols, target): """ Fit a basic linear regression model to the data :return: :rtype: """ x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target) print(len(x_train), len(x_test)) regr = LinearRegression() regr.fit(x_train, y_train) print(regr.score(x_test, y_test)) print(regr.coef_)
def elastic_net(data=None, num_cols=None, cat_cols=None, target=None, train_data=None, train_labels=None): """ Fit an elastic net regression model to the provided data :param data: numpy array containing numeric training data :type data: list-like numeric :param target: target column to predict :type target: list-like numeric :return: :rtype: """ if train_data is not None and train_labels is not None: regr = ElasticNet() regr.fit(train_data, train_labels) return regr else: x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target) y_train = y_train.astype(float) y_test = y_test.astype(float) # print(x_train) # print(list(y_train)) clf = ElasticNet() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(clf.score(x_test, y_test)) explainer = shap.KernelExplainer(clf.predict, x_train) shap_values = explainer.shap_values(x_train, nsamples=100) print(shap_values) # shap.summary_plot(shap_values, x_train) plt.tight_layout() # plt.show() print(explainer.expected_value) shap.force_plot(explainer.expected_value, shap_values[0], x_train.iloc[0, :]) plt.show() # probs = np.max(clf.predict_proba(x_test), axis=1) # print(probs) # TODO make this more general # if len(y_train.unique()) == 2: # roc_auc = create_roc_auc_plot(y_test.values, probs) # else: # # TODO plot regression result # pass # feature_importances = plot_feature_importances(x_train, clf.feature_importances_) return clf
def random_forest_classifier(data=None, num_cols=None, cat_cols=None, target=None, train_data=None, train_labels=None): """ Train a random forest model on the with the selected columns on the selected target using data from the given dataframe :param data: :type data: :param num_cols: :type num_cols: :param cat_cols: :type cat_cols: :param target: :type target: :return: :rtype: """ # Only train the classifier if train_data is not None and train_labels is not None: clf = RandomForestClassifier(random_state=0) clf.fit(train_data, train_labels) return clf else: x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target) y_train = y_train.astype("str") y_test = y_test.astype("str") print(x_train) print(list(y_train)) clf = RandomForestClassifier(random_state=0) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(clf.score(x_test, y_test)) probs = np.max(clf.predict_proba(x_test), axis=1) print(probs) # TODO make this more general if len(y_train.unique()) == 2: roc_auc = create_roc_auc_plot(y_test.values, probs) else: # TODO plot confusion matrix pass feature_importances = plot_feature_importances( x_train, clf.feature_importances_) return clf
def svm_regression(data=None, num_cols=None, cat_cols=None, target=None, train_data=None, train_labels=None): """ Train an svm regressor with the given data and target :param cat_cols: :type cat_cols: :param num_cols: :type num_cols: :param data: :type data: :param target: :type target: :return: :rtype: """ if train_data is not None and train_labels is not None: clf = SVR() clf.fit(train_data, train_labels) return clf else: x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target, na_strategy="fill") y_train = y_train.astype(float) y_test = y_test.astype(float) # print(x_train) # print(list(y_train)) clf = SVR() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(clf.score(x_test, y_test)) # probs = np.max(clf.predict_proba(x_test), axis=1) # print(probs) # TODO make this more general # if len(y_train.unique()) == 2: # roc_auc = create_roc_auc_plot(y_test.values, probs) # else: # # TODO plot regression result # pass # feature_importances = plot_feature_importances(x_train, clf.feature_importances_) return clf
def svm_classifier(data=None, num_cols=None, cat_cols=None, target=None, train_data=None, train_labels=None): """ Train an svm classifier with the given data and target :param cat_cols: :type cat_cols: :param num_cols: :type num_cols: :param data: raw training data :type data: :param target: target column in the data :type target: :return: :rtype: """ if train_data is not None and train_labels is not None: clf = SVC() clf.fit(train_data, train_labels) return clf else: x_train, x_test, y_train, y_test = create_training_data( data, num_cols, cat_cols, target, na_strategy="fill") # TODO multiclass classification y_train = y_train.astype("str") y_test = y_test.astype("str") print(x_train) print(list(y_train)) clf = SVC(gamma="auto") clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(y_pred) print(clf.score(x_test, y_test)) # probs = np.max(clf.predict_proba(x_test), axis=1) # print(probs) # # TODO make this more general # roc_auc = create_roc_auc_plot(y_test.values, probs) # feature_importances = plot_feature_importances(x_train, clf.feature_importances_) return clf
def vbgmm_cluster(data, n_clusters, covariance_type="full", weight_concentration_prior=None, excluded_variables=[], prepare_data=True, random_state=0, max_iter=1000, display=True, dims=3, reduction_algorithm="isomap", evaluate_clusters=False, dim_red_params=None): """ Perform clustering using a variational bayesian gaussian mixture :param max_iter: :type max_iter: :param n_models: :type n_models: :param data: :type data: :param three_dimensional: :type three_dimensional: :return: :rtype: """ if prepare_data: data = create_training_data(data, excluded_variables=excluded_variables, test_train_split=False) numpy_data = data.values else: numpy_data = data.values # TODO give more configuration options bgmm = BayesianGaussianMixture(n_components=n_clusters, covariance_type=covariance_type, weight_concentration_prior=weight_concentration_prior, max_iter=max_iter).fit(numpy_data) # Get the number/names of the cluster labels = bgmm.predict(numpy_data) if display: plot_clusters(numpy_data, labels, dims=dims, reduction_algorithm=reduction_algorithm) if evaluate_clusters: cluster_dfs = extract_clusters(data, labels) comparison_result = compare_clusters(data, cluster_dfs) pprint(comparison_result) return labels
def gmm_cluster(data, n_clusters, covariance_type="full", excluded_variables=[], prepare_data=True, random_state=0, max_iter=1000, display=True, dims=3, reduction_algorithm="isomap", evaluate_clusters=False, dim_red_params=None): """ Attempt clustering using Gaussian Mixture Models and different variations of Expectation Maximization Algorithms :param max_iter: :type max_iter: :param three_dimensional: :type three_dimensional: :param n_clusters: :type n_clusters: :param data: :type data: :return: :rtype: """ if prepare_data: data = create_training_data(data, excluded_variables=excluded_variables, test_train_split=False) numpy_data = data.values else: numpy_data = data.values gmm = GaussianMixture(n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter).fit(numpy_data) # Get the number/names of the cluster labels = gmm.predict(numpy_data) # Create a figure with the data grouped by associated cluster if display: plot_clusters(numpy_data, labels, dims=dims, reduction_algorithm=reduction_algorithm) if evaluate_clusters: cluster_dfs = extract_clusters(data, labels) comparison_result = compare_clusters(data, cluster_dfs) pprint(comparison_result) return labels
def svm(data, target, excluded_variables=[], prediction_type=None, kernel='rbf', C=1.0, degree=3, cv=True, cv_params=None, display=True, shap=True, prepare_data=True): if prediction_type: model_subtype = prediction_type else: model_subtype = detect_prediction_type(data, target) if prepare_data: x_train, x_test, y_train, y_test = create_training_data( data, target, excluded_variables) else: x_train, x_test = data[0], data[1] y_train, y_test = target[0], target[1] print(f"Creating a svm {model_subtype} model") if model_subtype in ["binary", "multi-class"]: pred = SVC(kernel=kernel, C=C, degree=degree, probability=True) y_train = y_train.astype("str") y_test = y_test.astype("str") # Kernel function for shap value prediction f = lambda x: pred.predict_proba(x)[:, 1] else: pred = SVR(kernel=kernel, C=C, degree=degree) y_train = y_train.astype("float") y_test = y_test.astype("float") if cv: # Perform cross validation hyper parameter tuning if not cv_params: cv_params = { "C": [1, 10, 100], "kernel": ["linear", "poly", "rbf", "sigmoid"], "gamma": ["auto", "scale"] } pred, cv_results, param_results = cross_validation_tuning( pred, cv_params, x_train, y_train) print(param_results) else: pred.fit(x_train, y_train) if display: display_model_performance(pred, model_subtype, x_test, y_test, target) if model_subtype != "regression": shap_values = display_feature_importances(pred.predict_proba, x_train, x_test, return_shap=shap) else: shap_values = display_feature_importances(pred, x_train, x_test, return_shap=shap) else: print(f"Score: {pred.score(x_test, y_test)}") # TODO print additional information if shap: return pred, shap_values return pred
def linear_model(data, target, excluded_variables=[], prediction_type=None, l1_ratio=0.2, max_iter=1000, cv=True, cv_params=None, display=True, shap=True, prepare_data=True): if prediction_type: model_subtype = prediction_type else: model_subtype = detect_prediction_type(data, target) if prepare_data: x_train, x_test, y_train, y_test = create_training_data( data, target, excluded_variables) else: x_train, x_test = data[0], data[1] y_train, y_test = target[0], target[1] print(f"Creating a linear {model_subtype} model") if model_subtype in ["binary", "multi-class"]: pred = LogisticRegression(penalty="elasticnet", l1_ratio=l1_ratio, max_iter=max_iter, solver="saga") y_train = y_train.astype("str") y_test = y_test.astype("str") # Kernel function for shap value prediction f = lambda x: pred.predict_proba(x)[:, 1] else: pred = ElasticNet(l1_ratio=l1_ratio, max_iter=max_iter) y_train = y_train.astype("float") y_test = y_test.astype("float") if cv: # Perform cross validation hyper parameter tuning if not cv_params: cv_params = { "l1_ratio": [0, 0.2, 0.5, 0.75, 1], "max_iter": [100, 1000, 10000] } pred, cv_results, param_results = cross_validation_tuning( pred, cv_params, x_train, y_train) print(param_results) else: pred.fit(x_train, y_train) if display: display_model_performance(pred, model_subtype, x_test, y_test, target) if model_subtype != "regression": shap_values = display_feature_importances(pred.predict_proba, x_train, x_test, return_shap=shap) else: shap_values = display_feature_importances(pred.predict, x_train, x_test, return_shap=shap) else: print(f"Score: {pred.score(x_test, y_test)}") # TODO print additional information if shap: return pred, shap_values return pred
def random_forest(data, target, excluded_variables=[], prediction_type=None, n_estimators=100, criterion=None, max_depth=None, max_features=None, min_samples_leaf=1, cv=True, cv_params=None, display=True, shap=True, prepare_data=True): if prediction_type: model_subtype = prediction_type else: model_subtype = detect_prediction_type(data, target) if prepare_data: x_train, x_test, y_train, y_test = create_training_data( data, target, excluded_variables) else: x_train, x_test = data[0], data[1] y_train, y_test = target[0], target[1] print(f"Creating a random forest {model_subtype} model") if model_subtype in ["binary", "multi-class"]: if criterion: pred = RandomForestClassifier(random_state=0, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features) else: pred = RandomForestClassifier(random_state=0, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features) # TODO check if this is really necessary y_train = y_train.astype("str") y_test = y_test.astype("str") else: if criterion: pred = RandomForestRegressor(random_state=0, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features) else: pred = RandomForestRegressor(random_state=0, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features) y_train = y_train.astype("float") y_test = y_test.astype("float") if cv: if not cv_params: cv_params = { "n_estimators": [10, 100, 500], "max_depth": [None, 6, 8], "max_features": [None, "auto", "log2"], "min_samples_leaf": [1, 5, 10] } pred, cv_results, param_results = cross_validation_tuning( pred, cv_params, x_train, y_train) print(param_results) else: pred.fit(x_train, y_train) if display: display_model_performance(pred, model_subtype, x_test, y_test, target) shap_values = display_feature_importances(pred, x_train, x_test, model_type="tree", return_shap=shap) else: print(f"Score: {pred.score(x_test, y_test)}") # TODO print additional information if shap: return pred, shap_values return pred
def multi_model_predictor(data, target, excluded_variables=[], prediction_type=None, linear_model_params=None, svm_params=None, random_forest_params=None, gradient_boosting_params=None, cv=True, display=True, shap=True, prepare_data=True, all_models=True): if prediction_type: model_subtype = prediction_type else: model_subtype = detect_prediction_type(data, target) if prepare_data: x_train, x_test, y_train, y_test, train_ind, test_ind = create_training_data( data, target, excluded_variables, test_train_indices=True) else: x_train, x_test = data[0], data[1] y_train, y_test = target[0], target[1] # Extract data for catboost pool # TODO # create the models print("Training models") if linear_model_params: lin_m = linear_model([x_train, x_test], [y_train, y_test], prediction_type=model_subtype, cv=cv, display=False, shap=False, prepare_data=False, **linear_model_params) else: lin_m = linear_model([x_train, x_test], [y_train, y_test], prediction_type=model_subtype, cv=cv, display=False, shap=False, prepare_data=False) if svm_params: svm_m = svm([x_train, x_test], [y_train, y_test], prediction_type=model_subtype, cv=cv, display=False, shap=False, prepare_data=False, **svm_params) else: svm_m = svm([x_train, x_test], [y_train, y_test], prediction_type=model_subtype, cv=cv, display=False, shap=False, prepare_data=False) if random_forest_params: rf_m = random_forest([x_train, x_test], [y_train, y_test], prediction_type=model_subtype, cv=cv, display=False, shap=False, prepare_data=False, **random_forest_params) else: rf_m = random_forest([x_train, x_test], [y_train, y_test], prediction_type=model_subtype, cv=cv, display=False, shap=False, prepare_data=False) if gradient_boosting_params: gb_m, gb_m_score = gradient_boosted_trees( data=data, target=target, prediction_type=model_subtype, cv=cv, display=False, shap=False, **gradient_boosting_params, score=True) else: gb_m, gb_m_score = gradient_boosted_trees( data=data, target=target, prediction_type=model_subtype, cv=cv, display=False, shap=False, score=True) # Display scores lm_score = lin_m.score(x_test, y_test) print(f"Linear model score: {lm_score}") rf_score = rf_m.score(x_test, y_test) print(f"Random forest model score: {rf_score}") svm_score = svm_m.score(x_test, y_test) print(f"SVM model score: {svm_score}") print(f"Catboost model score: {gb_m_score}") models = [("linear_model", lm_score, lin_m), ("random forest", rf_score, rf_m), ("svm", svm_score, svm_m), ("catboost", gb_m_score, gb_m)] top_model = sorted(models, key=lambda x: x[1])[-1] print(top_model) top_pred = top_model[2] if display: display_model_performance(top_pred, model_subtype, x_test, y_test, target) if top_model[0] in {"linear_model", "svm"}: if model_subtype == "regression": shap_values = display_feature_importances(top_pred.predict, x_train, x_test, return_shap=shap) else: shap_values = display_feature_importances( top_pred.predict_proba, x_train, x_test, return_shap=shap) else: shap_values = display_feature_importances(top_pred, x_train, x_test, model_type="tree", return_shap=shap) if shap: return top_pred, models, shap_values else: return top_pred, models
def train_models(self, tuning=False, verbose=True, test_set=True): """ Train all available models that fit the selected target and store the results Parameters ---------- tuning : bool indicating whether to perform cross validated hyper parameter tuning verbose : bool indicating the level of output the training generates test_set : bool setting wether to split the data into a training and testing set Returns ------- """ if test_set: x_train, x_test, y_train, y_test = create_training_data( self.data, self.num_cols, self.cat_cols, self.target) # TODO add gradient boosting models # Train regression models predictions = [] if self.mode == "regression": if verbose: print("Training ElasticNet model") elastic_net_model = elastic_net(train_data=x_train, train_labels=y_train) print( f"Elasticnet result: {elastic_net_model.score(x_test, y_test)}" ) print("Training SVM model") svm_regressor = svm_regression(train_data=x_train, train_labels=y_train) print( f"SVM model result: {svm_regressor.score(x_test, y_test)}" ) print(f"Training Random Forest Regressor") rf_regressor = random_forest_regressor( train_data=x_train, train_labels=y_train) print( f"Random Forest Results: {rf_regressor.score(x_test, y_test)}" ) else: elastic_net_model = elastic_net(train_data=x_train, train_labels=y_train) svm_regressor = svm_regression(train_data=x_train, train_labels=y_train) rf_regressor = random_forest_regressor( train_data=x_train, train_labels=y_train) predictions.append( ("elastic_net", elastic_net_model.predict(x_test))) predictions.append(("svm", svm_regressor.predict(x_test))) predictions.append( ("random forest", rf_regressor.predict(x_test))) # Binary prediction models elif self.mode == "binary": if verbose: print("Training Logistic regression classifier") logreg_clf = logistic_regression(train_data=x_train, train_labels=y_train) print( f"SVM classifier result: {logreg_clf.score(x_test, y_test)}" ) print("Training SVM classifier") svm_clf = svm_classifier(train_data=x_train, train_labels=y_train) print( f"SVM classifier result: {svm_clf.score(x_test, y_test)}" ) print("Training RF Classifier") rf_clf = random_forest_classifier(train_data=x_train, train_labels=y_train) print( f"RF classifier score: {rf_clf.score(x_test, y_test)}") else: logreg_clf = logistic_regression(train_data=x_train, train_labels=y_train) svm_clf = svm_classifier(train_data=x_train, train_labels=y_train) rf_clf = random_forest_classifier(train_data=x_train, train_labels=y_train) predictions.append( ("logistic regression", logreg_clf.predict(x_test))) predictions.append(("svm", svm_clf.predict(x_test))) predictions.append(("random forest", rf_clf.predict(x_test))) elif self.mode == "multi-class": if verbose: print("Training logistic regression") logreg_clf = logistic_regression(train_data=x_train, train_labels=y_train) print( f"SVM classifier result: {logreg_clf.score(x_test, y_test)}" ) print("Training SVM classifier") svm_clf = svm_classifier(train_data=x_train, train_labels=y_train) print( f"SVM classifier result: {svm_clf.score(x_test, y_test)}" ) print("Training RF Classifier") rf_clf = random_forest_classifier(train_data=x_train, train_labels=y_train) print( f"RF classifier score: {rf_clf.score(x_test, y_test)}") else: logreg_clf = logistic_regression(train_data=x_train, train_labels=y_train) svm_clf = svm_classifier(train_data=x_train, train_labels=y_train) rf_clf = random_forest_classifier(train_data=x_train, train_labels=y_train) predictions.append( ("logistic regression", logreg_clf.predict(x_test))) predictions.append(("svm", svm_clf.predict(x_test))) predictions.append(("random forest", rf_clf.predict(x_test))) self.display_results(predictions, y_test, self.mode)