Пример #1
0
    def test_vDF_score(self, base, titanic_vd):
        from verticapy.learn.linear_model import LogisticRegression

        model = LogisticRegression(
            name="public.LR_titanic",
            cursor=base.cursor,
            tol=1e-4,
            C=1.0,
            max_iter=100,
            solver="CGD",
            l1_ratio=0.5,
        )

        model.drop()  # dropping the model in case of its existance
        model.fit("public.titanic", ["fare", "age"], "survived")
        model.predict(titanic_vd, name="survived_pred")

        # Computing AUC
        auc = titanic_vd.score(y_true="survived",
                               y_score="survived_pred",
                               method="auc")
        assert auc == pytest.approx(0.697476274)

        # Computing MSE
        mse = titanic_vd.score(y_true="survived",
                               y_score="survived_pred",
                               method="mse")
        assert mse == pytest.approx(0.224993557)

        # Drawing ROC Curve
        roc_res = titanic_vd.score(y_true="survived",
                                   y_score="survived_pred",
                                   method="roc")
        assert roc_res["threshold"][3] == 0.003
        assert roc_res["false_positive"][3] == 1.0
        assert roc_res["true_positive"][3] == 1.0
        assert roc_res["threshold"][300] == 0.3
        assert roc_res["false_positive"][300] == pytest.approx(0.9900826446)
        assert roc_res["true_positive"][300] == pytest.approx(0.9974424552)
        assert roc_res["threshold"][900] == 0.9
        assert roc_res["false_positive"][900] == pytest.approx(0.01818181818)
        assert roc_res["true_positive"][900] == pytest.approx(0.06649616368)

        # Drawing PRC Curve
        prc_res = titanic_vd.score(y_true="survived",
                                   y_score="survived_pred",
                                   method="prc")
        assert prc_res["threshold"][3] == 0.002
        assert prc_res["recall"][3] == 1.0
        assert prc_res["precision"][3] == pytest.approx(0.3925702811)
        assert prc_res["threshold"][300] == 0.299
        assert prc_res["recall"][300] == pytest.approx(1.0)
        assert prc_res["precision"][300] == pytest.approx(0.3949494949)
        assert prc_res["threshold"][900] == 0.899
        assert prc_res["recall"][900] == pytest.approx(0.06649616368)
        assert prc_res["precision"][900] == pytest.approx(0.7027027027)

        # dropping the created model
        model.drop()
    def test_vDF_score(self, base, titanic_vd):
        from verticapy.learn.linear_model import LogisticRegression

        model = LogisticRegression(
            name="public.LR_titanic",
            cursor=base.cursor,
            tol=1e-4,
            C=1.0,
            max_iter=100,
            solver="CGD",
            penalty="ENet",
            l1_ratio=0.5,
        )

        model.drop()  # dropping the model in case of its existance
        model.fit("public.titanic", ["fare", "age"], "survived")
        model.predict(titanic_vd, name="survived_pred")

        # Computing AUC
        auc = titanic_vd.score(y_true="survived", y_score="survived_pred", method="auc")
        assert auc == pytest.approx(0.7051784997146537)

        # Computing MSE
        mse = titanic_vd.score(y_true="survived", y_score="survived_pred", method="mse")
        assert mse == pytest.approx(0.228082579110535)

        # Drawing ROC Curve
        roc_res = titanic_vd.score(
            y_true="survived", y_score="survived_pred", method="roc", nbins=1000,
        )
        assert roc_res["threshold"][3] == 0.003
        assert roc_res["false_positive"][3] == 1.0
        assert roc_res["true_positive"][3] == 1.0
        assert roc_res["threshold"][300] == 0.3
        assert roc_res["false_positive"][300] == pytest.approx(1.0)
        assert roc_res["true_positive"][300] == pytest.approx(1.0)
        assert roc_res["threshold"][900] == 0.9
        assert roc_res["false_positive"][900] == pytest.approx(0.0148760330578512)
        assert roc_res["true_positive"][900] == pytest.approx(0.061381074168798)

        # Drawing PRC Curve
        prc_res = titanic_vd.score(
            y_true="survived", y_score="survived_pred", method="prc", nbins=1000,
        )
        assert prc_res["threshold"][3] == 0.002
        assert prc_res["recall"][3] == 1.0
        assert prc_res["precision"][3] == pytest.approx(0.3925702811)
        assert prc_res["threshold"][300] == 0.299
        assert prc_res["recall"][300] == pytest.approx(1.0)
        assert prc_res["precision"][300] == pytest.approx(0.392570281124498)
        assert prc_res["threshold"][900] == 0.899
        assert prc_res["recall"][900] == pytest.approx(0.061381074168798)
        assert prc_res["precision"][900] == pytest.approx(0.727272727272727)

        # dropping the created model
        model.drop()
Пример #3
0
 def test_contour(self, titanic_vd):
     model_test = LogisticRegression("model_contour",)
     model_test.drop()
     model_test.fit(
         titanic_vd, ["age", "fare"], "survived",
     )
     result = model_test.contour()
     assert len(result.get_default_bbox_extra_artists()) == 38
     model_test.drop()
Пример #4
0
    def test_report(self, model):
        reg_rep = model.report()

        assert reg_rep["index"] == [
            "explained_variance",
            "max_error",
            "median_absolute_error",
            "mean_absolute_error",
            "mean_squared_error",
            "root_mean_squared_error",
            "r2",
            "r2_adj",
            "aic",
            "bic",
        ]
        assert reg_rep["value"][0] == pytest.approx(0.219816, abs=1e-6)
        assert reg_rep["value"][1] == pytest.approx(3.592465, abs=1e-6)
        assert reg_rep["value"][2] == pytest.approx(0.496031, abs=1e-6)
        assert reg_rep["value"][3] == pytest.approx(0.609075, abs=1e-6)
        assert reg_rep["value"][4] == pytest.approx(0.594856, abs=1e-6)
        assert reg_rep["value"][5] == pytest.approx(0.7712695123858948,
                                                    abs=1e-6)
        assert reg_rep["value"][6] == pytest.approx(0.219816, abs=1e-6)
        assert reg_rep["value"][7] == pytest.approx(0.21945605202370688,
                                                    abs=1e-6)
        assert reg_rep["value"][8] == pytest.approx(-3366.7617912479104,
                                                    abs=1e-6)
        assert reg_rep["value"][9] == pytest.approx(-3339.65156943384,
                                                    abs=1e-6)

        model_class = Pipeline([
            (
                "NormalizerWine",
                StandardScaler("logstd_model_test", cursor=model.cursor),
            ),
            (
                "LogisticRegressionWine",
                LogisticRegression("logreg_model_test", cursor=model.cursor),
            ),
        ])
        model_class.drop()
        model_class.fit("public.winequality", ["alcohol"], "good")
        cls_rep1 = model_class.report().transpose()
        assert cls_rep1["auc"][0] == pytest.approx(0.7642901826299067)
        assert cls_rep1["prc_auc"][0] == pytest.approx(0.45326090911518313)
        assert cls_rep1["accuracy"][0] == pytest.approx(0.8131445282438048)
        assert cls_rep1["log_loss"][0] == pytest.approx(0.182720882885624)
        assert cls_rep1["precision"][0] == pytest.approx(0.5595463137996219)
        assert cls_rep1["recall"][0] == pytest.approx(0.2317932654659358)
        assert cls_rep1["f1_score"][0] == pytest.approx(0.37307094353346476)
        assert cls_rep1["mcc"][0] == pytest.approx(0.2719537880298097)
        assert cls_rep1["informedness"][0] == pytest.approx(
            0.18715725014026519)
        assert cls_rep1["markedness"][0] == pytest.approx(0.3951696381964047)
        assert cls_rep1["csi"][0] == pytest.approx(0.19602649006622516)
        assert cls_rep1["cutoff"][0] == pytest.approx(0.5)

        model_class.drop()
    def test_model_from_vDF(self, base, titanic_vd):
        base.cursor.execute("DROP MODEL IF EXISTS logreg_from_vDF")
        model_test = LogisticRegression("logreg_from_vDF", cursor=base.cursor)
        model_test.fit(titanic_vd, ["age", "fare"], "survived")

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'logreg_from_vDF'"
        )
        assert base.cursor.fetchone()[0] == "logreg_from_vDF"

        model_test.drop()
Пример #6
0
    def test_drop(self, base):
        base.cursor.execute("DROP MODEL IF EXISTS logreg_model_test_drop")
        model_test = LogisticRegression("logreg_model_test_drop", cursor=base.cursor)
        model_test.fit("public.titanic", ["age", "fare"], "survived")

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'logreg_model_test_drop'"
        )
        assert base.cursor.fetchone()[0] == "logreg_model_test_drop"

        model_test.drop()
        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'logreg_model_test_drop'"
        )
        assert base.cursor.fetchone() is None
Пример #7
0
    def test_drop(self):
        current_cursor().execute("DROP MODEL IF EXISTS logreg_model_test_drop")
        model_test = LogisticRegression("logreg_model_test_drop",)
        model_test.fit("public.titanic", ["age", "fare"], "survived")

        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'logreg_model_test_drop'"
        )
        assert current_cursor().fetchone()[0] == "logreg_model_test_drop"

        model_test.drop()
        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'logreg_model_test_drop'"
        )
        assert current_cursor().fetchone() is None
Пример #8
0
def train(churn):
    drop(name = "public.churn_model")
    model = LogisticRegression("churn_model", 
                           penalty = 'L2', 
                           tol = 1e-6, 
                           max_iter = 1000, 
                           solver = "BFGS")
    # print("Running cross_validate function\n")                       
    # cross_validate(model, churn, churn.get_columns(exclude_columns = ["churn"]), 'churn')
    print("Fitting logistic regression model...")
    model.fit(churn, 
          churn.get_columns(exclude_columns = ["churn", "customerID"]), 
          'churn')      
    print("Success! demo.churn_model created")
    print("Model AUC: " + str(model.score(method="auc")) + '\n')
    # Begin Predict
    model.predict(churn,
                  X = churn.get_columns(exclude_columns = ["churn", "customerID"]),
                  name = 'pred_probs')
    churn.sort({"pred_probs":"desc"})       
    churn['pred_probs'].dropna()
    return(churn)
Пример #9
0
def load_model(name: str, cursor=None, input_relation: str = "", test_relation: str = ""):
    """
---------------------------------------------------------------------------
Loads a Vertica model and returns the associated object.

Parameters
----------
name: str
    Model Name.
cursor: DBcursor, optional
    Vertica database cursor.
input_relation: str, optional
    Some automated functions may depend on the input relation. If the 
    load_model function cannot find the input relation from the call string, 
    you should fill it manually.
test_relation: str, optional
    Relation to use to do the testing. All the methods will use this relation 
    for the scoring. If empty, the training relation will be used as testing.

Returns
-------
model
    The model.
    """
    check_types([("name", name, [str],), 
                 ("test_relation", test_relation, [str],),
                 ("input_relation", input_relation, [str],),])
    cursor = check_cursor(cursor)[0]
    does_exist = does_model_exist(name=name, cursor=cursor, raise_error=False)
    schema, name = schema_relation(name)
    schema, name = schema[1:-1], name[1:-1]
    assert does_exist, NameError("The model '{}' doesn't exist.".format(name))
    if does_exist == 2:
        cursor.execute(
            "SELECT attr_name, value FROM verticapy.attr WHERE LOWER(model_name) = LOWER('{}')".format(
                str_column(name.lower())
            )
        )
        result = cursor.fetchall()
        model_save = {}
        for elem in result:
            ldic = {}
            try:
                exec("result_tmp = {}".format(elem[1]), globals(), ldic)
            except:
                exec(
                    "result_tmp = '{}'".format(elem[1].replace("'", "''")),
                    globals(),
                    ldic,
                )
            result_tmp = ldic["result_tmp"]
            try:
                result_tmp = float(result_tmp)
            except:
                pass
            if result_tmp == None:
                result_tmp = "None"
            model_save[elem[0]] = result_tmp
        if model_save["type"] == "NearestCentroid":
            from verticapy.learn.neighbors import NearestCentroid

            model = NearestCentroid(name, cursor, model_save["p"])
            model.centroids_ = tablesample(model_save["centroids"])
            model.classes_ = model_save["classes"]
        elif model_save["type"] == "KNeighborsClassifier":
            from verticapy.learn.neighbors import KNeighborsClassifier

            model = KNeighborsClassifier(
                name, cursor, model_save["n_neighbors"], model_save["p"]
            )
            model.classes_ = model_save["classes"]
        elif model_save["type"] == "KNeighborsRegressor":
            from verticapy.learn.neighbors import KNeighborsRegressor

            model = KNeighborsRegressor(
                name, cursor, model_save["n_neighbors"], model_save["p"]
            )
        elif model_save["type"] == "KernelDensity":
            from verticapy.learn.neighbors import KernelDensity

            model = KernelDensity(
                name,
                cursor,
                model_save["bandwidth"],
                model_save["kernel"],
                model_save["p"],
                model_save["max_leaf_nodes"],
                model_save["max_depth"],
                model_save["min_samples_leaf"],
                model_save["nbins"],
                model_save["xlim"],
            )
            model.y = "KDE"
            model.map = model_save["map"]
            model.tree_name = model_save["tree_name"]
        elif model_save["type"] == "LocalOutlierFactor":
            from verticapy.learn.neighbors import LocalOutlierFactor

            model = LocalOutlierFactor(
                name, cursor, model_save["n_neighbors"], model_save["p"]
            )
            model.n_errors_ = model_save["n_errors"]
        elif model_save["type"] == "DBSCAN":
            from verticapy.learn.cluster import DBSCAN

            model = DBSCAN(
                name,
                cursor,
                model_save["eps"],
                model_save["min_samples"],
                model_save["p"],
            )
            model.n_cluster_ = model_save["n_cluster"]
            model.n_noise_ = model_save["n_noise"]
        elif model_save["type"] == "CountVectorizer":
            from verticapy.learn.preprocessing import CountVectorizer

            model = CountVectorizer(
                name,
                cursor,
                model_save["lowercase"],
                model_save["max_df"],
                model_save["min_df"],
                model_save["max_features"],
                model_save["ignore_special"],
                model_save["max_text_size"],
            )
            model.vocabulary_ = model_save["vocabulary"]
            model.stop_words_ = model_save["stop_words"]
        elif model_save["type"] == "SARIMAX":
            from verticapy.learn.tsa import SARIMAX

            model = SARIMAX(
                name,
                cursor,
                model_save["p"],
                model_save["d"],
                model_save["q"],
                model_save["P"],
                model_save["D"],
                model_save["Q"],
                model_save["s"],
                model_save["tol"],
                model_save["max_iter"],
                model_save["solver"],
                model_save["max_pik"],
                model_save["papprox_ma"],
            )
            model.transform_relation = model_save["transform_relation"]
            model.coef_ = tablesample(model_save["coef"])
            model.ma_avg_ = model_save["ma_avg"]
            if isinstance(model_save["ma_piq"], dict):
                model.ma_piq_ = tablesample(model_save["ma_piq"])
            else:
                model.ma_piq_ = None
            model.ts = model_save["ts"]
            model.exogenous = model_save["exogenous"]
            model.deploy_predict_ = model_save["deploy_predict"]
        elif model_save["type"] == "VAR":
            from verticapy.learn.tsa import VAR

            model = VAR(
                name,
                cursor,
                model_save["p"],
                model_save["tol"],
                model_save["max_iter"],
                model_save["solver"],
            )
            model.transform_relation = model_save["transform_relation"]
            model.coef_ = []
            for i in range(len(model_save["X"])):
                model.coef_ += [tablesample(model_save["coef_{}".format(i)])]
            model.ts = model_save["ts"]
            model.deploy_predict_ = model_save["deploy_predict"]
            model.X = model_save["X"]
            if not(input_relation):
                model.input_relation = model_save["input_relation"]
            else:
                model.input_relation = input_relation
            model.X = model_save["X"]
            if model_save["type"] in (
                "KNeighborsRegressor",
                "KNeighborsClassifier",
                "NearestCentroid",
                "SARIMAX",
            ):
                model.y = model_save["y"]
                model.test_relation = model_save["test_relation"]
            elif model_save["type"] not in ("CountVectorizer", "VAR"):
                model.key_columns = model_save["key_columns"]
    else:
        model_type = does_model_exist(name="{}.{}".format(schema, name), cursor=cursor, raise_error=False, return_model_type=True,)
        if model_type.lower() == "kmeans":
            cursor.execute(
                "SELECT GET_MODEL_SUMMARY (USING PARAMETERS model_name = '"
                + name
                + "')"
            )
            info = cursor.fetchone()[0].replace("\n", " ")
            info = "kmeans(" + info.split("kmeans(")[1]
        elif model_type.lower() == "normalize_fit":
            from verticapy.learn.preprocessing import Normalizer

            model = Normalizer(name, cursor)
            model.param_ = model.get_attr("details")
            model.X = [
                '"' + item + '"' for item in model.param_.values["column_name"]
            ]
            if "avg" in model.param_.values:
                model.parameters["method"] = "zscore"
            elif "max" in model.param_.values:
                model.parameters["method"] = "minmax"
            else:
                model.parameters["method"] = "robust_zscore"
            return model
        else:
            cursor.execute(
                "SELECT GET_MODEL_ATTRIBUTE (USING PARAMETERS model_name = '"
                + name
                + "', attr_name = 'call_string')"
            )
            info = cursor.fetchone()[0].replace("\n", " ")
        if "SELECT " in info:
            info = info.split("SELECT ")[1].split("(")
        else:
            info = info.split("(")
        model_type = info[0].lower()
        info = info[1].split(")")[0].replace(" ", "").split("USINGPARAMETERS")
        if model_type == "svm_classifier" and "class_weights='none'" not in " ".join(info).lower():
            parameters = "".join(info[1].split("class_weights=")[1].split("'"))
            parameters = parameters[3 : len(parameters)].split(",")
            del parameters[0]
            parameters += [
                "class_weights=" + info[1].split("class_weights=")[1].split("'")[1]
            ]
        elif model_type != "svd":
            parameters = info[1].split(",")
        else:
            parameters = []
        parameters = [item.split("=") for item in parameters]
        parameters_dict = {}
        for item in parameters:
            if len(item) > 1:
                parameters_dict[item[0]] = item[1]
        info = info[0]
        for elem in parameters_dict:
            if isinstance(parameters_dict[elem], str):
                parameters_dict[elem] = parameters_dict[elem].replace("'", "")
        if model_type == "rf_regressor":
            from verticapy.learn.ensemble import RandomForestRegressor

            model = RandomForestRegressor(
                name,
                cursor,
                int(parameters_dict["ntree"]),
                int(parameters_dict["mtry"]),
                int(parameters_dict["max_breadth"]),
                float(parameters_dict["sampling_size"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["min_leaf_size"]),
                float(parameters_dict["min_info_gain"]),
                int(parameters_dict["nbins"]),
            )
        elif model_type == "rf_classifier":
            from verticapy.learn.ensemble import RandomForestClassifier

            model = RandomForestClassifier(
                name,
                cursor,
                int(parameters_dict["ntree"]),
                int(parameters_dict["mtry"]),
                int(parameters_dict["max_breadth"]),
                float(parameters_dict["sampling_size"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["min_leaf_size"]),
                float(parameters_dict["min_info_gain"]),
                int(parameters_dict["nbins"]),
            )
        elif model_type == "xgb_classifier":
            from verticapy.learn.ensemble import XGBoostClassifier

            model = XGBoostClassifier(
                name,
                cursor,
                int(parameters_dict["max_ntree"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["nbins"]),
                parameters_dict["objective"],
                parameters_dict["split_proposal_method"],
                float(parameters_dict["epsilon"]),
                float(parameters_dict["learning_rate"]),
                float(parameters_dict["min_split_loss"]),
                float(parameters_dict["weight_reg"]),
                float(parameters_dict["sampling_size"]),
            )
        elif model_type == "xgb_regressor":
            from verticapy.learn.ensemble import XGBoostRegressor

            model = XGBoostRegressor(
                name,
                cursor,
                int(parameters_dict["max_ntree"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["nbins"]),
                parameters_dict["objective"],
                parameters_dict["split_proposal_method"],
                float(parameters_dict["epsilon"]),
                float(parameters_dict["learning_rate"]),
                float(parameters_dict["min_split_loss"]),
                float(parameters_dict["weight_reg"]),
                float(parameters_dict["sampling_size"]),
            )
        elif model_type == "logistic_reg":
            from verticapy.learn.linear_model import LogisticRegression

            model = LogisticRegression(
                name,
                cursor,
                parameters_dict["regularization"],
                float(parameters_dict["epsilon"]),
                float(parameters_dict["lambda"]),
                int(parameters_dict["max_iterations"]),
                parameters_dict["optimizer"],
                float(parameters_dict["alpha"]),
            )
        elif model_type == "linear_reg":
            from verticapy.learn.linear_model import (
                LinearRegression,
                Lasso,
                Ridge,
                ElasticNet,
            )
            if parameters_dict["regularization"] == "none":
                model = LinearRegression(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                )
            elif parameters_dict["regularization"] == "l1":
                model = Lasso(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    float(parameters_dict["lambda"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                )
            elif parameters_dict["regularization"] == "l2":
                model = Ridge(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    float(parameters_dict["lambda"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                )
            else:
                model = ElasticNet(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    float(parameters_dict["lambda"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                    float(parameters_dict["alpha"]),
                )
        elif model_type == "naive_bayes":
            from verticapy.learn.naive_bayes import NaiveBayes

            model = NaiveBayes(name, cursor, float(parameters_dict["alpha"]))
        elif model_type == "svm_regressor":
            from verticapy.learn.svm import LinearSVR

            model = LinearSVR(
                name,
                cursor,
                float(parameters_dict["epsilon"]),
                float(parameters_dict["C"]),
                True,
                float(parameters_dict["intercept_scaling"]),
                parameters_dict["intercept_mode"],
                float(parameters_dict["error_tolerance"]),
                int(parameters_dict["max_iterations"]),
            )
        elif model_type == "svm_classifier":
            from verticapy.learn.svm import LinearSVC

            class_weights = parameters_dict["class_weights"].split(",")
            for idx, elem in enumerate(class_weights):
                try:
                    class_weights[idx] = float(class_weights[idx])
                except:
                    class_weights[idx] = None
            model = LinearSVC(
                name,
                cursor,
                float(parameters_dict["epsilon"]),
                float(parameters_dict["C"]),
                True,
                float(parameters_dict["intercept_scaling"]),
                parameters_dict["intercept_mode"],
                class_weights,
                int(parameters_dict["max_iterations"]),
            )
        elif model_type == "kmeans":
            from verticapy.learn.cluster import KMeans

            model = KMeans(
                name,
                cursor,
                int(info.split(",")[-1]),
                parameters_dict["init_method"],
                int(parameters_dict["max_iterations"]),
                float(parameters_dict["epsilon"]),
            )
            model.cluster_centers_ = model.get_attr("centers")
            result = model.get_attr("metrics").values["metrics"][0]
            values = {
                "index": [
                    "Between-Cluster Sum of Squares",
                    "Total Sum of Squares",
                    "Total Within-Cluster Sum of Squares",
                    "Between-Cluster SS / Total SS",
                    "converged",
                ]
            }
            values["value"] = [
                float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]),
                float(result.split("Total Sum of Squares: ")[1].split("\n")[0]),
                float(
                    result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]
                ),
                float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0])
                / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]),
                result.split("Converged: ")[1].split("\n")[0] == "True",
            ]
            model.metrics_ = tablesample(values)
        elif model_type == "bisecting_kmeans":
            from verticapy.learn.cluster import BisectingKMeans

            model = BisectingKMeans(
                name,
                cursor,
                int(info.split(",")[-1]),
                int(parameters_dict["bisection_iterations"]),
                parameters_dict["split_method"],
                int(parameters_dict["min_divisible_cluster_size"]),
                parameters_dict["distance_method"],
                parameters_dict["kmeans_center_init_method"],
                int(parameters_dict["kmeans_max_iterations"]),
                float(parameters_dict["kmeans_epsilon"]),
            )
            model.metrics_ = model.get_attr("Metrics")
            model.cluster_centers_ = model.get_attr("BKTree")
        elif model_type == "pca":
            from verticapy.learn.decomposition import PCA

            model = PCA(name, cursor, 0, bool(parameters_dict["scale"]))
            model.components_ = model.get_attr("principal_components")
            model.explained_variance_ = model.get_attr("singular_values")
            model.mean_ = model.get_attr("columns")
        elif model_type == "svd":
            from verticapy.learn.decomposition import SVD

            model = SVD(name, cursor)
            model.singular_values_ = model.get_attr("right_singular_vectors")
            model.explained_variance_ = model.get_attr("singular_values")
        elif model_type == "one_hot_encoder_fit":
            from verticapy.learn.preprocessing import OneHotEncoder

            model = OneHotEncoder(name, cursor)
            try:
                model.param_ = to_tablesample(
                    query="SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) VERTICAPY_SUBTABLE UNION ALL SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(
                        model.name, model.name
                    ),
                    cursor=model.cursor,
                )
            except:
                try:
                    model.param_ = model.get_attr("integer_categories")
                except:
                    model.param_ = model.get_attr("varchar_categories")
        if not(input_relation):
            model.input_relation = info.split(",")[1].replace("'", "").replace("\\", "")
        else:
            model.input_relation = input_relation
        model.test_relation = test_relation if (test_relation) else model.input_relation
        if model_type not in ("kmeans", "pca", "svd", "one_hot_encoder_fit"):
            model.X = info.split(",")[3 : len(info.split(","))]
            model.X = [item.replace("'", "").replace("\\", "") for item in model.X]
            model.y = info.split(",")[2].replace("'", "").replace("\\", "")
        elif model_type in (
            "svd",
            "pca",
            "one_hot_encoder_fit",
            "normalizer",
            "kmeans",
            "bisectingkmeans",
        ):
            model.X = info.split(",")[2 : len(info.split(","))]
            model.X = [item.replace("'", "").replace("\\", "") for item in model.X]
        if model_type in ("naive_bayes", "rf_classifier", "xgb_classifier"):
            try:
                cursor.execute(
                    "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1".format(
                        model.y, model.input_relation, model.y
                    )
                )
                classes = cursor.fetchall()
                model.classes_ = [item[0] for item in classes]
            except:
                model.classes_ = [0, 1]
        elif model_type in ("svm_classifier", "logistic_reg"):
            model.classes_ = [0, 1]
        if model_type in ("svm_classifier", "svm_regressor", "logistic_reg", "linear_reg",):
            model.coef_ = model.get_attr("details")
    return model
Пример #10
0
def model(base, titanic_vd):
    base.cursor.execute("DROP MODEL IF EXISTS logreg_model_test")
    model_class = LogisticRegression("logreg_model_test", cursor=base.cursor)
    model_class.fit("public.titanic", ["age", "fare"], "survived")
    yield model_class
    model_class.drop()
Пример #11
0
 def test_repr(self, model):
     assert "predictor|coefficient|std_err" in model.__repr__()
     model_repr = LogisticRegression("model_repr")
     model_repr.drop()
     assert model_repr.__repr__() == "<LogisticRegression>"
Пример #12
0
 def test_get_plot(self, base, winequality_vd):
     # 1D
     base.cursor.execute("DROP MODEL IF EXISTS model_test_plot")
     model_test = LogisticRegression("model_test_plot", cursor=base.cursor)
     model_test.fit(winequality_vd, ["alcohol"], "good")
     result = model_test.plot(color="r")
     assert len(result.get_default_bbox_extra_artists()) == 11
     plt.close("all")
     model_test.drop()
     # 2D
     model_test.fit(winequality_vd, ["alcohol", "residual_sugar"], "good")
     result = model_test.plot(color="r")
     assert len(result.get_default_bbox_extra_artists()) == 5
     plt.close("all")
     model_test.drop()
Пример #13
0
def model(titanic_vd):
    model_class = LogisticRegression("logreg_model_test",)
    model_class.drop()
    model_class.fit("public.titanic", ["age", "fare"], "survived")
    yield model_class
    model_class.drop()