示例#1
0
def het_white(
    vdf: vDataFrame, eps: str, X: list,
):
    """
---------------------------------------------------------------------------
White’s Lagrange Multiplier Test for heteroscedasticity.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
X: str
    Exogenous Variables to test the heteroscedasticity on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),],
    )
    columns_check([eps] + X, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    X = vdf_columns_names(X, vdf)
    X_0 = ["1"] + X
    variables = []
    variables_names = []
    for i in range(len(X_0)):
        for j in range(i, len(X_0)):
            if i != 0 or j != 0:
                variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)]
                variables_names += ["var_{}_{}".format(i, j)]
    query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format(
        ", ".join(variables), eps, vdf.__genSQL__()
    )
    vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])

    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    try:
        model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2")
        R2 = model.score("r2")
        model.drop()
    except:
        try:
            model.set_params({"solver": "bfgs"})
            model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2")
            R2 = model.score("r2")
            model.drop()
        except:
            model.drop()
            raise
    n = vdf.shape()[0]
    if len(X) > 1:
        k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2))
    else:
        k = 1
    LM = n * R2
    lm_pvalue = chi2.sf(LM, k)
    F = (n - k - 1) * R2 / (1 - R2) / k
    f_pvalue = f.sf(F, k, n - k - 1)
    result = tablesample(
        {
            "index": [
                "Lagrange Multiplier Statistic",
                "lm_p_value",
                "F Value",
                "f_p_value",
            ],
            "value": [LM, lm_pvalue, F, f_pvalue],
        }
    )
    return result
示例#2
0
def het_goldfeldquandt(
    vdf: vDataFrame, y: str, X: list, idx: int = 0, split: float = 0.5
):
    """
---------------------------------------------------------------------------
Goldfeld-Quandt homoscedasticity test.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
y: str
    Response Column.
X: list
    Exogenous Variables.
idx: int, optional
    Column index of variable according to which observations are sorted 
    for the split.
split: float, optional
    Float to indicate where to split (Example: 0.5 to split on the median).

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """

    def model_fit(input_relation, X, y, model):
        var = []
        for vdf_tmp in input_relation:
            model.drop()
            model.fit(vdf_tmp, X, y)
            model.predict(vdf_tmp, name="verticapy_prediction")
            vdf_tmp["residual_0"] = vdf_tmp[y] - vdf_tmp["verticapy_prediction"]
            var += [vdf_tmp["residual_0"].var()]
            model.drop()
        return var

    check_types(
        [
            ("y", y, [str],),
            ("X", X, [list],),
            ("idx", idx, [int, float],),
            ("split", split, [int, float],),
            ("vdf", vdf, [vDataFrame, str,],),
        ],
    )
    columns_check([y] + X, vdf)
    y = vdf_columns_names([y], vdf)[0]
    X = vdf_columns_names(X, vdf)
    split_value = vdf[X[idx]].quantile(split)
    vdf_0_half = vdf.search(vdf[X[idx]] < split_value)
    vdf_1_half = vdf.search(vdf[X[idx]] > split_value)
    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    try:
        var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model)
    except:
        try:
            model.set_params({"solver": "bfgs"})
            var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model)
        except:
            model.drop()
            raise
    n, m = vdf_0_half.shape()[0], vdf_1_half.shape()[0]
    F = var0 / var1
    f_pvalue = f.sf(F, n, m)
    result = tablesample({"index": ["F Value", "f_p_value",], "value": [F, f_pvalue],})
    return result
示例#3
0
def het_arch(
    vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1,
):
    """
---------------------------------------------------------------------------
Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH).

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
ts: str
    vcolumn used as timeline. It will be to use to order the data. It can be
    a numerical or type date like (date, datetime, timestamp...) vcolumn.
by: list, optional
    vcolumns used in the partition.
p: int, optional
    Number of lags to consider in the test.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [
            ("eps", eps, [str],),
            ("ts", ts, [str],),
            ("p", p, [int, float],),
            ("vdf", vdf, [vDataFrame, str,],),
        ],
    )
    columns_check([eps, ts] + by, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    ts = vdf_columns_names([ts], vdf)[0]
    by = vdf_columns_names(by, vdf)
    X = []
    X_names = []
    for i in range(0, p + 1):
        X += [
            "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format(
                eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i
            )
        ]
        X_names += ["lag_{}".format(i)]
    query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format(
        ", ".join(X), vdf.__genSQL__()
    )
    vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    try:
        model.fit(vdf_lags, X_names[1:], X_names[0])
        R2 = model.score("r2")
        model.drop()
    except:
        try:
            model.set_params({"solver": "bfgs"})
            model.fit(vdf_lags, X_names[1:], X_names[0])
            R2 = model.score("r2")
            model.drop()
        except:
            model.drop()
            raise
    n = vdf.shape()[0]
    k = len(X)
    LM = (n - p) * R2
    lm_pvalue = chi2.sf(LM, p)
    F = (n - 2 * p - 1) * R2 / (1 - R2) / p
    f_pvalue = f.sf(F, p, n - 2 * p - 1)
    result = tablesample(
        {
            "index": [
                "Lagrange Multiplier Statistic",
                "lm_p_value",
                "F Value",
                "f_p_value",
            ],
            "value": [LM, lm_pvalue, F, f_pvalue],
        }
    )
    return result
示例#4
0
def het_breuschpagan(
    vdf: vDataFrame, eps: str, X: list,
):
    """
---------------------------------------------------------------------------
Breusch-Pagan test for heteroscedasticity.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
X: list
    Exogenous Variables to test the heteroscedasticity on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),],
    )
    columns_check([eps] + X, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    X = vdf_columns_names(X, vdf)

    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    vdf_copy = vdf.copy()
    vdf_copy["VERTICAPY_TEMP_eps2"] = vdf_copy[eps] ** 2
    try:
        model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2")
        R2 = model.score("r2")
        model.drop()
    except:
        try:
            model.set_params({"solver": "bfgs"})
            model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2")
            R2 = model.score("r2")
            model.drop()
        except:
            model.drop()
            raise
    n = vdf.shape()[0]
    k = len(X)
    LM = n * R2
    lm_pvalue = chi2.sf(LM, k)
    F = (n - k - 1) * R2 / (1 - R2) / k
    f_pvalue = f.sf(F, k, n - k - 1)
    result = tablesample(
        {
            "index": [
                "Lagrange Multiplier Statistic",
                "lm_p_value",
                "F Value",
                "f_p_value",
            ],
            "value": [LM, lm_pvalue, F, f_pvalue],
        }
    )
    return result
示例#5
0
def variance_inflation_factor(
    vdf: vDataFrame, X: list, X_idx: int = None,
):
    """
---------------------------------------------------------------------------
Computes the variance inflation factor (VIF). It can be used to detect
multicollinearity in an OLS Regression Analysis.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
X: list
    Input Variables.
X_idx: int
    Index of the exogenous variable in X. If left to None, a tablesample will
    be returned with all the variables VIF.

Returns
-------
float
    VIF.
    """
    check_types(
        [
            ("X_idx", X_idx, [int],),
            ("X", X, [list],),
            ("vdf", vdf, [vDataFrame, str,],),
        ],
    )
    columns_check(X, vdf)
    X = vdf_columns_names(X, vdf)

    if isinstance(X_idx, str):
        columns_check([X_idx], vdf)
        for i in range(len(X)):
            if str_column(X[i]) == str_column(X_idx):
                X_idx = i
                break
    if isinstance(X_idx, (int, float)):
        X_r = []
        for i in range(len(X)):
            if i != X_idx:
                X_r += [X[i]]
        y_r = X[X_idx]

        from verticapy.learn.linear_model import LinearRegression

        schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
        if not (schema_writing):
            schema_writing = "public"
        name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
            get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
        )
        model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
        try:
            model.fit(vdf, X_r, y_r)
            R2 = model.score("r2")
            model.drop()
        except:
            try:
                model.set_params({"solver": "bfgs"})
                model.fit(vdf, X_r, y_r)
                R2 = model.score("r2")
                model.drop()
            except:
                model.drop()
                raise
        if 1 - R2 != 0:
            return 1 / (1 - R2)
        else:
            return np.inf
    elif X_idx == None:
        VIF = []
        for i in range(len(X)):
            VIF += [variance_inflation_factor(vdf, X, i)]
        return tablesample({"X_idx": X, "VIF": VIF})
    else:
        raise ParameterError(
            f"Wrong type for Parameter X_idx.\nExpected integer, found {type(X_idx)}."
        )