def het_white( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- White’s Lagrange Multiplier Test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: str Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) X_0 = ["1"] + X variables = [] variables_names = [] for i in range(len(X_0)): for j in range(i, len(X_0)): if i != 0 or j != 0: variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)] variables_names += ["var_{}_{}".format(i, j)] query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(variables), eps, vdf.__genSQL__() ) vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] if len(X) > 1: k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2)) else: k = 1 LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_goldfeldquandt( vdf: vDataFrame, y: str, X: list, idx: int = 0, split: float = 0.5 ): """ --------------------------------------------------------------------------- Goldfeld-Quandt homoscedasticity test. Parameters ---------- vdf: vDataFrame Input vDataFrame. y: str Response Column. X: list Exogenous Variables. idx: int, optional Column index of variable according to which observations are sorted for the split. split: float, optional Float to indicate where to split (Example: 0.5 to split on the median). Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def model_fit(input_relation, X, y, model): var = [] for vdf_tmp in input_relation: model.drop() model.fit(vdf_tmp, X, y) model.predict(vdf_tmp, name="verticapy_prediction") vdf_tmp["residual_0"] = vdf_tmp[y] - vdf_tmp["verticapy_prediction"] var += [vdf_tmp["residual_0"].var()] model.drop() return var check_types( [ ("y", y, [str],), ("X", X, [list],), ("idx", idx, [int, float],), ("split", split, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([y] + X, vdf) y = vdf_columns_names([y], vdf)[0] X = vdf_columns_names(X, vdf) split_value = vdf[X[idx]].quantile(split) vdf_0_half = vdf.search(vdf[X[idx]] < split_value) vdf_1_half = vdf.search(vdf[X[idx]] > split_value) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except: try: model.set_params({"solver": "bfgs"}) var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except: model.drop() raise n, m = vdf_0_half.shape()[0], vdf_1_half.shape()[0] F = var0 / var1 f_pvalue = f.sf(F, n, m) result = tablesample({"index": ["F Value", "f_p_value",], "value": [F, f_pvalue],}) return result
def het_arch( vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1, ): """ --------------------------------------------------------------------------- Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("eps", eps, [str],), ("ts", ts, [str],), ("p", p, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps, ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) X = [] X_names = [] for i in range(0, p + 1): X += [ "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format( eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i ) ] X_names += ["lag_{}".format(i)] query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(X), vdf.__genSQL__() ) vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = (n - p) * R2 lm_pvalue = chi2.sf(LM, p) F = (n - 2 * p - 1) * R2 / (1 - R2) / p f_pvalue = f.sf(F, p, n - 2 * p - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_breuschpagan( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- Breusch-Pagan test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: list Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) vdf_copy = vdf.copy() vdf_copy["VERTICAPY_TEMP_eps2"] = vdf_copy[eps] ** 2 try: model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def variance_inflation_factor( vdf: vDataFrame, X: list, X_idx: int = None, ): """ --------------------------------------------------------------------------- Computes the variance inflation factor (VIF). It can be used to detect multicollinearity in an OLS Regression Analysis. Parameters ---------- vdf: vDataFrame Input vDataFrame. X: list Input Variables. X_idx: int Index of the exogenous variable in X. If left to None, a tablesample will be returned with all the variables VIF. Returns ------- float VIF. """ check_types( [ ("X_idx", X_idx, [int],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check(X, vdf) X = vdf_columns_names(X, vdf) if isinstance(X_idx, str): columns_check([X_idx], vdf) for i in range(len(X)): if str_column(X[i]) == str_column(X_idx): X_idx = i break if isinstance(X_idx, (int, float)): X_r = [] for i in range(len(X)): if i != X_idx: X_r += [X[i]] y_r = X[X_idx] from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf, X_r, y_r) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf, X_r, y_r) R2 = model.score("r2") model.drop() except: model.drop() raise if 1 - R2 != 0: return 1 / (1 - R2) else: return np.inf elif X_idx == None: VIF = [] for i in range(len(X)): VIF += [variance_inflation_factor(vdf, X, i)] return tablesample({"X_idx": X, "VIF": VIF}) else: raise ParameterError( f"Wrong type for Parameter X_idx.\nExpected integer, found {type(X_idx)}." )