def het_white( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- White’s Lagrange Multiplier Test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: str Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) X_0 = ["1"] + X variables = [] variables_names = [] for i in range(len(X_0)): for j in range(i, len(X_0)): if i != 0 or j != 0: variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)] variables_names += ["var_{}_{}".format(i, j)] query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(variables), eps, vdf.__genSQL__() ) vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] if len(X) > 1: k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2)) else: k = 1 LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_breuschpagan( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- Breusch-Pagan test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: list Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) vdf_copy = vdf.copy() vdf_copy["VERTICAPY_TEMP_eps2"] = vdf_copy[eps] ** 2 try: model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def adfuller( vdf: vDataFrame, column: str, ts: str, by: list = [], p: int = 1, with_trend: bool = False, regresults: bool = False, ): """ --------------------------------------------------------------------------- Augmented Dickey Fuller test (Time Series stationarity). Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. with_trend: bool, optional Adds a trend in the Regression. regresults: bool, optional If True, the full regression results are returned. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def critical_value(alpha, N, with_trend): if not (with_trend): if N <= 25: if alpha == 0.01: return -3.75 elif alpha == 0.10: return -2.62 elif alpha == 0.025: return -3.33 else: return -3.00 elif N <= 50: if alpha == 0.01: return -3.58 elif alpha == 0.10: return -2.60 elif alpha == 0.025: return -3.22 else: return -2.93 elif N <= 100: if alpha == 0.01: return -3.51 elif alpha == 0.10: return -2.58 elif alpha == 0.025: return -3.17 else: return -2.89 elif N <= 250: if alpha == 0.01: return -3.46 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.14 else: return -2.88 elif N <= 500: if alpha == 0.01: return -3.44 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.13 else: return -2.87 else: if alpha == 0.01: return -3.43 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.12 else: return -2.86 else: if N <= 25: if alpha == 0.01: return -4.38 elif alpha == 0.10: return -3.24 elif alpha == 0.025: return -3.95 else: return -3.60 elif N <= 50: if alpha == 0.01: return -4.15 elif alpha == 0.10: return -3.18 elif alpha == 0.025: return -3.80 else: return -3.50 elif N <= 100: if alpha == 0.01: return -4.04 elif alpha == 0.10: return -3.15 elif alpha == 0.025: return -3.73 else: return -5.45 elif N <= 250: if alpha == 0.01: return -3.99 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.69 else: return -3.43 elif N <= 500: if alpha == 0.01: return 3.98 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.68 else: return -3.42 else: if alpha == 0.01: return -3.96 elif alpha == 0.10: return -3.12 elif alpha == 0.025: return -3.66 else: return -3.41 check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("p", p, [int, float],), ("by", by, [list],), ("with_trend", with_trend, [bool],), ("regresults", regresults, [bool],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([ts, column] + by, vdf) ts = vdf_columns_names([ts], vdf)[0] column = vdf_columns_names([column], vdf)[0] by = vdf_columns_names(by, vdf) schema = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = "public" name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( schema, gen_name([column]).upper() ) relation_name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_VIEW_{}".format( schema, gen_name([column]).upper() ) try: vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP MODEL IF EXISTS {}".format(name) ) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) except: pass lag = [ "LAG({}, 1) OVER ({}ORDER BY {}) AS lag1".format( column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] lag += [ "LAG({}, {}) OVER ({}ORDER BY {}) - LAG({}, {}) OVER ({}ORDER BY {}) AS delta{}".format( column, i, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, column, i + 1, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, i, ) for i in range(1, p + 1) ] lag += [ "{} - LAG({}, 1) OVER ({}ORDER BY {}) AS delta".format( column, column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] query = "CREATE VIEW {} AS SELECT {}, {} AS ts FROM {}".format( relation_name, ", ".join(lag), "TIMESTAMPDIFF(SECOND, {}, MIN({}) OVER ())".format(ts, ts) if vdf[ts].isdate() else ts, vdf.__genSQL__(), ) vdf._VERTICAPY_VARIABLES_["cursor"].execute(query) model = LinearRegression( name, vdf._VERTICAPY_VARIABLES_["cursor"], solver="Newton", max_iter=1000 ) predictors = ["lag1"] + ["delta{}".format(i) for i in range(1, p + 1)] if with_trend: predictors += ["ts"] model.fit( relation_name, predictors, "delta", ) coef = model.coef_ vdf._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(name)) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) if regresults: return coef coef = coef.transpose() DF = coef.values["lag1"][0] / (max(coef.values["lag1"][1], 1e-99)) p_value = coef.values["lag1"][3] count = vdf.shape()[0] result = tablesample( { "index": [ "ADF Test Statistic", "p_value", "# Lags used", "# Observations Used", "Critical Value (1%)", "Critical Value (2.5%)", "Critical Value (5%)", "Critical Value (10%)", "Stationarity (alpha = 1%)", ], "value": [ DF, p_value, p, count, critical_value(0.01, count, with_trend), critical_value(0.025, count, with_trend), critical_value(0.05, count, with_trend), critical_value(0.10, count, with_trend), DF < critical_value(0.01, count, with_trend) and p_value < 0.01, ], } ) return result
def het_arch( vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1, ): """ --------------------------------------------------------------------------- Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("eps", eps, [str],), ("ts", ts, [str],), ("p", p, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps, ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) X = [] X_names = [] for i in range(0, p + 1): X += [ "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format( eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i ) ] X_names += ["lag_{}".format(i)] query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(X), vdf.__genSQL__() ) vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = (n - p) * R2 lm_pvalue = chi2.sf(LM, p) F = (n - 2 * p - 1) * R2 / (1 - R2) / p f_pvalue = f.sf(F, p, n - 2 * p - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result