def durbin_watson( vdf: vDataFrame, eps: str, ts: str, by: list = [], ): """ --------------------------------------------------------------------------- Durbin Watson test (residuals autocorrelation). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. Returns ------- float Durbin Watson statistic """ check_types( [ ("ts", ts, [str],), ("eps", eps, [str],), ("by", by, [list],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps] + [ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) query = "(SELECT et, LAG(et) OVER({}ORDER BY {}) AS lag_et FROM (SELECT {} AS et, {}{} FROM {}) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE".format( "PARTITION BY {} ".format(", ".join(by)) if (by) else "", ts, eps, ts, (", " + ", ".join(by)) if (by) else "", vdf.__genSQL__(), ) vdf.__executeSQL__( "SELECT SUM(POWER(et - lag_et, 2)) / SUM(POWER(et, 2)) FROM {}".format(query), title="Computes the Durbin Watson d.", ) d = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] return d
def mkt(vdf: vDataFrame, column: str, ts: str, alpha: float = 0.05): """ --------------------------------------------------------------------------- Mann Kendall test (Time Series trend). \u26A0 Warning : This Test is computationally expensive. It is using a CROSS JOIN during the computation. The complexity is O(n * k), n being the total count of the vDataFrame and k the number of rows to use to do the test. Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. alpha: float, optional Significance Level. Probability to accept H0. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("alpha", alpha, [int, float],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([column, ts], vdf) column = vdf_columns_names([column], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] table = "(SELECT {}, {} FROM {})".format(column, ts, vdf.__genSQL__()) query = "SELECT SUM(SIGN(y.{} - x.{})) FROM {} x CROSS JOIN {} y WHERE y.{} > x.{}".format( column, column, table, table, ts, ts ) vdf.__executeSQL__(query, title="Computes the Mann Kendall S.") S = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] try: S = float(S) except: S = None n = vdf[column].count() query = "SELECT SQRT(({} * ({} - 1) * (2 * {} + 5) - SUM(row * (row - 1) * (2 * row + 5))) / 18) FROM (SELECT row FROM (SELECT ROW_NUMBER() OVER (PARTITION BY {}) AS row FROM {}) VERTICAPY_SUBTABLE GROUP BY row) VERTICAPY_SUBTABLE".format( n, n, n, column, vdf.__genSQL__() ) vdf.__executeSQL__(query, title="Computes the Mann Kendall S standard deviation.") STDS = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] try: STDS = float(STDS) except: STDS = None if STDS in (None, 0) or S == None: return None if S > 0: ZMK = (S - 1) / STDS trend = "increasing" elif S < 0: ZMK = (S + 1) / STDS trend = "decreasing" else: ZMK = 0 trend = "no trend" pvalue = 2 * norm.sf(abs(ZMK)) result = ( True if (ZMK <= 0 and pvalue < alpha) or (ZMK >= 0 and pvalue < alpha) else False ) if not (result): trend = "no trend" result = tablesample( { "index": [ "Mann Kendall Test Statistic", "S", "STDS", "p_value", "Monotonic Trend", "Trend", ], "value": [ZMK, S, STDS, pvalue, result, trend], } ) return result