def intersect( vdf: vDataFrame, index: str, gid: str, g: str = "", x: str = "", y: str = "", ): """ --------------------------------------------------------------------------- Spatially intersects a point or points with a set of polygons. Parameters ---------- vdf: vDataFrame vDataFrame to use to compute the spatial join. index: str Name of the index. gid: str An integer column or integer that uniquely identifies the spatial object(s) of g or x and y. g: str, optional A geometry or geography (WGS84) column that contains points. The g column can contain only point geometries or geographies. x: str, optional x-coordinate or longitude. y: str, optional y-coordinate or latitude. Returns ------- vDataFrame object containing the result of the intersection. """ check_types( [ ("vdf", vdf, [vDataFrame],), ("gid", gid, [str],), ("g", g, [str],), ("x", x, [str],), ("y", y, [str],), ("index", index, [str],), ] ) table = vdf.__genSQL__() columns_check([gid], vdf) if g: columns_check([g], vdf) g = vdf_columns_names([g], vdf)[0] query = f"(SELECT STV_Intersect({gid}, {g} USING PARAMETERS index='{index}') OVER (PARTITION BEST) AS (point_id, polygon_gid) FROM {table}) x" elif x and y: columns_check([x, y], vdf) x, y = vdf_columns_names([x, y], vdf) query = f"(SELECT STV_Intersect({gid}, {x}, {y} USING PARAMETERS index='{index}') OVER (PARTITION BEST) AS (point_id, polygon_gid) FROM {table}) x" else: raise ParameterError("Either 'x' and 'y' or 'g' must not be empty.") return vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
def durbin_watson( vdf: vDataFrame, eps: str, ts: str, by: list = [], ): """ --------------------------------------------------------------------------- Durbin Watson test (residuals autocorrelation). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. Returns ------- float Durbin Watson statistic """ check_types( [ ("ts", ts, [str],), ("eps", eps, [str],), ("by", by, [list],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps] + [ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) query = "(SELECT et, LAG(et) OVER({}ORDER BY {}) AS lag_et FROM (SELECT {} AS et, {}{} FROM {}) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE".format( "PARTITION BY {} ".format(", ".join(by)) if (by) else "", ts, eps, ts, (", " + ", ".join(by)) if (by) else "", vdf.__genSQL__(), ) vdf.__executeSQL__( "SELECT SUM(POWER(et - lag_et, 2)) / SUM(POWER(et, 2)) FROM {}".format(query), title="Computes the Durbin Watson d.", ) d = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] return d
def create_index( vdf: vDataFrame, gid: str, g: str, index: str, overwrite: bool = False, max_mem_mb: int = 256, skip_nonindexable_polygons: bool = False, ): """ --------------------------------------------------------------------------- Creates a spatial index on a set of polygons to speed up spatial intersection with a set of points. Parameters ---------- vdf: vDataFrame vDataFrame to use to compute the spatial join. gid: str Name of an integer column that uniquely identifies the polygon. The gid cannot be NULL. g: str Name of a geometry or geography (WGS84) column or expression that contains polygons and multipolygons. Only polygon and multipolygon can be indexed. Other shape types are excluded from the index. index: str Name of the index. overwrite: bool, optional BOOLEAN value that specifies whether to overwrite the index, if an index exists. max_mem_mb: int, optional A positive integer that assigns a limit to the amount of memory in megabytes that create_index can allocate during index construction. skip_nonindexable_polygons: bool, optional In rare cases, intricate polygons (for instance, with too high resolution or anomalous spikes) cannot be indexed. These polygons are considered non-indexable. When set to False, non-indexable polygons cause the index creation to fail. When set to True, index creation can succeed by excluding non-indexable polygons from the index. Returns ------- vDataFrame object result of the join. """ check_types([ ( "vdf", vdf, [vDataFrame], ), ( "gid", gid, [str], ), ( "index", index, [str], ), ( "g", g, [str], ), ( "overwrite", overwrite, [bool], ), ( "max_mem_mb", max_mem_mb, [int], ), ( "skip_nonindexable_polygons", skip_nonindexable_polygons, [bool], ), ]) columns_check([gid, g], vdf) gid, g = vdf_columns_names([gid, g], vdf) query = "SELECT STV_Create_Index({}, {} USING PARAMETERS index='{}', overwrite={} , max_mem_mb={}, skip_nonindexable_polygons={}) OVER() FROM {}" query = query.format(gid, g, index, overwrite, max_mem_mb, skip_nonindexable_polygons, vdf.__genSQL__()) return to_tablesample(query, vdf._VERTICAPY_VARIABLES_["cursor"])
def het_white( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- White’s Lagrange Multiplier Test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: str Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) X_0 = ["1"] + X variables = [] variables_names = [] for i in range(len(X_0)): for j in range(i, len(X_0)): if i != 0 or j != 0: variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)] variables_names += ["var_{}_{}".format(i, j)] query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(variables), eps, vdf.__genSQL__() ) vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] if len(X) > 1: k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2)) else: k = 1 LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def adfuller( vdf: vDataFrame, column: str, ts: str, by: list = [], p: int = 1, with_trend: bool = False, regresults: bool = False, ): """ --------------------------------------------------------------------------- Augmented Dickey Fuller test (Time Series stationarity). Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. with_trend: bool, optional Adds a trend in the Regression. regresults: bool, optional If True, the full regression results are returned. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def critical_value(alpha, N, with_trend): if not (with_trend): if N <= 25: if alpha == 0.01: return -3.75 elif alpha == 0.10: return -2.62 elif alpha == 0.025: return -3.33 else: return -3.00 elif N <= 50: if alpha == 0.01: return -3.58 elif alpha == 0.10: return -2.60 elif alpha == 0.025: return -3.22 else: return -2.93 elif N <= 100: if alpha == 0.01: return -3.51 elif alpha == 0.10: return -2.58 elif alpha == 0.025: return -3.17 else: return -2.89 elif N <= 250: if alpha == 0.01: return -3.46 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.14 else: return -2.88 elif N <= 500: if alpha == 0.01: return -3.44 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.13 else: return -2.87 else: if alpha == 0.01: return -3.43 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.12 else: return -2.86 else: if N <= 25: if alpha == 0.01: return -4.38 elif alpha == 0.10: return -3.24 elif alpha == 0.025: return -3.95 else: return -3.60 elif N <= 50: if alpha == 0.01: return -4.15 elif alpha == 0.10: return -3.18 elif alpha == 0.025: return -3.80 else: return -3.50 elif N <= 100: if alpha == 0.01: return -4.04 elif alpha == 0.10: return -3.15 elif alpha == 0.025: return -3.73 else: return -5.45 elif N <= 250: if alpha == 0.01: return -3.99 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.69 else: return -3.43 elif N <= 500: if alpha == 0.01: return 3.98 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.68 else: return -3.42 else: if alpha == 0.01: return -3.96 elif alpha == 0.10: return -3.12 elif alpha == 0.025: return -3.66 else: return -3.41 check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("p", p, [int, float],), ("by", by, [list],), ("with_trend", with_trend, [bool],), ("regresults", regresults, [bool],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([ts, column] + by, vdf) ts = vdf_columns_names([ts], vdf)[0] column = vdf_columns_names([column], vdf)[0] by = vdf_columns_names(by, vdf) schema = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = "public" name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( schema, gen_name([column]).upper() ) relation_name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_VIEW_{}".format( schema, gen_name([column]).upper() ) try: vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP MODEL IF EXISTS {}".format(name) ) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) except: pass lag = [ "LAG({}, 1) OVER ({}ORDER BY {}) AS lag1".format( column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] lag += [ "LAG({}, {}) OVER ({}ORDER BY {}) - LAG({}, {}) OVER ({}ORDER BY {}) AS delta{}".format( column, i, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, column, i + 1, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, i, ) for i in range(1, p + 1) ] lag += [ "{} - LAG({}, 1) OVER ({}ORDER BY {}) AS delta".format( column, column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] query = "CREATE VIEW {} AS SELECT {}, {} AS ts FROM {}".format( relation_name, ", ".join(lag), "TIMESTAMPDIFF(SECOND, {}, MIN({}) OVER ())".format(ts, ts) if vdf[ts].isdate() else ts, vdf.__genSQL__(), ) vdf._VERTICAPY_VARIABLES_["cursor"].execute(query) model = LinearRegression( name, vdf._VERTICAPY_VARIABLES_["cursor"], solver="Newton", max_iter=1000 ) predictors = ["lag1"] + ["delta{}".format(i) for i in range(1, p + 1)] if with_trend: predictors += ["ts"] model.fit( relation_name, predictors, "delta", ) coef = model.coef_ vdf._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(name)) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) if regresults: return coef coef = coef.transpose() DF = coef.values["lag1"][0] / (max(coef.values["lag1"][1], 1e-99)) p_value = coef.values["lag1"][3] count = vdf.shape()[0] result = tablesample( { "index": [ "ADF Test Statistic", "p_value", "# Lags used", "# Observations Used", "Critical Value (1%)", "Critical Value (2.5%)", "Critical Value (5%)", "Critical Value (10%)", "Stationarity (alpha = 1%)", ], "value": [ DF, p_value, p, count, critical_value(0.01, count, with_trend), critical_value(0.025, count, with_trend), critical_value(0.05, count, with_trend), critical_value(0.10, count, with_trend), DF < critical_value(0.01, count, with_trend) and p_value < 0.01, ], } ) return result
def het_arch( vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1, ): """ --------------------------------------------------------------------------- Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("eps", eps, [str],), ("ts", ts, [str],), ("p", p, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps, ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) X = [] X_names = [] for i in range(0, p + 1): X += [ "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format( eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i ) ] X_names += ["lag_{}".format(i)] query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(X), vdf.__genSQL__() ) vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = (n - p) * R2 lm_pvalue = chi2.sf(LM, p) F = (n - 2 * p - 1) * R2 / (1 - R2) / p f_pvalue = f.sf(F, p, n - 2 * p - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def mkt(vdf: vDataFrame, column: str, ts: str, alpha: float = 0.05): """ --------------------------------------------------------------------------- Mann Kendall test (Time Series trend). \u26A0 Warning : This Test is computationally expensive. It is using a CROSS JOIN during the computation. The complexity is O(n * k), n being the total count of the vDataFrame and k the number of rows to use to do the test. Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. alpha: float, optional Significance Level. Probability to accept H0. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("alpha", alpha, [int, float],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([column, ts], vdf) column = vdf_columns_names([column], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] table = "(SELECT {}, {} FROM {})".format(column, ts, vdf.__genSQL__()) query = "SELECT SUM(SIGN(y.{} - x.{})) FROM {} x CROSS JOIN {} y WHERE y.{} > x.{}".format( column, column, table, table, ts, ts ) vdf.__executeSQL__(query, title="Computes the Mann Kendall S.") S = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] try: S = float(S) except: S = None n = vdf[column].count() query = "SELECT SQRT(({} * ({} - 1) * (2 * {} + 5) - SUM(row * (row - 1) * (2 * row + 5))) / 18) FROM (SELECT row FROM (SELECT ROW_NUMBER() OVER (PARTITION BY {}) AS row FROM {}) VERTICAPY_SUBTABLE GROUP BY row) VERTICAPY_SUBTABLE".format( n, n, n, column, vdf.__genSQL__() ) vdf.__executeSQL__(query, title="Computes the Mann Kendall S standard deviation.") STDS = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] try: STDS = float(STDS) except: STDS = None if STDS in (None, 0) or S == None: return None if S > 0: ZMK = (S - 1) / STDS trend = "increasing" elif S < 0: ZMK = (S + 1) / STDS trend = "decreasing" else: ZMK = 0 trend = "no trend" pvalue = 2 * norm.sf(abs(ZMK)) result = ( True if (ZMK <= 0 and pvalue < alpha) or (ZMK >= 0 and pvalue < alpha) else False ) if not (result): trend = "no trend" result = tablesample( { "index": [ "Mann Kendall Test Statistic", "S", "STDS", "p_value", "Monotonic Trend", "Trend", ], "value": [ZMK, S, STDS, pvalue, result, trend], } ) return result