def test_union(self): x = ["fi", "fo", "fum"] y = ["fi", "yo", "sum"] z = ["fi", "fe", "sun"] assert np.all( utils.union(x, y) == pd.Index(["fi", "fo", "fum", "sum", "yo"])) assert np.all( utils.union(x, y, z) == pd.Index( ["fe", "fi", "fo", "fum", "sum", "sun", "yo"]))
def _write_json(self, filename: str): # update meta information self.update_meta() # columns founded by meta_map are dropped redundant_meta = union(list(default_columns().keys()), list(self.mapper_.keys())) reduced_meta = self.meta_.drop(redundant_meta, axis=1, errors="ignore") # encode data stringed_data = self.df_.to_json(double_precision=12) stringed_meta = ( reduced_meta.to_json(double_precision=12) if reduced_meta.shape[1] > 0 else "{}" ) # generate checksum - using just the column names. checksum = hashlib.sha256( json.dumps(self.df_.columns.tolist()).encode() ).hexdigest() # compilation string compile_string = ( '{"data":%s,"meta":%s,"name":%s,"cache":%s,"mapper":%s,"checksum":%s}' % ( stringed_data, stringed_meta, json.dumps(self.name_), json.dumps(self.selectors_), json.dumps(self.mapper_), json.dumps(checksum), ) ) # determine file name. fn = filename if filename is not None else self.name_ + ".json" with open(fn, "wb") as f: f.write(compile_string.encode())
def _partial_bicorr_inner(data: pd.DataFrame, x, y, covar, tail: str = "two-sided", method: str = "spearman", output: str = "score"): """Internal method for partial bi correlation here.""" # all columns select if verbose > 0: print("partial {}:{}\\{}".format(x, y, covar)) col = union(x, y, covar) """ Calculate linear models here to get residuals for x, y to correlate together. """ # Drop rows with NaN _data = data[col].dropna() # use linear model to generate predictions px, r_x = lm(_data[covar], _data[x]) py, r_y = lm(_data[covar], _data[y]) # wrap residuals as series # if one is a boolean operation, we must preserve structure res_x = pd.Series(r_x, name=x) res_y = pd.Series(r_y, name=y) """ Perform bivariate correlate as normal. """ # calculate bicorrelation on residuals if output == "score": return _bicorr_inner_score(res_x, res_y, method) else: return _bicorr_inner_full(res_x, res_y, method=method, tail=tail)
def _create_new_metamap(df, meta, selectors, mapper, name, meta_set): # for each selector, get the group view. if isinstance(meta_set, (list, tuple)): cnames = [inspect(df, meta, selectors, sel, mode="view") for sel in meta_set] else: raise TypeError("'selectors' must be of type {list, tuple}") # calculate the pairwise intersection between all the cnames igrid = union(*pairwise(intersect, cnames)) if len(igrid) == 0: new_grid = pd.concat( [pd.Series(n, index=val) for n, val in zip(meta_set, cnames)], sort=False, axis=0, ) new_grid.name = name else: raise ValueError("shared terms: {} discovered for meta_map.".format(igrid)) # merge into meta cat = object_to_categorical(new_grid, meta_set) cat.name = name # APPARENTLY CONCAT doesn't work here? for some dumb reason. meta[name] = cat # store meta_map for future reference. mapper[name] = meta_set
def preprocess_continuous_X_y(df, xcols, ycols, for_sklearn=True): """Preprocess and split dataframe into X and y machine-learning ready datasets. Preprocesses especially for sklearn estimator object fit methods. Parameters ---------- df : pd.DataFrame The full dataset xcols : list of str Subset of the columns to choose. ycols : str, list of str Subset of the columns for target. for_sklearn : bool, default=True Returns a np.ndarray if true, else pd.Series/DataFrame Returns ------- _x : np.ndarray/pd.DataFrame Design matrix. X is reshaped ready for scikit-learn _y : np.ndarray/pd.Series Target variable """ __data = preprocess_continuous_X(df, union(xcols, ycols)) if for_sklearn: # returns np.ndarray objects properly configured _x = np.asarray(__data[xcols]) _y = np.asarray(__data[ycols]) if isinstance(xcols, str) or (isinstance(xcols, (list, tuple)) and len(xcols) == 1): _x = _x.reshape(-1, 1) return _x, _y else: return __data[xcols], __data[ycols]
def _integrate_terms(a, b): """where a, b are packaged (term, op)""" t1, op = a t2, op2 = b if op == '&': # return a 2-tuple return (intersect(t1, t2), op2) elif op == '|': # return a 2-tuple return (union(t1, t2), op2) else: return t1
def _extract_coefficients_from_model(cv, x, pkg_name): """accepted packages: linear_model, tree, ensemble, svm.""" if pkg_name == "sklearn.linear_model" or pkg_name == "sklearn.svm": cof = np.vstack([m.coef_ for m in cv["estimator"]]) if cof.shape[-1] == 1: cof = cof.flatten() res = pd.DataFrame(cof, columns=listify(x)) res["intercept"] = np.vstack([m.intercept_ for m in cv["estimator"]]).flatten() res.columns = union(listify(x), ["intercept"]) return res elif pkg_name == "sklearn.tree" or pkg_name == "sklearn.ensemble": cof = np.vstack([m.feature_importances_ for m in cv["estimator"]]) if cof.shape[-1] == 1: cof = cof.flatten() res = pd.DataFrame(cof, columns=listify(x)) res.columns = pd.Index(listify(x)) return res else: return []
def select(self, sc: str) -> pd.Index: """View a subset of columns using a flexible `eval`-like string. Select merely returns the columns of interest selected using this selector. Selections of columns can be done by: type [object, int, float, numpy.dtype*, pandas.CategoricalDtype] callable (function) that returns [bool list] of length p pd.Index str [regex, df.column name, cached name, meta.column name (that references a boolean column)] list/tuple of the above .. note:: We do not currently incorporate the use of brackets. Parameters ---------- sc : str-like The selection string to find an optimal subset of columns. Warnings -------- UserWarning If the selection returned is empty. Returns ------- sel : pd.Index The list of column names NOT selected, or empty See Also -------- view : View a selection of columns in `df_`. search : View the intersection of search terms, for columns in `df_`. Examples -------- You can use string names of types to select columns of a certain type: >>> import turbopanda as turb >>> import pandas as pd >>> mdf = turb.MetaPanda(pd.DataFrame({'a': [1., 2.], 'b': [3, 4]})) >>> mdf.select("float") Index(['a'], dtype='object', name='colnames') Or inverses can also be selected using tilde `~`: >>> mdf.select("~float") Index(['b'], dtype='object', name='colnames') Multiple terms can be joined together, include regex-expressions NOT including `&` or `|`, for instance if we wanted to select all float columns containing names x1, x2 or x3: >>> mdf.select("float & x[1-3]") """ instance_check(sc, str) terms = [c.strip() for c in re.split("[&|]", sc)] operator = re.findall("[&|]", sc) if len(terms) < 1: return pd.Index([]) else: grp = [ self.view_not(t[1:]) if t.startswith("~") else self.view(t) for t in terms ] full = grp[0] for mg, op in zip(grp[1:], operator): if op == "&": full = intersect(full, mg) elif op == "|": full = union(full, mg) return full
def _get_notes_all(): return union(_get_notes_flat(), _get_notes_sharp())