def build_column(self, data): left_col, right_col, algo = (self.cfg.get(p) for p in ["left", "right", "algo"]) normalized = self.cfg.get("normalized", False) if algo == "levenshtein": if normalized: similarity = strsimpy.normalized_levenshtein.NormalizedLevenshtein( ) else: similarity = strsimpy.levenshtein.Levenshtein() elif algo == "damerau-leveneshtein": similarity = strsimpy.damerau.Damerau() if normalized: similarity = SimilarityNormalizeWrapper(similarity) elif algo == "jaro-winkler": similarity = JaroWinkler() elif algo == "jaccard": similarity = strsimpy.jaccard.Jaccard(int(self.cfg.get("k", 3))) if normalized: similarity = SimilarityNormalizeWrapper(similarity) distances = apply( data[[left_col, right_col]].fillna(""), lambda rec: similarity.distance(*rec), axis=1, ) return pd.Series(distances, index=data.index, name=self.name)
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp) kde_code = [] if self.target is None: return_data, hist_labels = self.build_histogram_data( parent.data[parent.selected_col]) kde, kde_code = build_kde(parent.data[parent.selected_col], hist_labels, parent.selected_col) if kde is not None: return_data["kde"] = kde else: return_data = {"targets": [], "labels": list(range(self.bins))} target_dtype = find_dtype(parent.data[self.target]) target_formatter = find_dtype_formatter(target_dtype) for target, target_data in parent.data[[ self.target, parent.selected_col ]].groupby(self.target): target_data, _ = self.build_histogram_data( target_data[parent.selected_col]) target_data["target"] = target_formatter(target, as_string=True) return_data["targets"].append(target_data) desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp) hist_data, hist_labels = np.histogram(parent.data[parent.selected_col], bins=self.bins) hist_data = [json_float(h) for h in hist_data] return_data = dict( labels=["{0:.1f}".format(lbl) for lbl in hist_labels[1:] ], # drop the first bin because of just a minimum data=hist_data, ) kde, kde_code = build_kde(parent.data[parent.selected_col], hist_labels, parent.selected_col) if kde is not None: return_data["kde"] = kde desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def build(self, parent): s = parent.data[parent.selected_col] if parent.classifier == "D": s = apply(s, json_timestamp) qq_x, qq_y = sts.probplot(s, dist="norm", fit=False) qq = pd.DataFrame(dict(x=qq_x, y=qq_y)) f = grid_formatter(grid_columns(qq), nan_display=None) return_data = dict(data=f.format_dicts(qq.itertuples())) return_data["min"] = f.fmts[0][-1](qq.min()[0].min(), None) return_data["max"] = f.fmts[0][-1](qq.max()[0].max(), None) return return_data, self._build_code(parent)
def build(self, parent): s = parent.data[parent.selected_col] if parent.classifier == "D": s = apply(s, json_timestamp) qq_x, qq_y = sts.probplot(s, dist="norm", fit=False) qq = pd.DataFrame(dict(x=qq_x, y=qq_y)) f = grid_formatter(grid_columns(qq), nan_display=None) return_data = f.format_lists(qq) trend_line = px.scatter(x=qq_x, y=qq_y, trendline="ols").data[1] trend_line = pd.DataFrame(dict(x=trend_line["x"], y=trend_line["y"])) f = grid_formatter(grid_columns(trend_line), nan_display=None) trend_line = f.format_lists(trend_line) return_data["x2"] = trend_line["x"] return_data["y2"] = trend_line["y"] return return_data, self._build_code(parent)
def build(self, parent): if parent.classifier == "D": parent.data.loc[:, parent.selected_col] = apply( parent.data[parent.selected_col], json_timestamp ) kde_code = [] if self.target is None: return_data, hist_labels = self.build_histogram_data( parent.data[parent.selected_col] ) kde, kde_code = build_kde( parent.data[parent.selected_col], hist_labels, parent.selected_col ) if kde is not None: return_data["kde"] = kde else: bin_vals = pd.cut(parent.data[parent.selected_col], bins=self.bins) labels = ["{}".format(c) for c in bin_vals.dtype.categories] parent.data.loc[:, "bin"] = bin_vals.astype("str") return_data = {"targets": [], "labels": labels} target_dtype = find_dtype(parent.data[self.target]) target_formatter = find_dtype_formatter(target_dtype) for target, target_data in parent.data[[self.target, "bin"]].groupby( self.target ): target_counts = target_data["bin"].value_counts() target_counts = [ int(tc) for tc in target_counts.reindex(labels, fill_value=0).values ] return_data["targets"].append( dict( target=target_formatter(target, as_string=True), data=target_counts, ) ) desc, desc_code = load_describe(parent.data[parent.selected_col]) dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col) for p in ["skew", "kurt"]: if p in dtype_info: desc[p] = dtype_info[p] return_data["desc"] = desc return return_data, self._build_code(parent, kde_code, desc_code)
def clean(s, cleaner, cfg): if cleaner == "drop_multispace": return s.str.replace(r"[ ]+", " ") elif cleaner == "drop_punctuation": if six.PY3: return apply( s, lambda x: x.translate(str.maketrans("", "", string.punctuation)) ) return apply(s, lambda x: x.translate(None, string.punctuation)) elif cleaner == "stopwords": stopwords = cfg.get("stopwords") or [] def clean_stopwords(x): return " ".join([w for w in x.split(" ") if w not in stopwords]) return apply(s, clean_stopwords) elif cleaner == "nltk_stopwords": language = cfg.get("language") or "english" try: import nltk nltk.download("stopwords") nltk.download("punkt") nltk_stopwords_set = set(nltk.corpus.stopwords.words(language)) def clean_nltk_stopwords(x): return " ".join( [ w for w in nltk.tokenize.word_tokenize(x) if w not in nltk_stopwords_set ] ) return apply(s.fillna(""), clean_nltk_stopwords) except ImportError: raise Exception( "You must install the 'nltk' package in order to use this cleaner!" ) elif cleaner == "drop_numbers": return s.str.replace(r"[0-9]+", "") elif cleaner == "keep_alpha": return apply(s, lambda x: "".join(c for c in x if c.isalpha())) elif cleaner == "normalize_accents": return apply( s, lambda x: unicodedata.normalize("NFKD", u"{}".format(x)) .encode("ASCII", "ignore") .decode("utf-8"), ) elif cleaner == "drop_all_space": return s.str.replace(r"[ ]+", "") elif cleaner == "drop_repeated_words": def drop_repeats(val): def _load(): val_segs = val.split(" ") for i, v2 in enumerate(val_segs): if i == 0: yield v2 elif val_segs[i - 1] != v2: yield v2 return " ".join(list(_load())) return apply(s, drop_repeats) elif cleaner == "add_word_number_space": return s.str.replace(r"(\d+(\.\d+)?)", r" \1 ") elif cleaner == "drop_repeated_chars": def drop_repeats(val): def _load(): for i, v2 in enumerate(val): if i == 0: yield v2 elif val[i - 1] != v2: yield v2 return "".join(list(_load())) return apply(s, drop_repeats) elif cleaner == "update_case": case = cfg.get("caseType") return getattr(s.str, case)()
def build_column(self, data): col, from_type, to_type = (self.cfg.get(p) for p in ["col", "from", "to"]) s = data[col] classifier = classify_type(from_type) if ( classifier == "S" ): # col can be (str or category) -> date, int, float, bool, category if to_type == "date": date_kwargs = {} if self.cfg.get("fmt"): date_kwargs["format"] = self.cfg["fmt"] else: date_kwargs["infer_datetime_format"] = True return pd.Series( pd.to_datetime(s, **date_kwargs), name=self.name, index=s.index ) elif to_type == "int": if s.str.startswith("0x").any(): def str_hex_to_int(v): return v if pd.isnull(v) else int(v, base=16) return pd.Series( apply(s, str_hex_to_int), name=self.name, index=s.index ) return pd.Series( s.astype("float").astype("int"), name=self.name, index=s.index ) elif to_type == "float": if s.str.startswith("0x").any(): return pd.Series( apply(s, float.fromhex), name=self.name, index=s.index ) return pd.Series( pd.to_numeric(s, errors="coerce"), name=self.name, index=s.index ) else: if from_type.startswith("mixed"): if to_type == "float": return pd.Series( pd.to_numeric(s, errors="coerce"), name=self.name, index=s.index, ) elif to_type == "bool": def _process_mixed_bool(v): if isinstance(v, bool): return v if isinstance(v, six.string_types): return dict(true=True, false=False).get( v.lower(), np.nan ) return np.nan return pd.Series( apply(s, _process_mixed_bool), name=self.name, index=s.index ) return pd.Series(s.astype(to_type), name=self.name, index=s.index) elif classifier == "I": # date, float, category, str, bool if to_type == "date": unit = self.cfg.get("unit") or "D" if unit == "YYYYMMDD": return pd.Series( apply(s.astype("str"), pd.Timestamp), name=self.name, index=s.index, ) return pd.Series( pd.to_datetime(s, unit=unit), name=self.name, index=s.index ) elif to_type == "hex": def int_to_hex(v): return v if pd.isnull(v) else hex(v) return pd.Series(apply(s, int_to_hex), name=self.name, index=s.index) return pd.Series(s.astype(to_type), name=self.name, index=s.index) elif classifier == "F": # str, int if to_type == "hex": return pd.Series(apply(s, float.hex), name=self.name, index=s.index) return pd.Series(s.astype(to_type), name=self.name, index=s.index) elif classifier == "D": # str, int if to_type == "int": unit = self.cfg.get("unit") if unit == "YYYYMMDD": return pd.Series( s.dt.strftime("%Y%m%d").astype(int), name=self.name, index=s.index, ) return pd.Series( apply(s, lambda x: time.mktime(x.timetuple())).astype(int) ) return pd.Series( s.dt.strftime(self.cfg.get("fmt") or "%Y%m%d"), name=self.name, index=s.index, ) elif classifier == "B": return pd.Series(s.astype(to_type), name=self.name, index=s.index) raise NotImplementedError( "data type conversion not supported for dtype: {}".format(from_type) )