def _fit(self, dataset: Dataset) -> Preprocessor: def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]: def get_token_counts(col): token_series = df[col].apply(self.tokenization_fn) tokens = token_series.sum() return Counter(tokens) return [get_token_counts(col) for col in self.columns] value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") total_counts = [Counter() for _ in self.columns] for batch in value_counts.iter_batches(): for i, col_value_counts in enumerate(batch): total_counts[i].update(col_value_counts) def most_common(counter: Counter, n: int): return Counter(dict(counter.most_common(n))) top_counts = [ most_common(counter, self.max_features) for counter in total_counts ] self.stats_ = { f"token_counts({col})": counts for (col, counts) in zip(self.columns, top_counts) } return self
def _get_unique_value_indices( dataset: Dataset, columns: List[str], drop_na_values: bool = False, key_format: str = "unique_values({0})", ) -> Dict[str, Dict[str, int]]: """If drop_na_values is True, will silently drop NA values.""" def get_pd_unique_values(df: pd.DataFrame) -> List[Dict[str, set]]: return [{col: set(df[col].unique()) for col in columns}] uniques = dataset.map_batches(get_pd_unique_values, batch_format="pandas") final_uniques = {col: set() for col in columns} for batch in uniques.iter_batches(): for col_uniques in batch: for col, uniques in col_uniques.items(): final_uniques[col].update(uniques) for col, uniques in final_uniques.items(): if drop_na_values: final_uniques[col] = {v for v in uniques if not pd.isnull(v)} else: if any(pd.isnull(v) for v in uniques): raise ValueError( f"Unable to fit column '{col}' because it contains null values. " f"Consider imputing missing values first.") unique_values_with_indices = { key_format.format(column): {k: j for j, k in enumerate(sorted(final_uniques[column]))} for column in columns } return unique_values_with_indices
def _fit(self, dataset: Dataset) -> Preprocessor: low = self.quantile_range[0] med = 0.50 high = self.quantile_range[1] num_records = dataset.count() max_index = num_records - 1 split_indices = [ int(percentile * max_index) for percentile in (low, med, high) ] self.stats_ = {} # TODO(matt): Handle case where quantile lands between 2 numbers. # The current implementation will simply choose the closest index. # This will affect the results of small datasets more than large datasets. for col in self.columns: filtered_dataset = dataset.map_batches(lambda df: df[[col]], batch_format="pandas") sorted_dataset = filtered_dataset.sort(col) _, low, med, high = sorted_dataset.split_at_indices(split_indices) def _get_first_value(ds: Dataset, c: str): return ds.take(1)[0][c] low_val = _get_first_value(low, col) med_val = _get_first_value(med, col) high_val = _get_first_value(high, col) self.stats_[f"low_quantile({col})"] = low_val self.stats_[f"median({col})"] = med_val self.stats_[f"high_quantile({col})"] = high_val return self
def _get_unique_value_indices( dataset: Dataset, columns: List[str], drop_na_values: bool = False, key_format: str = "unique_values({0})", limit: Optional[Dict[str, int]] = None, ) -> Dict[str, Dict[str, int]]: """If drop_na_values is True, will silently drop NA values.""" limit = limit or {} for column in limit: if column not in columns: raise ValueError( f"You set limit for {column}, which is not present in {columns}." ) def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: result = [ { col: Counter(df[col].value_counts(dropna=False).to_dict()) for col in columns } ] return result value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = {col: Counter() for col in columns} for batch in value_counts.iter_batches(): for col_value_counts in batch: for col, value_counts in col_value_counts.items(): final_counters[col] += value_counts # Inspect if there is any NA values. for col in columns: if drop_na_values: counter = final_counters[col] counter_dict = dict(counter) sanitized_dict = {k: v for k, v in counter_dict.items() if not pd.isnull(k)} final_counters[col] = Counter(sanitized_dict) else: if any(pd.isnull(k) for k in final_counters[col]): raise ValueError( f"Unable to fit column '{col}' because it contains null" f" values. Consider imputing missing values first." ) unique_values_with_indices = dict() for column in columns: if column in limit: # Output sorted by freq. unique_values_with_indices[key_format.format(column)] = { k[0]: j for j, k in enumerate(final_counters[column].most_common(limit[column])) } else: # Output sorted by column name. unique_values_with_indices[key_format.format(column)] = { k: j for j, k in enumerate(sorted(dict(final_counters[column]).keys())) } return unique_values_with_indices
def _transform(self, dataset: Dataset) -> Dataset: # TODO(matt): Expose `batch_size` or similar configurability. # The default may be too small for some datasets and too large for others. dataset_format = dataset._dataset_format() if dataset_format not in ("pandas", "arrow"): raise ValueError( f"Unsupported Dataset format: '{dataset_format}'. Only 'pandas' and " "'arrow' Dataset formats are supported.") transform_type = self._determine_transform_to_use(dataset_format) if transform_type == "pandas": return dataset.map_batches(self._transform_pandas, batch_format="pandas") elif transform_type == "arrow": return dataset.map_batches(self._transform_arrow, batch_format="pyarrow") else: raise ValueError( "Invalid transform type returned from _determine_transform_to_use; " f'"pandas" and "arrow" allowed, but got: {transform_type}')
def _get_most_frequent_values(dataset: Dataset, *columns: str) -> Dict[str, Union[str, Number]]: columns = list(columns) def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]: return [Counter(df[col].value_counts().to_dict()) for col in columns] value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = [Counter() for _ in columns] for batch in value_counts.iter_batches(): for i, col_value_counts in enumerate(batch): final_counters[i] += col_value_counts return { f"most_frequent({column})": final_counters[i].most_common(1)[0][0] for i, column in enumerate(columns) }
def _get_most_frequent_values(dataset: Dataset, *columns: str) -> Dict[str, Union[str, Number]]: columns = list(columns) def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: return [{ col: Counter(df[col].value_counts().to_dict()) for col in columns }] value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = {col: Counter() for col in columns} for batch in value_counts.iter_batches(): for col_value_counts in batch: for col, value_counts in col_value_counts.items(): final_counters[col] += value_counts return { f"most_frequent({column})": final_counters[column].most_common(1)[0][0] for column in columns }
def _get_most_frequent_values(dataset: Dataset, *columns: str) -> Dict[str, Union[str, Number]]: # TODO(matt): Optimize this. results = {} for column in columns: # Remove nulls. nonnull_dataset = dataset.map_batches( lambda df: df.dropna(subset=[column]), batch_format="pandas") # Count values. counts = nonnull_dataset.groupby(column).count() # Find max count. max_aggregate = counts.aggregate(Max("count()")) max_count = max_aggregate["max(count())"] # Find values with max_count. most_frequent_values = counts.map_batches( lambda df: df.drop(df[df["count()"] < max_count].index), batch_format="pandas", ) # Take first (sorted) value. most_frequent_value_count = most_frequent_values.take(1)[0] most_frequent_value = most_frequent_value_count[column] results[f"most_frequent({column})"] = most_frequent_value return results
def _transform(self, dataset: Dataset) -> Dataset: # TODO(matt): Expose `batch_size` or similar configurability. # The default may be too small for some datasets and too large for others. return dataset.map_batches(self._transform_pandas, batch_format="pandas")