def _fit(self, dataset: Dataset) -> Preprocessor: low = self.quantile_range[0] med = 0.50 high = self.quantile_range[1] num_records = dataset.count() max_index = num_records - 1 split_indices = [ int(percentile * max_index) for percentile in (low, med, high) ] self.stats_ = {} # TODO(matt): Handle case where quantile lands between 2 numbers. # The current implementation will simply choose the closest index. # This will affect the results of small datasets more than large datasets. for col in self.columns: filtered_dataset = dataset.map_batches(lambda df: df[[col]], batch_format="pandas") sorted_dataset = filtered_dataset.sort(col) _, low, med, high = sorted_dataset.split_at_indices(split_indices) def _get_first_value(ds: Dataset, c: str): return ds.take(1)[0][c] low_val = _get_first_value(low, col) med_val = _get_first_value(med, col) high_val = _get_first_value(high, col) self.stats_[f"low_quantile({col})"] = low_val self.stats_[f"median({col})"] = med_val self.stats_[f"high_quantile({col})"] = high_val return self
def train_test_split( dataset: Dataset, test_size: Union[int, float], *, shuffle: bool = False, seed: Optional[int] = None, ) -> Tuple[Dataset, Dataset]: """Split a Dataset into train and test subsets. Example: .. code-block:: python import ray from ray.ml import train_test_split ds = ray.data.range(8) train, test = train_test_split(ds, test_size=0.25) print(train.take()) # [0, 1, 2, 3, 4, 5] print(test.take()) # [6, 7] Args: dataset: Dataset to split. test_size: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. The train split will always be the compliment of the test split. shuffle: Whether or not to globally shuffle the dataset before splitting. Defaults to False. This may be a very expensive operation with large datasets. seed: Fix the random seed to use for shuffle, otherwise one will be chosen based on system randomness. Ignored if ``shuffle=False``. Returns: Train and test subsets as two Datasets. """ if shuffle: dataset = dataset.random_shuffle(seed=seed) if not isinstance(test_size, (int, float)): raise TypeError(f"`test_size` must be int or float got {type(test_size)}.") if isinstance(test_size, float): if test_size <= 0 or test_size >= 1: raise ValueError( "If `test_size` is a float, it must be bigger than 0 and smaller than " f"1. Got {test_size}." ) return dataset.split_proportionately([1 - test_size]) else: dataset_length = dataset.count() if test_size <= 0 or test_size >= dataset_length: raise ValueError( "If `test_size` is an int, it must be bigger than 0 and smaller than " f"the size of the dataset ({dataset_length}). Got {test_size}." ) return dataset.split_at_indices([dataset_length - test_size])
def _get_unique_value_indices( dataset: Dataset, columns: List[str], drop_na_values: bool = False, key_format: str = "unique_values({0})", ) -> Dict[str, Dict[str, int]]: """If drop_na_values is True, will silently drop NA values.""" def get_pd_unique_values(df: pd.DataFrame) -> List[Dict[str, set]]: return [{col: set(df[col].unique()) for col in columns}] uniques = dataset.map_batches(get_pd_unique_values, batch_format="pandas") final_uniques = {col: set() for col in columns} for batch in uniques.iter_batches(): for col_uniques in batch: for col, uniques in col_uniques.items(): final_uniques[col].update(uniques) for col, uniques in final_uniques.items(): if drop_na_values: final_uniques[col] = {v for v in uniques if not pd.isnull(v)} else: if any(pd.isnull(v) for v in uniques): raise ValueError( f"Unable to fit column '{col}' because it contains null values. " f"Consider imputing missing values first.") unique_values_with_indices = { key_format.format(column): {k: j for j, k in enumerate(sorted(final_uniques[column]))} for column in columns } return unique_values_with_indices
def _fit(self, dataset: Dataset) -> Preprocessor: def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]: def get_token_counts(col): token_series = df[col].apply(self.tokenization_fn) tokens = token_series.sum() return Counter(tokens) return [get_token_counts(col) for col in self.columns] value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") total_counts = [Counter() for _ in self.columns] for batch in value_counts.iter_batches(): for i, col_value_counts in enumerate(batch): total_counts[i].update(col_value_counts) def most_common(counter: Counter, n: int): return Counter(dict(counter.most_common(n))) top_counts = [ most_common(counter, self.max_features) for counter in total_counts ] self.stats_ = { f"token_counts({col})": counts for (col, counts) in zip(self.columns, top_counts) } return self
def _get_unique_value_indices( dataset: Dataset, columns: List[str], drop_na_values: bool = False, key_format: str = "unique_values({0})", limit: Optional[Dict[str, int]] = None, ) -> Dict[str, Dict[str, int]]: """If drop_na_values is True, will silently drop NA values.""" limit = limit or {} for column in limit: if column not in columns: raise ValueError( f"You set limit for {column}, which is not present in {columns}." ) def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: result = [ { col: Counter(df[col].value_counts(dropna=False).to_dict()) for col in columns } ] return result value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = {col: Counter() for col in columns} for batch in value_counts.iter_batches(): for col_value_counts in batch: for col, value_counts in col_value_counts.items(): final_counters[col] += value_counts # Inspect if there is any NA values. for col in columns: if drop_na_values: counter = final_counters[col] counter_dict = dict(counter) sanitized_dict = {k: v for k, v in counter_dict.items() if not pd.isnull(k)} final_counters[col] = Counter(sanitized_dict) else: if any(pd.isnull(k) for k in final_counters[col]): raise ValueError( f"Unable to fit column '{col}' because it contains null" f" values. Consider imputing missing values first." ) unique_values_with_indices = dict() for column in columns: if column in limit: # Output sorted by freq. unique_values_with_indices[key_format.format(column)] = { k[0]: j for j, k in enumerate(final_counters[column].most_common(limit[column])) } else: # Output sorted by column name. unique_values_with_indices[key_format.format(column)] = { k: j for j, k in enumerate(sorted(dict(final_counters[column]).keys())) } return unique_values_with_indices
def _fit(self, dataset: Dataset) -> Preprocessor: if self.strategy == "mean": aggregates = [Mean(col) for col in self.columns] self.stats_ = dataset.aggregate(*aggregates) elif self.strategy == "most_frequent": self.stats_ = _get_most_frequent_values(dataset, *self.columns) return self
def _transform(self, dataset: Dataset) -> Dataset: # TODO(matt): Expose `batch_size` or similar configurability. # The default may be too small for some datasets and too large for others. dataset_format = dataset._dataset_format() if dataset_format not in ("pandas", "arrow"): raise ValueError( f"Unsupported Dataset format: '{dataset_format}'. Only 'pandas' and " "'arrow' Dataset formats are supported.") transform_type = self._determine_transform_to_use(dataset_format) if transform_type == "pandas": return dataset.map_batches(self._transform_pandas, batch_format="pandas") elif transform_type == "arrow": return dataset.map_batches(self._transform_arrow, batch_format="pyarrow") else: raise ValueError( "Invalid transform type returned from _determine_transform_to_use; " f'"pandas" and "arrow" allowed, but got: {transform_type}')
def _get_most_frequent_values(dataset: Dataset, *columns: str) -> Dict[str, Union[str, Number]]: columns = list(columns) def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]: return [Counter(df[col].value_counts().to_dict()) for col in columns] value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = [Counter() for _ in columns] for batch in value_counts.iter_batches(): for i, col_value_counts in enumerate(batch): final_counters[i] += col_value_counts return { f"most_frequent({column})": final_counters[i].most_common(1)[0][0] for i, column in enumerate(columns) }
def _get_most_frequent_values(dataset: Dataset, *columns: str) -> Dict[str, Union[str, Number]]: columns = list(columns) def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: return [{ col: Counter(df[col].value_counts().to_dict()) for col in columns }] value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = {col: Counter() for col in columns} for batch in value_counts.iter_batches(): for col_value_counts in batch: for col, value_counts in col_value_counts.items(): final_counters[col] += value_counts return { f"most_frequent({column})": final_counters[column].most_common(1)[0][0] for column in columns }
def _get_most_frequent_values(dataset: Dataset, *columns: str) -> Dict[str, Union[str, Number]]: # TODO(matt): Optimize this. results = {} for column in columns: # Remove nulls. nonnull_dataset = dataset.map_batches( lambda df: df.dropna(subset=[column]), batch_format="pandas") # Count values. counts = nonnull_dataset.groupby(column).count() # Find max count. max_aggregate = counts.aggregate(Max("count()")) max_count = max_aggregate["max(count())"] # Find values with max_count. most_frequent_values = counts.map_batches( lambda df: df.drop(df[df["count()"] < max_count].index), batch_format="pandas", ) # Take first (sorted) value. most_frequent_value_count = most_frequent_values.take(1)[0] most_frequent_value = most_frequent_value_count[column] results[f"most_frequent({column})"] = most_frequent_value return results
def _transform(self, dataset: Dataset) -> Dataset: # TODO(matt): Expose `batch_size` or similar configurability. # The default may be too small for some datasets and too large for others. return dataset.map_batches(self._transform_pandas, batch_format="pandas")
def _fit(self, dataset: Dataset) -> Preprocessor: aggregates = [Agg(col) for Agg in [Min, Max] for col in self.columns] self.stats_ = dataset.aggregate(*aggregates) return self
def _fit(self, dataset: Dataset) -> Preprocessor: mean_aggregates = [Mean(col) for col in self.columns] std_aggregates = [Std(col, ddof=self.ddof) for col in self.columns] self.stats_ = dataset.aggregate(*mean_aggregates, *std_aggregates) return self
def _get_first_value(ds: Dataset, c: str): return ds.take(1)[0][c]
def _get_unique_values(dataset: Dataset, column: str) -> Set[str]: agg_ds = dataset.groupby(column).count() # TODO: Support an upper limit by using `agg_ds.take(N)` instead. return {row[column] for row in agg_ds.iter_rows()}
def get_max(ds: Dataset): return ds.aggregate(Max("value"))
def get_max_a(ds: Dataset): # Calculate max value for column A. max_a = ds.aggregate(Max("A")) return max_a
def execute_if_needed(self, ds: Dataset) -> Dataset: if ds._uuid not in self.set: ds = ds.fully_executed() self.set.add(ds._uuid) return ds