def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: new_df = type(df)() tmp = "__tmp__" # Temporary column for sorting df[tmp] = _arange(len(df), like_df=df, dtype="int32") cat_names, multi_col_group = nvt_cat._get_multicolumn_names( col_selector, df.columns, self.name_sep) _read_pq_func = _read_parquet_dispatch(df) for name in cat_names: new_part = type(df)() storage_name = self.storage_name.get(name, name) name = multi_col_group.get(name, name) path = self.categories[storage_name] selection_l = list(name) if isinstance(name, tuple) else [name] selection_r = list(name) if isinstance(name, tuple) else [storage_name] stat_df = nvt_cat._read_groupby_stat_df(path, storage_name, self.cat_cache, _read_pq_func) tran_df = df[selection_l + [tmp]].merge(stat_df, left_on=selection_l, right_on=selection_r, how="left") tran_df = tran_df.sort_values(tmp) tran_df.drop(columns=selection_l + [tmp], inplace=True) new_cols = [c for c in tran_df.columns if c not in new_df.columns] new_part = tran_df[new_cols].reset_index(drop=True) new_df = _concat_columns([new_df, new_part]) df.drop(columns=[tmp], inplace=True) return new_df
def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType: # Add temporary column for sorting tmp = "__tmp__" df[tmp] = _arange(len(df), like_df=df, dtype="int32") fit_folds = self.kfold > 1 if fit_folds: df[self.fold_name] = _add_fold(df.index, self.kfold, self.fold_seed) # Need mean of contiuous target column y_mean = self.target_mean or self.means # Loop over categorical-column groups and apply logic new_df = None for ind, cat_group in enumerate(columns): if isinstance(cat_group, tuple): cat_group = list(cat_group) elif isinstance(cat_group, str): cat_group = [cat_group] if new_df is None: new_df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind) else: _df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind) new_df = _concat_columns([new_df, _df]) # Drop temporary columns df.drop(columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True) if fit_folds and not self.drop_folds: new_df[self.fold_name] = df[self.fold_name] return new_df
def _transform_partition(root_df, column_groups): """Transforms a single partition by appyling all operators in a ColumnGroup""" output = None for column_group in column_groups: unique_flattened_cols = _get_unique(column_group.flattened_columns) # collect dependencies recursively if we have parents if column_group.parents: df = None columns = None for parent in column_group.parents: unique_flattened_cols_parent = _get_unique( parent.flattened_columns) parent_df = _transform_partition(root_df, [parent]) if df is None or not len(df): df = parent_df[unique_flattened_cols_parent] columns = set(unique_flattened_cols_parent) else: new_columns = set(unique_flattened_cols_parent) - columns df = _concat_columns([df, parent_df[list(new_columns)]]) columns.update(new_columns) else: # otherwise select the input from the root df df = root_df[unique_flattened_cols] # apply the operator if necessary if column_group.op: try: df = column_group.op.transform(column_group.input_column_names, df) except Exception: LOG.exception("Failed to transform operator %s", column_group.op) raise if df is None: raise RuntimeError( "Operator %s didn't return a value during transform" % column_group.op) # dask needs output to be in the same order defined as meta, reorder partitions here # this also selects columns (handling the case of removing columns from the output using # "-" overload) if not output: output = df[unique_flattened_cols] else: output = _concat_columns([output, df[unique_flattened_cols]]) return output
def _concat_tensors(self, tensors, kind): if kind & (Supports.GPU_DATAFRAME | Supports.CPU_DATAFRAME): return _concat_columns(tensors) else: output = tensors[0] for tensor in tensors[1:]: output.update(tensor) return output
def _transform_partition(root_df, workflow_nodes, additional_columns=None): """Transforms a single partition by appyling all operators in a WorkflowNode""" output = None for node in workflow_nodes: node_input_cols = _get_unique(node.input_schema.column_names) node_output_cols = _get_unique(node.output_schema.column_names) addl_input_cols = set(node.dependency_columns.names) # Build input dataframe if node.parents_with_dependencies: # If there are parents, collect their outputs # to build the current node's input input_df = None seen_columns = None for parent in node.parents_with_dependencies: parent_output_cols = _get_unique(parent.output_schema.column_names) parent_df = _transform_partition(root_df, [parent]) if input_df is None or not len(input_df): input_df = parent_df[parent_output_cols] seen_columns = set(parent_output_cols) else: new_columns = set(parent_output_cols) - seen_columns input_df = _concat_columns([input_df, parent_df[list(new_columns)]]) seen_columns.update(new_columns) # Check for additional input columns that aren't generated by parents # and fetch them from the root dataframe unseen_columns = set(node.input_schema.column_names) - seen_columns addl_input_cols = addl_input_cols.union(unseen_columns) # TODO: Find a better way to remove dupes addl_input_cols = addl_input_cols - set(input_df.columns) if addl_input_cols: input_df = _concat_columns([input_df, root_df[list(addl_input_cols)]]) else: # If there are no parents, this is an input node, # so pull columns directly from root df input_df = root_df[node_input_cols + list(addl_input_cols)] # Compute the node's output if node.op: try: # use input_columns to ensure correct grouping (subgroups) selection = node.input_columns.resolve(node.input_schema) output_df = node.op.transform(selection, input_df) except Exception: LOG.exception("Failed to transform operator %s", node.op) raise if output_df is None: raise RuntimeError("Operator %s didn't return a value during transform" % node.op) else: output_df = input_df # Combine output across node loop iterations # dask needs output to be in the same order defined as meta, reorder partitions here # this also selects columns (handling the case of removing columns from the output using # "-" overload) if output is None: output = output_df[node_output_cols] else: output = _concat_columns([output, output_df[node_output_cols]]) if additional_columns: output = _concat_columns([output, root_df[_get_unique(additional_columns)]]) return output