def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def __init__(self, method='yeo-johnson', standardize=True, copy=True): self._hyperparams = { 'method': method, 'standardize': standardize, 'copy': copy } self._wrapped_model = SKLModel(**self._hyperparams)
class PowerTransformerImpl(): def __init__(self, method='yeo-johnson', standardize=True, copy=True): self._hyperparams = { 'method': method, 'standardize': standardize, 'copy': copy} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
def get_transformed_numeric_data(self): """Transforms the numeric data from the dask dataframe contained in 'self.dask_dataframe', selected based on the contents of 'self.num_labels'. Returns: A dataframe with the scaled and transformed columns. """ updated_data_bc = {} # Iterate through the numeric labels. for column in self.num_labels: # For each column, add 1 to shift data to right and avoid zeros. # We need to call 'computed()' on the column so that we can retrieve its values and apply 'reshape()'. data_in_column = self.dask_df[column] data = data_in_column.compute().values.reshape(-1, 1) + 1 # For each column, compose the path and name of the file which holds the # 'PowerTransformer' object with the fitted lambdas using the model directory, # the day of the last training (current day if we are preprocessing for training) and a unique hash. pkl_file = f'{self.models_dir}/{self.day_as_str}_{self.unique_hash}_ga_chp_box_cox_{column}.pkl' # If predicting load the specific 'PowerTransformer' object for this column and apply the transformation. if self.training_or_prediction == 'prediction': box_cox = joblib.load(pkl_file) data_bc = box_cox.transform(data) # If training, fit the 'PowerTransformer' and save the object in the specified column's file then apply the transformation. else: # Create a 'PowerTransformer' object using the 'box-cox' method. box_cox = PowerTransformer(method='box-cox') box_cox.fit(data) joblib.dump(box_cox, pkl_file) data_bc = box_cox.transform(data) updated_data_bc[column] = data_bc.T.tolist()[0] # Append all the columns to an array and generate a dask dataframe from it with the data # transformed using Box-Cox. bc_list = [] for column in self.num_labels: bc_list.append(updated_data_bc[column]) bc_array = np.array(bc_list).transpose() transformed_bc_data = dd.from_array(bc_array, chunksize=200000, columns=self.num_labels) # Generate a similar .pkl file name and path for the 'Pipeline' type object with the fitted hyperparameters. pkl_file = f'{self.models_dir}/{self.day_as_str}_{self.unique_hash}_ga_chp_pipeline.pkl' # If predicting, load the pipeline and use it to transform the data. if self.training_or_prediction == 'prediction': pipeline = joblib.load(pkl_file) transformed_numeric = pipeline.transform(transformed_bc_data) else: # If training, generate a 'Pipeline' using a 'SimpleImputer', 'Normalizer' and 'StandardScaler'. pipeline = Pipeline([ # Replace zeros with mean value. ('imputer', SimpleImputer(strategy="mean", missing_values=0)), # Scale in interval (0, 1). ('normalizer', Normalizer()), # Substract mean and divide by variance. ('scaler', StandardScaler()), ]) # Fit the pipeline and save it to the specified file then apply the transformation. pipeline.fit(transformed_bc_data) joblib.dump(pipeline, pkl_file) transformed_numeric = pipeline.transform(transformed_bc_data) return dd.from_array(transformed_numeric, chunksize=200000, columns=self.num_labels)