def get_reduced_embeddings_df(data, embedder: embeddings.EmbeddingVectorizer, reducer: base.BaseEstimator): """ run feature extraction with `embedder` and then dimensionality reduction with `reducer` """ data_embeddings = embedder.transform(data) reduced_task_embeddings = reducer.fit_transform(data_embeddings) return reduced_task_embeddings
def _impute(self, imputer: BaseEstimator, target: str) -> None: """Impute any missing data within 'numeric_features'. This method skips imputing the target variable. Args: imputer (BaseEstimator): Class instance to impute the data. Must have valid 'fit_transform' method. target (str): Column name for the target variable. """ numeric_features_wo_target = list( set(self.numeric_features) - set([target])) if numeric_features_wo_target: self.processed_data.loc[:, numeric_features_wo_target] = imputer.fit_transform( self.processed_data. loc[:, numeric_features_wo_target])
def _scale_data(self, scaler: BaseEstimator, target: str, scale_target: bool) -> None: """Scale numeric features. This method can either be used to scale the target variable or not. Args: scaler (BaseEstimator): Class instance to scale the data. Must have valid 'fit_transform' method. target (str): Column name of target variable. scale_target (bool): Whether to scale the target variable or not. """ if scale_target: features_to_scale = self.numeric_features else: features_to_scale = list( set(self.numeric_features) - set([target])) if features_to_scale: self.processed_data.loc[:, features_to_scale] = scaler.fit_transform( self.processed_data. loc[:, features_to_scale])
def transform_dataset(self, algorithm: BaseEstimator, n_folds: int = 5) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Given a set of fully-qualified hyperparameters, create and not working a algorithm model. Returns: Model object and metrics dictionary """ """Load input dataset and class_column""" df = self.dataset.load(self.s3_config, self.s3_bucket) class_column = self.dataset.class_column """Split input dataset in X and y""" X, y = df.drop(class_column, axis=1), df[class_column] """ Checks if algorithm (BaseEstimator) is a classifier. If True, predict y_pred with the method cross_val_predict. Then calculate the evaluation metrics for the algorithm model and return them as a dict. Convert y_pred to pd Series and concatenate X & y_pred. If False, call fit_transform or fit and then transform on X, y and return the transformed dataset as Dataframe. """ if is_classifier(algorithm): """Predict labels with n fold cross validation""" y_pred = cross_val_predict(algorithm, X, y, cv=n_folds) """Calculate evaluation metrics""" accuracy = accuracy_score(y, y_pred) precision = precision_score(y, y_pred, average='weighted') recall = recall_score(y, y_pred, average='weighted') f1 = f1_score(y, y_pred, average='weighted') # TODO log_loss = logloss(y, y_pred) roc_auc = multiclass_roc_auc_score(y, y_pred, average='weighted') """Convert np array y_pred to pd series and add it to X""" y_pred = pd.Series(y_pred) X = pd.concat([X, y_pred], axis=1) X.columns = range(X.shape[1]) return X, {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'neg_log_loss': log_loss, 'roc_auc': roc_auc } else: """ If algorithm object has method fit_transform, call fit_transform on X, y. Else, first call fit on X, y, then transform on X. Safe the transformed dataset in X """ if hasattr(algorithm, 'fit_transform'): X = algorithm.fit_transform(X, y) else: # noinspection PyUnresolvedReferences X = algorithm.fit(X, y).transform(X) X = pd.DataFrame(data=X, index=range(X.shape[0]), columns=range(X.shape[1])) return X, {}