Пример #1
0
class FeatureUnion_df(TransformerMixin, BaseEstimator):
    '''
    Wrapper of FeatureUnion but returning a Dataframe, 
    the column order follows the concatenation done by FeatureUnion
    transformer_list: list of Pipelines
    '''
    def __init__(self, transformer_list, n_jobs=None, transformer_weights=None, verbose=False):
        self.transformer_list = transformer_list
        self.n_jobs = n_jobs
        self.transformer_weights = transformer_weights
        self.verbose = verbose  # these are necessary to work inside of GridSearch or similar
        self.feat_un = FeatureUnion(self.transformer_list, 
                                    self.n_jobs, 
                                    self.transformer_weights, 
                                    self.verbose)

    def fit(self, X, y=None):
        self.feat_un.fit(X, y)
        return self

    def transform(self, X, y=None):
        X_tr = self.feat_un.transform(X)
        columns = []

        for trsnf in self.transformer_list:
            cols = trsnf[1].steps[-1][1].get_features_name()
            columns += list(cols)

        X_tr = pd.DataFrame(X_tr, index=X.index, columns=columns)

        return X_tr

    def get_params(self, deep=True):  # necessary to well behave in GridSearch
        return self.feat_un.get_params(deep=deep)
Пример #2
0
class DataCleaner(object):
    def __init__(self, path2meta, verbose=0):

        meta_data_df = pd.read_csv(path2meta)

        self.numeric_feat_list = list(
            meta_data_df[meta_data_df['datatype'] == 'numeric']['name'])
        self.bool_feat_list = list(
            meta_data_df[meta_data_df['datatype'] == 'bool']['name'])
        self.categorical_feat_list = list(
            meta_data_df[meta_data_df['datatype'] == 'categorical']['name'])

        self.CleaningPipeline = FeatureUnion(transformer_list=[
            ('numeric_pipe',
             Pipeline([('selector',
                        ItemSelector(key_list=self.numeric_feat_list)),
                       ('Imputer', Imputer(strategy='mean', verbose=verbose)
                        ), ('scalar', StandardScaler())])),
            ('bool_pipe',
             Pipeline([('selector',
                        ItemSelector(key_list=self.bool_feat_list)),
                       ('Imputer',
                        Imputer(strategy='most_frequent', verbose=verbose)
                        ), ('scalar', StandardScaler())])),
            ('cat_pipe',
             Pipeline([('selector',
                        ItemSelector(key_list=self.categorical_feat_list)),
                       ('Imputer',
                        Imputer(strategy='most_frequent', verbose=verbose)
                        ), ('encoder',
                            OneHotEncoder()), ('scalar', StandardScaler())])),
        ])

    def get_output_col_neams(self):
        "Get the column names in order of the features post cleaning"
        ohe = self.CleaningPipeline.get_params()['cat_pipe__encoder']

        if ohe.is_fitted:
            new_cat_feat_list = ohe.get_new_column_names(
                self.categorical_feat_list)
            return self.numeric_feat_list + self.bool_feat_list + new_cat_feat_list
        else:
            raise (ValueError(
                "Cleaner must be fitted before new column names can be retrieved"
            ))

    def get_clean_dataframe(self, input_df):
        return pd.DataFrame(self.CleaningPipeline.fit_transform(input_df),
                            columns=self.get_output_col_neams())
Пример #3
0
class FeatureUnionDataFrame(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.fu = FeatureUnion(*args, **kwargs)

    def fit(self, X, y=None, **kwargs):
        self.fu.fit(X, y, **kwargs)
        return self

    def transform(self, X, y=None, **fit_params):
        return pd.DataFrame(self.fu.transform(X), columns=self.fu.get_feature_names())

    def get_feature_names(self):
        return self.fu.get_feature_names()

    def set_params(self, **kwargs):
        self.fu.set_params(**kwargs)

    def get_params(self, deep=False):
        return self.fu.get_params(deep)
Пример #4
0
class FeatureUnioner(DfTransformer):
    """
        Joins all dataframes coming from multiple transformers into one dataframe
    """
    def __init__(self, transformer_list, n_jobs=-1):
        self.name = "FeatureUnioner"
        super().log_start(self.name)

        self.transformer_list = transformer_list
        self.n_jobs = n_jobs
        self.feature_union = FeatureUnion(self.transformer_list, self.n_jobs)
        self.columns = []
        
    def fit(self, X, y=None):
        self.feature_union.fit(X)
        return self

    def transform(self, X, y=None):
        
        X_transform = self.feature_union.transform(X)
        self.concat_df_columns()
        X_transform = pd.DataFrame(X_transform, index=X_transform.index, columns = self.columns)

        super().log_end(self.name)
        return X_transform
    
    def concat_df_columns(self):
        
        for transformer in self.transformer_list:
            columns = transformer[1].steps[-1][1].get_feature_names()
            self.columns += columns
            
    def get_params(self, deep = True):
        """
            used for gridsearch
        """
        return self.feature_union.get_params(deep=deep)