예제 #1
0
 def fit(self, X, y=None):
     for c in object_cols(X):
         le = LabelEncoder()
         le.fit(X[c])
         self.encoders[c] = le
     logger.debug('index encoders {}'.format(self.encoders))
     return self
예제 #2
0
def show_freq(df, max_card=20):
    """ plot histograms of columns """

    assert(isinstance(df, pd.DataFrame))

    for c in object_cols(df):
        if df[c].nunique() < max_card:
            df[c].value_counts().plot(kind='barh')
            plt.title(c)
            plt.show()
예제 #3
0
def get_pipeline(est, is_tree, is_regressor, params):
    name = model_name(est)
    if name.startswith('Dummy'):
        ppl = Pipeline([
                       ('ft', FunctionTransformer()), 
                       ('mo', est)
                      ])
        params['ft__func'] = [lambda x:x[numeric_cols(x)]]
        params['ft__validate'] = [False]
    elif is_tree:
        ppl = Pipeline([
                       ('da', DateEncoder()),
                       ('du', OrdinalEncoder()),
                       ('ft', FunctionTransformer()),
                       ('se', SelectKBest2()),
                       ('mo', est)
                      ])
        params['da__ascategory'] = [False]
        params['du__drop_invariant'] = [True]
        params['ft__func'] = [lambda x:x.fillna(-999)]
        params['ft__validate'] = [False]
        params['se__score_func'] = get_selector(is_regressor, is_tree)
        params['se__k'] = [0.2, 0.5, 0.8, 1000, 1000]
    else:
        ppl = Pipeline([
                ('da', DateEncoder()),
                ('en', FeatureUnion([
                       ('nu', Pipeline([('ft', FunctionTransformer()), ('in', Imputer()), ('sc', TransformerWrap(StandardScaler()))])),
                       ('ca', Pipeline([('ft', FunctionTransformer()), ('sc', SparseCatEncoder())]))
                       ])),
                ('fu', FeatureUnion([('se', SelectKBest2()), ('dr', TruncatedSVD2())])),
                ('mo', est)
                ])
            
        params['en__nu__ft__func'] = [lambda x:x[numeric_cols(x)]]
        params['en__nu__ft__validate'] = [False]
        params['en__ca__ft__func'] = [lambda x:x[object_cols(x)]]
        params['en__ca__ft__validate'] = [False]
        params['fu__se__score_func'] = get_selector(is_regressor, is_tree)
        params['fu__se__k'] = [0.2, 0.5, 0.8, 1000]
        params['fu__dr__k'] = [0.2, 0.5, 0.8, 1000]        
        
    return name, ppl, params
예제 #4
0
 def transform(self, X, y=None):
     Xt = X.copy()
     for c in object_cols(X):
         Xt[c] = self.encoders[c].transform(Xt[c])
     logger.debug('')
     return Xt
예제 #5
0
#x, y = get_iris()

print_summary(x)

ppl = Pipeline([
    ('in', ConstantInputer()), ("da", DateEncoder()),
    ('en',
     FeatureUnion([('nu',
                    Pipeline([('ft', FunctionTransformer()),
                              ("sc", TransformerWrap(StandardScaler()))])),
                   ('ca',
                    make_pipeline(FunctionTransformer(), SparseCatEncoder(),
                                  FunctionTransformer()))])),
    ('fi', make_union(SelectKBest2(), TruncatedSVD2()))
])

params = {
    'en__nu__ft__func': lambda x: x[numeric_cols(x)],
    'en__nu__ft__validate': False,
    'en__ca__functiontransformer-1__func': lambda x: x[object_cols(x)],
    'en__ca__functiontransformer-1__validate': False,
    'en__ca__functiontransformer-2__func':
    lambda x: x.loc[:, x.nunique() > 1],
    'en__ca__functiontransformer-2__validate': False
}

ppl.set_params(**params)

xt = ppl.fit_transform(x, y)

print_summary(xt)
예제 #6
0
def _compute_pearson_correls(df, columns):
    for c in object_cols(df) + date_cols(df):
        df[c] = _index_encode(df[c])
    cor = np.corrcoef(df, rowvar=0)
    res = pd.DataFrame(cor, index=df.columns, columns=df.columns) 
    return res[columns]