def fit(self, X, y=None): for c in object_cols(X): le = LabelEncoder() le.fit(X[c]) self.encoders[c] = le logger.debug('index encoders {}'.format(self.encoders)) return self
def show_freq(df, max_card=20): """ plot histograms of columns """ assert(isinstance(df, pd.DataFrame)) for c in object_cols(df): if df[c].nunique() < max_card: df[c].value_counts().plot(kind='barh') plt.title(c) plt.show()
def get_pipeline(est, is_tree, is_regressor, params): name = model_name(est) if name.startswith('Dummy'): ppl = Pipeline([ ('ft', FunctionTransformer()), ('mo', est) ]) params['ft__func'] = [lambda x:x[numeric_cols(x)]] params['ft__validate'] = [False] elif is_tree: ppl = Pipeline([ ('da', DateEncoder()), ('du', OrdinalEncoder()), ('ft', FunctionTransformer()), ('se', SelectKBest2()), ('mo', est) ]) params['da__ascategory'] = [False] params['du__drop_invariant'] = [True] params['ft__func'] = [lambda x:x.fillna(-999)] params['ft__validate'] = [False] params['se__score_func'] = get_selector(is_regressor, is_tree) params['se__k'] = [0.2, 0.5, 0.8, 1000, 1000] else: ppl = Pipeline([ ('da', DateEncoder()), ('en', FeatureUnion([ ('nu', Pipeline([('ft', FunctionTransformer()), ('in', Imputer()), ('sc', TransformerWrap(StandardScaler()))])), ('ca', Pipeline([('ft', FunctionTransformer()), ('sc', SparseCatEncoder())])) ])), ('fu', FeatureUnion([('se', SelectKBest2()), ('dr', TruncatedSVD2())])), ('mo', est) ]) params['en__nu__ft__func'] = [lambda x:x[numeric_cols(x)]] params['en__nu__ft__validate'] = [False] params['en__ca__ft__func'] = [lambda x:x[object_cols(x)]] params['en__ca__ft__validate'] = [False] params['fu__se__score_func'] = get_selector(is_regressor, is_tree) params['fu__se__k'] = [0.2, 0.5, 0.8, 1000] params['fu__dr__k'] = [0.2, 0.5, 0.8, 1000] return name, ppl, params
def transform(self, X, y=None): Xt = X.copy() for c in object_cols(X): Xt[c] = self.encoders[c].transform(Xt[c]) logger.debug('') return Xt
#x, y = get_iris() print_summary(x) ppl = Pipeline([ ('in', ConstantInputer()), ("da", DateEncoder()), ('en', FeatureUnion([('nu', Pipeline([('ft', FunctionTransformer()), ("sc", TransformerWrap(StandardScaler()))])), ('ca', make_pipeline(FunctionTransformer(), SparseCatEncoder(), FunctionTransformer()))])), ('fi', make_union(SelectKBest2(), TruncatedSVD2())) ]) params = { 'en__nu__ft__func': lambda x: x[numeric_cols(x)], 'en__nu__ft__validate': False, 'en__ca__functiontransformer-1__func': lambda x: x[object_cols(x)], 'en__ca__functiontransformer-1__validate': False, 'en__ca__functiontransformer-2__func': lambda x: x.loc[:, x.nunique() > 1], 'en__ca__functiontransformer-2__validate': False } ppl.set_params(**params) xt = ppl.fit_transform(x, y) print_summary(xt)
def _compute_pearson_correls(df, columns): for c in object_cols(df) + date_cols(df): df[c] = _index_encode(df[c]) cor = np.corrcoef(df, rowvar=0) res = pd.DataFrame(cor, index=df.columns, columns=df.columns) return res[columns]