예제 #1
0
 def __init__(self):
     self.catPreprocessor = make_pipeline(e.CategoricalGrouper(),
                                          e.CategoricalEncoder())
     self.catPCA = make_pipeline(e.CategoricalGrouper(),
                                 e.CategoricalEncoder(),
                                 PCA(n_components=6, random_state=0))
     self.contPreprocessor = make_pipeline(
         QuantileTransformer(output_distribution='normal', random_state=0),
         StandardScaler())
     self.contPCA = make_pipeline(
         QuantileTransformer(output_distribution='normal', random_state=0),
         StandardScaler(), PCA(n_components=6, random_state=0))
예제 #2
0
LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index)

LOGGER.info('Load categorical features to drop')
noVarFeatures = json.load(open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r'))

LOGGER.info('Process categorical features')
catf = pd.DataFrame(
    data=make_pipeline(
        e.CategoricalGrouper(),
        e.CategoricalEncoder()
    ).fit_transform(X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y),
    columns=X.filter(like='cat').drop(labels=noVarFeatures, axis=1).columns,
    index=X.index
)

LOGGER.info('Process continuous features')
contf = pd.DataFrame(
    data=scale(quantile_transform(
        X=X.filter(like='cont'),
        output_distribution='normal',
        random_state=0
    )),
    columns=X.filter(like='cont').columns,
    index=X.index
예제 #3
0
LOGGER.info('Load correlated features')
CORRELATED = json.load(
    open(file=p.joinpath('src', 'meta', 'Correlated.json'), mode='r'))

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.filter(CORRELATED)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(),
              name='loss',
              index=y.index)

LOGGER.info('Process categorical features')
catf = pd.DataFrame(data=make_pipeline(e.CategoricalGrouper(),
                                       e.CategoricalEncoder()).fit_transform(
                                           X.filter(like='cat'), y),
                    columns=X.filter(like='cat').columns,
                    index=X.index)

LOGGER.info('Process continuous features')
contf = pd.DataFrame(data=scale(
    quantile_transform(X=X.filter(like='cont'),
                       output_distribution='normal',
                       random_state=0)),
                     columns=X.filter(like='cont').columns,
                     index=X.index)

LOGGER.info(r'Figure 1: Correlations above 75%')
X = catf.join(contf)
예제 #4
0
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(),
              name='loss',
              index=y.index)

LOGGER.info('Load categorical features to drop')
noVarFeatures = json.load(
    open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r'))

LOGGER.info('Process categorical features')
catf = pd.DataFrame(data=make_pipeline(
    e.CategoricalGrouper(), e.CategoricalEncoder()).fit_transform(
        X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y),
                    columns=X.filter(like='cat').drop(labels=noVarFeatures,
                                                      axis=1).columns,
                    index=X.index)

LOGGER.info('Process continuous features')
contf = pd.DataFrame(data=scale(
    quantile_transform(X=X.filter(like='cont'),
                       output_distribution='normal',
                       random_state=0)),
                     columns=X.filter(like='cont').columns,
                     index=X.index)

LOGGER.info('Find correlations')
corr = catf.join(contf).corr()
예제 #5
0
def make_pipeline(model: Estimator) -> Pipeline:
    return Pipeline([('grouper', e.CategoricalGrouper()),
                     ('encoder', e.CategoricalEncoder()), ('clf', model)])