Exemplo n.º 1
0
    def define_pipeline(self, columns, dtypes):
        cat_columns = []
        num_columns = []
        for c in columns:
            if (np.str(dtypes[c])=='category')|(dtypes[c]=='object'):
                cat_columns.append(c)
            elif c not in ['TransactionAmt','TransactionDT']:
                num_columns.append(c)

        self.pipeline = dfp.DataframePipeline(steps=[
            dfp.FunctionTransformer(inputs=['TransactionDT'], outputs=['day'], func=lambda x: x / 86400.0),
            dfp.FunctionTransformer(inputs=[('D4', 'day'), ('D6', 'day'), ('D7', 'day'), ('D8', 'day'), ('D10', 'day'), ('D11', 'day'), ('D12', 'day'), ('D13', 'day'), ('D14', 'day'), ('D15', 'day')],
                                    outputs=['D4', 'D6', 'D7', 'D8', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15'],
                                    func=lambda x, y: x - y),
            dfp.ComplementLabelEncoder(inputs=cat_columns, outputs=cat_columns),
            dfp.Scaler(inputs=num_columns, outputs=num_columns, strategy='min'),
            # dfp.Imputer(inputs=num_columns, outputs=num_columns, val=-1),
            dfp.StringConcatenator(inputs=[('card1', 'addr1'), ('card1_addr1', 'P_emaildomain')],
                                   outputs=['card1_addr1', 'card1_addr1_P_emaildomain'],
                                   separator='_'),
            dfp.ComplementLabelEncoder(inputs=['card1_addr1', 'card1_addr1_P_emaildomain'],
                                       outputs=['card1_addr1', 'card1_addr1_P_emaildomain']),
            dfp.FrequencyEncoder(inputs=['addr1', 'card1', 'card2', 'card3', 'P_emaildomain', 'card1_addr1', 'card1_addr1_P_emaildomain'],
                                 outputs=['addr1_FE', 'card1_FE', 'card2_FE', 'card3_FE', 'P_emaildomain_FE', 'card1_addr1_FE', 'card1_addr1_P_emaildomain_FE'],
                                 normalize=True),
            dfp.Aggregator(inputs=['TransactionAmt', 'TransactionAmt', 'TransactionAmt',
                                   'D9', 'D9', 'D9',
                                   'D11', 'D11', 'D11'],
                           outputs=['TransactionAmt_card1_mean', 'TransactionAmt_card1_addr1_mean', 'TransactionAmt_card1_addr1_P_emaildomain_mean', 
                                    'D9_card1_mean', 'D9_card1_addr1_mean', 'D9_card1_addr1_P_emaildomain_mean', 
                                    'D11_card1_mean', 'D11_card1_addr1_mean', 'D11_card1_addr1_P_emaildomain_mean'],
                           groupby=['card1', 'card1_addr1', 'card1_addr1_P_emaildomain',
                                    'card1', 'card1_addr1', 'card1_addr1_P_emaildomain',
                                    'card1', 'card1_addr1', 'card1_addr1_P_emaildomain'],
                           func='mean'),
            dfp.Aggregator(inputs=['TransactionAmt', 'TransactionAmt', 'TransactionAmt',
                                   'D9', 'D9', 'D9',
                                   'D11', 'D11', 'D11'],
                           outputs=['TransactionAmt_card1_std', 'TransactionAmt_card1_addr1_std', 'TransactionAmt_card1_addr1_P_emaildomain_std', 
                                    'D9_card1_std', 'D9_card1_addr1_std', 'D9_card1_addr1_P_emaildomain_std', 
                                    'D11_card1_std', 'D11_card1_addr1_std', 'D11_card1_addr1_P_emaildomain_std'],
                           groupby=['card1', 'card1_addr1', 'card1_addr1_P_emaildomain',
                                    'card1', 'card1_addr1', 'card1_addr1_P_emaildomain',
                                    'card1', 'card1_addr1', 'card1_addr1_P_emaildomain'],
                           func='std'),
            dfp.FunctionTransformer(inputs=['TransactionAmt'],
                                    outputs=['cents'],
                                    func=lambda x: x - np.floor(x)),
            dfp.ColumnSelector(columns=['TransactionDT','D6','D7','D8','D9','D12','D13','D14','C3','M5','id_08','id_33', 'card4','id_07','id_14','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_30','id_32','id_34'],
                               drop=True),
        ])
Exemplo n.º 2
0
    def define_pipeline(self, columns, dtypes):
        binvar = ['bin_' + str(i) for i in range(1, 5)]
        ordvar = ['ord_' + str(i) for i in range(6)]
        nomvar = ['nom_' + str(i) for i in range(10)]
        dmvar = ['day', 'month']

        self.pipeline = dfp.DataframePipeline(steps=[
            dfp.MapTransformer(
                inputs=['bin_3'], outputs=['bin_3'], dict={
                    'F': 0,
                    'T': 1
                }),
            dfp.MapTransformer(
                inputs=['bin_4'], outputs=['bin_4'], dict={
                    'N': 0,
                    'Y': 1
                }),
            dfp.FunctionTransformer(
                inputs=['bin_0'], outputs=['bin_0'], func=lambda x: x - 1),
            dfp.MapTransformer(inputs=['ord_1'],
                               outputs=['ord_1'],
                               dict={
                                   'Novice': 0,
                                   'Contributor': 1,
                                   'Expert': 2,
                                   'Master': 3,
                                   'Grandmaster': 4
                               }),
            dfp.MapTransformer(inputs=['ord_2'],
                               outputs=['ord_2'],
                               dict={
                                   'Freezing': 0,
                                   'Cold': 1,
                                   'Warm': 2,
                                   'Hot': 3,
                                   'Boiling Hot': 4,
                                   'Lava Hot': 5
                               }),
            dfp.ComplementLabelEncoder(inputs=['ord_3', 'ord_4', 'ord_5'],
                                       outputs=['ord_3', 'ord_4', 'ord_5']),
            dfp.Scaler(inputs=ordvar, outputs=ordvar, strategy='standard'),
            dfp.StringSplitter(
                inputs=['nom_5'], outputs=['nom_5'], index=8, keep=-1),
            dfp.StringSplitter(inputs=['nom_6', 'nom_7', 'nom_8', 'nom_9'],
                               outputs=['nom_6', 'nom_7', 'nom_8', 'nom_9'],
                               index=3,
                               keep=-1),
            # dfp.OneHotTransformer(onehot_columns=nomvar+dmvar),
            dfp.OneHotEncoder(columns=[
                'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6'
            ] + dmvar),
            dfp.ColumnSelector(columns=['bin_0'] + nomvar + dmvar, drop=True)
        ])
Exemplo n.º 3
0
def test_np_sqrt():
    func = dfp.FunctionTransformer(inputs=['col3'],
                                   outputs=['col4'],
                                   func=np.sqrt)
    out = func.fit_transform(df.copy())
    assert_frame_equal(out, np_sqrt_df)
Exemplo n.º 4
0
def test_add_two_columns():
    func = dfp.FunctionTransformer(inputs=[('col1', 'col2')],
                                   outputs=['col4'],
                                   func=lambda x, y: x + y)
    out = func.fit_transform(df.copy())
    assert_frame_equal(out, add_two_columns_df)
Exemplo n.º 5
0
def test_add_constant():
    func = dfp.FunctionTransformer(inputs=['col1'],
                                   outputs=['col4'],
                                   func=lambda x: x + 1)
    out = func.fit_transform(df.copy())
    assert_frame_equal(out, add_constant_df)