Пример #1
0
def fit(X, y, output_dir, **kwargs):
    """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
    DataRobot runs this hook when the task is being trained inside a blueprint.
    As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit M-Estimate target encoder], that is then used to transform new data.
    The input parameters are passed by DataRobot based on project and blueprint configuration.

    Parameters
    -------
    X: pd.DataFrame
        Training data that DataRobot passes when this task is being trained.
    y: pd.Series
        Project's target column (None is passed for unsupervised projects).
    output_dir: str
        A path to the output folder; the artifact [in this example - a pickle file containing a pre-fit M-Estimate target encoder] must be saved into this folder to be re-used in transform().

    Returns
    -------
    None
        fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
        so that the trained object can be used during scoring inside transform()
    """

    # Transform categorical columns into a numeric transformation using Weight of Evidence
    encoder_mest = ce.MEstimateEncoder(cols=X.columns, randomized=True, m=0.50)
    encoder_mest.fit(X, y)

    # dump the trained object
    # into an artifact [in this example - woe.pkl]
    # and save it into output_dir so that it can be used later to impute on new data
    output_dir_path = Path(output_dir)
    if output_dir_path.exists() and output_dir_path.is_dir():
        with open("{}/mest.pkl".format(output_dir), "wb") as fp:
            pickle.dump(encoder_mest, fp)
Пример #2
0
def cal_moe(df_tr, col):
    enc = ce.MEstimateEncoder(cols=[col]).fit(df_tr.loc[::, feature_col],
                                              df_tr.loc[::, 'isDefault'])
    tmp = pd.DataFrame({
        f'{col}':
        df_tr.loc[::, col],
        f'moe_{col}':
        enc.transform(df_tr.loc[::, feature_col], df_tr.loc[::,
                                                            'isDefault'])[col]
    })
    return tmp.groupby([col])[f'moe_{col}'].mean(), f'moe_{col}'
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Пример #4
0
    def test_reference_m0(self):
        x = ['A', 'A', 'B', 'B']
        y = [1, 1, 0, 1]
        x_t = ['A', 'B', 'C']

        encoder = encoders.MEstimateEncoder(m=0,
                                            handle_unknown='value',
                                            handle_missing='value')
        encoder.fit(x, y)
        scored = encoder.transform(x_t)

        expected = [[1], [0.5], [3. / 4.]]  # The prior probability
        self.assertEqual(scored.values.tolist(), expected)
Пример #5
0
 def get_encoder(self) -> BaseEstimator:
     return ce.MEstimateEncoder(cols=self.target_columns)
Пример #6
0
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

# datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large...


# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(),
             category_encoders.BinaryEncoder(),
             category_encoders.HashingEncoder(),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(),
             category_encoders.LeaveOneOutEncoder(),
             category_encoders.MEstimateEncoder(),
             category_encoders.OneHotEncoder(),
             category_encoders.OrdinalEncoder(),
             # category_encoders.PolynomialEncoder(),
             # category_encoders.SumEncoder(),
             category_encoders.TargetEncoder(),
             category_encoders.WOEEncoder()]

encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(handle_missing='value'),
             category_encoders.BaseNEncoder(handle_missing='indicator'),
             category_encoders.BinaryEncoder(handle_missing='value'),
category_encoders.BinaryEncoder(handle_missing='indicator'),
#              category_encoders.HashingEncoder(handle_missing='value'),
# category_encoders.HashingEncoder(handle_missing='indicator'),
             # category_encoders.HelmertEncoder(),
Пример #7
0
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'):
    
    encoders_used = {}
    
    for col in encoder_to_use:

        if encoder_to_use[col] == 'ColumnDropper':
            df = df.drop(columns = col)
            dfv = dfv.drop(columns = col)
            dfk = dfk.drop(columns = col)
            encoders_used[col] = 'ColumnDropper'    
                
        if encoder_to_use[col]=='BackwardDifferenceEncoder':
            encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BaseNEncoder':
            encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) 
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BinaryEncoder':
            encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='CatBoostEncoder':
            encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

    #     if encoder_to_use[col]=='HashingEncoder':
    #         encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
    #         encoder.fit(X=df,y=df['set_clicked'])
    #         df=encoder.transform(df)
    #         encoders_used[col]=encoder

        if encoder_to_use[col]=='HelmertEncoder':
            encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='JamesSteinEncoder':
            encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary')
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='LeaveOneOutEncoder':
            encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='MEstimateEncoder':
            encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OneHotEncoder':
            encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OrdinalEncoder':
            encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='SumEncoder':
            encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='PolynomialEncoder':
            encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='TargetEncoder':
            encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder


        if encoder_to_use[col]=='WOEEncoder':
            encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder
            
#         print("Encoding done for - ",col)
    
    print("Completed encoder - ",datetime.datetime.now())
    
    return df, dfv, dfk, encoders_used
datasets = ['audiology.arff', 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff',
            'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff',
            'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff',
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [category_encoders.BackwardDifferenceEncoder(),
            category_encoders.BaseNEncoder(),
            category_encoders.BinaryEncoder(),
            category_encoders.HashingEncoder(),
            category_encoders.HelmertEncoder(),
            category_encoders.JamesSteinEncoder(),
            category_encoders.LeaveOneOutEncoder(),
            category_encoders.MEstimateEncoder(),
            category_encoders.OneHotEncoder(),
            category_encoders.OrdinalEncoder(),
            category_encoders.PolynomialEncoder(),
            category_encoders.SumEncoder(),
            category_encoders.TargetEncoder(),
            category_encoders.WOEEncoder()]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
Пример #9
0
    dct = df[i].value_counts()
    if next(iter(dct))/13731 <.66:
        value_count[i] = dict(df[i].value_counts())
        if len(dct) > 10:
            hashes.append(i)
        else:
            te.append(i)
    else:
        drop_col.append(i)
df.drop(drop_col, axis=1, inplace=True)
cat_col = [x for x in cat_col if x not in drop_col]

X = df.copy()
y = df.Score
X.drop('Score', axis=1, inplace=True)
leoo = ce.LeaveOneOutEncoder(cols=te)
me = ce.MEstimateEncoder(cols=hashes) 
X = leoo.fit_transform(X, y)
X = me.fit_transform(X, y)
X = pca(X, cat_col, 30)
X.drop(cat_col, axis=1, inplace=True)

rf = RandomForestRegressor()
rf.fit(X,y)
plt.plot(rf.feature_importances_)

model_col = ['col84', 'col85', 'col917', 'col2612', 'col3109', 'col3108', 'col87', 'col118', 'col918', 'col2613', 'col3111', 'col88', 'col2614', 'pca0', 'pca1']
X = df[model_col]
y = df.Score
X_train, X_test, y_train, y_test = train_test_split(X[model_col], y, train_size=0.80)
print(local_test(X_train, y_train, X_test, y_test, baseline_two))
Пример #10
0
def get_model(PARAMS):
    """return model for provided params

    :param PARAMS: dictionary with model params
    :type PARAMS: dicr
    :return: model pipeline
    :rtype: sklearn pipeline
    """

    try:
        te_dict = {
            'CatBoostEncoder': ce.CatBoostEncoder(),
            'HashingEncoder': ce.HashingEncoder(),
            'HelmertEncoder': ce.HelmertEncoder(),
            'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(),
            'OneHotEncoder': ce.OneHotEncoder(),
            'TargetEncoder': ce.TargetEncoder(),
            'WOEEncoder': ce.WOEEncoder(),
            'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(),
            'BaseNEncoder': ce.BaseNEncoder(),
            'BinaryEncoder': ce.BinaryEncoder(),
            'CountEncoder': ce.CountEncoder(),
            'JamesSteinEncoder': ce.JamesSteinEncoder(),
            'MEstimateEncoder': ce.MEstimateEncoder(),
            'PolynomialEncoder': ce.PolynomialEncoder(),
            'SumEncoder': ce.SumEncoder()
        }

        pipe = make_pipeline(
            helpers.PrepareData(extraxt_year=True, unicode_text=True),
            ColumnTransformer([
                ('num', helpers.PassThroughOrReplace(), [
                    'flat_size', 'rooms', 'floor', 'number_of_floors',
                    'year_of_building', 'GC_latitude', 'GC_longitude'
                ]),
                ('te_producer', te_dict.get(PARAMS['te_producer']),
                 'producer_name'),
                ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'),
                ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']),
                 'GC_addr_neighbourhood'),
                ('te_suburb', te_dict.get(PARAMS['te_suburb']),
                 'GC_addr_suburb'),
                ('te_postcode', te_dict.get(PARAMS['te_postcode']),
                 'GC_addr_postcode'),
                ('txt_name',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_name__ngram_range']),
                                 max_features=PARAMS['txt_name__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_name__binary'],
                                 use_idf=PARAMS['txt_name__use_idf']), 'name'),
                ('txt_dscr',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_dscr__ngram_range']),
                                 max_features=PARAMS['txt_dscr__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_dscr__binary'],
                                 use_idf=PARAMS['txt_dscr__use_idf']),
                 'description'),
            ]),
            TransformedTargetRegressor(
                regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed),
                func=np.log1p,
                inverse_func=np.expm1))

        return pipe

    except BaseException as e:
        LOG.error(e)
        return None
Пример #11
0
test_mix = pd.concat([test_normal, df_anomaly]).reset_index().drop(['index'], axis=1)

########################################################################
df_normal_sample = train.sample(frac=sample_size)
df_normal_sample.head(n=5)

Y_train = df_normal_sample['class']
X_train = df_normal_sample.drop('class', axis = 1)

test_mix_copy = test_mix.copy()
#test_mix_copy['class'] = -1
Y_test = test_mix_copy['class']
X_test = test_mix.drop('class', axis = 1)


ce_mestimator = ce.MEstimateEncoder()
X_train_encode = ce_mestimator.fit_transform(X_train, Y_train)
X_test_encode = ce_mestimator.fit_transform(X_test, Y_test)

#X_train[:, 1] = LabelEncoder.fit_transform(X_train[:, 1])
#X_test_encode = LabelEncoder.fit_transform(X_test)



train_x = StandardScaler().fit_transform(X_train_encode)
train_x.shape
test_x = StandardScaler().fit_transform(X_test_encode)
test_x.shape


#Autoencoder Layer Structure and Parameters
Пример #12
0
def generate_candidates(adjusted_cols):
    return [
        (
            "bsplitz_method",
            Pipeline([
                ("cate", category_encoders.OrdinalEncoder(cols=adjusted_cols)),
                (
                    "ordinal_encoder",
                    BsplitZClassifier(adjusted_cols,
                                      random_state=10,
                                      num_samples=100),
                ),
            ]),
        ),
        (
            "target_encoder",
            Pipeline([
                (
                    "target_encoder",
                    category_encoders.TargetEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        #            ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "m_encoder",
            Pipeline([
                (
                    "m_encoder",
                    category_encoders.MEstimateEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        #        ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "cat_encoder",
            Pipeline([
                (
                    "m_encoder",
                    category_encoders.CatBoostEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        # ('backward_encoder', Pipeline([
        #     ('backward_encoder', category_encoders.BackwardDifferenceEncoder(
        #         cols=adjusted_cols)),
        #     ('clf', BsplitZClassifier())])), #skip because of too slow
        (
            "basen_encoder",
            Pipeline([
                (
                    "basen_encoder",
                    category_encoders.BaseNEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "binary_encoder",
            Pipeline([
                (
                    "basen_encoder",
                    category_encoders.BinaryEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        (
            "count_encoder",
            Pipeline([
                (
                    "basen_encoder",
                    category_encoders.CountEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        # ('hashing_encoder', Pipeline([
        #    ('basen_encoder', category_encoders.HashingEncoder(
        #        cols=adjusted_cols)),
        #    ('clf', BsplitZClassifier())])), #skip because of too slow
        # ('woe_encoder', Pipeline([
        #     ('woe_encoder', category_encoders.WOEEncoder(
        #         cols=adjusted_cols)),
        #     ('clf', BsplitZClassifier())])), #skip because of binary target only
        (
            "jamesstein_encoder",
            Pipeline([
                (
                    "js_encoder",
                    category_encoders.JamesSteinEncoder(cols=adjusted_cols),
                ),
                ("clf", BsplitZClassifier()),
            ]),
        ),
        # ('clf', DecisionTreeClassifier(max_depth=1))])),
        # ('helmert_encoder', Pipeline([
        #    ('helmert_encoder', category_encoders.HelmertEncoder(
        #        cols=adjusted_cols)),
        #    ('clf', BsplitZClassifier())])), #skip because of too slow
    ]