def test_backward_difference(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) enc = encoders.BackwardDifferenceEncoder(verbose=1, cols=cols) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BackwardDifferenceEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BackwardDifferenceEncoder(verbose=1, drop_invariant=True) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.BackwardDifferenceEncoder(verbose=1, return_df=False) enc.fit(X, None) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
def transform_categorical(self): data = pd.read_csv(self.file_path) encoder1 = ce.BackwardDifferenceEncoder(cols=['most_freq_url']) data = encoder1.fit_transform(data) encoder2 = ce.BackwardDifferenceEncoder(cols=['most_freq_hashtag']) data = encoder2.fit_transform(data) data.to_csv(self.file_path + '_categorical_transformed', index=False, encoding="utf-8")
def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence encoder_bwdiff = ce.BackwardDifferenceEncoder(cols=X.columns) encoder_bwdiff.fit(X, y) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/backdiff.pkl".format(output_dir), "wb") as fp: pickle.dump(encoder_bwdiff, fp)
def backward_difference(): X, _, _ = get_mushroom_data() print(X.info()) enc = ce.BackwardDifferenceEncoder() enc.fit(X, None) out = enc.transform(X) print(out.info()) del enc, _, X, out
def apply_backward_difference_encoding(df, categorical_columns): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.BackwardDifferenceEncoder(cols=categorical_columns).fit( df.values) X_transformed = encoder.transform(df) X_transformed.drop(['intercept'], inplace=True, axis=1) return X_transformed
def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): train = ['A', 'B'] encoder = encoders.BackwardDifferenceEncoder( handle_unknown='indicator') result = encoder.fit_transform(train) expected = [[1, -2 / 3.0, -1 / 3.0], [1, 1 / 3.0, -1 / 3.0]] self.assertEqual(result.values.tolist(), expected)
def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): train = ['A', 'B', np.nan] encoder = encoders.BackwardDifferenceEncoder( handle_missing='indicator') result = encoder.fit_transform(train) expected = [[1, -2 / 3.0, -1 / 3.0], [1, 1 / 3.0, -1 / 3.0], [1, 1 / 3.0, 2 / 3.0]] self.assertEqual(result.values.tolist(), expected)
def test_backwards_difference_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] encoder = encoders.BackwardDifferenceEncoder() encoder.fit(train) test_t = encoder.transform(test) expected = [[1, -2 / 3.0, -1 / 3.0], [1, 0, 0], [1, 0, 0]] self.assertEqual(test_t.values.tolist(), expected)
def test_backwards_difference_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] encoder = encoders.BackwardDifferenceEncoder() encoder.fit(train) obtained = encoder.transform(train) expected = [[1, -2 / 3.0, -1 / 3.0, -2 / 3.0, -1 / 3.0], [1, 1 / 3.0, -1 / 3.0, 1 / 3.0, -1 / 3.0], [1, 1 / 3.0, 2 / 3.0, 1 / 3.0, 2 / 3.0]] self.assertEqual(obtained.values.tolist(), expected)
def create_features(self, df_train, df_test): encoder = ce.BackwardDifferenceEncoder(cols=self.columns) encoder.fit(df_train[self.columns], df_train[self.target_column].values.tolist()) encoded_train = encoder.transform(df_train[self.columns]) encoded_test = encoder.transform(df_test[self.columns]) for column in encoded_train.columns: self.train[column + '_BackwardDifferenceEncoder'] = encoded_train[column] self.test[column + '_BackwardDifferenceEncoder'] = encoded_test[column]
def test_backwards_difference_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) expected = [[1, 0, 0], [1, 1 / 3.0, -1 / 3.0], [1, 1 / 3.0, 2 / 3.0], [1, 0, 0]] self.assertEqual(test_t.values.tolist(), expected)
def test_backward_difference_np(self): """ :return: """ X = self.create_array(n_rows=1000) X_t = self.create_array(n_rows=100) enc = encoders.BackwardDifferenceEncoder(verbose=1) enc.fit(X, None) self.verify_numeric(enc.transform(X_t))
def main(params, inputs, outputs): columns_param = params.columns data = inputs.data data_new = outputs.data_new data_0 = pd.read_pickle(data) encoder = ce.BackwardDifferenceEncoder( cols=[col for col in columns_param.split(",")]) data_1 = encoder.fit_transform(data_0) data_1.to_pickle(data_new)
def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): train = ['A', 'B'] test = ['A', 'B', 'C'] encoder = encoders.BackwardDifferenceEncoder( handle_unknown='indicator') encoder.fit(train) result = encoder.transform(test) expected = [[1, -2 / 3.0, -1 / 3.0], [1, 1 / 3.0, -1 / 3.0], [1, 1 / 3.0, 2 / 3.0]] self.assertEqual(result.values.tolist(), expected)
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def test_backwards_difference_encoder_2StringCols_ExpectCorrectOrder(self): train = pd.DataFrame({'col1': [1, 2, 3, 4], 'col2': ['A', 'B', 'C', 'D'], 'col3': [1, 2, 3, 4], 'col4': ['A', 'B', 'C', 'A'] }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values self.assertTrue(np.array_equal(expected_columns, columns))
def test_backward_difference(self): """ :return: """ cols = ['C1', 'D', 'E', 'F'] enc = encoders.BackwardDifferenceEncoder(verbose=1, cols=cols) X = self.create_dataset(n_rows=1000) X_test = enc.fit_transform(X, None) for dt in X_test.dtypes: numeric = False if dt == int or dt == float: numeric = True self.assertTrue(numeric)
def fit(self, X, y=None): if self.type == 'backdiff': self.encoder = ce.BackwardDifferenceEncoder( handle_unknown='ignore') if self.type == 'binenc': self.encoder = ce.BinaryEncoder(handle_unknown='impute') if self.type == 'hashenc': self.encoder = ce.HashingEncoder() if self.type == 'helmenc': self.encoder = ce.HelmertEncoder(handle_unknown='impute') if self.type == 'onehot': self.encoder = ce.OneHotEncoder(handle_unknown='ignore') if self.type == 'ordenc': self.encoder = ce.OrdinalEncoder(handle_unknown='impute') if self.type == 'sumenc': self.encoder = ce.SumEncoder(handle_unknown='ignore') if self.type == 'polyenc': self.encoder = ce.PolynomialEncoder(handle_unknown='impute') self.encoder.fit(X, y) return self
# solar.flare2.arff Medium impact # soybean.arff Large impact # sick.arff # spectrometer.arff Medium impact (contains an ID) # sponge.arff Large impact # tic-tac-toe.arff # trains.arff Medium impact (tiny dataset -> with high variance) datasets = ['audiology.arff', 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff', 'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), category_encoders.PolynomialEncoder(), category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] # Initialization
def pipeline(df, target, cat_columns, models): n_rows, n_cols = df.shape metrics = { "n_rows": [], "n_cols": [], "cardinality": [], "model": [], "column": [], "encoder": [], "rmse": [], "mae": [], "fit_time": [], "rmse_change": [], "mae_change": [], "fit_time_change": [], } columns = cat_columns for model_name in models: base_rmse, base_mae, base_fit_time = model( df=df, target=target, encoder=np.nan, col=np.nan, model_name=model_name, encoder_type="basic", encoder_name=[], ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=np.nan, model_name=model_name, column=np.nan, name="basic", rmse=base_rmse, mae=base_mae, fit_time=base_fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) for column in columns: print() print(column) cardinality = df[column].nunique() print("ohe") rmse, mae, fit_time = model( df=df, target=target, encoder=np.nan, col=column, model_name=model_name, encoder_type="basic", encoder_name="One Hot Encoder (pd.dummies)", ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=cardinality, model_name=model_name, column=column, name="One Hot Encoder (pd.dummies)", rmse=rmse, mae=mae, fit_time=fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) encoders = [ ("Sum Encoder(sleepmind)", SumEncoder()), ("BinaryEncoder", ce.BinaryEncoder(cols=[column])), ("HashingEncoder", ce.HashingEncoder(cols=[column])), ("OneHotEncoder", ce.OneHotEncoder(cols=[column])), ("OrdinalEncoder", ce.OrdinalEncoder(cols=[column])), ("BaseNEncoder", ce.BaseNEncoder(cols=[column])), ( "BackwardDifferenceEncoder", ce.BackwardDifferenceEncoder(cols=[column]), ), ("HelmertEncoder", ce.HelmertEncoder(cols=[column])), ("SumEncoder", ce.SumEncoder(cols=[column])), ("PolynomialEncoder", ce.PolynomialEncoder(cols=[column])), ("TargetEncoder", ce.TargetEncoder(cols=[column])), ("LeaveOneOutEncoder", ce.LeaveOneOutEncoder(cols=[column])), ( "XAM_bayesian_targetEncoder", BayesianTargetEncoder(columns=[column], prior_weight=3, suffix=""), ), ] for name, encoder in encoders: print(name) rmse, mae, fit_time = model( df=df, target=target, encoder=encoder, col=column, model_name=model_name, encoder_type="sklearn_encoding", encoder_name=name, ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=cardinality, model_name=model_name, column=column, name=name, rmse=rmse, mae=mae, fit_time=fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) bayes_encoders = [ ("hcc_BayesEncoding", BayesEncoding), ("hcc_BayesEncodingKfold", BayesEncodingKfold), ("LOOEncoding", LOOEncoding), ("LOOEncodingKfold", LOOEncodingKfold), ] for name, bayes_encoder in bayes_encoders: print(name) rmse, mae, fit_time = model( df=df, target=target, encoder=bayes_encoder, col=column, model_name=model_name, encoder_name=name, encoder_type="basic", hcc_ind=1, ) _append_metric( row_list=metrics, n_rows=n_rows, n_cols=n_cols, cardinality=cardinality, model_name=model_name, column=column, name=name, rmse=rmse, mae=mae, fit_time=fit_time, base_rmse=base_rmse, base_mae=base_mae, base_fit_time=base_fit_time, ) results = pd.DataFrame(metrics) return results
# X0 = X0.rename(index=str, columns={ # "diameter": "x1", # "color": "x2" # }) y = DF_ORIGINAL['ct_wins'] X1 = DF_ORIGINAL.drop(['ct_wins', 't_wins'], axis=1) X0 = renameX(X1) encoder = ce.OneHotEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) writeFile(describeData(X, 'OneHot')) X = pd.get_dummies(X0) writeFile(describeData(X, 'Dummy')) encoder = ce.BackwardDifferenceEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) X = X.drop(['intercept'], axis=1) writeFile(describeData(X, 'BackwardDifference')) encoder = ce.BaseNEncoder(base=3, cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) writeFile(describeData(X, 'BaseN')) encoder = ce.BinaryEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) writeFile(describeData(X, 'Binary')) encoder = ce.HelmertEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) X.drop(['intercept'], inplace=True, axis=1)
def __init__(self, name: str): super().__init__(name, '_bde_', ce.BackwardDifferenceEncoder(cols=[name]), -1.0, 1.0)
def get_factors(model, df, fnum, fname, nvalues, dtype, encoder, rounding, sentinel): r"""Convert the original feature to a factor. Parameters ---------- model : alphapy.Model Model object with the feature specifications. df : pandas.DataFrame Dataframe containing the column ``fname``. fnum : int Feature number, strictly for logging purposes fname : str Name of the text column in the dataframe ``df``. nvalues : int The number of unique values. dtype : str The values ``'float64'``, ``'int64'``, or ``'bool'``. encoder : alphapy.features.Encoders Type of encoder to apply. rounding : int Number of places to round. sentinel : float The number to be imputed for NaN values. Returns ------- all_features : numpy array The features that have been transformed to factors. """ logger.info("Feature %d: %s is a factor of type %s with %d unique values", fnum, fname, dtype, nvalues) logger.info("Encoding: %s", encoder) # Extract model data feature_map = model.feature_map model_type = model.specs['model_type'] target_value = model.specs['target_value'] # get feature feature = df[fname] # convert float to factor if dtype == 'float64': logger.info("Rounding: %d", rounding) feature = feature.apply(float_factor, args=[rounding]) # encoders pd_features = pd.DataFrame() enc = None ef = pd.DataFrame(feature) if encoder == Encoders.factorize: pd_factors = pd.factorize(feature)[0] pd_features = pd.DataFrame(pd_factors) elif encoder == Encoders.onehot: pd_features = pd.get_dummies(feature) elif encoder == Encoders.ordinal: enc = ce.OrdinalEncoder(cols=[fname]) elif encoder == Encoders.binary: enc = ce.BinaryEncoder(cols=[fname]) elif encoder == Encoders.helmert: enc = ce.HelmertEncoder(cols=[fname]) elif encoder == Encoders.sumcont: enc = ce.SumEncoder(cols=[fname]) elif encoder == Encoders.polynomial: enc = ce.PolynomialEncoder(cols=[fname]) elif encoder == Encoders.backdiff: enc = ce.BackwardDifferenceEncoder(cols=[fname]) else: raise ValueError("Unknown Encoder %s" % encoder) # If encoding worked, calculate target percentages for classifiers. pd_exists = not pd_features.empty enc_exists = enc is not None all_features = None if pd_exists or enc_exists: if pd_exists: all_features = pd_features elif enc_exists: all_features = enc.fit_transform(ef, None) # Calculate target percentages for factors if (model_type == ModelType.classification and fname in feature_map['crosstabs']): # Get the crosstab for this feature ct = feature_map['crosstabs'][fname] # map target percentages to the new feature ct_map = ct.to_dict()[target_value] ct_feature = df[[fname]].applymap(ct_map.get) # impute sentinel for any values that could not be mapped ct_feature.fillna(value=sentinel, inplace=True) # concatenate all generated features all_features = np.column_stack((all_features, ct_feature)) logger.info("Applied target percentages for %s", fname) else: raise RuntimeError("Encoding for feature %s failed" % fname) return all_features
def __init__( self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories="auto", dtype=np.float64, handle_unknown="ignore", clf_type=None, n_components=None, ): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { "OneHotEncoder": OneHotEncoder(handle_unknown="ignore"), "OneHotEncoder-1": OneHotEncoderRemoveOne(handle_unknown="ignore"), "Categorical": None, "OneHotEncoderDense": OneHotEncoder(handle_unknown="ignore", sparse=False), "OneHotEncoderDense-1": OneHotEncoderRemoveOne(handle_unknown="ignore", sparse=False), "SimilarityEncoder": SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), "NgramNaiveFisherKernel": NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), "ngrams_hot_vectorizer": [], "NgramsCountVectorizer": CountVectorizer(analyzer="char", ngram_range=self.ngram_range), "NgramsTfIdfVectorizer": TfidfVectorizer(analyzer="char", ngram_range=self.ngram_range, smooth_idf=False), "WordNgramsTfIdfVectorizer": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), smooth_idf=False), "TargetEncoder": TargetEncoder(clf_type=self.clf_type, handle_unknown="ignore"), "MDVEncoder": MDVEncoder(self.clf_type), "BackwardDifferenceEncoder": cat_enc.BackwardDifferenceEncoder(), "BinaryEncoder": cat_enc.BinaryEncoder(), "HashingEncoder": cat_enc.HashingEncoder(), "HelmertEncoder": cat_enc.HelmertEncoder(), "SumEncoder": cat_enc.SumEncoder(), "PolynomialEncoder": cat_enc.PolynomialEncoder(), "BaseNEncoder": cat_enc.BaseNEncoder(), "LeaveOneOutEncoder": cat_enc.LeaveOneOutEncoder(), "NgramsLDA": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ( "LDA", LatentDirichletAllocation(n_components=self.n_components, learning_method="batch"), ), ]), "NMF": Pipeline([ ( "ngrams_count", CountVectorizer(analyzer="char", ngram_range=self.ngram_range), ), ("NMF", NMF(n_components=self.n_components)), ]), "WordNMF": Pipeline([ ("ngrams_count", CountVectorizer(analyzer="word", ngram_range=(1, 1))), ("NMF", NMF(n_components=self.n_components)), ]), "NgramsMultinomialMixture": NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), "AdHocNgramsMultinomialMixture": AdHocNgramsMultinomialMixture(n_iters=0), "AdHocIndependentPDF": AdHocIndependentPDF(), "OnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=0.99, r=None, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization2": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means++", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization3": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "OnlineGammaPoissonFactorization4": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=0.95, batch_size=256, tol=1e-4, random_state=18, init="k-means", ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20, ), "WordOnlineGammaPoissonFactorization": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, tol=1e-4, random_state=18, init="k-means++", ngram_range=(1, 1), analizer="word", rescale_W=True, max_iter_e_step=10, ), "OnlineGammaPoissonFactorization_fast": gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=0.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init="k-means++", rescale_W=False, ), "MinHashEncoder": MinHashEncoder(n_components=self.n_components), "PretrainedFastText": PretrainedFastText(n_components=self.n_components), "PretrainedFastText_fr": PretrainedFastText(n_components=self.n_components, language="french"), "PretrainedFastText_hu": PretrainedFastText(n_components=self.n_components, language="hungarian"), None: FunctionTransformer(None, validate=True), "Passthrough": PasstroughEncoder(), } self.list_1D_array_methods = [ "NgramsCountVectorizer", "NgramsTfIdfVectorizer", "WordNgramsTfIdfVectorizer", "ngrams_hot_vectorizer", "NgramsLDA", "NMF", "WordNMF", "NgramsMultinomialMixture", "NgramsMultinomialMixtureKMeans2", "AdHocNgramsMultinomialMixture", "AdHocIndependentPDF", "GammaPoissonFactorization", "OnlineGammaPoissonFactorization", "WordOnlineGammaPoissonFactorization", "OnlineGammaPoissonFactorization2", "OnlineGammaPoissonFactorization3", "OnlineGammaPoissonFactorization4", "OnlineGammaPoissonFactorization_fast", "MinHashEncoder", "MinMeanMinHashEncoder", ]
def get_model(PARAMS): """return model for provided params :param PARAMS: dictionary with model params :type PARAMS: dicr :return: model pipeline :rtype: sklearn pipeline """ try: te_dict = { 'CatBoostEncoder': ce.CatBoostEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(), 'OneHotEncoder': ce.OneHotEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'WOEEncoder': ce.WOEEncoder(), 'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'CountEncoder': ce.CountEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'MEstimateEncoder': ce.MEstimateEncoder(), 'PolynomialEncoder': ce.PolynomialEncoder(), 'SumEncoder': ce.SumEncoder() } pipe = make_pipeline( helpers.PrepareData(extraxt_year=True, unicode_text=True), ColumnTransformer([ ('num', helpers.PassThroughOrReplace(), [ 'flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building', 'GC_latitude', 'GC_longitude' ]), ('te_producer', te_dict.get(PARAMS['te_producer']), 'producer_name'), ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'), ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']), 'GC_addr_neighbourhood'), ('te_suburb', te_dict.get(PARAMS['te_suburb']), 'GC_addr_suburb'), ('te_postcode', te_dict.get(PARAMS['te_postcode']), 'GC_addr_postcode'), ('txt_name', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_name__ngram_range']), max_features=PARAMS['txt_name__max_features'], dtype=np.float32, binary=PARAMS['txt_name__binary'], use_idf=PARAMS['txt_name__use_idf']), 'name'), ('txt_dscr', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_dscr__ngram_range']), max_features=PARAMS['txt_dscr__max_features'], dtype=np.float32, binary=PARAMS['txt_dscr__binary'], use_idf=PARAMS['txt_dscr__use_idf']), 'description'), ]), TransformedTargetRegressor( regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed), func=np.log1p, inverse_func=np.expm1)) return pipe except BaseException as e: LOG.error(e) return None
from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() lb_results = lb.fit_transform(cat_df_flights_onehot_sklearn['carrier']) lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_) print(lb_results_df.head()) result_df = pd.concat([cat_df_flights_onehot_sklearn, lb_results_df], axis=1) print(result_df.head()) cat_df_flights_ce = cat_df_flights.copy() import category_encoders as ce encoder = ce.BinaryEncoder(cols=['carrier']) df_binary = encoder.fit_transform(cat_df_flights_ce) df_binary.head() encoder = ce.BackwardDifferenceEncoder(cols=['carrier']) df_bd = encoder.fit_transform(cat_df_flights_ce) df_bd.head() dummy_df_age = pd.DataFrame({'age': ['0-20', '20-40', '40-60','60-80']}) dummy_df_age['start'], dummy_df_age['end'] = zip(*dummy_df_age['age'].map(lambda x: x.split('-'))) dummy_df_age.head() dummy_df_age = pd.DataFrame({'age': ['0-20', '20-40', '40-60','60-80']}) def split_mean(x): split_list = x.split('-') mean = (float(split_list[0])+float(split_list[1]))/2 return mean dummy_df_age['age_mean'] = dummy_df_age['age'].apply(lambda x: split_mean(x)) dummy_df_age.head()
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'): encoders_used = {} for col in encoder_to_use: if encoder_to_use[col] == 'ColumnDropper': df = df.drop(columns = col) dfv = dfv.drop(columns = col) dfk = dfk.drop(columns = col) encoders_used[col] = 'ColumnDropper' if encoder_to_use[col]=='BackwardDifferenceEncoder': encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BaseNEncoder': encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BinaryEncoder': encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='CatBoostEncoder': encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # if encoder_to_use[col]=='HashingEncoder': # encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) # encoder.fit(X=df,y=df['set_clicked']) # df=encoder.transform(df) # encoders_used[col]=encoder if encoder_to_use[col]=='HelmertEncoder': encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='JamesSteinEncoder': encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary') encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='LeaveOneOutEncoder': encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='MEstimateEncoder': encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) encoders_used[col]=encoder if encoder_to_use[col]=='OneHotEncoder': encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='OrdinalEncoder': encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='SumEncoder': encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='PolynomialEncoder': encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='TargetEncoder': encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='WOEEncoder': encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # print("Encoding done for - ",col) print("Completed encoder - ",datetime.datetime.now()) return df, dfv, dfk, encoders_used
def backward_encode(cat_data): encoder = ce.BackwardDifferenceEncoder(cols=list(cat_data.columns)) df_bd = encoder.fit_transform(cat_data) return df_bd
def __init__(self, encoder_name, reduction_method=None, ngram_range=(2, 4), categories='auto', dtype=np.float64, handle_unknown='ignore', clf_type=None, n_components=None): self.ngram_range = ngram_range self.encoder_name = encoder_name self.categories = categories self.dtype = dtype self.clf_type = clf_type self.handle_unknown = handle_unknown self.reduction_method = reduction_method self.n_components = n_components self.encoders_dict = { 'OneHotEncoder': OneHotEncoder(handle_unknown='ignore'), 'OneHotEncoder-1': OneHotEncoderRemoveOne(handle_unknown='ignore'), 'Categorical': None, 'OneHotEncoderDense': OneHotEncoder(handle_unknown='ignore', sparse=False), 'OneHotEncoderDense-1': OneHotEncoderRemoveOne(handle_unknown='ignore', sparse=False), 'SimilarityEncoder': SimilarityEncoder(ngram_range=self.ngram_range, random_state=10), 'NgramNaiveFisherKernel': NgramNaiveFisherKernel(ngram_range=self.ngram_range, random_state=10), 'ngrams_hot_vectorizer': [], 'NgramsCountVectorizer': CountVectorizer(analyzer='char', ngram_range=self.ngram_range), 'NgramsTfIdfVectorizer': TfidfVectorizer(analyzer='char', ngram_range=self.ngram_range, smooth_idf=False), 'WordNgramsTfIdfVectorizer': TfidfVectorizer(analyzer='word', ngram_range=(1, 1), smooth_idf=False), 'TargetEncoder': TargetEncoder(clf_type=self.clf_type, handle_unknown='ignore'), 'MDVEncoder': MDVEncoder(self.clf_type), 'BackwardDifferenceEncoder': cat_enc.BackwardDifferenceEncoder(), 'BinaryEncoder': cat_enc.BinaryEncoder(), 'HashingEncoder': cat_enc.HashingEncoder(), 'HelmertEncoder': cat_enc.HelmertEncoder(), 'SumEncoder': cat_enc.SumEncoder(), 'PolynomialEncoder': cat_enc.PolynomialEncoder(), 'BaseNEncoder': cat_enc.BaseNEncoder(), 'LeaveOneOutEncoder': cat_enc.LeaveOneOutEncoder(), 'NgramsLDA': Pipeline([ ('ngrams_count', CountVectorizer(analyzer='char', ngram_range=self.ngram_range)), ( 'LDA', LatentDirichletAllocation(n_components=self.n_components, learning_method='batch'), ) ]), 'NMF': Pipeline([('ngrams_count', CountVectorizer(analyzer='char', ngram_range=self.ngram_range)), ('NMF', NMF(n_components=self.n_components))]), 'WordNMF': Pipeline([('ngrams_count', CountVectorizer(analyzer='word', ngram_range=(1, 1))), ('NMF', NMF(n_components=self.n_components))]), 'NgramsMultinomialMixture': NgramsMultinomialMixture(n_topics=self.n_components, max_iters=10), 'AdHocNgramsMultinomialMixture': AdHocNgramsMultinomialMixture(n_iters=0), 'AdHocIndependentPDF': AdHocIndependentPDF(), 'OnlineGammaPoissonFactorization': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, rho=.99, r=None, tol=1e-4, random_state=18, init='k-means++', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=10), 'OnlineGammaPoissonFactorization2': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init='k-means++', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'OnlineGammaPoissonFactorization3': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, rho=None, batch_size=256, tol=1e-4, random_state=18, init='k-means', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'OnlineGammaPoissonFactorization4': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=None, rho=.95, batch_size=256, tol=1e-4, random_state=18, init='k-means', ngram_range=self.ngram_range, rescale_W=True, max_iter_e_step=20), 'WordOnlineGammaPoissonFactorization': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, tol=1e-4, random_state=18, init='k-means++', ngram_range=(1, 1), analizer='word', rescale_W=True, max_iter_e_step=10), 'OnlineGammaPoissonFactorization_fast': gamma_poisson_factorization.OnlineGammaPoissonFactorization( n_topics=self.n_components, r=.3, ngram_range=(3, 3), max_iter=1, min_iter=1, tol=1e-4, random_state=18, init='k-means++', rescale_W=False), 'MinHashEncoder': MinHashEncoder(n_components=self.n_components), 'PretrainedFastText': PretrainedFastText(n_components=self.n_components), 'PretrainedFastText_fr': PretrainedFastText(n_components=self.n_components, language='french'), 'PretrainedFastText_hu': PretrainedFastText(n_components=self.n_components, language='hungarian'), None: FunctionTransformer(None, validate=True), 'Passthrough': PasstroughEncoder(), } self.list_1D_array_methods = [ 'NgramsCountVectorizer', 'NgramsTfIdfVectorizer', 'WordNgramsTfIdfVectorizer', 'ngrams_hot_vectorizer', 'NgramsLDA', 'NMF', 'WordNMF', 'NgramsMultinomialMixture', 'NgramsMultinomialMixtureKMeans2', 'AdHocNgramsMultinomialMixture', 'AdHocIndependentPDF', 'GammaPoissonFactorization', 'OnlineGammaPoissonFactorization', 'WordOnlineGammaPoissonFactorization', 'OnlineGammaPoissonFactorization2', 'OnlineGammaPoissonFactorization3', 'OnlineGammaPoissonFactorization4', 'OnlineGammaPoissonFactorization_fast', 'MinHashEncoder', 'MinMeanMinHashEncoder', ]
features = dataset target = _ """START: Import encoders""" import category_encoders as ce import sys sys.path.append('../encoders/') from ceng import CENGEncoder from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder from entity_embedding import EntityEmbeddingEncoder from cesamo import CESAMOEncoder Encoders = { 'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(num_predictors=2), 'AgingPP': AgingPPEncoder(num_predictors=2), 'SimplePP': SimplePPEncoder(num_predictors=2), 'CESAMOEncoder': CESAMOEncoder() } if target_flag == 0: del Encoders['EntityEmbedding'] del Encoders['TargetEnc'] del Encoders['WOE']