예제 #1
0
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1):
    if not X:
        X = np.array([
            ["P", "+"],
            ["P2", "-"],
            ["P3", "-"],
        ])

    custom_encoder = CustomOrdinalFeatureEncoder()
    ordinal_encoder = OrdinalEncoder()

    ordinal_encoder_time = []
    custom_encoder_time = []
    for i in range(iterations):
        ts = time()
        custom_encoder.fit(X)
        transformed = custom_encoder.transform(X)
        custom_encoder.inverse_transform(transformed)
        custom_encoder_time.append(time() - ts)

        ts = time()
        ordinal_encoder.fit(X)
        transformed = ordinal_encoder.transform(X)
        ordinal_encoder.inverse_transform(transformed)
        ordinal_encoder_time.append(time() - ts)
    custom_encoder_time = np.mean(custom_encoder_time)
    ordinal_encoder_time = np.mean(ordinal_encoder_time)
    if verbose:
        print(f"CustomEncoder -> Time: {custom_encoder_time}")
        print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}")
    return custom_encoder_time, ordinal_encoder_time
예제 #2
0
def test_ordinal_encoder_inverse():
    X = [['abc', 2, 55], ['def', 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape('Shape of the passed X data is not correct')
    with pytest.raises(ValueError, match=msg):
        enc.inverse_transform(X_tr)
예제 #3
0
def make_prediction():
    oe = OrdinalEncoder()
    oe2 = OrdinalEncoder()
    train_data = pd.read_csv("Train.csv")
    test_data = pd.read_csv("Test.csv")

    test_data = oe.fit_transform(test_data)

    model = LogisticRegression()
    train_cols = [col for col in train_data.columns if col != "netgain"]
    X = train_data[train_cols]
    X = oe.fit_transform(X).astype("int")

    Y = train_data["netgain"]
    Y = Y.values.reshape(-1, 1)
    Y = oe2.fit_transform(Y).astype("int")

    #fitting model with prediction   data and telling it my target
    model.fit(X, Y)

    test_data2 = pd.read_csv("Test.csv")

    test_data2["netgain"] = oe2.inverse_transform(
        model.predict(test_data).astype("int").reshape(-1, 1))
    test_data2[["id", "netgain"]].to_csv("Results.csv")
    # Now we want to delete the first column which is unnecessary since it is jsut for numbering the rows
    new_df = pd.read_csv('Results.csv')
    first_column = new_df.columns[0]
    new_df = new_df.drop([first_column], axis=1)
    new_df.to_csv('Results.csv', index=False)
예제 #4
0
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
    """Check ordinal encoder is compatible with pandas."""
    # checks pandas dataframe with categorical features
    if pd_nan_type == 'pd.NA':
        # pd.NA is in pandas 1.0
        pd = pytest.importorskip('pandas', minversion="1.0")
        pd_missing_value = pd.NA
    else:  # np.nan
        pd = pytest.importorskip('pandas')
        pd_missing_value = np.nan

    df = pd.DataFrame({
        'col1':
        pd.Series(['c', 'a', pd_missing_value, 'b', 'a'], dtype='category'),
    })

    oe = OrdinalEncoder().fit(df)
    assert len(oe.categories_) == 1
    assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c'])
    assert np.isnan(oe.categories_[0][-1])

    df_trans = oe.transform(df)

    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])

    X_inverse = oe.inverse_transform(df_trans)
    assert X_inverse.shape == (5, 1)
    assert_array_equal(X_inverse[:2, 0], ['c', 'a'])
    assert_array_equal(X_inverse[3:, 0], ['b', 'a'])
    assert np.isnan(X_inverse[2, 0])
예제 #5
0
        def set_miss_values(df, complete_index):
            enc_label = OrdinalEncoder()
            enc_fea = OrdinalEncoder()
            missing_index = complete_index[0]

            # Take out the existing numerical data (no NaN) and throw them in Random Forest Regressor
            train_df = df[complete_index]
            # known & unknow values
            known_values = np.array(train_df[train_df[missing_index].notnull()])
            unknow_values = np.array(train_df[train_df[missing_index].isnull()])

            # y is the know missing_index
            y = known_values[:, 0].reshape(-1, 1)
            enc_label.fit(y)
            y = enc_label.transform(y)

            # X are the features
            X = known_values[:, 1:]
            test_X = unknow_values[:, 1:]
            all_X = np.row_stack((X, test_X))
            enc_fea.fit(all_X)
            X = enc_fea.transform(X)

            # fit
            rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
            rfr.fit(X, y.ravel())
            # predict
            predicted_values = rfr.predict(enc_fea.transform(unknow_values[:, 1:]))
            predicted_values = enc_label.inverse_transform(predicted_values.reshape(-1, 1))
            # fill in with predicted values
            df.loc[(df[missing_index].isnull()), missing_index] = predicted_values
            return df
예제 #6
0
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
    """Check ordinal encoder is compatible with pandas."""
    # checks pandas dataframe with categorical features
    pd = pytest.importorskip("pandas")

    pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan

    df = pd.DataFrame(
        {
            "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
        }
    )

    oe = OrdinalEncoder().fit(df)
    assert len(oe.categories_) == 1
    assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
    assert np.isnan(oe.categories_[0][-1])

    df_trans = oe.transform(df)

    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])

    X_inverse = oe.inverse_transform(df_trans)
    assert X_inverse.shape == (5, 1)
    assert_array_equal(X_inverse[:2, 0], ["c", "a"])
    assert_array_equal(X_inverse[3:, 0], ["b", "a"])
    assert np.isnan(X_inverse[2, 0])
예제 #7
0
class SklearnEncoder(object):
    def __init__(self, encoder_type):
        self.encoder_type = encoder_type
        if self.encoder_type == "Label":
            self.encoder_module = LabelEncoder()

        elif self.encoder_type == "OneHot":
            self.encoder_module = OneHotEncoder()

        elif self.encoder_type == "Ordinal":  # 序数编码
            self.encoder_module = OrdinalEncoder()

    def _fit(self, x, y=None):
        if self.encoder_type == "Label":
            self.encoder_module.fit(y=x)
        else:
            self.encoder_module.fit(X=x, y=y)

    def _transform(self, x):
        if self.encoder_type == "Label":
            return self.encoder_module.transform(y=x)
        else:
            return self.encoder_module.transform(X=x)

    def _fit_transform(self, x, y=None):
        if self.encoder_type == "Label":
            return self.encoder_module.fit_transform(y=x)
        else:
            return self.encoder_module.fit_transform(X=x, y=y)

    def _reversal(self, x):  # 与transform的操作刚好相反
        return self.encoder_module.inverse_transform(X=x)
예제 #8
0
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """
    A Transformer to one hot-encode a given category of data
    """
    def __init__(self):
        self.categories_ = None
        self._one_hot_encoder = OneHotEncoder(sparse=False, categories='auto')
        self._ordinal_encoder = OrdinalEncoder()

    def fit(self, X, y=None):  # pylint: disable=invalid-name,unused-argument
        """
        Satisfy sklearn, do nothing
        :param X:
        :param y:
        :return:
        """
        return self

    def transform(self, X):  # pylint: disable=invalid-name,no-self-use
        """
        Encode each column of X as one hot lists
        :param X:
        :return:
        """
        ordinal_data = self._ordinal_encoder.fit_transform(X)
        self.categories_ = self._ordinal_encoder.categories_
        return self._one_hot_encoder.fit_transform(ordinal_data)

    def inverse_transform(self, X):  # pylint: disable=invalid-name,no-self-use
        """
        Return the original data, given transformed data
        """
        ordinal_data = self._one_hot_encoder.inverse_transform(X)
        return self._ordinal_encoder.inverse_transform(ordinal_data)
예제 #9
0
파일: data.py 프로젝트: pbr142/xai
def impute_cat_column(y: pd.Series, X: pd.DataFrame) -> pd.Series:
    """Impute missing values of a categorical pandas Series using a catboost classifier.
    Missing values of categorical features in X are imputed using their mode.

    Args:
        y (pd.Series): Series for which to impute missing values
        X (pd.DataFrame): Features to use for imputation

    Returns:
        pd.Series: y with missing values imputed
    """

    cat_features = X.select_dtypes('object').columns
    X[cat_features] = X[cat_features].fillna(X[cat_features].mode().iloc[0])

    idx_valid = y.notnull()
    y_valid = y[idx_valid]

    if y.dtype == 'O':
        enc = OrdinalEncoder()
        y_valid = enc.fit_transform(y_valid.values.reshape(-1,1))
    
    model = cb.CatBoostClassifier()
    _ = model.fit(X[idx_valid], y_valid, cat_features=cat_features, verbose=0)

    y_pred = model.predict(X[~idx_valid])
    if y.dtype == 'O':
        y_pred = enc.inverse_transform(y_pred)
    
    cur_opt = pd.get_option('mode.chained_assignment')
    pd.set_option('mode.chained_assignment',None)
    y.loc[~idx_valid] = y_pred.reshape(-1)
    pd.set_option('mode.chained_assignment',cur_opt)
    return y
예제 #10
0
def test_ordinal_encoder_sparse():
    """Check that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    """
    X = np.array([[3, 2, 1], [0, 1, 1]])
    X_sparse = sparse.csr_matrix(X)

    encoder = OrdinalEncoder()

    err_msg = "A sparse matrix was passed, but dense data is required"
    with pytest.raises(TypeError, match=err_msg):
        encoder.fit(X_sparse)
    with pytest.raises(TypeError, match=err_msg):
        encoder.fit_transform(X_sparse)

    X_trans = encoder.fit_transform(X)
    X_trans_sparse = sparse.csr_matrix(X_trans)
    with pytest.raises(TypeError, match=err_msg):
        encoder.inverse_transform(X_trans_sparse)
예제 #11
0
def test_ordinal_encoder_inverse():
    X = [['abc', 2, 55], ['def', 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape('Shape of the passed X data is not correct')
    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
예제 #12
0
def test_ordinal_encoder_handle_unknowns_string():
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
    X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object)
    X_trans = np.array([['c', 'xy'], ['bla', 'y'], ['a', 'x']], dtype=object)
    enc.fit(X_fit)

    X_trans_enc = enc.transform(X_trans)
    exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype='int64')
    assert_array_equal(X_trans_enc, exp)

    X_trans_inv = enc.inverse_transform(X_trans_enc)
    inv_exp = np.array([['c', None], [None, 'y'], ['a', 'x']], dtype=object)
    assert_array_equal(X_trans_inv, inv_exp)
예제 #13
0
def test_ordinal_encoder_handle_unknowns_numeric(dtype):
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
    X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
    X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
    enc.fit(X_fit)

    X_trans_enc = enc.transform(X_trans)
    exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64")
    assert_array_equal(X_trans_enc, exp)

    X_trans_inv = enc.inverse_transform(X_trans_enc)
    inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
    assert_array_equal(X_trans_inv, inv_exp)
예제 #14
0
def test_ordinal_encoder_handle_unknowns_string():
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2)
    X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object)
    X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object)
    enc.fit(X_fit)

    X_trans_enc = enc.transform(X_trans)
    exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64")
    assert_array_equal(X_trans_enc, exp)

    X_trans_inv = enc.inverse_transform(X_trans_enc)
    inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object)
    assert_array_equal(X_trans_inv, inv_exp)
예제 #15
0
def test_ordinal_encoder_passthrough_missing_values_float():
    """Test ordinal encoder with nan on float dtypes."""

    X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
    oe = OrdinalEncoder().fit(X)

    assert len(oe.categories_) == 1
    assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])

    X_trans = oe.transform(X)
    assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])

    X_inverse = oe.inverse_transform(X_trans)
    assert_allclose(X_inverse, X)
예제 #16
0
class DataEncoder(object):  # 支持三种编码方式
    def __init__(self, encoder_type):
        assert encoder_type in {"one_hot", "label", "Ordinal"}
        self.encoder_type = encoder_type
        if self.encoder_type == "one_hot":  # 种类的编码
            self.encodermodule = OneHotEncoder(categories='auto', drop=None, sparse=True,
                                               dtype=np.float64, handle_unknown='error')
            # categories 可取 "auto" 或种类的列表
            # drop  可取 {‘first’, ‘if_binary’} None  或 array [i] 表示丢弃第i个
            # first 表示丢弃每个种类特征的第一个, 二进制
            # sparse  返回一个稀疏矩阵,否则返回一个数组
            # handle_unknown  {‘error’, ‘ignore’}, default=’error’
        elif self.encoder_type == "label":
            self.encodermodule = LabelEncoder()

        elif self.encoder_type == "Ordinal":  # 序号编码
            self.encodermodule = OrdinalEncoder(categories="auto", dtype=np.float64)
            # categories 用法与onehot 差不多
        else:
            raise ValueError("please select a correct encoder_type")

    def fit_transform(self, data):
        return self.encodermodule.fit_transform(data)

    def fit(self, data):
        self.encodermodule.fit(data)

    def transform(self, data):
        self.encodermodule.transform(data)

    def set_params(self, params):
        self.encodermodule.set_params(**params)

    def get_params(self):
        return self.encodermodule.get_params(deep=True)

    def inverse_transform(self, data):
        return self.encodermodule.inverse_transform(data)

    def get_classes(self):
        assert self.encoder_type in {"label"}
        return self.encodermodule.classes_

    def get_category(self):
        assert self.encoder_type in {"one_hot", "Ordinal"}
        return self.encodermodule.categories_  # 返回数组列表

    def get_feature_names(self, output_feature):  # 获取输出特征的特征名字
        assert self.encoder_type in {"one_hot"}
        return self.encodermodule.get_feature_names(output_feature)
예제 #17
0
class DecoderXGBoost(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.enc = OrdinalEncoder()
        # Encoder que transforma os nomes das classes em valores numéricos que serão melhores interpretados pelo classificador.
        self.enc.fit([['EXCELENTE'], ['MUITO_BOM'], ['HUMANAS'], ['EXATAS'], ['DIFICULDADE']])

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Primeiro extraímos a classe que teve o melhor fit para cada resultado.
        best_preds = np.asarray([[np.argmax(line) for line in X]]).transpose()
        # Decodificamos para os tipos de dados originais da coluna de PERFIL (object).
        best_preds = self.enc.inverse_transform(best_preds)
        # Retornamos um np.array com as novas predições.
        return best_preds
예제 #18
0
class CatSklearnAttacker(PrivacyAttackerModel):
    """Base class for categorical attacker based on sklearn models.

    Attributes:
        key_type (CategoricalType):
            Required key attribute type (class_num or one_hot) by the learner.
        sensitive_type (CategoricalType):
            Required sensitive attribute type (class_num or one_hot) by the learner.
        skl_learner (Class):
            A (wrapped) sklearn classifier class that can be called with no arguments.
    """
    KEY_TYPE = None
    SENSITIVE_TYPE = None
    SKL_LEARNER = None

    def __init__(self):
        self.predictor = self.SKL_LEARNER()
        self.key_processor = OrdinalEncoder() if self.KEY_TYPE == CategoricalType.CLASS_NUM \
            else OneHotEncoder()
        self.sensitive_processor = OrdinalEncoder() if \
            self.SENSITIVE_TYPE == CategoricalType.CLASS_NUM else OneHotEncoder()

    def fit(self, synthetic_data, key, sensitive):
        key_table = allow_nan(synthetic_data[key])
        sensitive_table = allow_nan(synthetic_data[sensitive])
        self.key_processor.fit(key_table)
        self.sensitive_processor.fit(sensitive_table)

        key_train = self.key_processor.transform(key_table)
        sensitive_train = self.sensitive_processor.transform(sensitive_table)
        self.predictor.fit(key_train, sensitive_train)

    def predict(self, key_data):
        keys = allow_nan_array(key_data)  # de-nan key attributes
        try:
            # key attributes in ML ready format
            keys_transform = self.key_processor.transform([keys])
        except ValueError:  # Some attributes of the input haven't appeared in synthetic tables
            return None
        sensitive_pred = self.predictor.predict(keys_transform)
        if len(np.array(sensitive_pred).shape) == 1:
            sensitive_pred = [sensitive_pred]

        # predicted sensitive attributes in original format
        sensitives = self.sensitive_processor.inverse_transform(sensitive_pred)
        return tuple(sensitives[0])
예제 #19
0
    def _fit_resample(self, X, y):
        # FIXME: to be removed in 0.12
        if self.n_jobs is not None:
            warnings.warn(
                "The parameter `n_jobs` has been deprecated in 0.10 and will be "
                "removed in 0.12. You can pass an nearest neighbors estimator where "
                "`n_jobs` is already set instead.",
                FutureWarning,
            )

        self._validate_estimator()

        X_resampled = [X.copy()]
        y_resampled = [y.copy()]

        encoder = OrdinalEncoder(dtype=np.int32)
        X_encoded = encoder.fit_transform(X)

        vdm = ValueDifferenceMetric(
            n_categories=[len(cat)
                          for cat in encoder.categories_]).fit(X_encoded, y)

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X_encoded, target_class_indices)

            X_class_dist = vdm.pairwise(X_class)
            self.nn_k_.fit(X_class_dist)
            # the kneigbors search will include the sample itself which is
            # expected from the original algorithm
            nn_indices = self.nn_k_.kneighbors(X_class_dist,
                                               return_distance=False)
            X_new, y_new = self._make_samples(X_class, class_sample, y.dtype,
                                              nn_indices, n_samples)

            X_new = encoder.inverse_transform(X_new)
            X_resampled.append(X_new)
            y_resampled.append(y_new)

        X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, y_resampled
예제 #20
0
def best_features(train, test, perc):
    temp_trans = OrdinalEncoder(dtype='int')
    train[['protocol_type', 'service', 'flag',
           'target']] = temp_trans.fit_transform(
               train[['protocol_type', 'service', 'flag', 'target']])
    trans = SelectPercentile(f_classif, percentile=perc)
    trans.fit(train.drop('target', axis='columns'), train['target'])
    train[['protocol_type', 'service', 'flag',
           'target']] = temp_trans.inverse_transform(
               train[['protocol_type', 'service', 'flag', 'target']])
    eliminated_columns = trans.get_support()
    bad_features = []
    for i in range(len(eliminated_columns)):
        if not eliminated_columns[i]:
            bad_features.append(train.columns[i])
    train.drop(bad_features, axis='columns', inplace=True)
    test.drop(bad_features, axis='columns', inplace=True)
    return train, test
예제 #21
0
    def _fit_resample(self, X, y):
        self._validate_estimator()

        X_resampled = [X.copy()]
        y_resampled = [y.copy()]

        encoder = OrdinalEncoder(dtype=np.int32)
        X_encoded = encoder.fit_transform(X)

        vdm = ValueDifferenceMetric(
            n_categories=[len(cat) for cat in encoder.categories_]
        ).fit(X_encoded, y)

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X_encoded, target_class_indices)

            X_class_dist = vdm.pairwise(X_class)
            self.nn_k_.fit(X_class_dist)
            # the kneigbors search will include the sample itself which is
            # expected from the original algorithm
            nn_indices = self.nn_k_.kneighbors(X_class_dist, return_distance=False)
            X_new, y_new = self._make_samples(
                X_class, class_sample, y.dtype, nn_indices, n_samples
            )

            X_new = encoder.inverse_transform(X_new)
            X_resampled.append(X_new)
            y_resampled.append(y_new)

        X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, y_resampled
예제 #22
0
    def run_unsupervised_simulation(self, N=1000):
        self.N = N
        try:
            return self.S1T, self.S2
        except NameError:
            # If calc_second_order is True, the resulting matrix has N * (2D + 2) rows.
            encoder = OrdinalEncoder().fit(self.df)
            sample = self.sampler.sample(problem=self.problem, N=N)
            sample = pd.DataFrame(encoder.inverse_transform(np.rint(sample)),
                                  columns=self.df.columns)
            # WARNING: hard coded!!
            predictions = self.bn.predict(
                sample, 'response').applymap(lambda x: 1 if x == 'pCR' else 0)

            si = self.analyzer.analyze(problem, sample)
            self.S1T = pd.DataFrame({
                'S1': si['S1'],
                'ST': si['ST']
            },
                                    index=self.df.columns)
            self.S2 = pd.DataFrame(si['S2'],
                                   index=self.df.columns,
                                   columns=self.df.columns)
            return self.S1T, self.S2
예제 #23
0
class NumericTransformer(object):
    """General purpose numeric conversion for pandas dataframes.

    All categorical data and levels must be passed to .fit().
    If new categorical series or levels are present in .transform() it won't work!

    Currently datetimes cannot be inverse_transformed back to datetime

    Args:
        na_strings (list): list of strings to replace as pd.NA
        categorical_fillna (str): how to fill NaN for categorical variables (numeric NaN are unaltered)
            "ffill" - uses forward and backward filling to supply na values
            "indicator" or anything else currently results in all missing replaced with str "missing_value"
        handle_unknown (str): passed through to scikit-learn OrdinalEncoder
        verbose (int): greater than 0 to print some messages
    """

    def __init__(
        self,
        na_strings: list = ['', ' '],  # 'NULL', 'NA', 'NaN', 'na', 'nan'
        categorical_fillna: str = "ffill",
        handle_unknown: str = 'use_encoded_value',
        verbose: int = 0,
    ):
        self.na_strings = na_strings
        self.verbose = verbose
        self.categorical_fillna = categorical_fillna
        self.handle_unknown = handle_unknown
        self.categorical_flag = False
        self.needs_transformation = True

    def _fit(self, df):
        """Fit categorical to numeric."""
        # test if any columns aren't numeric
        if not isinstance(df, pd.DataFrame):  # basically just Series inputs
            df = pd.DataFrame(df)

        if df.shape[1] == df.select_dtypes(include=np.number).shape[1]:
            self.needs_transformation = False
            if self.verbose > 2:
                print("All data is numeric, skipping NumericTransformer")

        if self.needs_transformation:
            # replace some common nan datatypes from strings to nan
            df.replace(self.na_strings, np.nan, inplace=True)  # pd.NA in future

            # convert series to numeric which can be readily converted.
            df = df.apply(pd.to_numeric, errors='ignore')

            # record which columns are which dtypes
            self.column_order = df.columns
            self.numeric_features = df.select_dtypes(
                include=[np.number]
            ).columns.tolist()
            self.categorical_features = list(
                set(df.columns.tolist()) - set(self.numeric_features)
            )

            if len(self.categorical_features) > 0:
                self.categorical_flag = True
            if self.categorical_flag:
                from sklearn.preprocessing import OrdinalEncoder

                df_enc = df[self.categorical_features]
                if self.categorical_fillna == "ffill":
                    df_enc = df_enc.fillna(method='ffill').fillna(method='bfill')
                df_enc = df_enc.fillna('missing_value')
                self.cat_transformer = OrdinalEncoder(
                    handle_unknown=self.handle_unknown, unknown_value=np.nan
                )
                # the + 1 makes it compatible with remove_leading_zeroes
                df_enc = self.cat_transformer.fit_transform(df_enc) + 1
                # df_enc = self.cat_transformer.transform(df_enc) + 1

                self.cat_max = df_enc.max(axis=0)
                self.cat_min = df_enc.min(axis=0)
                if self.verbose > 0:
                    print("Categorical features converted to numeric")
                df = pd.concat(
                    [
                        pd.DataFrame(
                            df[self.numeric_features], columns=self.numeric_features
                        ),
                        pd.DataFrame(
                            df_enc, columns=self.categorical_features, index=df.index
                        ),
                    ],
                    axis=1,
                )[self.column_order]
        return df.astype(float)

    def fit(self, df):
        """Learn behavior of data to change.

        Args:
            df (pandas.DataFrame): input dataframe
        """
        self._fit(df)
        return self

    def fit_transform(self, df):
        """Fits and Returns *Magical* DataFrame.

        Args:
            df (pandas.DataFrame): input dataframe
        """
        return self._fit(df)

    def transform(self, df):
        """Convert categorical dataset to numeric."""
        if self.needs_transformation:
            if not isinstance(df, pd.DataFrame):
                df = pd.DataFrame(df)
            df.replace(self.na_strings, np.nan, inplace=True)
            df = df.apply(pd.to_numeric, errors='ignore')
            if self.categorical_flag:
                df_enc = (df[self.categorical_features]).fillna(method='ffill')
                df_enc = df_enc.fillna(method='bfill').fillna('missing_value')
                df_enc = self.cat_transformer.transform(df_enc) + 1
                df = pd.concat(
                    [
                        pd.DataFrame(
                            df[self.numeric_features], columns=self.numeric_features
                        ),
                        pd.DataFrame(
                            df_enc, columns=self.categorical_features, index=df.index
                        ),
                    ],
                    axis=1,
                )[self.column_order]
        try:
            df = df.astype(float)
        except ValueError as e:
            raise ValueError(
                f"NumericTransformer.transform() could not convert data to float. {str(e)}."
            )
        return df

    def inverse_transform(self, df, convert_dtypes: bool = False):
        """Convert numeric back to categorical.
        Args:
            df (pandas.DataFrame): df
            convert_dtypes (bool): whether to use pd.convert_dtypes after inverse
        """
        if self.categorical_flag:
            if not isinstance(df, pd.DataFrame):  # basically just Series inputs
                df = pd.DataFrame(df)
            df_enc = (
                df[self.categorical_features].clip(
                    upper=self.cat_max, lower=self.cat_min, axis=1
                )
                - 1
            )
            df_enc = self.cat_transformer.inverse_transform(df_enc)
            df = pd.concat(
                [
                    pd.DataFrame(
                        df[self.numeric_features], columns=self.numeric_features
                    ),
                    pd.DataFrame(
                        df_enc, columns=self.categorical_features, index=df.index
                    ),
                ],
                axis=1,
            )[self.column_order]
        if convert_dtypes:
            df = df.convert_dtypes()
        return df
예제 #24
0
data_.iloc[:,1:-1] = enc.fit_transform(data_.iloc[:,1:-1]) # 一步到位

# In[]:
from sklearn.preprocessing import OneHotEncoder

X = data_.iloc[:,1:-1]
 
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
 
#依然可以直接一步到位,但为了给大家展示模型属性,所以还是写成了三步
OneHotEncoder(categories='auto').fit_transform(X).toarray()
 
#依然可以还原
pd.DataFrame(enc.inverse_transform(result))
 
print(enc.get_feature_names()) # 返回每一个经过哑变量后生成稀疏矩阵列的名字
 
# axis=1,表将两表左右相连,如果是axis=0,就是将量表上下相连
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]



# In[]:
# 5、连续变量转换:
# 将年龄二值化
data_2 = data.copy()
 
예제 #25
0
class GeneralizeCategorical(GeneralizeContinuous):
    def __init__(self,
                 epsilon=1.0,
                 n_bins=5,
                 strategy='uniform',
                 max_cardinality=10):
        super().__init__(n_bins=n_bins, strategy=strategy)
        self.epsilon = epsilon
        self.max_cardinality = max_cardinality

    def fit(self, X, y=None):
        """ Steps:
        1. Transform categorical to continuous
        2. Store DP marginal counts for optional inverse transform
        3. Run super().fit() to get groups
        """
        self._ordinalencoder = OrdinalEncoder().fit(X)
        #todo: turn into numpy -> df needed for marginal distribution
        X_enc = self._ordinalencoder.transform(X)
        X_enc = pd.DataFrame(X_enc, columns=X.columns)

        # get dp marginal of encoded feature
        # todo turn into list of arrays
        local_epsilon = self.epsilon / X.shape[1]
        self.marginals_ = []
        for jj, c in enumerate(X.columns):
            self.marginals_.append(
                dp_marginal_distribution(X_enc.loc[:, c],
                                         local_epsilon).values)

        return super().fit(X_enc, y)

    def transform(self, X):
        """Equivalent to continuous transform but we still need to encode the data beforehand"""
        X_enc = self._ordinalencoder.transform(X)
        return super().transform(X_enc)

    def inverse_transform(self, Xt):
        assert set(Xt.columns) == set(
            self._header), "input contains different columns than seen in fit"

        X_enc = check_array(Xt,
                            copy=True,
                            dtype=FLOAT_DTYPES,
                            force_all_finite='allow-nan')
        # Xinv = Xt.copy()
        n_records, n_features = X_enc.shape
        if X_enc.shape[1] != n_features:
            raise ValueError("Incorrect number of features. Expecting {}, "
                             "received {}.".format(n_features, X_enc.shape[1]))

        self._marginal_group_alloc = []

        for jj, c in enumerate(Xt.columns):
            bin_edges = self.bin_edges_[jj]
            marginals = self.marginals_[jj]
            marginals_idx = np.arange(len(marginals))

            rtol = 1.e-5
            atol = 1.e-8
            eps = atol + rtol * np.abs(marginals)
            marginal_group_alloc = np.digitize(marginals_idx + eps,
                                               bin_edges[1:])
            np.clip(marginal_group_alloc,
                    0,
                    self.n_bins_[jj] - 1,
                    out=marginal_group_alloc)

            self._marginal_group_alloc.append(marginal_group_alloc)

            # lower_bounds = np.int_(bin_edges[np.int_(X_enc[:, jj])])
            # upper_bounds = np.int_(bin_edges[np.int_(X_enc[:, jj]) + 1])

            for i in range(n_records):
                # Values which are close to a bin edge are susceptible to numeric
                # instability. Add eps to X so these values are binned correctly
                # with respect to their decimal truncation. See documentation of
                # numpy.isclose for an explanation of ``rtol`` and ``atol``.
                # rtol = 1.e-5
                # atol = 1.e-8
                # eps = atol + rtol * np.abs(upper_bounds[i])
                # marginal_candidates = marginals[
                #     (marginals.keys() >= lower_bounds[i]) &
                #     (marginals.keys() < upper_bounds[i] + eps)]

                #np.where returns 1d tuple, thus index 0
                marginal_candidate_idx = np.where(
                    X_enc[i, jj] == marginal_group_alloc)[0]
                marginal_candidate_probs = marginals[marginal_candidate_idx]
                marginal_candidate_probs_normalized = dp_normalize(
                    marginal_candidate_probs)

                # marginal_idx = np.arange(lower_bounds[i], upper_bounds[i])
                # marginal_probs = marginals[marginal_idx]

                # marginal_probs_normalized = marginal_probs / marginal_probs.sum()
                # sample encoded (numerical) value based on marginal probabilities
                # print(jj)
                # print(X_enc.shape)
                # print(X_enc[i, jj])
                # print(marginal_candidate_idx)
                X_enc[i, jj] = np.random.choice(
                    marginal_candidate_idx,
                    p=marginal_candidate_probs_normalized)

                # X_enc[i, jj] = np.random.choice(list(marginal_candidates.keys()), p=marginal_candidates_normalized.values)

        # inverse transform numerical value to original categorical
        X_inv = self._ordinalencoder.inverse_transform(X_enc)
        return pd.DataFrame(X_inv, columns=self._header)
예제 #26
0
class BinaryEncoder(BaseEstimator, TransformerMixin):
    """
    First the categories are encoded as ordinal, then the resulting integers are converted into the binary code,
    then the digits from the binary string are split into separate columns.

    Decoding is done with O(n) complexity by selecting the value of parameter, binary representation of which is the
    closest to one of the existing categories in terms of Euclidean distance.

    Choices object should be hashable.
    """
    def __init__(self, categories: Union[str, List[object]] = 'auto'):

        self.__categories = categories
        self.__transformer = OrdinalEncoder(categories=self.__categories,
                                            dtype=pd.np.int64)
        self.__encode_mapping = {}
        self.__decode_mapping = {}
        self.__n_bits = {}
        self._enc_suffix = f"_{self.__class__.__name__}"

    def fit(self, df: pd.DataFrame, y=None):

        self.__transformer.fit(X=df, y=y)
        self.__n_bits = {
            c_name: len(format(len(c_cats), 'b'))
            for c_name, c_cats in enumerate(self.__transformer.categories_)
        }
        # __n_bits reflects how many bits it is needed to encode categories of corresponding (by index) column

        # precompute binary encodings
        for idx, column_categories in enumerate(
                self.__transformer.categories_):
            self.__encode_mapping[idx] = dict()
            self.__decode_mapping[idx] = dict()
            for cat_idx, category in enumerate(column_categories):
                encoding = tuple(
                    float(x)
                    for x in format(cat_idx, f'0{self.__n_bits[idx]}b'))
                self.__encode_mapping[idx][cat_idx] = encoding
                self.__decode_mapping[idx][encoding] = cat_idx

        return self

    def transform(self, df: pd.DataFrame) -> pd.np.ndarray:

        if len(df.keys()) != len(self.__n_bits):
            raise TypeError(
                f"Transformer was fit to data with {self.__n_bits} columns, "
                f"but given data with {len(df.keys())} columns.")
        # Convert to OrdinalEncoding
        pre_transformed = self.__transformer.transform(
            X=df)  # In OrdinalEncoding
        # Convert to BinaryEncoding
        n_out_columns = sum(self.__n_bits.values())
        n_out_rows = len(pre_transformed)
        transformed = pd.np.empty(shape=(n_out_rows, n_out_columns),
                                  dtype=pd.np.int64)
        for row_idx, p_row in enumerate(pre_transformed):
            row = []
            for idx, cat_idx in enumerate(p_row):
                row.extend(self.__encode_mapping[idx][cat_idx])
            transformed[row_idx] = row
        return transformed

    def inverse_transform(self, df: pd.DataFrame) -> pd.np.ndarray:

        # convert back from Binary to OrdinalEncoding
        ordinal_encoded = pd.DataFrame(columns=self.__encode_mapping.keys())
        # decode per original column
        left_pointer = 0
        for column in self.__encode_mapping.keys():
            columns_idxs = slice(left_pointer,
                                 left_pointer + self.__n_bits[column])
            left_pointer += self.__n_bits[column]
            bin_columns_raw = df.iloc[:, columns_idxs].to_numpy()
            bin_columns_real = pd.np.apply_along_axis(
                self._closest_euclidean,
                axis=1,
                arr=bin_columns_raw,
                vectors=self.__decode_mapping[column].keys())

            ord_column = pd.np.apply_along_axis(
                lambda enc: self.__decode_mapping[column][tuple(enc)],
                axis=1,
                arr=bin_columns_real)

            ordinal_encoded[column] = ord_column
        # convert back from OrdinalEncoding to original one
        decoded = self.__transformer.inverse_transform(ordinal_encoded)
        return decoded

    @staticmethod
    def _closest_euclidean(vector: pd.np.ndarray,
                           vectors: List[pd.np.ndarray]) -> pd.np.ndarray:
        """
        finds closest vector from a list of provided vectors by minimizing Euclidean distance
        :param vector:
        :param vectors:
        :return:
        """
        min_found_distance = float('inf')
        closest_vector = None
        for existing_vector in vectors:
            dist = sum(((x - y)**2 for x, y in zip(vector, existing_vector)))
            if dist == 0:
                # Found exact match
                closest_vector = existing_vector
                break
            elif dist < min_found_distance:
                closest_vector = existing_vector
                min_found_distance = dist
        return closest_vector
예제 #27
0
class NameClassifier(object):
    '''
    ML algorithm to classify names' nationality
    this class is NameClassifier model class

    Attributes:
        Vectorizer: to vectorize the data for prediction, CountVectorizer
        model: classifier for decision making, based on Naive Bayes

    Methods:
        load_data
        train
        evaluate
        predict
        get_word_dict
        get_label_str
        plot_confusion
        saveModel
        loadModel
    '''
    def __init__(self):
        # declaire the model variables, classifier (clf) and vectorizer, if training new one
        self.model = MultinomialNB()

    ### Some utility functions for data preprocess etc
    # load data from csv on pandas, not tied to class
    def load_data(self, file_names, test_size=0.3):
        '''Load the data, encode the labels, and split into train and test set.
        
        Params:
            file_name(string): file path & name to the csv file
            test_size(float): ratio of testing set, between 0 & 1

        Return: x_train, x_test(as pandas series of names), y_train, y_test(as numpy arr of labels)
            These elements will be returned on the order above.
            Pandas Series: name data, X_train and X_test
            ndarray: encoded labels, y_train and y_test
        '''

        # if 2 filenames (japanese and foregin) is entered, convert labels into fr(non-japanese) for binary classification
        if type(file_names) == list:
            df = pd.concat([pd.read_csv(f) for f in file_names])
            df.loc[df.code != 'jp_JP', 'code'] = 'fr'
        else:
            df = pd.read_csv(file_names)

        labels = df['code'].values.reshape(-1, 1)
        self.label_encoder = OrdinalEncoder().fit(labels)
        labels = self.label_encoder.transform(labels)

        return train_test_split(df['name'],
                                labels.ravel(),
                                test_size=test_size,
                                shuffle=True)

    def train(self, X_train, y_train):
        '''given training data, this method will fit the vectorizer(bag of words) and train the naive bayes model.
        
        Param:
            X_train(Pandas Series): training name dataset
            y_train(ndarray): training labels dataset
        '''

        # fit the vectorizer
        print('Fitting the vectorizer and training the model...')
        self.vec = CountVectorizer().fit(X_train)
        self.word_vec = self.vec.transform(X_train)
        # train the ML model
        self.model.fit(self.word_vec, y_train)
        print('training completed!')

    def predict(self, names, label_str=False):
        '''Predict name's origin based on the test data. Returns encoded label by default,
        but returns label strings when label_str=True
        
        Param:
            names(ndarray/Pandas Series/list): containing names
            label_str(bool): default False, to return label integers, set it to True to return label strings

        Return:
            array: containing label integers or strings.
        '''

        name_vector = self.vec.transform(names)
        pred = self.model.predict(name_vector)
        if not label_str:
            return pred
        else:
            return self.label_encoder.inverse_transform(pred.reshape(
                -1, 1)).ravel()

    def evaluate(self, names, labels):
        '''make prediction, and evaluate the model's 
        - accuracy
        - precision: each element in returned vector represents precision for each class.
        - recall: same as above, except recall for each class.
        for each class and overall.

        You can take average to get model wise precision and recall.

        Params:
            names(list/Pandas Series/ndarray): names data
            labels(ndarray): ground truth
        '''
        prediction = self.predict(names)
        cm = confusion_matrix(labels, prediction)
        # recall
        recall = np.diag(cm) / np.sum(cm, axis=1)
        # precision
        precision = np.diag(cm) / np.sum(cm, axis=0)

        acc = (prediction == labels).mean()

        return {'accuracy': acc, 'precision': precision, 'recall': recall}

    def get_word_dict(self, corpus=None):
        '''This method returns word frequency dictionary, from the training data
        of the model or given corpus if any.

        Params:
            corpus(list/Series): python list or pandas series of names.This is default to
            None, in which case frequency dictionary is created on the data the model was trained on.

        Returns: 
            dictionary: python dictionary with names as keys, and their frequencies as values.
        '''
        freq_dic = {}
        if corpus is None:
            vector = self.vec
            bag_words = self.word_vec
        else:
            vector = CountVectorizer().fit(corpus)
            bag_words = vector.transform(corpus)

        feature = vector.get_feature_names()
        sum_words = bag_words.sum(axis=0).tolist()[0]  # list within list

        for i, word in enumerate(feature):
            freq_dic[word] = sum_words[i]

        return freq_dic

    def get_label_str(self, labels):
        '''accepts numerically encoded labels and returns corresponding label strings
            param:
                labels(ndarray): ndarray containing numerical labels
            returns:
                ndarray: containing label strings
        '''
        return self.label_encoder.inverse_transform(labels.reshape(-1,
                                                                   1)).ravel()

    def plot_confusion(self, yt, prediction_test):
        '''Plot confusion matrix, based on given labels and prediction

        Param:
            yt(ndarray): array of gruond truth labels
            prediction_test(ndarray): predicted labels
        '''

        self.cm = confusion_matrix(yt, prediction_test)
        fig = plt.figure(figsize=(10, 8))
        plt.imshow(self.cm, interpolation='nearest')
        plt.colorbar()
        axis_font = {'size': 13, 'color': 'black'}
        self.cat = self.label_encoder.categories_[0]
        num_class = len(self.cat)
        classNames = [self.cat[i] for i in range(num_class)]
        plt.title("Confusion Matrix by class", fontdict=axis_font)
        plt.ylabel("True Label", fontdict=axis_font)
        plt.xlabel("Predicted Label", fontdict=axis_font)
        tick_marks = np.arange(len(classNames))
        plt.xticks(tick_marks, classNames, rotation=45)
        plt.yticks(tick_marks, classNames)
        fdic = {'size': 10, 'color': 'white', 'weight': 'heavy'}
        for i in range(num_class):
            for j in range(num_class):
                plt.text(j,
                         i,
                         str(self.cm[i, j]),
                         fontdict=fdic,
                         horizontalalignment='center',
                         verticalalignment='center')
        plt.show()

    @classmethod
    def load_model(cls, file_name):  # instance / class method??
        '''Load saved model obj for use.

        Param:
            file_name(string): path to the model file(pickle).
        
        Return:
            NameClassifier: the loaded class obj for use.
        '''
        # https://stackoverflow.com/questions/2709800/how-to-pickle-yourself
        # loading pickled saved model
        # loading itself from the pickle?? lol
        print('loading the model')
        return pickle.load(open(file_name, 'rb'))

    def save_model(self, file_name):
        '''Save a trained model obj for future use.

        Param:
            file_name(string): path to the model file(pickle).
        '''
        # save this class itself as pickle??
        pickle.dump(self, open(file_name, 'wb'))
예제 #28
0
class NumericTransformer(object):
    """Test numeric conversion."""
    def __init__(
            self,
            na_strings: list = ['', ' ', 'NULL', 'NA', 'NaN', 'na', 'nan'],
            categorical_impute_strategy: str = 'constant',
            verbose: int = 0):
        self.na_strings = na_strings
        self.categorical_impute_strategy = categorical_impute_strategy
        self.verbose = verbose
        self.categorical_flag = False

    def fit(self, df):
        """Fit categorical to numeric."""
        # replace some common nan datatypes from strings to np.nan
        df.replace(self.na_strings, np.nan, inplace=True)

        # convert series to numeric which can be readily converted.
        df = df.apply(pd.to_numeric, errors='ignore')

        # record which columns are which dtypes
        self.column_order = df.columns
        # df_datatypes = df.dtypes
        self.numeric_features = (df.select_dtypes(
            include=[np.number]).columns.tolist())
        self.categorical_features = list(
            set(df.columns.tolist()) - set(self.numeric_features))

        if len(self.categorical_features) > 0:
            self.categorical_flag = True
        if self.categorical_flag:
            from sklearn.preprocessing import OrdinalEncoder
            df_enc = (df[self.categorical_features]).fillna(method='ffill')
            df_enc = df_enc.fillna(method='bfill').fillna('missing_value')
            self.cat_transformer = OrdinalEncoder()
            self.cat_transformer.fit(df_enc)

            # the + 1 makes it compatible with remove_leading_zeroes
            df_enc = self.cat_transformer.transform(df_enc) + 1
            self.cat_max = df_enc.max(axis=0)
            self.cat_min = df_enc.min(axis=0)
            if self.verbose >= 0:
                print("Categorical features converted to numeric")
        return self

    def transform(self, df):
        """Convert categorical dataset to numeric."""
        df.replace(self.na_strings, np.nan, inplace=True)
        df = df.apply(pd.to_numeric, errors='ignore')
        if self.categorical_flag:
            df_enc = (df[self.categorical_features]).fillna(method='ffill')
            df_enc = df_enc.fillna(method='bfill').fillna('missing_value')
            df_enc = self.cat_transformer.transform(df_enc) + 1
            df = pd.concat([
                pd.DataFrame(df[self.numeric_features],
                             columns=self.numeric_features),
                pd.DataFrame(
                    df_enc, columns=self.categorical_features, index=df.index)
            ],
                           axis=1)[self.column_order]
        return df.astype(float)

    def inverse_transform(self, df):
        """Convert numeric back to categorical."""
        if self.categorical_flag:
            df_enc = df[self.categorical_features].clip(
                upper=self.cat_max, lower=self.cat_min, axis=1) - 1
            df_enc = self.cat_transformer.inverse_transform(df_enc)
            df = pd.concat([
                pd.DataFrame(df[self.numeric_features],
                             columns=self.numeric_features),
                pd.DataFrame(
                    df_enc, columns=self.categorical_features, index=df.index)
            ],
                           axis=1)[self.column_order]
        return df
예제 #29
0
        self.assertListEqual(enc_df.categories_[1].tolist(),
                             ["Unknown", "Big", "Small", "Other"])

        self.assertTrue((X_df_tran == np.array([[2, 1], [1, 1], [1, 2],
                                                [0, 3]])).all())
        self.assertTrue((X_df_invtran == X).all())


if __name__ == "__main__":
    unittest.main()

labelenc = OrdinalEncoder()
X = np.array([['Male', 1], ['Female', 1], ['Female', 2]])
labelenc.fit(X)
X_tran = labelenc.transform(X)
X_invtran = labelenc.inverse_transform(X_tran)
print(labelenc.categories_)
print(X_tran)
print(X_invtran)

print("========")

enc = FrequencyEncoder()
X = np.array([['Male', 1], ['Female', 2], ['Female', 2]])
enc.fit(X)
X_tran = enc.transform(X)
X_invtran = enc.inverse_transform(X_tran)
print(enc.categories_)
print(X_tran)
print(X_invtran)
예제 #30
0
# > Una posible opcion para tratar con nans, es reconocerlo como tal y asignarle su propia categoria

# Aca vemos como pasar el nan al texto "nan" y por lo tanto es un "nuevo" color
df['eye_color'].astype(str).unique()

# Convertimos nulos a string 'nan', es decir un valor posible mas para que no explote
df[['eye_color_encoded',
    'gender_encoded']] = oe.fit_transform(df[columns_to_encode].astype(str))

df[['eye_color', 'eye_color_encoded', 'gender', 'gender_encoded']]

# > Una funcionalidad MUY interesante de muchas de las clases de sklearn que ayudan en la transformacion de
# es que tienen la transformacion INVERSA!

oe.inverse_transform(df[['eye_color_encoded', 'gender_encoded']])

# **Pregunta del millon**:
# - Esta todo bien con esta trasnformacion??
# - Puedo usar las columnas 'eye_color_encoded' y 'gender_encoded' ??

# #### Label Encoder
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

# Es exactamente la misma idea pero esperando una sola variable ya que se usa para encodear la variable target de un modelo predictivo

le = LabelEncoder()
# Convertimos nulos a string 'nan', es decir un valor posible mas
df['alignment_encoded'] = le.fit_transform(df['alignment'].astype(str))

df[['alignment', 'alignment_encoded']]
예제 #31
0
# # 方法二 使用sklearn中LabelEncoder()类进行映射
# class_le = LabelEncoder()
# class_le.fit(df["classlabel"])
# print(class_le.classes_)
# df["classlabel"]= class_le.fit_transform(df["classlabel"])
# print(df)
# print(class_le.inverse_transform(df["classlabel"]))


# 方法三 使用sklearn中的OrdinalEncoder来进行编码
size_oe = OrdinalEncoder()
size_oe.fit(df['size'].values.reshape(-1,1))
size_new = size_oe.fit_transform(df['size'].values.reshape(-1,1))
print(size_oe.categories_)
print(size_new)
print(size_oe.inverse_transform(size_new))


# # 方法四 One-hot方法
# # 为什么要使用One-hot方法?
# # 对于color特征项,如果使用上述两种方法转换为数值型,会引入由不同的数值大小造成的特征不平等问题
# # 1) 使用pandas中的get_dummies()方法(哑变量)来进行处理
# pf = pd.get_dummies(df["color"])
# df = pd.concat([df,pf], axis=1)
# df.drop(["color"], axis = 1,inplace = True)
# print(df)

# 2) 使用sklearn中的OneHotEncoder来进行处理
color_ohe = OneHotEncoder(sparse=False) # sparse = False意味着输出的是numpy.ndarray, True输出的是scipy.sparse.csr.csr_matrix
color_ohe.fit(df["color"].values.reshape(-1,1))
color_New = color_ohe.fit_transform(df["color"].values.reshape(-1,1))