예제 #1
0
def test_ordinal_encoder_raise_missing(X):
    ohe = OrdinalEncoder()

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    ohe.fit(X[:1, :])

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X)
class OrdinalEncodeCategoricalVariables(BaseEstimator, TransformerMixin):
    # order and encode categorical variables
    # self.variables --> CATEGORICAL_VARIABLES

    def __init__(self, variables=None):

        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # get_dummies isn't appropriate so use ordinal_map
        # add points column to X so groupby works!

        #X = X.copy()
        print()
        print(X.dtypes)

        self.enc = OrdinalEncoder()
        self.enc.fit(X[self.variables])

        return self

    def transform(self, X):

        X[self.variables] = self.enc.transform(X[self.variables])
        print()
        print(X.dtypes)

        return X
예제 #3
0
def load_data(path,path_on):
    df = pd.read_csv(path, sep="\t")
    df = precession(df)
    #分析了模型的feature_importance之后,删去了一些重要性很低的特征
    df.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True)
    #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性
    df = df.fillna("empty")

    #线下训练数据
    x = df.drop(['label','sid'],axis=1)
    y = df['label']
    cols = x.columns

    #线上训练数据
    df_on = pd.read_csv(path_on, sep="\t")
    df_on = precession(df_on)
    df_on.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True)
    x_on = df_on.drop(['sid'], axis=1)
    x_on = x_on.fillna("empty")
    #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况
    x_all = pd.concat([x, x_on], 0)

    #把所有的字符编码成数字
    oe = OrdinalEncoder()
    oe.fit(x_all)  # 直接传入 他会自动将object类型换掉
    x = oe.transform(x)
    print(x.shape)
    return x,y,oe,cols
예제 #4
0
def get_sklearn_accuracy(mode):
    """
    Function to find accuracy using sklearn decision tree classifier
    Input:
    mode: variable to indicate test or train accuracy
    Output:
    accuracy score for sklearn model
    """
    if not mode:
        raise ValueError("Specify mode")
    # Read dataset
    train = pd.read_csv("data/small_train.csv")
    test = pd.read_csv("data/small_test.csv")

    # Encode categorical variables
    encoder = OrdinalEncoder()
    encoded_train = encoder.fit_transform(train)
    encoded_test = encoder.transform(test)

    # Split data into train and test dataset
    x_train, y_train = encoded_train[:, :-1], encoded_train[:, -1]
    x_test, y_test = encoded_test[:, :-1], encoded_test[:, -1]

    # Instantiate sklearn classifier and train on training dataset
    model = DecisionTreeClassifier()
    mod = model.fit(x_train, y_train)

    # Evaluate model for training dataset and test dataset accuracy
    if mode == "train":
        predictions = mod.predict(x_train)
        return accuracy_score(y_train, predictions)
    predictions = mod.predict(x_test)
    return accuracy_score(y_test, predictions)
예제 #5
0
파일: models.py 프로젝트: pajoshi/pycbr
class MatrixOrdinalAttribute(Attribute):
    """A (possibly) categorical attribute whose similarity is defined by a matrix"""

    def __init__(self, values, matrix, undefined=("n.a.",)):
        super().__init__()
        self.values = values
        self.matrix = matrix
        self.undefined = undefined

        self.n = len(values)
        self.encoder = None

    def get_description(self):
        return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__,
                "values": self.values, "matrix": self.matrix,
                "undefined": self.undefined}

    def fit(self, X, y=None):
        self.encoder = OrdinalEncoder([self.values + list(self.undefined)], dtype=int)
        self.encoder.fit([[x] for x in self.values + list(self.undefined)])  # Argument irrelevant
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X)

    def similarity(self, x, y):
        if x >= self.n or y >= self.n:
            return np.nan
        return self.matrix[x][y]
예제 #6
0
파일: models.py 프로젝트: pajoshi/pycbr
class LinearOrdinalAttribute(Attribute):
    """A (possibly) categorical attribute whose similarity is linear with respect to a scale"""

    def __init__(self, order, undefined=("n.a.",)):
        """

        Args:
            order (list): List of values, defining their ordering.
            undefined (iterable): Values which are recognized, but not comparable to the ranking. When such a value is
                                  found, the similarity returned is nan.
        """
        super().__init__()
        self.order = order
        self.undefined = undefined

        self.n = len(order)
        self.encoder = None

    def get_description(self):
        return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__,
                "order": self.order, "undefined": self.undefined}

    def fit(self, X, y=None):
        self.encoder = OrdinalEncoder([self.order + list(self.undefined)])
        self.encoder.fit([[x] for x in self.order + list(self.undefined)])  # Argument irrelevant
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X)

    def similarity(self, x, y):
        if x >= self.n or y >= self.n:
            return np.nan
        return 1 - abs(x - y) / (self.n - 1)
def encode_categories(final_db):
    '''Take needed features from the dataset and encode string into categorical numbers
    Inputs:
        - final_db (Pandas dataframe): cleaned dataframe 
    Outputs:
        - X (Pandas dataframe): feature matrix dimension NxD, where N is the datapoints number and D the number of features
        - y (numpy array): labels array (binary or multiclass), dimension Nx1 
        - enc (sklearn OrdinalEncoder): ordinal encoder used (to be used in decoding after)
    '''

    # Loading data
    X = final_db.copy()
    X = X[[
        'exposure_type', 'obs_duration_mean', 'conc1_type', 'species', 'class',
        'tax_order', 'family', 'genus', 'atom_number', 'alone_atom_number',
        'tripleBond', 'doubleBond', 'bonds_number', 'ring_number', 'Mol',
        'MorganDensity', 'LogP'
    ]]
    y = final_db.score.copy().values

    # Encoding phase
    enc = OrdinalEncoder(dtype=int)
    enc.fit(X[[
        'exposure_type', 'conc1_type', 'species', 'class', 'tax_order',
        'family', 'genus'
    ]])
    X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']] = \
        enc.transform(X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']])

    return X, y, enc
예제 #8
0
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1):
    if not X:
        X = np.array([
            ["P", "+"],
            ["P2", "-"],
            ["P3", "-"],
        ])

    custom_encoder = CustomOrdinalFeatureEncoder()
    ordinal_encoder = OrdinalEncoder()

    ordinal_encoder_time = []
    custom_encoder_time = []
    for i in range(iterations):
        ts = time()
        custom_encoder.fit(X)
        transformed = custom_encoder.transform(X)
        custom_encoder.inverse_transform(transformed)
        custom_encoder_time.append(time() - ts)

        ts = time()
        ordinal_encoder.fit(X)
        transformed = ordinal_encoder.transform(X)
        ordinal_encoder.inverse_transform(transformed)
        ordinal_encoder_time.append(time() - ts)
    custom_encoder_time = np.mean(custom_encoder_time)
    ordinal_encoder_time = np.mean(ordinal_encoder_time)
    if verbose:
        print(f"CustomEncoder -> Time: {custom_encoder_time}")
        print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}")
    return custom_encoder_time, ordinal_encoder_time
 def labelEncoding(self, data_column):
     logger.info('[{}] : [INFO] Label encoding ...'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
     enc = OrdinalEncoder()
     enc.fit(data_column)
     enc_data_column = enc.transform(data_column)
     return enc_data_column
def load_data_rasar(DATA_PATH, encoding, seed=42):

    db = pd.read_csv(DATA_PATH).drop(
        columns=['Unnamed: 0', 'test_cas', 'smiles'])

    numerical = [
        'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP',
        'alone_atom_number', 'doubleBond', 'tripleBond', 'ring_number',
        'oh_count', 'MeltingPoint', 'WaterSolubility'
    ]

    # Categoriche + obs_duration_mean (già numeri)
    categorical = [
        'conc1_type', 'exposure_type', 'control_type', 'media_type',
        'application_freq_unit', 'species', 'class', 'tax_order', 'family',
        'genus'
    ]

    # MinMax trasform for numerical variables
    for nc in numerical:
        minmax = MinMaxScaler()
        minmax.fit(db[[nc]])
        db[[nc]] = minmax.transform(db[[nc]])

    # Ordinal Encoding for categorical variables
    encoder = OrdinalEncoder(dtype=int)
    encoder.fit(db[categorical])
    db[categorical] = encoder.transform(db[categorical]) + 1

    # Apro i pubchem
    db = pd.concat([db, pd.DataFrame(pd.DataFrame(db['pubchem2d'].values).\
                                     apply(lambda x: x.str.replace('', ' ').str.strip().str.split(' '), axis = 1)[0].to_list(),
                   columns = ['pub'+ str(i) for i in range(1,882)])], axis = 1)

    db.drop(columns=['fish'], inplace=True)

    # Encoding for target variable: binary and multiclass
    if encoding == 'binary':
        db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 0, 1)

    elif encoding == 'multiclass':
        t = db['conc1_mean'].copy()
        db['conc1_mean'] = multiclass_encoding(t)

    X = db.drop(columns='conc1_mean')
    y = db['conc1_mean'].values

    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=seed)

    # ricongiungo train con test
    X_try = X_train.append(X_test)

    # tengo traccia della lunghezza del train set
    len_X_train = len(X_train)

    return X_try, X_train, X_test, y_train, y_test, len_X_train
예제 #11
0
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
    """Check ordinal encoder is compatible with pandas."""
    # checks pandas dataframe with categorical features
    pd = pytest.importorskip("pandas")

    pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan

    df = pd.DataFrame(
        {
            "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
        }
    )

    oe = OrdinalEncoder().fit(df)
    assert len(oe.categories_) == 1
    assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
    assert np.isnan(oe.categories_[0][-1])

    df_trans = oe.transform(df)

    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])

    X_inverse = oe.inverse_transform(df_trans)
    assert X_inverse.shape == (5, 1)
    assert_array_equal(X_inverse[:2, 0], ["c", "a"])
    assert_array_equal(X_inverse[3:, 0], ["b", "a"])
    assert np.isnan(X_inverse[2, 0])
예제 #12
0
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
    """Check ordinal encoder is compatible with pandas."""
    # checks pandas dataframe with categorical features
    if pd_nan_type == 'pd.NA':
        # pd.NA is in pandas 1.0
        pd = pytest.importorskip('pandas', minversion="1.0")
        pd_missing_value = pd.NA
    else:  # np.nan
        pd = pytest.importorskip('pandas')
        pd_missing_value = np.nan

    df = pd.DataFrame({
        'col1':
        pd.Series(['c', 'a', pd_missing_value, 'b', 'a'], dtype='category'),
    })

    oe = OrdinalEncoder().fit(df)
    assert len(oe.categories_) == 1
    assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c'])
    assert np.isnan(oe.categories_[0][-1])

    df_trans = oe.transform(df)

    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])

    X_inverse = oe.inverse_transform(df_trans)
    assert X_inverse.shape == (5, 1)
    assert_array_equal(X_inverse[:2, 0], ['c', 'a'])
    assert_array_equal(X_inverse[3:, 0], ['b', 'a'])
    assert np.isnan(X_inverse[2, 0])
예제 #13
0
def setup():
    if not path.isfile(".data/30-days-of-ml.zip"):
        os.system("kaggle competitions download -c 30-days-of-ml")
    if not path.isdir(".data/30-days-of-ml/"):
        os.system("unzip .data/30-days-of-ml.zip -d .data/30-days-of-ml")

    # Load the training data
    train = pd.read_csv(".data/30-days-of-ml/train.csv", index_col=0)
    test = pd.read_csv(".data/30-days-of-ml/test.csv", index_col=0)

    # Preview the data
    train.head()

    # Separate target from features
    y = train['target']
    features = train.drop(['target'], axis=1)

    # Preview features
    features.head()

    # List of categorical columns
    object_cols = [col for col in features.columns if 'cat' in col]

    # ordinal-encode categorical columns
    X = features.copy()
    X_test = test.copy()
    ordinal_encoder = OrdinalEncoder()
    X[object_cols] = ordinal_encoder.fit_transform(features[object_cols])
    X_test[object_cols] = ordinal_encoder.transform(test[object_cols])

    # Preview the ordinal-encoded features
    X.head()

    return X, y, X_test
예제 #14
0
def test_encoders_string_categories(input_dtype, category_dtype, array_type):
    """Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    """

    X = np.array([["b"], ["a"]], dtype=input_dtype)
    categories = [np.array(["b", "a"], dtype=category_dtype)]
    ohe = OneHotEncoder(categories=categories, sparse=False).fit(X)

    X_test = _convert_container([["a"], ["a"], ["b"], ["a"]],
                                array_type,
                                dtype=input_dtype)
    X_trans = ohe.transform(X_test)

    expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
    assert_allclose(X_trans, expected)

    oe = OrdinalEncoder(categories=categories).fit(X)
    X_trans = oe.transform(X_test)

    expected = np.array([[1], [1], [0], [1]])
    assert_array_equal(X_trans, expected)
예제 #15
0
def load_data(path, path_on):
    df = pd.read_csv(path)
    df = precession(df)
    #分析了模型的feature_importance之后,删去了一些重要性很低的特征
    #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性
    df = df.fillna("empty")

    #线下训练数据
    x = df.drop(['label', 'sid'], axis=1)
    y = df['label']
    cols = x.columns

    #线上训练数据
    df_on = pd.read_csv(path_on)
    df_on = precession(df_on)
    x_on = df_on.drop(['sid'], axis=1)
    x_on = x_on.fillna("empty")
    #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况
    x_all = pd.concat([x, x_on], 0)
    print(x_all.shape)

    #把所有的字符编码成数字
    oe = OrdinalEncoder()
    oe.fit(x_all)
    x = oe.transform(x)
    print(x.shape)
    return x, y, oe, cols
예제 #16
0
 def _encode_feature(self, mat, feature_column):
     feat = mat[feature_column].to_numpy().reshape(-1, 1)
     enc = OrdinalEncoder()
     enc.fit(feat)
     self.feature_encoders[feature_column] = enc
     mat.loc[:, feature_column] = enc.transform(feat)
     return mat
def ordinal_encoder(params):
    train = params[0].astype('str')
    test = params[1].astype('str')
    oe = OrdinalEncoder()
    train = oe.fit_transform(train.reshape(-1, 1))
    test = oe.transform(test.reshape(-1, 1))
    return train.flatten(), test.flatten()
예제 #18
0
    def new_data_encoding(self, types_dict={}):
        key_list = list(types_dict.keys())
        ordinal_list = []
        onehot_list = []
        ordinal = OrdinalEncoder()
        onehot = OneHotEncoder()
        result = []
        for key in key_list:
            if types_dict[key] == 0:
                ordinal_list.append(key)
            elif types_dict[key] == 1:
                onehot_list.append(key)
        print(ordinal_list)
        print(onehot_list)
        temp_o = self.data.loc[:, ordinal_list]
        if len(ordinal_list) == 1:
            ordinal.fit(temp_o.values.reshape(-1, 1))
            self.data.loc[:, ordinal_list] = ordinal.transform(
                temp_o.values.reshape(-1, 1))
        elif len(ordinal_list):
            ordinal.fit(temp_o)
            self.data.loc[:, ordinal_list] = ordinal.transform(temp_o)
        joblib.dump(ordinal, 'datahandle/ordinal.pkl')

        temp_hot = self.data.loc[:, onehot_list]
        if len(onehot_list) == 1:
            onehot.fit(temp_hot.values.reshape(-1, 1))
            self.data.loc[:, onehot_list] = onehot.transform(
                temp_hot.values.reshape(-1, 1))
        elif len(onehot_list):
            onehot.fit(temp_hot)
            result = onehot.transform(temp_hot).toarray()
            result = pd.DataFrame(result)
        columns = []
        joblib.dump(onehot, 'datahandle/onehot.pkl')
        for l in onehot.categories_:
            columns = columns + list(l)
        result.columns = columns
        for i in range(len(onehot_list)):
            key = list(onehot.categories_[i])
            temp = result.loc[:, key]
            pos = self.data.columns.get_loc(onehot_list[i])
            data1 = self.data.iloc[:, 0:pos]
            data2 = self.data.iloc[:, pos + 1:]
            data1 = pd.concat([data1, temp], axis=1)
            self.data = pd.concat([data1, data2], axis=1)
        print(self.data)
예제 #19
0
def test_value_difference_metric_property(dtype, k, r, y_type, encode_label):
    # Check the property of the vdm distance. Let's check the property
    # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and
    # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34
    # https://arxiv.org/pdf/cs/9701101.pdf
    #
    # "if an attribute color has three values red, green and blue, and the
    # application is to identify whether or not an object is an apple, red and
    # green would be considered closer than red and blue because the former two
    # both have similar correlations with the output class apple."

    # defined our feature
    X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1)
    # 0 - not an apple / 1 - an apple
    y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1])
    y_labels = np.array(["not apple", "apple"], dtype=object)
    y = y_labels[y]
    y = _convert_container(y, y_type)
    if encode_label:
        y = LabelEncoder().fit_transform(y)

    encoder = OrdinalEncoder(dtype=dtype)
    X_encoded = encoder.fit_transform(X)

    vdm = ValueDifferenceMetric(k=k, r=r)
    vdm.fit(X_encoded, y)

    sample_green = encoder.transform([["green"]])
    sample_red = encoder.transform([["red"]])
    sample_blue = encoder.transform([["blue"]])

    for sample in (sample_green, sample_red, sample_blue):
        # computing the distance between a sample of the same category should
        # give a null distance
        dist = vdm.pairwise(sample).squeeze()
        assert dist == pytest.approx(0)

    # check the property explained in the introduction example
    dist_1 = vdm.pairwise(sample_green, sample_red).squeeze()
    dist_2 = vdm.pairwise(sample_blue, sample_red).squeeze()
    dist_3 = vdm.pairwise(sample_blue, sample_green).squeeze()

    # green and red are very close
    # blue is closer to red than green
    assert dist_1 < dist_2
    assert dist_1 < dist_3
    assert dist_2 < dist_3
예제 #20
0
def encode_column(encoding_type, col_name):
    
    if encoding_type=='label':
        le=LabelEncoder()
        le.fit(df[col_name])
        title_order = list(le.classes_)
        df[col_name] = le.fit_transform(df[col_name])
        print("Label Encoded")
    if encoding_type=='ordinal':
        oe=OrdinalEncoder()
        Ord = [["AssocProf",1],["AsstProf",0],["Prof",2]]
        oe.fit(Ord)
        title_order = Ord
        oe.transform(df[col_name])
        print("Ordinal Encoded")

    return
예제 #21
0
class CatSklearnAttacker(PrivacyAttackerModel):
    """Base class for categorical attacker based on sklearn models.

    Attributes:
        key_type (CategoricalType):
            Required key attribute type (class_num or one_hot) by the learner.
        sensitive_type (CategoricalType):
            Required sensitive attribute type (class_num or one_hot) by the learner.
        skl_learner (Class):
            A (wrapped) sklearn classifier class that can be called with no arguments.
    """
    KEY_TYPE = None
    SENSITIVE_TYPE = None
    SKL_LEARNER = None

    def __init__(self):
        self.predictor = self.SKL_LEARNER()
        self.key_processor = OrdinalEncoder() if self.KEY_TYPE == CategoricalType.CLASS_NUM \
            else OneHotEncoder()
        self.sensitive_processor = OrdinalEncoder() if \
            self.SENSITIVE_TYPE == CategoricalType.CLASS_NUM else OneHotEncoder()

    def fit(self, synthetic_data, key, sensitive):
        key_table = allow_nan(synthetic_data[key])
        sensitive_table = allow_nan(synthetic_data[sensitive])
        self.key_processor.fit(key_table)
        self.sensitive_processor.fit(sensitive_table)

        key_train = self.key_processor.transform(key_table)
        sensitive_train = self.sensitive_processor.transform(sensitive_table)
        self.predictor.fit(key_train, sensitive_train)

    def predict(self, key_data):
        keys = allow_nan_array(key_data)  # de-nan key attributes
        try:
            # key attributes in ML ready format
            keys_transform = self.key_processor.transform([keys])
        except ValueError:  # Some attributes of the input haven't appeared in synthetic tables
            return None
        sensitive_pred = self.predictor.predict(keys_transform)
        if len(np.array(sensitive_pred).shape) == 1:
            sensitive_pred = [sensitive_pred]

        # predicted sensitive attributes in original format
        sensitives = self.sensitive_processor.inverse_transform(sensitive_pred)
        return tuple(sensitives[0])
예제 #22
0
def encode_batters(data):
    data[['BAT_ID', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID']] = \
        data[['BAT_ID', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID']]\
        .fillna('')
    batters = np.unique(
        data[['BAT_ID', 'BASE1_RUN_ID', 'BASE2_RUN_ID',
              'BASE3_RUN_ID']].values.reshape(-1))
    encoder = OrdinalEncoder().fit(batters.reshape(-1, 1))
    data['BAT_ID'] = encoder.transform(data['BAT_ID'].values.reshape(
        -1, 1)).reshape(-1).astype(int)
    data['BASE1_RUN_ID'] = encoder.transform(
        data['BASE1_RUN_ID'].values.reshape(-1, 1)).reshape(-1).astype(int)
    data['BASE2_RUN_ID'] = encoder.transform(
        data['BASE2_RUN_ID'].values.reshape(-1, 1)).reshape(-1).astype(int)
    data['BASE3_RUN_ID'] = encoder.transform(
        data['BASE3_RUN_ID'].values.reshape(-1, 1)).reshape(-1).astype(int)
    return data
예제 #23
0
def test_ordinalencoder():
    X0 = [["Male", 1], ["Female", 3], ["Female", 2]]
    X1 = [["Male", 1], ["Female", 27], ["Bananas", 2]]
    for X in [X0, X1]:
        ohe = OrdinalEncoder()
        ohe.fit(X)
        ohe_ = convert_estimator(ohe)
        assert np.allclose(ohe.transform(X), ohe_.transform(X))
def ordinal_encode_df(df, encoder=None):
    """ Transform the object categories by means of ordinal encoding"""
    if encoder is None:
        ordinal_enc = OrdinalEncoder().fit(
            df.select_dtypes('object').replace(np.nan, 'nan'))
    else:
        ordinal_enc = encoder
    object_ordinals = pd.DataFrame(
        ordinal_enc.transform(
            df.select_dtypes('object').replace(np.nan, 'nan'))).astype('int')
    # we put back in the nan values
    nan_list = ordinal_enc.transform(np.array([['nan', 'nan', 'nan']]))
    for i in range(3):
        object_ordinals.iloc[:, i] = object_ordinals.iloc[:, i].replace(
            nan_list[0, i], np.nan)
    for i, col in enumerate(df.select_dtypes('object').columns):
        df[col] = object_ordinals.iloc[:, i]
예제 #25
0
def encodeOrdinal(data, col_names):
    # creating instance of encoder
    ordinal_encoder = OrdinalEncoder()

    # Assigning numerical values and storing in another column
    ordinal_encoder.fit(data[col_names])
    data[col_names] = ordinal_encoder.transform(data[col_names])
    return data, ordinal_encoder
예제 #26
0
def loadCar():
    data = pd.read_csv('car.csv')

    X, y = data.values[:1401, 0:6], data.values[:1401, 6]
    finalTestX, finalTestY = data.values[1401:, 0:6], data.values[1401:, 6]

    print("Size of car data: ", len(X))

    enc = OrdinalEncoder()
    enc.fit(X)
    X = enc.transform(X)
    finalTestX = enc.transform(finalTestX)
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)
    finalTestY = le.transform(finalTestY)
    return X, y, finalTestX, finalTestY
예제 #27
0
def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
    """Checks that ordinal encoder transforms string dtypes. Non-regression
    test for #19872."""
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9)
    enc.fit(X_train)

    X_trans = enc.transform(X_test)
    assert_allclose(X_trans, [[-9, 0]])
예제 #28
0
def id3_adapret(**kwargs):
    # getting data from kwargs
    train = kwargs['train']
    test = kwargs['test']
    t = kwargs['tolorance']
    # bulding encoder
    merged_data = pd.concat([train,test])
    merged_data_without_class = merged_data.drop('class',1)
    encoder = OrdinalEncoder()
    encoder.fit(merged_data_without_class)
    # seperating classification column from datasets
    train_without_class= train.drop('class',1)
    test_without_class = test.drop('class',1)
    train_classifications = train['class']
    test_classifications = test['class']
    # encoding them all
    encoded_train_without_class = encoder.transform(train_without_class)
    encoded_test_without_class = encoder.transform(test_without_class)
    encoded_train_classifications = train_classifications.map({'yes':1,'no':0})
    encoded_test_classifications = test_classifications.map({'yes':1,'no':0})
    # building classification tree 
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=t)
    clf.fit(encoded_train_without_class, encoded_train_classifications)
    # pridicting with the tree
    predictions = clf.predict(encoded_test_without_class)
    # buildding matrix and cakculating score
    correct = 0
    TP,TN,FP,FN = 0,0,0,0
    for classif, predic in zip(encoded_test_classifications, predictions):
        if (classif == predic):
            correct += 1
        if (classif == 1 and predic == 1 ):
            TP = TP + 1
        if (classif == 0 and predic == 0 ):
            TN = TN + 1
        if (classif == 0 and predic == 1 ):
            FP = FP + 1
        if (classif == 1  and predic == 0 ):
            FN = FN + 1
    total = len(predictions)
    # returning dict acording to the dapter
    return {'score':( correct / total ) *100,
            'TP':TP,
            'TN':TN,
            'FP':FP,
            'FN':FN}
예제 #29
0
def clean_data(data: DataFrame):
    columns = ['form_field1', 'form_field2', 'form_field3', 'form_field4', 'form_field5', 'form_field6', 'form_field7', 'form_field8', 'form_field9', 'form_field10', 'form_field12', 'form_field13', 'form_field14', 'form_field16', 'form_field17', 'form_field18', 'form_field19', 'form_field20', 'form_field21', 'form_field22', 'form_field24', 'form_field25', 'form_field26', 'form_field27', 'form_field28', 'form_field29', 'form_field32', 'form_field33', 'form_field34', 'form_field36', 'form_field37', 'form_field38', 'form_field39', 'form_field42', 'form_field43', 'form_field44', 'form_field46', 'form_field47', 'form_field48', 'form_field49', 'form_field50']
    categories = [array(['charge', 'lending'], dtype=object)]

    df = data[columns]
    enc = OrdinalEncoder()
    enc.categories_ = categories
    df.form_field47 = enc.transform(df.form_field47.to_frame())
    return df
예제 #30
0
def test_ordinal_encoder_handle_unknowns_nan():
    # Make sure unknown_value=np.nan properly works

    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)

    X_fit = np.array([[1], [2], [3]])
    enc.fit(X_fit)
    X_trans = enc.transform([[1], [2], [4]])
    assert_array_equal(X_trans, [[0], [1], [np.nan]])
예제 #31
0
def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test):
    """Test the interaction between missing values and handle_unknown"""

    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    X_trans = oe.fit_transform(X)
    assert_allclose(X_trans, expected_X_trans)

    assert_allclose(oe.transform(X_test), [[-1.0]])