예제 #1
0
def train_my_model(train_dataset):
    clean_data, rusher_char = cleaning_blue_print(train_dataset, save=False)
    spatial_data = spatial_blue_print(train_dataset, save=False)
    # for testing purposes
    # clean_data = pd.read_csv('datasets/train_cleaned_data_v1_1.csv')
    # spatial_data = pd.read_csv('datasets/train_spatial_data_v1_1.csv')
    # print(clean_data.dtypes['GameSnap'], spatial_data.dtypes['_GameSnap'])
    total_data = pd.merge(clean_data,
                          spatial_data,
                          left_on='GameSnap',
                          right_on='_GameSnap',
                          how='left')
    dataset = total_data[total_data.QB1_offense_mean_distance.notnull()].drop(
        ['GameSnap', '_GameSnap'],
        axis=1)  # in the dataset this column is not populated
    train, test = train_test_split(dataset, test_size=0.3, random_state=123)
    # print(train.isnull().sum().to_string())
    x_train = train.drop(['Yards'], axis=1)
    x_test = test.drop(['Yards'], axis=1)
    y_train = train['Yards']
    y_test = test['Yards']

    # finding the categorical variables
    cat_features = train.select_dtypes(include=['object']).columns
    # Ordinal encoding of the categorical variables
    enc = OrdinalEncoder()
    enc.fit(dataset[cat_features])
    x_train[cat_features] = enc.transform(x_train[cat_features])
    # print(cat_features)
    # Random Forrest model
    RF_model = RandomForestRegressor()
    RF_model.fit(x_train, y_train)
    return (RF_model, enc, cat_features, rusher_char)
 def test_ordinal_encoder_mixed_string_int_drop(self):
     data = [
         ["c0.4", "c0.2", 3],
         ["c1.4", "c1.2", 0],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
     ]
     test = [["c0.2", "c2.2", 1]]
     model = OrdinalEncoder(categories="auto")
     model.fit(data)
     inputs = [
         ("input1", StringTensorType([None, 2])),
         ("input2", Int64TensorType([None, 1])),
     ]
     model_onnx = convert_sklearn(model,
                                  "ordinal encoder",
                                  inputs,
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         test,
         model,
         model_onnx,
         basename="SklearnOrdinalEncoderMixedStringIntDrop",
         allow_failure="StrictVersion("
         "onnxruntime.__version__)"
         "<= StrictVersion('0.5.0')",
     )
예제 #3
0
파일: models.py 프로젝트: pajoshi/pycbr
class MatrixOrdinalAttribute(Attribute):
    """A (possibly) categorical attribute whose similarity is defined by a matrix"""

    def __init__(self, values, matrix, undefined=("n.a.",)):
        super().__init__()
        self.values = values
        self.matrix = matrix
        self.undefined = undefined

        self.n = len(values)
        self.encoder = None

    def get_description(self):
        return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__,
                "values": self.values, "matrix": self.matrix,
                "undefined": self.undefined}

    def fit(self, X, y=None):
        self.encoder = OrdinalEncoder([self.values + list(self.undefined)], dtype=int)
        self.encoder.fit([[x] for x in self.values + list(self.undefined)])  # Argument irrelevant
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X)

    def similarity(self, x, y):
        if x >= self.n or y >= self.n:
            return np.nan
        return self.matrix[x][y]
예제 #4
0
def load_data(path,path_on):
    df = pd.read_csv(path, sep="\t")
    df = precession(df)
    #分析了模型的feature_importance之后,删去了一些重要性很低的特征
    df.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True)
    #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性
    df = df.fillna("empty")

    #线下训练数据
    x = df.drop(['label','sid'],axis=1)
    y = df['label']
    cols = x.columns

    #线上训练数据
    df_on = pd.read_csv(path_on, sep="\t")
    df_on = precession(df_on)
    df_on.drop(['imeimd5', 'openudidmd5', 'os', 'adidmd5', 'idfamd5'], 1, inplace=True)
    x_on = df_on.drop(['sid'], axis=1)
    x_on = x_on.fillna("empty")
    #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况
    x_all = pd.concat([x, x_on], 0)

    #把所有的字符编码成数字
    oe = OrdinalEncoder()
    oe.fit(x_all)  # 直接传入 他会自动将object类型换掉
    x = oe.transform(x)
    print(x.shape)
    return x,y,oe,cols
예제 #5
0
파일: models.py 프로젝트: pajoshi/pycbr
class LinearOrdinalAttribute(Attribute):
    """A (possibly) categorical attribute whose similarity is linear with respect to a scale"""

    def __init__(self, order, undefined=("n.a.",)):
        """

        Args:
            order (list): List of values, defining their ordering.
            undefined (iterable): Values which are recognized, but not comparable to the ranking. When such a value is
                                  found, the similarity returned is nan.
        """
        super().__init__()
        self.order = order
        self.undefined = undefined

        self.n = len(order)
        self.encoder = None

    def get_description(self):
        return {"__class__": self.__class__.__module__ + "." + self.__class__.__name__,
                "order": self.order, "undefined": self.undefined}

    def fit(self, X, y=None):
        self.encoder = OrdinalEncoder([self.order + list(self.undefined)])
        self.encoder.fit([[x] for x in self.order + list(self.undefined)])  # Argument irrelevant
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X)

    def similarity(self, x, y):
        if x >= self.n or y >= self.n:
            return np.nan
        return 1 - abs(x - y) / (self.n - 1)
def encode_categories(final_db):
    '''Take needed features from the dataset and encode string into categorical numbers
    Inputs:
        - final_db (Pandas dataframe): cleaned dataframe 
    Outputs:
        - X (Pandas dataframe): feature matrix dimension NxD, where N is the datapoints number and D the number of features
        - y (numpy array): labels array (binary or multiclass), dimension Nx1 
        - enc (sklearn OrdinalEncoder): ordinal encoder used (to be used in decoding after)
    '''

    # Loading data
    X = final_db.copy()
    X = X[[
        'exposure_type', 'obs_duration_mean', 'conc1_type', 'species', 'class',
        'tax_order', 'family', 'genus', 'atom_number', 'alone_atom_number',
        'tripleBond', 'doubleBond', 'bonds_number', 'ring_number', 'Mol',
        'MorganDensity', 'LogP'
    ]]
    y = final_db.score.copy().values

    # Encoding phase
    enc = OrdinalEncoder(dtype=int)
    enc.fit(X[[
        'exposure_type', 'conc1_type', 'species', 'class', 'tax_order',
        'family', 'genus'
    ]])
    X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']] = \
        enc.transform(X[['exposure_type', 'conc1_type', 'species', 'class', 'tax_order', 'family', 'genus']])

    return X, y, enc
예제 #7
0
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1):
    if not X:
        X = np.array([
            ["P", "+"],
            ["P2", "-"],
            ["P3", "-"],
        ])

    custom_encoder = CustomOrdinalFeatureEncoder()
    ordinal_encoder = OrdinalEncoder()

    ordinal_encoder_time = []
    custom_encoder_time = []
    for i in range(iterations):
        ts = time()
        custom_encoder.fit(X)
        transformed = custom_encoder.transform(X)
        custom_encoder.inverse_transform(transformed)
        custom_encoder_time.append(time() - ts)

        ts = time()
        ordinal_encoder.fit(X)
        transformed = ordinal_encoder.transform(X)
        ordinal_encoder.inverse_transform(transformed)
        ordinal_encoder_time.append(time() - ts)
    custom_encoder_time = np.mean(custom_encoder_time)
    ordinal_encoder_time = np.mean(ordinal_encoder_time)
    if verbose:
        print(f"CustomEncoder -> Time: {custom_encoder_time}")
        print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}")
    return custom_encoder_time, ordinal_encoder_time
예제 #8
0
def test_OrdinalEncoder():
    expected = pd.DataFrame({"name": nominal, "feature": nominal})

    oe = OrdinalEncoder()
    oe.fit(X[nominal])

    assert feat(oe, nominal).equals(expected)
class NewOrdinalEncoder(OrdinalEncoder):
    """
    comparable with null value & numerical input
    """
    def __init__(self, category_cols: List[str], begin_idx=0) -> None:
        super(OrdinalEncoder, self).__init__()
        self.ordinal_encoder = OrdinalEncoder(
            # handle_unknown='use_encoded_value', unknown_value='null'
        )
        self.category_cols = category_cols
        # self.null_map = {col: 'null' for col in self.category_cols}
        self.begin_idx = begin_idx

    def fit(self, X, y=None):
        # X.fillna(self.null_map, inplace=True)
        # X[self.category_cols] = X[self.category_cols].astype('str')
        self.ordinal_encoder.fit(X[self.category_cols])
        return self

    def transform(self, X):
        # X[self.category_cols] = X[self.category_cols].astype('str')
        # X.fillna(self.null_map, inplace=True)
        X.loc[:, self.category_cols] = self.ordinal_encoder.transform(
            X[self.category_cols]).astype('int') + self.begin_idx
        return X
예제 #10
0
def load_data(path, path_on):
    df = pd.read_csv(path)
    df = precession(df)
    #分析了模型的feature_importance之后,删去了一些重要性很低的特征
    #todo (待优化)缺失值填充成了empty, 它是原本数据的缺失值的填充字符,与原本数据保持一致,填充之后也作为特征的一种属性
    df = df.fillna("empty")

    #线下训练数据
    x = df.drop(['label', 'sid'], axis=1)
    y = df['label']
    cols = x.columns

    #线上训练数据
    df_on = pd.read_csv(path_on)
    df_on = precession(df_on)
    x_on = df_on.drop(['sid'], axis=1)
    x_on = x_on.fillna("empty")
    #线上线下数据融合,防止编码过程中出现线下数据有编码,线上数据无编码的情况
    x_all = pd.concat([x, x_on], 0)
    print(x_all.shape)

    #把所有的字符编码成数字
    oe = OrdinalEncoder()
    oe.fit(x_all)
    x = oe.transform(x)
    print(x.shape)
    return x, y, oe, cols
예제 #11
0
        def set_miss_values(df, complete_index):
            enc_label = OrdinalEncoder()
            enc_fea = OrdinalEncoder()
            missing_index = complete_index[0]

            # Take out the existing numerical data (no NaN) and throw them in Random Forest Regressor
            train_df = df[complete_index]
            # known & unknow values
            known_values = np.array(train_df[train_df[missing_index].notnull()])
            unknow_values = np.array(train_df[train_df[missing_index].isnull()])

            # y is the know missing_index
            y = known_values[:, 0].reshape(-1, 1)
            enc_label.fit(y)
            y = enc_label.transform(y)

            # X are the features
            X = known_values[:, 1:]
            test_X = unknow_values[:, 1:]
            all_X = np.row_stack((X, test_X))
            enc_fea.fit(all_X)
            X = enc_fea.transform(X)

            # fit
            rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
            rfr.fit(X, y.ravel())
            # predict
            predicted_values = rfr.predict(enc_fea.transform(unknow_values[:, 1:]))
            predicted_values = enc_label.inverse_transform(predicted_values.reshape(-1, 1))
            # fill in with predicted values
            df.loc[(df[missing_index].isnull()), missing_index] = predicted_values
            return df
예제 #12
0
def load_data_knn(DATA_PATH, encoding, seed=42):

    db = pd.read_csv(DATA_PATH).drop(
        columns=['Unnamed: 0', 'test_cas', 'smiles'])

    numerical = [
        'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP',
        'alone_atom_number', 'doubleBond', 'tripleBond', 'ring_number',
        'oh_count', 'MeltingPoint', 'WaterSolubility'
    ]

    # Categoriche + obs_duration_mean (già numeri)
    categorical = [
        'conc1_type', 'exposure_type', 'control_type', 'media_type',
        'application_freq_unit', 'species', 'class', 'tax_order', 'family',
        'genus'
    ]

    # MinMax trasform for numerical variables
    for nc in numerical:
        minmax = MinMaxScaler()
        minmax.fit(db[[nc]])
        db[[nc]] = minmax.transform(db[[nc]])

    # Ordinal Encoding for categorical variables
    encoder = OrdinalEncoder(dtype=int)
    encoder.fit(db[categorical])
    db[categorical] = encoder.transform(db[categorical]) + 1

    # Apro i pubchem
    db = pd.concat([db, pd.DataFrame(pd.DataFrame(db['pubchem2d'].values).\
                                     apply(lambda x: x.str.replace('', ' ').str.strip().str.split(' '), axis = 1)[0].to_list(),
                   columns = ['pub'+ str(i) for i in range(1,882)])], axis = 1)

    db.drop(columns=['fish'], inplace=True)

    # Encoding for target variable: binary and multiclass
    if encoding == 'binary':
        db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 0, 1)

    elif encoding == 'multiclass':
        t = db['conc1_mean'].copy()
        db['conc1_mean'] = multiclass_encoding(t)

    X = db.drop(columns='conc1_mean')
    y = db['conc1_mean'].values

    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=seed)

    # ricongiungo train con test
    X_try = X_train.append(X_test)

    # tengo traccia della lunghezza del train set
    len_X_train = len(X_train)

    return X_try, X_train, X_test, y_train, y_test, len_X_train
예제 #13
0
def loadUnswNb15(folder, shuffleDataset=False, randomState=None):
    xEncoder = OrdinalEncoder()
    folder = Path(folder)
    trainingSetPath = folder / 'UNSW_NB15_training-set.csv'
    testingSetPath = folder / 'UNSW_NB15_testing-set.csv'
    trainingSet = pd.read_csv(str(trainingSetPath))
    testingSet = pd.read_csv(str(testingSetPath))

    trainingY = trainingSet['attack_cat'].values
    trainingX = trainingSet.drop(columns=['id', 'attack_cat', 'label'])
    testingY = testingSet['attack_cat'].values
    testingX = testingSet.drop(columns=['id', 'attack_cat', 'label'])

    xEncoder.fit(
        pd.concat([
            trainingX[['proto', 'service', 'state']],
            testingX[['proto', 'service', 'state']]
        ],
                  ignore_index=True))
    trainingX[['proto', 'service', 'state']] = \
        xEncoder.transform(trainingX[['proto', 'service', 'state']])
    trainingX = trainingX.values
    testingX[['proto', 'service', 'state']] = \
        xEncoder.transform(testingX[['proto', 'service', 'state']])
    testingX = testingX.values

    if shuffleDataset:
        trainingX, trainingY = shuffle(trainingX,
                                       trainingY,
                                       random_state=randomState)
        testingX, testingY = shuffle(testingX,
                                     testingY,
                                     random_state=randomState)

    return trainingX, testingX, trainingY, testingY
예제 #14
0
 def _encode_feature(self, mat, feature_column):
     feat = mat[feature_column].to_numpy().reshape(-1, 1)
     enc = OrdinalEncoder()
     enc.fit(feat)
     self.feature_encoders[feature_column] = enc
     mat.loc[:,feature_column] = enc.transform(feat)
     return mat
예제 #15
0
def test_ordinal_encoder_handle_unknowns_raise(params, err_type, err_msg):
    # Check error message when validating input parameters
    X = np.array([['a', 'x'], ['b', 'y']], dtype=object)

    encoder = OrdinalEncoder(**params)
    with pytest.raises(err_type, match=err_msg):
        encoder.fit(X)
예제 #16
0
def dataWash(city, path: str):
    weather = pd.read_csv(path)

    X = weather.iloc[:, :-1]
    Y = weather.loc[:, ("Location", "RainTomorrow")]
    X = X.loc[X.loc[:, "Location"] == city]
    Y = Y.loc[Y.loc[:, "Location"] == city]
    Y = Y.drop(['Location'], axis=1)
    X = X.drop(['Location'], axis=1)

    #get month
    X["Date"] = X["Date"].apply(lambda x: int(x.split("/")[1]))
    X = X.rename(columns={"Date": "Month"})

    #fill Null object-data up with most frequent value
    cate = X.columns[X.dtypes == "object"].tolist()
    si = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
    si.fit(X.loc[:, cate])
    X.loc[:, cate] = si.transform(X.loc[:, cate])

    #encode object data
    oe = OrdinalEncoder()
    oe = oe.fit(X.loc[:, cate])
    X.loc[:, cate] = oe.transform(X.loc[:, cate])

    oe = oe.fit(Y.loc[:, :])
    Y.loc[:, :] = oe.transform(Y.loc[:, :])

    #fill float data up with mean value.
    col = X.columns[X.dtypes == "float64"].tolist()
    impmean = SimpleImputer(missing_values=np.nan, strategy="mean")
    impmean = impmean.fit(X.loc[:, col])
    X.loc[:, col] = impmean.transform(X.loc[:, col])

    return X, Y
예제 #17
0
 def labelEncoding(self, data_column):
     logger.info('[{}] : [INFO] Label encoding ...'.format(
         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
     enc = OrdinalEncoder()
     enc.fit(data_column)
     enc_data_column = enc.transform(data_column)
     return enc_data_column
예제 #18
0
def regplot(X, y):
    '''Function for plotting the variables of input X against the target y'''

    fig, axes = plt.subplots(2, 3, figsize=(16, 8))
    fig.suptitle('charges for insurance')

    sns.regplot(ax=axes[0, 0], x='bmi', y=y, data=X)
    #axes[0].set_xlabel('bmi')
    sns.regplot(ax=axes[0, 1], x='children', y=y, data=X)

    sns.regplot(ax=axes[1, 0], x='age', y=y, data=X)

    sns.regplot(ax=axes[1, 1], x='age', y=y, data=X)

    ###### encoding of ordinal data
    ordinal_encoder = OrdinalEncoder(categories=[['no', 'yes']])
    ordinal_encoder.fit(X[['smoker']])
    smoker_encoded = pd.DataFrame(ordinal_encoder.transform(X[['smoker']]))
    ######
    sns.regplot(ax=axes[0, 2], x=smoker_encoded, y=y)
    axes[0, 2].set_xlabel('smoker(0:No, 1:Yes]')

    ordinal_encoder1 = OrdinalEncoder(categories=[['female', 'male']])
    ordinal_encoder1.fit(X[['sex']])
    sex_encoded = pd.DataFrame(ordinal_encoder1.transform(X[['sex']]))

    sns.regplot(ax=axes[1, 2], x=sex_encoded, y=y)
    axes[1, 2].set_xlabel('sex(0:f, 1:m]')
    axes[1, 2].set_xlim(0, 1)
    axes[1, 2].set_ylim(0, 60000)
    plt.show()
예제 #19
0
class SklearnEncoder(object):
    def __init__(self, encoder_type):
        self.encoder_type = encoder_type
        if self.encoder_type == "Label":
            self.encoder_module = LabelEncoder()

        elif self.encoder_type == "OneHot":
            self.encoder_module = OneHotEncoder()

        elif self.encoder_type == "Ordinal":  # 序数编码
            self.encoder_module = OrdinalEncoder()

    def _fit(self, x, y=None):
        if self.encoder_type == "Label":
            self.encoder_module.fit(y=x)
        else:
            self.encoder_module.fit(X=x, y=y)

    def _transform(self, x):
        if self.encoder_type == "Label":
            return self.encoder_module.transform(y=x)
        else:
            return self.encoder_module.transform(X=x)

    def _fit_transform(self, x, y=None):
        if self.encoder_type == "Label":
            return self.encoder_module.fit_transform(y=x)
        else:
            return self.encoder_module.fit_transform(X=x, y=y)

    def _reversal(self, x):  # 与transform的操作刚好相反
        return self.encoder_module.inverse_transform(X=x)
예제 #20
0
def preprocess(df):
	cat_vars = df.columns[df.dtypes == object]
	c = cat_vars.tolist()
	c.remove('STATUS')
	c.append('REHIRE')
	c.append('JOB_SATISFACTION')
	for var in c:
		cat_list ='var'+'_'+var
		cat_list = pd.get_dummies(df[var], prefix=var)
		data1 = df.join(cat_list)
		df = data1

	data_vars = df.columns.values.tolist()
	to_keep = [i for i in data_vars if i not in c]
	to_keep.remove('TERMINATION_YEAR')
	to_keep.remove('EMP_ID')
	
	data_final = df[to_keep]
	col1 = data_final.columns.tolist()
	col1.remove('STATUS')
	col2 = 'STATUS'

	X = data_final[col1]
	y = data_final[col2]

	temp = np.array(y).reshape(-1,1)
	encoder = OrdinalEncoder()
	encoder.fit(temp)
	y = encoder.transform(temp)
	y = y.ravel()
	return X, y
class OrdinalEncodeCategoricalVariables(BaseEstimator, TransformerMixin):
    # order and encode categorical variables
    # self.variables --> CATEGORICAL_VARIABLES

    def __init__(self, variables=None):

        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        # get_dummies isn't appropriate so use ordinal_map
        # add points column to X so groupby works!

        #X = X.copy()
        #print()
        #print(X.dtypes)

        self.enc = OrdinalEncoder()
        self.enc.fit(X[self.variables])

        return self

    def transform(self, X):

        X[self.variables] = self.enc.transform(X[self.variables])
        #print()
        #print(X.dtypes)

        return X
예제 #22
0
def change_Categorical_ord_5_Data(input_train_data, input_test_data):
    encoder = OrdinalEncoder(categories='auto')
    encoder.fit(input_train_data.ord_5.values.reshape(-1, 1))
    input_train_data.ord_5 = encoder.transform(
        input_train_data.ord_5.values.reshape(-1, 1))
    input_test_data.ord_5 = encoder.transform(
        input_test_data.ord_5.values.reshape(-1, 1))
    return input_train_data, input_test_data
예제 #23
0
def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
    """Checks that ordinal encoder transforms string dtypes. Non-regression
    test for #19872."""
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9)
    enc.fit(X_train)

    X_trans = enc.transform(X_test)
    assert_allclose(X_trans, [[-9, 0]])
예제 #24
0
def test_ordinalencoder():
    X0 = [["Male", 1], ["Female", 3], ["Female", 2]]
    X1 = [["Male", 1], ["Female", 27], ["Bananas", 2]]
    for X in [X0, X1]:
        ohe = OrdinalEncoder()
        ohe.fit(X)
        ohe_ = convert_estimator(ohe)
        assert np.allclose(ohe.transform(X), ohe_.transform(X))
예제 #25
0
def encodeOrdinal(data, col_names):
    # creating instance of encoder
    ordinal_encoder = OrdinalEncoder()

    # Assigning numerical values and storing in another column
    ordinal_encoder.fit(data[col_names])
    data[col_names] = ordinal_encoder.transform(data[col_names])
    return data, ordinal_encoder
예제 #26
0
def test_ordinal_encoder_raise_categories_shape():

    X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
    cats = ['Low', 'Medium', 'High']
    enc = OrdinalEncoder(categories=cats)
    msg = ("Shape mismatch: if categories is an array,")

    with pytest.raises(ValueError, match=msg):
        enc.fit(X)
def test_ordinal_encoder_raise_categories_shape():

    X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
    cats = ['Low', 'Medium', 'High']
    enc = OrdinalEncoder(categories=cats)
    msg = ("Shape mismatch: if categories is an array,")

    with pytest.raises(ValueError, match=msg):
        enc.fit(X)
예제 #28
0
def test_ordinal_encoder_handle_unknowns_nan():
    # Make sure unknown_value=np.nan properly works

    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)

    X_fit = np.array([[1], [2], [3]])
    enc.fit(X_fit)
    X_trans = enc.transform([[1], [2], [4]])
    assert_array_equal(X_trans, [[0], [1], [np.nan]])
예제 #29
0
def get_ordinalencoder(df: pd.DataFrame) -> OrdinalEncoder:
    ordcol = set(df.columns)
    ordcol = list(ordcol)
    ordcol.sort()

    enc = OrdinalEncoder()
    enc.fit(df[ordcol].values)

    return enc, ordcol
예제 #30
0
def test_ordinal_encoder_raise_categories_shape():

    X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T
    cats = ["Low", "Medium", "High"]
    enc = OrdinalEncoder(categories=cats)
    msg = "Shape mismatch: if categories is an array,"

    with pytest.raises(ValueError, match=msg):
        enc.fit(X)
예제 #31
0
class OrdinalEncoding(AutoSklearnPreprocessingAlgorithm):
    def __init__(
        self,
        random_state: Optional[np.random.RandomState] = None,
    ):
        self.random_state = random_state

    def fit(self,
            X: PIPELINE_DATA_DTYPE,
            y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'OrdinalEncoding':
        if not scipy.sparse.issparse(X):
            self.preprocessor = OrdinalEncoder(
                categories='auto',
                handle_unknown='use_encoded_value',
                unknown_value=-1,
            )
            self.preprocessor.fit(X, y)
        return self

    def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
        if scipy.sparse.issparse(X):
            # Sparse data should be float dtype, which means we do not need
            # to further encode it.
            return X
        if self.preprocessor is None:
            raise NotImplementedError()
        # Notice we are shifting the unseen categories during fit to 1
        # from -1, 0, ... to 0,..., cat + 1
        # This is done because Category shift requires non negative integers
        # Consider removing this if that step is removed
        return self.preprocessor.transform(X) + 1

    @staticmethod
    def get_properties(
        dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
    ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
        return {
            'shortname': 'OrdinalEncoder',
            'name': 'Ordinal Encoder',
            'handles_regression': True,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'handles_multioutput': True,
            # TODO find out of this is right!
            'handles_sparse': True,
            'handles_dense': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (INPUT, ),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
    ) -> ConfigurationSpace:
        return ConfigurationSpace()
예제 #32
0
def test_ordinal_encoder_raise_missing(X):
    ohe = OrdinalEncoder()

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    ohe.fit(X[:1, :])

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X)
예제 #33
0
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OrdinalEncoder(categories=cats)
    exp = np.array([[0.], [1.]])
    assert_array_equal(enc.fit_transform(X), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)