Python EasyPreprocessor примеры использования

Язык программирования: Python

Пространство имен/Пакет: dabl.preprocessing

Класс/Тип: EasyPreprocessor

Примеров на hotexamples.com: 9

Python EasyPreprocessor - 9 примеров найдено. Это лучшие примеры Python кода для dabl.preprocessing.EasyPreprocessor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

EasyPreprocessor(9)

fit(6)

fit_transform(3)

get_feature_names(3)

transform(2)

Пример #1

Показать файл

Файл: test_preprocessing.py Проект: sanjaradylov/dabl

def test_easy_preprocessor_cat_cols():
    # Create dataframe with 1 numerical and 3 categorical features.
    cat_1_unique = ['a', 'b', 'c']
    cat_2_unique = ['D', 'E', 'F', 'G']
    cat_3_unique = [-2.0, -1.0]
    data = pd.DataFrame({
        'cat_1': ['a', 'b', 'a', '', '', 'c', 'a', 'c', 'a', 'a'],
        'cat_2': ['D', 'D', 'E', np.NaN, 'D', np.NaN, 'E', 'F', 'F', 'G'],
        'cat_3': [-2., -1., -1., -2., -1., '', -1., -2., '', np.NaN],
        'num':
        np.sin(range(10)),  # Valid continuous feature.
    })

    # Preprocess data, i.e. replace empty strings with NaNs, impute NaNs, and
    # encode categorical variables with OneHotEncoder.
    ep = EasyPreprocessor()
    data_t = ep.fit_transform(data)

    cat_all = [cat_1_unique, cat_2_unique, cat_3_unique]
    n_unique_cats = sum(len(cats) for cats in cat_all)
    # The number of features after preprocessing must be equal to
    # the number of unique categories within the dataframe + the number of
    # remaining valid features.
    assert data_t.shape[1] == n_unique_cats + 1

    cat_pipe = ep.ct_.named_transformers_['categorical']
    ohe = cat_pipe.named_steps['onehotencoder']
    # The category sets detected by OneHotEncoder inside EasyPreprocessor
    # must match the ones specified in cat_all.
    assert len(ohe.categories_) == len(cat_all)
    for cat_list, ohe_cat_list in zip(cat_all, ohe.categories_):
        assert set(cat_list) == set(ohe_cat_list)

Пример #2

Показать файл

def test_simple_preprocessor_imputed_features():
    # Issue: 211

    data = pd.DataFrame({'A': [0, 1, 2, 1, np.NaN]}, dtype=int)
    types = detect_types(data, type_hints={'A': 'categorical'})

    ep = EasyPreprocessor(types=types)
    ep.fit(data)

    expected_names = ['A_0', 'A_1', 'A_2', 'A_imputed_False', 'A_imputed_True']
    assert ep.get_feature_names() == expected_names

Пример #3

Показать файл

def test_simple_preprocessor():
    sp = EasyPreprocessor()
    sp.fit(X_cat)
    trans = sp.transform(X_cat)
    assert trans.shape == (3, 7)  # FIXME should be 6?

    iris = load_iris()
    sp = EasyPreprocessor()
    sp.fit(iris.data)

Пример #4

Показать файл

def test_boolean_and_nan(null_object):
    X = pd.DataFrame({'a': [True, False, True, False, null_object]})
    types = detect_types(X)
    assert types.categorical.a

    X_preprocessed = EasyPreprocessor().fit_transform(X)
    assert X_preprocessed.shape[1] == 4
    assert all(np.unique(X_preprocessed) == [0, 1])

Пример #5

Показать файл

Файл: test_preprocessing.py Проект: NicolasHug/dabl

def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'age_dabl_continuous', 'body_dabl_continuous', 'fare_dabl_continuous',
        'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0', 'pclass_1',
        'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D'
    ]
    assert ep.get_feature_names() == expected_names

Пример #6

Показать файл

def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous',
        'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3',
        'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3',
        'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2',
        'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0',
        'body_?_1.0'
    ]
    try:
        assert ep.get_feature_names() == expected_names
    except AssertionError:
        # OHE uses int in newer versions
        expected_names[57] = 'age_?_0'
        expected_names[58] = 'age_?_1'
        expected_names[59] = 'body_?_0'
        expected_names[60] = 'body_?_1'
        assert ep.get_feature_names() == expected_names

    # without clean
    X = ep.fit_transform(titanic.drop('survived', axis=1))
    # FIXME can't do that yet
    # assert ep.get_feature_names() == expected_names_no_clean

    assert not np.isnan(X).any()

Пример #7

Показать файл

Файл: test_preprocessing.py Проект: sanjaradylov/dabl

def test_simple_preprocessor():
    sp = EasyPreprocessor()
    trans = sp.fit_transform(X_cat)
    assert trans.shape == (3, 6)

    iris = load_iris()
    sp = EasyPreprocessor()
    sp.fit(iris.data)

Пример #8

Показать файл

def test_easy_preprocessor_transform():
    titanic = load_titanic()
    titanic_clean = clean(titanic)
    X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y,
                                                      random_state=42)
    pipe = make_pipeline(EasyPreprocessor(), LogisticRegression(C=0.1))
    pipe.fit(X_train, y_train)
    pipe.predict(X_train)
    pipe.predict(X_val)

Пример #9

Показать файл

def test_simple_preprocessor_dirty_float():
    dirty = pd.DataFrame(make_dirty_float())
    fp = EasyPreprocessor()
    fp.fit(dirty)
    res = fp.transform(dirty)
    assert res.shape == (100, 3)
    rowsum = res.sum(axis=0)
    # count of "garbage"
    assert rowsum[1] == 1
    # count of "missing"
    assert rowsum[2] == 9

    # make sure we can transform a clean column
    fp.transform(pd.DataFrame(['0', '1', '2'], columns=['a_column']))