Python OneHotEncoder示例，tpot.builtins.OneHotEncoder Python示例

示例#1

0

显示文件

文件： one_hot_encoder_tests.py 项目： balaprasanna/aml-tpot

def test_transform_selected_2():
    """Assert _transform_selected return original X when selected is a list of False values"""
    ohe = OneHotEncoder(categorical_features=[False, False, False])
    X = _transform_selected(dense1,
                            ohe._fit_transform,
                            ohe.categorical_features,
                            copy=True)
    assert np.allclose(X, dense1)

示例#2

0

显示文件

文件： one_hot_encoder_tests.py 项目： balaprasanna/aml-tpot

def test_transform():
    """Test OneHotEncoder with both dense and sparse matrixes."""
    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ohe = OneHotEncoder()
    ohe.fit(input)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    output = ohe.transform(test_data).todense()
    assert np.sum(output) == 5

    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ips = scipy.sparse.csr_matrix(input)
    ohe = OneHotEncoder()
    ohe.fit(ips)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    tds = scipy.sparse.csr_matrix(test_data)
    output = ohe.transform(tds).todense()
    assert np.sum(output) == 3

示例#3

0

显示文件

文件： one_hot_encoder_tests.py 项目： balaprasanna/aml-tpot

def test_transform_selected():
    """Assert _transform_selected return original X when selected is empty list"""
    ohe = OneHotEncoder(categorical_features=[])
    X = _transform_selected(dense1,
                            ohe._fit_transform,
                            ohe.categorical_features,
                            copy=True)
    assert np.allclose(X, dense1)

示例#4

0

显示文件

def fit_then_transform_dense(expected, input,
                             categorical_features='all',
                             minimum_fraction=None):
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        sparse=False, minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected, transformation)

    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         sparse=False, minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation)

示例#5

0

显示文件

文件： one_hot_encoder_tests.py 项目： balaprasanna/aml-tpot

def test_k_fold_cv():
    """Test OneHotEncoder with categorical_features='auto'."""
    boston = load_boston()
    clf = make_pipeline(
        OneHotEncoder(categorical_features='auto',
                      sparse=False,
                      minimum_fraction=0.05), LinearRegression())

    cross_val_score(clf,
                    boston.data,
                    boston.target,
                    cv=KFold(n_splits=10, shuffle=True))

示例#6

0

显示文件

def fit_then_transform(expected, input, categorical_features='all',
                       minimum_fraction=None):
    # Test fit_transform
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected.astype(float),
                              transformation.todense())

    # Test fit, and afterwards transform
    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation.todense())

示例#7

0

显示文件

文件： one_hot_encoder_tests.py 项目： EpistasisLab/tpot

def test_transform():
    """Test OneHotEncoder with both dense and sparse matrixes."""
    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ohe = OneHotEncoder()
    ohe.fit(input)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    output = ohe.transform(test_data).todense()
    assert np.sum(output) == 5

    input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
    ips = scipy.sparse.csr_matrix(input)
    ohe = OneHotEncoder()
    ohe.fit(ips)
    test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
    tds = scipy.sparse.csr_matrix(test_data)
    output = ohe.transform(tds).todense()
    assert np.sum(output) == 3

示例#8

0

显示文件

文件： one_hot_encoder_tests.py 项目： EpistasisLab/tpot

def fit_then_transform_dense(expected, input,
                             categorical_features='all',
                             minimum_fraction=None):
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        sparse=False, minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected, transformation)

    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         sparse=False, minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation)

示例#9

0

显示文件

文件： one_hot_encoder_tests.py 项目： EpistasisLab/tpot

def fit_then_transform(expected, input, categorical_features='all',
                       minimum_fraction=None):
    # Test fit_transform
    ohe = OneHotEncoder(categorical_features=categorical_features,
                        minimum_fraction=minimum_fraction)
    transformation = ohe.fit_transform(input.copy())
    assert_array_almost_equal(expected.astype(float),
                              transformation.todense())

    # Test fit, and afterwards transform
    ohe2 = OneHotEncoder(categorical_features=categorical_features,
                         minimum_fraction=minimum_fraction)
    ohe2.fit(input.copy())
    transformation = ohe2.transform(input.copy())
    assert_array_almost_equal(expected, transformation.todense())

示例#10

0

显示文件

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, Normalizer
from tpot.builtins import OneHotEncoder, StackingEstimator
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.84550605863897
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10),
            RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.5, n_estimators=100), step=0.2),
            MinMaxScaler()
        ),
        FunctionTransformer(copy)
    ),
    Normalizer(norm="max"),
    XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=600, nthread=1, subsample=0.9500000000000001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#11

0

显示文件

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from tpot.builtins import OneHotEncoder, StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9699834298007317
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True,
                                                       criterion="gini",
                                                       max_features=0.05,
                                                       min_samples_leaf=1,
                                                       min_samples_split=6,
                                                       n_estimators=100)),
    OneHotEncoder(minimum_fraction=0.25, sparse=False), MinMaxScaler(),
    LogisticRegression(C=25.0, dual=False, penalty="l1"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#12

0

显示文件

文件： tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplitaggr_4_groups6.py 项目： brains-on-code/conducting-and-analyzing-human-studies

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8896296296296295
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(
            make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
            FunctionTransformer(copy))),
    OneHotEncoder(minimum_fraction=0.25, sparse=False),
    LinearSVC(C=20.0, dual=True, loss="hinge", penalty="l2", tol=0.0001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#13

0

显示文件

def test_refit_on_new_data():
    """Test that OneHotEncoder can refit on two data sets."""
    ohe = OneHotEncoder()
    ohe.fit(dense1)
    ohe.fit(dense2)

示例#14

0

显示文件

文件： tpot_model_4.py 项目： Social-Media-Public-Analysis/murphy

from sklearn.feature_selection import SelectFwe, SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.3656802383316783
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(SelectFwe(score_func=f_classif, alpha=0.008),
                      OneHotEncoder(minimum_fraction=0.1),
                      SelectPercentile(score_func=f_classif, percentile=13)),
        FunctionTransformer(copy)),
    LinearSVC(C=0.1, dual=False, loss="squared_hinge", penalty="l2", tol=0.01))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#15

0

显示文件

文件： tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplitaggr_2_groups0.py 项目： brains-on-code/conducting-and-analyzing-human-studies

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.builtins import OneHotEncoder

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9866666666666667
exported_pipeline = make_pipeline(
    OneHotEncoder(minimum_fraction=0.2, sparse=False), Normalizer(norm="max"),
    LogisticRegression(C=25.0, dual=True, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#16

0

显示文件

文件： pipeline_2018.06.15_23-28-51.py 项目： albemlee/schooldistrictrecommendation

import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, PolynomialFeatures
from tpot.builtins import OneHotEncoder

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:1.0
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    Nystroem(gamma=0.05, kernel="poly", n_components=7),
    OneHotEncoder(minimum_fraction=0.05, sparse=False), GaussianNB())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#17

0

显示文件

文件： S1NPDR_v1_76.py 项目： fagan2888/malaria-challenge

from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8390633822699041
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            SelectPercentile(score_func=f_classif, percentile=90),
            StackingEstimator(
                estimator=LogisticRegression(C=0.01, dual=True, penalty="l2")),
            SelectPercentile(score_func=f_classif, percentile=76)),
        FunctionTransformer(copy)),
    StackingEstimator(estimator=LinearSVC(
        C=25.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)),
    SelectPercentile(score_func=f_classif, percentile=70),
    OneHotEncoder(minimum_fraction=0.1, sparse=False), StandardScaler(),
    LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)