def test_transform_selected_2(): """Assert _transform_selected return original X when selected is a list of False values""" ohe = OneHotEncoder(categorical_features=[False, False, False]) X = _transform_selected(dense1, ohe._fit_transform, ohe.categorical_features, copy=True) assert np.allclose(X, dense1)
def test_transform(): """Test OneHotEncoder with both dense and sparse matrixes.""" input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() ohe = OneHotEncoder() ohe.fit(input) test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() output = ohe.transform(test_data).todense() assert np.sum(output) == 5 input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() ips = scipy.sparse.csr_matrix(input) ohe = OneHotEncoder() ohe.fit(ips) test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() tds = scipy.sparse.csr_matrix(test_data) output = ohe.transform(tds).todense() assert np.sum(output) == 3
def test_transform_selected(): """Assert _transform_selected return original X when selected is empty list""" ohe = OneHotEncoder(categorical_features=[]) X = _transform_selected(dense1, ohe._fit_transform, ohe.categorical_features, copy=True) assert np.allclose(X, dense1)
def fit_then_transform_dense(expected, input, categorical_features='all', minimum_fraction=None): ohe = OneHotEncoder(categorical_features=categorical_features, sparse=False, minimum_fraction=minimum_fraction) transformation = ohe.fit_transform(input.copy()) assert_array_almost_equal(expected, transformation) ohe2 = OneHotEncoder(categorical_features=categorical_features, sparse=False, minimum_fraction=minimum_fraction) ohe2.fit(input.copy()) transformation = ohe2.transform(input.copy()) assert_array_almost_equal(expected, transformation)
def test_k_fold_cv(): """Test OneHotEncoder with categorical_features='auto'.""" boston = load_boston() clf = make_pipeline( OneHotEncoder(categorical_features='auto', sparse=False, minimum_fraction=0.05), LinearRegression()) cross_val_score(clf, boston.data, boston.target, cv=KFold(n_splits=10, shuffle=True))
def fit_then_transform(expected, input, categorical_features='all', minimum_fraction=None): # Test fit_transform ohe = OneHotEncoder(categorical_features=categorical_features, minimum_fraction=minimum_fraction) transformation = ohe.fit_transform(input.copy()) assert_array_almost_equal(expected.astype(float), transformation.todense()) # Test fit, and afterwards transform ohe2 = OneHotEncoder(categorical_features=categorical_features, minimum_fraction=minimum_fraction) ohe2.fit(input.copy()) transformation = ohe2.transform(input.copy()) assert_array_almost_equal(expected, transformation.todense())
from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler, Normalizer from tpot.builtins import OneHotEncoder, StackingEstimator from xgboost import XGBClassifier from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.84550605863897 exported_pipeline = make_pipeline( make_union( make_pipeline( OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10), RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.5, n_estimators=100), step=0.2), MinMaxScaler() ), FunctionTransformer(copy) ), Normalizer(norm="max"), XGBClassifier(learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=600, nthread=1, subsample=0.9500000000000001) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler from tpot.builtins import OneHotEncoder, StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9699834298007317 exported_pipeline = make_pipeline( StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.05, min_samples_leaf=1, min_samples_split=6, n_estimators=100)), OneHotEncoder(minimum_fraction=0.25, sparse=False), MinMaxScaler(), LogisticRegression(C=25.0, dual=False, penalty="l1")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import OneHotEncoder, StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8896296296296295 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), make_union( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), FunctionTransformer(copy))), OneHotEncoder(minimum_fraction=0.25, sparse=False), LinearSVC(C=20.0, dual=True, loss="hinge", penalty="l2", tol=0.0001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def test_refit_on_new_data(): """Test that OneHotEncoder can refit on two data sets.""" ohe = OneHotEncoder() ohe.fit(dense1) ohe.fit(dense2)
from sklearn.feature_selection import SelectFwe, SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import OneHotEncoder, StackingEstimator from tpot.export_utils import set_param_recursive from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.3656802383316783 exported_pipeline = make_pipeline( make_union( make_pipeline(SelectFwe(score_func=f_classif, alpha=0.008), OneHotEncoder(minimum_fraction=0.1), SelectPercentile(score_func=f_classif, percentile=13)), FunctionTransformer(copy)), LinearSVC(C=0.1, dual=False, loss="squared_hinge", penalty="l2", tol=0.01)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.builtins import OneHotEncoder # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9866666666666667 exported_pipeline = make_pipeline( OneHotEncoder(minimum_fraction=0.2, sparse=False), Normalizer(norm="max"), LogisticRegression(C=25.0, dual=True, penalty="l2")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.kernel_approximation import Nystroem from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Imputer, PolynomialFeatures from tpot.builtins import OneHotEncoder # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Score on the training set was:1.0 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), Nystroem(gamma=0.05, kernel="poly", n_components=7), OneHotEncoder(minimum_fraction=0.05, sparse=False), GaussianNB()) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.svm import LinearSVC from tpot.builtins import OneHotEncoder, StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8390633822699041 exported_pipeline = make_pipeline( make_union( make_pipeline( SelectPercentile(score_func=f_classif, percentile=90), StackingEstimator( estimator=LogisticRegression(C=0.01, dual=True, penalty="l2")), SelectPercentile(score_func=f_classif, percentile=76)), FunctionTransformer(copy)), StackingEstimator(estimator=LinearSVC( C=25.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)), SelectPercentile(score_func=f_classif, percentile=70), OneHotEncoder(minimum_fraction=0.1, sparse=False), StandardScaler(), LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.01)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)