def fit(self, data, args): self.model = Binarizer() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def test_transform_selected_retain_order(): X = [[-1, 1], [2, -2]] assert_raise_message(ValueError, "The retain_order option can only be set to True " "for dense matrices.", _transform_selected, sparse.csr_matrix(X), Binarizer().transform, dtype=np.int, selected=[0], retain_order=True) def transform(X): return np.hstack((X, [[0], [0]])) assert_raise_message(ValueError, "The retain_order option can only be set to True " "if the dimensions of the input array match the " "dimensions of the transformed array.", _transform_selected, X, transform, dtype=np.int, selected=[0], retain_order=True) X_expected = [[-1, 1], [2, 0]] Xtr = _transform_selected(X, Binarizer().transform, dtype=np.int, selected=[1], retain_order=True) assert_array_equal(toarray(Xtr), X_expected) X_expected = [[0, 1], [1, -2]] Xtr = _transform_selected(X, Binarizer().transform, dtype=np.int, selected=[0], retain_order=True) assert_array_equal(toarray(Xtr), X_expected)
class BinarizerImpl(): def __init__(self, threshold=0.0, copy=True): self._hyperparams = {'threshold': threshold, 'copy': copy} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in ((StandardScaler(), Normalizer(), Binarizer())): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
class CreateBinarizer(CreateModel): def fit(self, data, args): self.model = Binarizer() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def test(self, data): assert self.model is not None return self.model.transform(data.X_test) def predict(self, data): with Timer() as t: self.predictions = self.test(data) data.learning_task = LearningTask.REGRESSION return t.interval
def make_models(X, y, y_bin): return dict(ols=LinearRegression().fit(X, y), lr_bin=LogisticRegression().fit(X, y_bin), lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y), lr_mn=LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(X, y), svc=SVC(kernel='linear').fit(X, y_bin), svr=SVR(kernel='linear').fit(X, y), dtc=DecisionTreeClassifier(max_depth=4).fit(X, y), dtr=DecisionTreeRegressor(max_depth=4).fit(X, y), rfc=RandomForestClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), rfr=RandomForestRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbc=GradientBoostingClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbr=GradientBoostingRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), abc=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y), abc2=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y), abc3=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y_bin), abc4=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y_bin), km=KMeans(1).fit(X), km2=KMeans(5).fit(X), pc1=PCA(1).fit(X), pc2=PCA(2).fit(X), pc3=PCA(2, whiten=True).fit(X), mlr1=MLPRegressor([2], 'relu').fit(X, y), mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y), mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y), mlc=MLPClassifier([2, 2], 'tanh').fit(X, y), mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin), bin=Binarizer(0.5), mms=MinMaxScaler().fit(X), mas=MaxAbsScaler().fit(X), ss1=StandardScaler().fit(X), ss2=StandardScaler(with_mean=False).fit(X), ss3=StandardScaler(with_std=False).fit(X), n1=Normalizer('l1'), n2=Normalizer('l2'), n3=Normalizer('max'))
def _check_transform_selected(X, X_expected, dtype, sel): for M in (X, sparse.csr_matrix(X)): Xtr = _transform_selected(M, Binarizer().transform, dtype, sel) assert_array_equal(toarray(Xtr), X_expected)
def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, -1]]) for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) X_bin = binarizer.transform(X) assert_equal(sparse.issparse(X), sparse.issparse(X_bin)) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert_true(X_bin is not X) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert_true(X_bin is not X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) if init is not list: assert_true(X_bin is X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(threshold=-0.5, copy=True) for init in (np.array, list): X = init(X_.copy()) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 1) assert_equal(np.sum(X_bin == 1), 5) X_bin = binarizer.transform(X) # Cannot use threshold < 0 for sparse assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
def __init__(self, threshold=0.0, copy=True): self._hyperparams = {'threshold': threshold, 'copy': copy} self._wrapped_model = SKLModel(**self._hyperparams)
warnings.filterwarnings("ignore", category=DeprecationWarning) clf_dict = {'ARDRegression':ARDRegression(), 'AdaBoostClassifier':AdaBoostClassifier(), 'AdaBoostRegressor':AdaBoostRegressor(), 'AdditiveChi2Sampler':AdditiveChi2Sampler(), 'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(), 'ElasticNetCV':ElasticNetCV(), 'EmpiricalCovariance':EmpiricalCovariance(), 'ExtraTreeClassifier':ExtraTreeClassifier(), 'ExtraTreeRegressor':ExtraTreeRegressor(), 'ExtraTreesClassifier':ExtraTreesClassifier(), 'ExtraTreesRegressor':ExtraTreesRegressor(),
for i in range(1, 11): data.loc[data['Region'] == i, 'expensive than average region'] = data.loc[data['Region'] == i, 'Price'] - \ data.loc[data['Region'] == i, 'Price'].mean() for i in range(1, 8): data.loc[data['Weekday'] == i, 'expensive than average weekday'] = data.loc[data['Weekday'] == i, 'Price'] - \ data.loc[data['Weekday'] == i, 'Price'].mean() for i in range(1, 366): data.loc[data['Date'] == i, 'expensive than average date'] = data.loc[data['Date'] == i, 'Price'] - \ data.loc[data['Date'] == i, 'Price'].mean() for i in range(2): data.loc[data['Apartment'] == i, 'expensive than average apartment'] = data.loc[data['Apartment'] == i, 'Price'] - \ data.loc[data['Apartment'] == i, 'Price'].mean() for i in range(1, 5): data.loc[data['Beds'] == i, 'expensive than average bed'] = data.loc[data['Beds'] == i, 'Price'] - \ data.loc[data['Beds'] == i, 'Price'].mean() threshold1 = Binarizer(threshold=3.0) res1 = pd.DataFrame(threshold1.transform(data['Review'].values.reshape(-1, 1))) threshold2 = Binarizer(threshold=80) res2 = pd.DataFrame(threshold2.transform(data['Price'].values.reshape(-1, 1))) pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) res3 = pd.DataFrame( pf.fit_transform( data[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']])) encoder = OneHotEncoder() data_region1hot = encoder.fit_transform(data['Region'].values.reshape(-1, 1)) data_region = pd.DataFrame(data_region1hot.toarray()) data_weekday1hot = encoder.fit_transform(data['Weekday'].values.reshape(-1, 1)) data_weekday = pd.DataFrame(data_weekday1hot.toarray()) data_reformed = pd.concat(