示例#1
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
示例#2
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
示例#3
0
def main():
    args = parse()
    n_rollout = args.nrollout
    n_epoch = args.epoch
    savename = args.savename if args.savename is not None else 'model-' + str(
        n_rollout) + 'unroll'

    np.random.seed(1098)
    path = args.filename
    names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort']
    with h5py.File(path, 'r') as f:
        (target_pos, target_speed, pos, vel,
         effort) = [[np.array(val) for val in f[name].values()]
                    for name in names]

    x_target = np.array(target_pos)
    x_first = np.array([pos_[0] for pos_ in pos])
    x_speed = np.array(target_speed).reshape((-1, 1))
    aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

    x = np.concatenate((x_target, x_first, x_speed), axis=1)

    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    output_scaler = StandardScaler()
    effort_concat = np.concatenate([a for a in effort], axis=0)
    output_scaler.fit(effort_concat)
    effort = [output_scaler.transform(eff) for eff in effort]

    y = pad_sequences(effort, padding='post', value=0.)
    aux_output = pad_sequences(aux_output, padding='post', value=0.)
    x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x,
                                                               y,
                                                               aux_output,
                                                               test_size=0.2)

    y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)]
    y_aux_mask, y_aux_test_mask = [
        np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test)
    ]

    model = MyModel(train=[x, [y, y_aux]],
                    val=[x_test, [y_test, y_aux_test]],
                    train_mask=[y_mask, y_aux_mask],
                    val_mask=[y_test_mask, y_aux_test_mask],
                    max_unroll=n_rollout,
                    name=savename)

    if not os.path.exists('save'):
        os.makedirs('save')

    if args.train:
        model.fit(nb_epoch=n_epoch, batch_size=32)
    elif args.resume:
        model.resume(nb_epoch=n_epoch, batch_size=32)
示例#4
0
def make_models(X, y, y_bin):
    return dict(ols=LinearRegression().fit(X, y),
                lr_bin=LogisticRegression().fit(X, y_bin),
                lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y),
                lr_mn=LogisticRegression(solver='lbfgs',
                                         multi_class='multinomial').fit(X, y),
                svc=SVC(kernel='linear').fit(X, y_bin),
                svr=SVR(kernel='linear').fit(X, y),
                dtc=DecisionTreeClassifier(max_depth=4).fit(X, y),
                dtr=DecisionTreeRegressor(max_depth=4).fit(X, y),
                rfc=RandomForestClassifier(n_estimators=3,
                                           max_depth=3,
                                           random_state=1).fit(X, y),
                rfr=RandomForestRegressor(n_estimators=3,
                                          max_depth=3,
                                          random_state=1).fit(X, y),
                gbc=GradientBoostingClassifier(n_estimators=3,
                                               max_depth=3,
                                               random_state=1).fit(X, y),
                gbr=GradientBoostingRegressor(n_estimators=3,
                                              max_depth=3,
                                              random_state=1).fit(X, y),
                abc=AdaBoostClassifier(algorithm='SAMME',
                                       n_estimators=3,
                                       random_state=1).fit(X, y),
                abc2=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y),
                abc3=AdaBoostClassifier(algorithm='SAMME',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                abc4=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                km=KMeans(1).fit(X),
                km2=KMeans(5).fit(X),
                pc1=PCA(1).fit(X),
                pc2=PCA(2).fit(X),
                pc3=PCA(2, whiten=True).fit(X),
                mlr1=MLPRegressor([2], 'relu').fit(X, y),
                mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y),
                mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y),
                mlc=MLPClassifier([2, 2], 'tanh').fit(X, y),
                mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin),
                bin=Binarizer(0.5),
                mms=MinMaxScaler().fit(X),
                mas=MaxAbsScaler().fit(X),
                ss1=StandardScaler().fit(X),
                ss2=StandardScaler(with_mean=False).fit(X),
                ss3=StandardScaler(with_std=False).fit(X),
                n1=Normalizer('l1'),
                n2=Normalizer('l2'),
                n3=Normalizer('max'))
示例#5
0
def test_scaler_without_copy():
    """Check that StandardScaler.fit does not change input"""
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_copy = X.copy()
    StandardScaler(copy=False).fit(X)
    assert_array_equal(X, X_copy)

    X_csr_copy = X_csr.copy()
    StandardScaler(with_mean=False, copy=False).fit(X_csr)
    assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
示例#6
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sparse.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = StandardScaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
示例#7
0
    def prepare_time_data(data):
        data_scaler = StandardScaler()
        data_concat = np.concatenate(data, axis=0)
        data_scaler.fit(data_concat)
        new_data = [data_scaler.transform(data_) for data_ in data]

        return data_scaler, new_data
示例#8
0
def boston_DBSCAN(class_num=0):
    '''给出Boston房价数据集中class_num类,数据形式为归一化数据列
    :parameter
    ————
    class_num:类号,读取函数所返回的类别号
    :returns
    ————
    x_boston:波士顿数据集中class_num类的自变量,归一化数据列,共13列
    y_boston:波士顿数据集中class_num类自变量,归一化数据列,共1列
    '''
    # 读取全数据集
    bostondata = load_boston()
    boston_X = bostondata.data
    boston_y = bostondata.target
    boston_full = np.c_[boston_X, boston_y]
    # 进行全数据集归一化
    scale = StandardScaler()
    boston_full = scale.fit_transform(boston_full)
    # 数据集降维为3维,方便可视化调参。
    pca = PCA(n_components=3)
    boston_full3 = pca.fit_transform(boston_full)
    # 分类
    clt = DBSCAN(eps=0.8, min_samples=5, n_jobs=4)
    label3 = clt.fit_predict(X=boston_full3)
    # 给定输出数据
    group0_boston = boston_full[label3 == 0]
    x_boston = group0_boston[:, 0:-2]
    y_boston = group0_boston[:, -1]
    return x_boston, y_boston
示例#9
0
    def fit(self, x_train, y_train):

        self.processing_steps = [StandardScaler()]

        ann = MLPRegressor()
        params = {
            'hidden_layer_sizes': sp_randint(20, 150),
            'alpha': sp_uniform(0, 100),
            'max_iter': sp_randint(100, 2000),
            'solver': ['lbfgs'],
            # 'identity', 'logistic', 'tanh', 'relu'
            'activation': ['relu']
        }

        if 'hidden_layer_sizes' in self.kwargs:
            self.kwargs['hidden_layer_sizes'] = self.parsefunction(
                self.kwargs['hidden_layer_sizes'])

        params.update(self.kwargs)
        clf = RandomizedSearchCV(estimator=ann,
                                 param_distributions=params,
                                 n_iter=10,
                                 scoring=self.score['function'],
                                 cv=3,
                                 iid=True)

        self._update_pipeline_and_fit(x_train, y_train, [clf])
示例#10
0
    def fit(self, x_train, y_train):
        self.processing_steps = [StandardScaler()]
        svr = SVR(kernel='rbf', gamma=0.1)

        # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
        # C = [2**i for i in np.arange(start=-5, stop=16, step=2)]
        # gamma = [2**i for i in np.arange(start=-15, stop=4, step=2)]
        # https://stats.stackexchange.com/questions/43943/
        # which-search-range-for-determining-svm-optimal-c-
        # and-gamma-parameters

        C = [2**i for i in [-3, -2, -1, 0, 1, 2, 3, 4, 5]]
        gamma = [2**i for i in [-5, -4, -3, -2, -1, 0, 1, 2, 3]]

        params = {"C": sp_uniform(0.125, 32), "gamma": sp_uniform(0.03125, 8)}
        params.update(self.kwargs)

        reg = RandomizedSearchCV(estimator=svr,
                                 param_distributions=params,
                                 n_iter=10,
                                 scoring=self.score['function'],
                                 cv=3,
                                 iid=True)

        clf = MultiOutputRegressor(reg)
        self._update_pipeline_and_fit(x_train, y_train, [clf])
示例#11
0
def make_model(classifier, **params):
    pipeline = Pipeline([
        ('feature_extractor', FeatureExtractor()),
        ('scaler', StandardScaler()),
        ('model', classifier(**params)),
    ])
    return pipeline
示例#12
0
    def fit(self, data, args):
        self.model = StandardScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
示例#13
0
def test_fit_transform():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for obj in ((StandardScaler(), Normalizer(), Binarizer())):
        X_transformed = obj.fit(X).transform(X)
        X_transformed2 = obj.fit_transform(X)
        assert_array_equal(X_transformed, X_transformed2)
示例#14
0
    def normalize_features(self, scaler: StandardScaler=None) \
            -> StandardScaler:
        '''
        Normalizes the features of the dataset using a StandardScaler
        (subtract mean, divide by standard deviation).

        If a scaler is provided, uses that scaler to perform the normalization.
        Otherwise fits a scaler to the features in the dataset and then
        performs the normalization.

        :param scaler: A fitted StandardScaler. Used if provided.
        Otherwise a StandardScaler is fit on this dataset and is then used.
        :param replace_nan_token: What to replace nans with.
        :return: A fitted StandardScaler. If a scaler is provided, this is the
        same scaler. Otherwise, this is a scaler fit on this dataset.
        '''
        if not self.data or not self.data[0].features:
            return None

        if not scaler:
            scaler = StandardScaler()

        features = np.vstack([d.features for d in self.data])
        scaler.fit(features)

        for d in self.data:
            d.set_features(scaler.transform(d.features.reshape(1, -1))[0])

        return scaler
示例#15
0
def load_UCI_Credit_Card_data(infile=None, balanced=True, seed=5):

    X = []
    y = []
    sids = []

    with open(infile, "r") as fi:
        fi.readline()
        reader = csv.reader(fi)
        for row in reader:
            sids.append(row[0])
            X.append(row[1:-1])
            y0 = int(row[-1])
            if y0 == 0:
                y0 = -1
            y.append(y0)
    y = np.array(y)

    if balanced:
        X, y = balance_X_y(X, y, seed)

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    encoder = OneHotEncoder(categorical_features=[1, 2, 3])
    encoder.fit(X)
    X = encoder.transform(X).toarray()

    X, y = shuffle_X_y(X, y, seed)

    scale_model = StandardScaler()
    X = scale_model.fit_transform(X)

    return X, np.expand_dims(y, axis=1)
示例#16
0
def get_data(args, logger, debug):
    '''Get data.'''
    # Get data:
    train_data, val_data, test_data = _get_data(args, logger)

    debug(f'train size = {len(train_data):,} | val size = {len(val_data):,} |'
          f' test size = {len(test_data):,}')

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(args.data_df)
        debug('Class sizes')
        debug(class_sizes)

    # Scale features:
    if args.features_scaling:
        features_scaler = train_data.normalize_features()
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    # Initialise scaler and scale training targets by subtracting mean and
    # dividing standard deviation (regression only):
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        scaler = StandardScaler()
        targets = scaler.fit_transform(train_data.targets())
        train_data.set_targets(targets)
    else:
        scaler = None

    return train_data, val_data, test_data, scaler, features_scaler
示例#17
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
示例#18
0
 def _create_scaler(self, positivity):
     self.scaler_positivity = positivity
     if positivity is True:
         eps = 1e-9
         self._scaler = MinMaxScaler(feature_range=(eps, 1))
     else:
         self._scaler = StandardScaler()
     self.scaler_is_fitted = False
 def __stdScaler(self):
     all_cols = list(self.data_df.columns.values)
     for col in all_cols:
         if col not in self.non_numeric_cols and col != 'time_to_failure':
             stdScaler = StandardScaler()
             stdScaler.fit(self.data_df[[col]])
             self.data_df[col] = stdScaler.transform(self.data_df[[col]])
     print('Standard Scaler applied ... ')
示例#20
0
 def preprocess(self):
     sc = StandardScaler()
     sc.fit(self.X_train)
     X_train_std = sc.transform(self.X_train)
     X_test_std = sc.transform(self.X_test)
     self.train_dataset = self.Dataset(data=X_train_std,
                                       target=self.y_train)
     self.test_dataset = self.Dataset(data=X_test_std, target=self.y_test)
示例#21
0
def test_theanets_regression():
    check_regression(
        TheanetsRegressor(layers=[3],
                          trainers=[dict(algo='rmsprop', **impatient)]),
        **regressor_params)
    check_regression(
        TheanetsRegressor(scaler=StandardScaler(),
                          trainers=[dict(algo='rmsprop', **impatient)]),
        **regressor_params)
示例#22
0
def test_theanets_regression():
    check_regression(
        TheanetsRegressor(layers=[20],
                          trainers=[{
                              'optimize': 'rmsprop',
                              'min_improvement': 0.1
                          }]), **regressor_params)
    check_regression(TheanetsRegressor(scaler=StandardScaler()),
                     **regressor_params)
示例#23
0
def test_theanets_regression():
    check_regression(
        TheanetsRegressor(layers=[3],
                          trainers=[{
                              'algo': 'rmsprop',
                              'learning_rate': 0.1
                          }]), **regressor_params)
    check_regression(TheanetsRegressor(scaler=StandardScaler()),
                     **regressor_params)
示例#24
0
def load_scalers(path: str) -> Tuple[StandardScaler, StandardScaler]:
    '''
    Loads the scalers a model was trained with.

    :param path: Path where model checkpoint is saved.
    :return: A tuple with the data scaler and the features scaler.
    '''
    state = torch.load(path, map_location=lambda storage, loc: storage)

    scaler = StandardScaler(state['data_scaler']['means'],
                            state['data_scaler']['stds']) \
        if state['data_scaler'] else None

    features_scaler = StandardScaler(state['features_scaler']['means'],
                                     state['features_scaler']['stds']) \
        if state['features_scaler'] else None

    return scaler, features_scaler
示例#25
0
    def _proccess_input(self, target_pos, target_speed, pos, vel, effort):
        x_target = np.array(target_pos)
        x_first = np.array([pos_[0] for pos_ in pos])
        x_speed = np.array(target_speed).reshape((-1, 1))
        aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

        x = np.concatenate((x_target, x_first, x_speed), axis=1)

        input_scaler = StandardScaler()
        x = input_scaler.fit_transform(x)
        output_scaler = StandardScaler()
        effort_concat = np.concatenate([a for a in effort], axis=0)
        output_scaler.fit(effort_concat)
        effort = [output_scaler.transform(eff) for eff in effort]

        y = pad_sequences(effort, padding='post', value=0.)
        aux_output = pad_sequences(aux_output, padding='post', value=0.)
        x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2)
        return x, x_test, y, y_test, y_aux, y_aux_test
示例#26
0
def test_warning_scaling_integers():
    """Check warning when scaling integer data"""
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    w = "assumes floating point values as input, got uint8"

    clean_warning_registry()
    assert_warns_message(UserWarning, w, scale, X)
    assert_warns_message(UserWarning, w, StandardScaler().fit, X)
    assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
示例#27
0
def imputeAndScale(X_train,X_test):
    imp= Imputer()
    X_train=imp.fit_transform(X_train)
    X_test=imp.transform(X_test)
    
    scaler= StandardScaler().fit(X_train)
    X_test=scaler.transform(X_test)
    X_train= scaler.transform(X_train)
    
    return X_train, X_test
示例#28
0
def test_warning_scaling_integers():
    # Check warning when scaling integer data
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    w = "Data with input dtype uint8 was converted to float64"

    clean_warning_registry()
    assert_warns_message(DataConversionWarning, w, scale, X)
    assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X)
    assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
示例#29
0
def test_warning_scaling_integers():
    """Check warning when scaling integer data"""
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        assert_warns(UserWarning, StandardScaler().fit, X)

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        assert_warns(UserWarning, MinMaxScaler().fit, X)
示例#30
0
def scale_vars(df, mapper=None):
    # TODO Try RankGauss: https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629
    warnings.filterwarnings('ignore',
                            category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        # is_numeric_dtype will exclude categorical columns
        map_f = [([n], StandardScaler()) for n in df.columns
                 if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df).astype(np.float32)
    return mapper