def test_numpy_inplace(): ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) standardize(ary, [1]) ary = ary_expc = np.array([[1, 1.46385], [2, 0.87831], [3, 0.29277], [4, -0.29277], [5, -0.87831], [6, -1.46385]]) np.testing.assert_allclose(ary, ary_expc, rtol=1e-03)
def test_numpy_single_feat(): ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) ary_actu = standardize(ary, [1]) ary_expc = np.array([[1.46385], [0.87831], [0.29277], [-0.29277], [-0.87831], [-1.46385]]) np.testing.assert_allclose(ary_actu, ary_expc, rtol=1e-03)
def test_standardize_all_columns_ndarray(): ary = np.array([[0, 10], [0, 9], [0, 8], [0, 7], [0, 6], [0, 5]]) ary_actu = standardize(ary, columns=None) ary_expc = np.array([[0.0, 1.46385], [0.0, 0.87831], [0.0, 0.29277], [0.0, -0.29277], [0.0, -0.87831], [0.0, -1.46385]]) np.testing.assert_allclose(ary_actu, ary_expc, rtol=1e-03)
def test_zero_division_numpy(): ary = np.array([[0, 10], [0, 9], [0, 8], [0, 7], [0, 6], [0, 5]]) ary_actu = standardize(ary, columns=[0, 1]) ary_expc = np.array([[0.0, 1.46385], [0.0, 0.87831], [0.0, 0.29277], [0.0, -0.29277], [0.0, -0.87831], [0.0, -1.46385]]) np.testing.assert_allclose(ary_actu, ary_expc, rtol=1e-03)
def test_numpy_single_dim(): ary = np.array([1, 2, 3, 4, 5, 6]) ary_actu = standardize(ary, [0]) ary_expc = np.array([[-1.46385], [-0.87831], [-0.29277], [0.29277], [0.87831], [1.46385]]) np.testing.assert_allclose(ary_actu, ary_expc, rtol=1e-03)
def test_numpy_standardize(): ary = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) ary_actu = standardize(ary, columns=[0, 1]) ary_expc = np.array([[-1.46385, 1.46385], [-0.87831, 0.87831], [-0.29277, 0.29277], [0.29277, -0.29277], [0.87831, -0.87831], [1.46385, -1.46385]]) np.testing.assert_allclose(ary_actu, ary_expc, rtol=1e-03)
def test_zero_division_pandas(): s1 = pd.Series([0, 0, 0, 0, 0, 0], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6))) df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 df_out1 = standardize(df, ['s1', 's2']) ary_out1 = np.array([[0.0, 1.46385], [0.0, 0.87831], [0.0, 0.29277], [0.0, -0.29277], [0.0, -0.87831], [0.0, -1.46385]]) np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03)
def test_pandas_standardize(): s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6))) df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 df_out1 = standardize(df, ['s1', 's2']) ary_out1 = np.array([[-1.46385, 1.46385], [-0.87831, 0.87831], [-0.29277, 0.29277], [0.29277, -0.29277], [0.87831, -0.87831], [1.46385, -1.46385]]) np.testing.assert_allclose(df_out1.values, ary_out1, rtol=1e-03)
X, y = mnist_data() X, y = shuffle_arrays_unison((X, y), random_seed=1) X_train, y_train = X[:500], y[:500] X_test, y_test = X[500:], y[500:] def plot_digit(X, y, idx): img = X[idx].reshape(28, 28) # 784 => 28 * 28 plt.imshow(img, cmap='Greys', interpolation='nearest') plt.title('true label: %d' % y[idx]) plt.show() plot_digit(X, y, 3500) X_train_std, params = standardize(X_train, columns=range(X_train.shape[1]), return_params=True) X_test_std = standardize(X_test, columns=range(X_test.shape[1]), params=params) nn1 = MLP(hidden_layers=[150], l2=0.00, l1=0.0, epochs=100, eta=0.005, momentum=0.0, decrease_const=0.0, minibatches=100, random_seed=1, print_progress=3) nn1.fit(X_train_std, y_train) plt.plot(range(len(nn1.cost_)), nn1.cost_)
# Sebastian Raschka 2014-2019 # mlxtend Machine Learning Library Extensions # Author: Sebastian Raschka <sebastianraschka.com> # # License: BSD 3 clause import numpy as np from numpy.testing import assert_almost_equal from numpy.testing import assert_allclose from mlxtend.utils import assert_raises from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA from mlxtend.data import iris_data from mlxtend.preprocessing import standardize X, y = iris_data() X_std = standardize(X) def test_default_components(): pca = PCA() res = pca.fit(X_std).transform(X_std) assert res.shape[1] == 4 def test_whitening(): pca = PCA(n_components=2) res = pca.fit(X_std).transform(X_std) diagonals_sum = np.sum(np.diagonal(np.cov(res.T))) assert round(diagonals_sum, 1) == 3.9, diagonals_sum pca = PCA(n_components=2, whitening=True)
args = parser.parse_args() train_data = pd.read_csv("train.data.csv") label_list = [i for i in train_data['class']] label = np.empty(len(label_list)) for i in range(label.shape[0]): if (label_list[i] == ' >50K'): label[i] = 1 else: label[i] = -1 feature = pd.DataFrame.as_matrix(train_data)[:, :-1] # Preprocess the data feature = preprocess(feature) # Standardize columns in train_feature feature = standardize(feature) # Train-validation split train_feature, val_feature, train_label, val_label = train_test_split( feature, label, test_size=0.1) plt.figure() for lamda in [0.001, 0.01, 0.1, 1]: # Initialize a, b a = np.random.randn(train_feature.shape[1], 1) # shape=(14, 1) b = 0 accuracy_list = [] mag_list = [] loss_list = [] for epoch in range(args.num_epochs): actual_train_feature, held_out_feature, actual_train_label, held_out_label = train_test_split( train_feature,
scaler = StandardScaler() wine_subset = wine[['ash', 'alcalinity', 'magnesium']] wine_subset.apply(np.var) # see how variance varies wine_subset_scaled = scaler.fit_transform(wine_subset) np.var(wine_subset_scaled) # so long variation in variance # Standardizing using `mlxtend` ## Also creates a Z-score, but easier when working w/ Pandas DF from mlxtend.preprocessing import standardize standardize(wine, columns=['ash', 'alcalinity', 'magnesium']) standardize(wine_subset) # MinMaxScaler ## X_norm = (X - X_min) / (X_max - X_min) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() wine_minmax_scaled = scaler.fit_transform(wine_subset) np.var(wine_minmax_scaled) # Using mlxtend version of MinMaxScaler
import matplotlib.pyplot as plt def plot_digit(X, y, idx): img = X[idx].reshape(28, 28) plt.imshow(img, cmap='Greys', interpolation='nearest') plt.title('true label: %d' % y[idx]) plt.show() plot_digit(X, y, 3500) from mlxtend.preprocessing import standardize X_train_std, params = standardize(X_train, columns=range(X_train.shape[1]), return_params=True) X_test_std = standardize(X_test, columns=range(X_test.shape[1]), params=params) nn1 = MLP(hidden_layers=[150], l2=0.00, l1=0.0, epochs=100, eta=0.005, momentum=0.0, decrease_const=0.0, minibatches=100, random_seed=1, print_progress=3) nn1.fit(X_train_std, y_train) plt.plot(range(len(nn1.cost_)), nn1.cost_)
import pandas as pd import numpy as np from mlxtend.preprocessing import standardize s1 = pd.Series([1, 2, 3, 4, 5, 6], index=(range(6))) s2 = pd.Series([10, 9, 8, 7, 6, 5], index=(range(6))) df = pd.DataFrame(s1, columns=['s1']) df['s2'] = s2 print df print standardize(df, columns=['s1', 's2']) X = np.array([[1, 10], [2, 9], [3, 8], [4, 7], [5, 6], [6, 5]]) print standardize(X, columns=[0, 1]) # re-using parameters X_train = np.array([[1, 10], [4, 7], [3, 8]]) X_test = np.array([[1, 2], [3, 4], [5, 6]]) X_train_std, params = standardize(X_train, columns=[0, 1], return_params=True) X_test_std = standardize(X_test, columns=[0, 1], params=params) print params print X_train_std print X_test_std
# Sebastian Raschka 2014-2017 # mlxtend Machine Learning Library Extensions # Author: Sebastian Raschka <sebastianraschka.com> # # License: BSD 3 clause import numpy as np from numpy.testing import assert_almost_equal from numpy.testing import assert_allclose from mlxtend.utils import assert_raises from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA from mlxtend.data import iris_data from mlxtend.preprocessing import standardize X, y = iris_data() X = standardize(X) def test_default_components(): pca = PCA() res = pca.fit(X).transform(X) assert res.shape[1] == 4 def test_default_2components(): pca = PCA(n_components=2) res = pca.fit(X).transform(X) assert res.shape[1] == 2 def test_eigen_vs_svd():