def test_load_iris(): iris = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd']) assert isinstance(iris, pd.DataFrame) assert 'species' not in iris.columns assert iris.shape == (150, 4) # assert on the names assert 'a' in iris
# Author: Taylor Smith <*****@*****.**> from __future__ import print_function import numpy as np import pandas as pd from skoot.datasets import load_iris_df from skoot.utils.testing import assert_raises from skoot.feature_selection import (FeatureFilter, SparseFeatureFilter, MultiCorrFilter, NearZeroVarianceFilter) from numpy.testing import assert_array_equal, assert_array_almost_equal # get some datasets defined for use later iris = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd']) sparse = pd.DataFrame.from_records(data=[[1., 2., np.nan], [2., 3., np.nan], [np.nan, 4., 5.]], columns=['a', 'b', 'c']) def test_nzv_constant_col(): X = pd.DataFrame.from_records(data=np.array([[1, 2, 3], [4, 5, 3], [6, 7, 3], [8, 9, 3]]), columns=['a', 'b', 'c']) flt = NearZeroVarianceFilter(freq_cut=25) trans = flt.fit_transform(X) # show that the output is one column shorter
# -*- coding: utf-8 -*- from __future__ import absolute_import from skoot.exploration.multivariate import summarize from skoot.datasets import load_iris_df import numpy as np from numpy.testing import assert_array_almost_equal, assert_array_equal # used throughout nan = np.nan float_fields = ["a", "b", "c", "d"] # load iris and add a string field iris = load_iris_df(include_tgt=True, names=float_fields, tgt_name="species") iris["cls"] = ["A" if x == 0 else "B" if x == 1 else "C" for x in iris["species"]] def test_summarize_all_continuous(): cont = iris[float_fields] summary = summarize(cont) # show we get the stats we expect expected = [ [5.843333, 3.054000, 3.758667, 1.198667], # mean [5.800000, 3.000000, 4.350000, 1.300000], # median [7.900000, 4.400000, 6.900000, 2.500000], # max [4.300000, 2.000000, 1.000000, 0.100000], # min [0.685694, 0.188004, 3.113179, 0.582414], # variance [0.311753, 0.330703, -0.271712, -0.103944], # skewness
# -*- coding: utf-8 -*- from __future__ import absolute_import from skoot.datasets import load_iris_df from skoot.preprocessing.schema import SchemaNormalizer from skoot.utils.testing import assert_persistable X = load_iris_df() schema = {'petal width (cm)': int} def test_normalizer(): norm = SchemaNormalizer(schema).fit(X) trans = norm.transform(X) types = trans.dtypes assert types['petal width (cm)'].name.startswith("int"), types def test_schema_persistable(): assert_persistable(SchemaNormalizer(schema), "location.pkl", X)
# -*- coding: utf-8 -*- # # Author: Taylor Smith <*****@*****.**> from __future__ import absolute_import from sklearn.preprocessing import RobustScaler from skoot.preprocessing import SelectiveScaler from skoot.datasets import load_iris_df from numpy.testing import assert_array_almost_equal import numpy as np X = load_iris_df(include_tgt=False) def test_selective_scale(): original = X cols = [original.columns[0]] # Only perform on first... # original_means = np.mean(X, axis=0) # array([ 5.84333333, 3.054 , 3.75866667, 1.19866667]) # original_std = np.std(X, axis=0) # array([ 0.82530129, 0.43214658, 1.75852918, 0.76061262]) transformer = SelectiveScaler(cols=cols).fit(original) transformed = transformer.transform(original) # expected: array([ 0. , 3.054 , 3.75866667, 1.19866667])
""" ================= Example summarize ================= Demonstrates how to use the ``summarize`` function to get a quick summary of your dataset. .. raw:: html <br/> """ print(__doc__) # Author: Taylor Smith <*****@*****.**> from skoot.exploration import summarize from skoot.datasets import load_iris_df # ############################################################################# # load data iris = load_iris_df(include_tgt=True) # add a feature of nothing but a single level of strings. This is to # demonstrate that the summary will report on even uninformative features iris["x5"] = "Level1" # print the summary of the dataset print(summarize(iris))
# -*- coding: utf-8 -*- from __future__ import absolute_import from skoot.preprocessing import BinningTransformer from skoot.datasets import load_iris_df from skoot.utils.testing import assert_raises import numpy as np from numpy.testing import assert_array_equal iris = load_iris_df(include_tgt=False, names=["a", "b", "c", "d"]) def test_binning_simple(): binner = BinningTransformer(cols=["a"], n_bins=3, strategy="uniform", return_bin_label=True, overwrite=True) binner.fit(iris) trans = binner.transform(iris) # show the dfs are not the same assert trans is not iris # show the columns stayed the same, though assert trans.columns.tolist() == iris.columns.tolist() # show we have a string datatype now assert trans.dtypes['a'].name == 'object'
# -*- coding: utf-8 -*- # # Author: Taylor Smith <*****@*****.**> from __future__ import absolute_import from skoot.utils.dataframe import get_numeric_columns from skoot.datasets import load_iris_df # get iris loaded iris = load_iris_df(names=['a', 'b', 'c', 'd'], tgt_name='e') def test_get_numeric(): subset = get_numeric_columns(iris) assert subset.equals(iris) assert subset is not iris def test_get_numeric_subset(): df = iris.copy() df['e'] = df['e'].astype(str) subset = get_numeric_columns(df) assert subset.shape != df.shape
# # Author: Taylor Smith <*****@*****.**> from __future__ import print_function, absolute_import, division from numpy.testing import assert_array_almost_equal from skoot.datasets import load_iris_df from skoot.utils.testing import assert_transformer_asdf, assert_persistable from skoot.decomposition import (SelectivePCA, SelectiveTruncatedSVD, SelectiveNMF, SelectiveKernelPCA, SelectiveIncrementalPCA) # Def data for testing names = ['a', 'b', 'c', 'd'] X = load_iris_df(include_tgt=False, names=names) def test_selective_pca(): # create a copy of the original original = X.copy() # set the columns we'll fit to just be the first cols = [names[0]] # 'a' # the "other" names, and their corresponding matrix comp_column_names = names[1:] compare_cols = original[comp_column_names].as_matrix() # now fit PCA on the first column only transformer = SelectivePCA(cols=cols,
<br/> """ print(__doc__) # Author: Taylor Smith <*****@*****.**> # ############################################################################# # Introduce an interesting scenario from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from skoot.preprocessing import SelectiveStandardScaler from skoot.base import make_transformer from skoot.datasets import load_iris_df X = load_iris_df(tgt_name="target") y = X.pop('target') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) # Let's say we want to scale our features with the StandardScaler, but # for whatever reason we only want the ABSOLUTE value of the scaled values. # We *could* create a transformer or split our pipeline, but either case is # klunky and could interrupt our CV process in a grid search. # # So we'll instead define a simple commutative function that will be wrapped # in an "anonymous" transformer def make_abs(X):
between scikit-learn and skoot. .. raw:: html <br/> """ print(__doc__) # Author: Taylor Smith <*****@*****.**> # ############################################################################# # Skoot is laid out much like scikit-learn. That is, many of the same modules # exist in skoot that are present in scikit. For example: from skoot import decomposition print(dir(decomposition)) # many are similar to sklearn classes print("") # ############################################################################# # Skoot also has a dataset interface, like sklearn. Except it returns # dataframes rather than numpy arrays: from skoot.datasets import load_iris_df df = load_iris_df(include_tgt=True, tgt_name='Species') print(df.head()) print("") # ############################################################################# # All skoot transformers are based on the BasePDTransformer: from skoot.base import BasePDTransformer print(BasePDTransformer.__doc__)