def test_pipeline_pca(self): X = numpy.array([[1.0, 2, 3], [2, 3, 4], [3, 4, 5]]) exp = Pipeline([PcaTransformer(rank=2)]) infos = exp.get_fit_info(X)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['F0', 'F1', 'F2'], 'type': 'start', 'outputs': ['F0', 'F1', 'F2'] }, { 'name': 'TypeConverter', 'inputs': ['F0', 'F1', 'F2'], 'type': 'transform', 'outputs': ['F0', 'F1', 'F2'], 'schema_after': ['F0', 'F1', 'F2'] }, { 'name': 'PcaTransformer', 'inputs': ['temp_'], 'type': 'transform', 'outputs': ['temp_'], 'schema_after': ['F0', 'F1', 'F2', 'temp_'] }] # This id depends on id(node), different at each execution. infos[-1]["inputs"] = ["temp_"] # This id depends on id(node), different at each execution. infos[-1]["outputs"] = ["temp_"] # This id depends on id(node), different at each execution. infos[-1]["schema_after"][-1] = ["temp_"] self.assertTrue(any(x != y for x, y in zip(exp, infos)))
def test_PcaTransformer_int(self): df_ = get_dataset("infert").as_df() res = {} dt = {} for ty in (int, float): df = df_.copy() df['age'] = df['age'].astype(ty) df['parity'] = df['parity'].astype(ty) df['spontaneous'] = df['spontaneous'].astype(ty) df['stratum'] = df['stratum'].astype(ty) X = ['age', 'parity', 'spontaneous', 'stratum'] pipe = Pipeline([ ColumnConcatenator() << { 'X': X }, PcaTransformer(rank=3) << 'X' ]) y = pipe.fit_transform(df[X], verbose=0) res[ty] = y.sum().sum() dt[ty] = list(y.dtypes) vals = list(res.values()) assert_almost_equal(vals[0], vals[1]) dt = list(dt.values()) dt[0].sort() dt[1].sort() assert dt[0] != dt[1]
def test_PcaTransformer_no_concat(self): df = get_dataset("infert").as_df() X = [ 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum' ] pipe = Pipeline([ PcaTransformer(rank=3) << ['age', 'parity', 'spontaneous', 'stratum'] ]) y = pipe.fit_transform(df[X].astype(numpy.float32)) assert y is not None
def test_PcaTransformer(self): df = get_dataset("infert").as_df() X = [ 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum' ] pipe = Pipeline( [ColumnConcatenator() << { 'X': X }, PcaTransformer(rank=3) << 'X']) y = pipe.fit_transform(df[X].astype(numpy.float32)) y = y[['X.0', 'X.1', 'X.2']] assert_almost_equal(y.sum().sum(), 11.293087, decimal=3, err_msg="Sum should be %s" % 11.293087)
use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs'], label='induced'), 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \ OneVsRestClassifier(LinearSvmBinaryClassifier(), use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs'], label='induced'), 'PcaAnomalyDetector': PcaAnomalyDetector(rank=3), 'PcaTransformer': PcaTransformer(rank=2), 'PixelExtractor': Pipeline([ Loader(columns={'ImgPath': 'Path'}), PixelExtractor(columns={'ImgPixels': 'ImgPath'}), ]), 'PrefixColumnConcatenator': PrefixColumnConcatenator(columns={'Features': 'Sepal_'}), 'Resizer': Pipeline([ Loader(columns={'ImgPath': 'Path'}), Resizer(image_width=227, image_height=227, columns={'ImgResize': 'ImgPath'}) ]), 'SkipFilter': SkipFilter(count=5), 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'], seasonal_window_size=2), 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'], seasonal_window_size=2),
# 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 # pooled.stratum education_str # 0 3.0 0-5yrs # 1 1.0 0-5yrs train_file = get_dataset("infert").as_filepath() schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \ "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \ "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \ "sep=, header=+" fds = FileDataStream(train_file, schema=schema) # target and features columns y = 'case' X = ['age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooledstratum'] # observe gradual impact of dimensionality reduction on AUC # reducing dimensions should degrade signal gradually, while # maintaining the traits in original dataset as much as possible. for rank in range(len(X), 2, -1): print('Number of dimensions=', rank) pipe = Pipeline([ ColumnConcatenator() << { 'X': X }, # X is VectorDataViewType column PcaTransformer(rank=rank) << 'X', # find principal components of X LightGbmBinaryClassifier() ]) metrics, scores = pipe.fit(fds, y).test(fds, y) print('AUC=', metrics['AUC'].values)
############################################################################### # PcaTransformer import numpy from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.decomposition import PcaTransformer # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32) # transform data feature_columns = ['age', 'parity', 'induced', 'spontaneous'] pipe = PcaTransformer(rank=3, columns={'features': feature_columns}) print(pipe.fit_transform(data).head()) # age case education features.0 features.1 features.2 induced ... # 0 26.0 1.0 0-5yrs -5.675901 -3.964389 -1.031570 1.0 ... # 1 42.0 1.0 0-5yrs 10.364552 0.875251 0.773911 1.0 ... # 2 39.0 1.0 0-5yrs 7.336117 -4.073389 1.128798 2.0 ... # 3 34.0 1.0 0-5yrs 2.340584 -2.130528 1.248973 2.0 ... # 4 35.0 1.0 6-11yrs 3.343876 -1.088401 -0.100063 1.0 ...