Пример #1
0
    def test_pipeline_pca(self):
        X = numpy.array([[1.0, 2, 3], [2, 3, 4], [3, 4, 5]])
        exp = Pipeline([PcaTransformer(rank=2)])
        infos = exp.get_fit_info(X)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['F0', 'F1', 'F2'],
            'type': 'start',
            'outputs': ['F0', 'F1', 'F2']
        }, {
            'name': 'TypeConverter',
            'inputs': ['F0', 'F1', 'F2'],
            'type': 'transform',
            'outputs': ['F0', 'F1', 'F2'],
            'schema_after': ['F0', 'F1', 'F2']
        }, {
            'name': 'PcaTransformer',
            'inputs': ['temp_'],
            'type': 'transform',
            'outputs': ['temp_'],
            'schema_after': ['F0', 'F1', 'F2', 'temp_']
        }]
        # This id depends on id(node), different at each execution.
        infos[-1]["inputs"] = ["temp_"]
        # This id depends on id(node), different at each execution.
        infos[-1]["outputs"] = ["temp_"]
        # This id depends on id(node), different at each execution.
        infos[-1]["schema_after"][-1] = ["temp_"]

        self.assertTrue(any(x != y for x, y in zip(exp, infos)))
Пример #2
0
 def test_PcaTransformer_int(self):
     df_ = get_dataset("infert").as_df()
     res = {}
     dt = {}
     for ty in (int, float):
         df = df_.copy()
         df['age'] = df['age'].astype(ty)
         df['parity'] = df['parity'].astype(ty)
         df['spontaneous'] = df['spontaneous'].astype(ty)
         df['stratum'] = df['stratum'].astype(ty)
         X = ['age', 'parity', 'spontaneous', 'stratum']
         pipe = Pipeline([
             ColumnConcatenator() << {
                 'X': X
             },
             PcaTransformer(rank=3) << 'X'
         ])
         y = pipe.fit_transform(df[X], verbose=0)
         res[ty] = y.sum().sum()
         dt[ty] = list(y.dtypes)
     vals = list(res.values())
     assert_almost_equal(vals[0], vals[1])
     dt = list(dt.values())
     dt[0].sort()
     dt[1].sort()
     assert dt[0] != dt[1]
Пример #3
0
 def test_PcaTransformer_no_concat(self):
     df = get_dataset("infert").as_df()
     X = [
         'age', 'parity', 'induced', 'spontaneous', 'stratum',
         'pooled.stratum'
     ]
     pipe = Pipeline([
         PcaTransformer(rank=3) <<
         ['age', 'parity', 'spontaneous', 'stratum']
     ])
     y = pipe.fit_transform(df[X].astype(numpy.float32))
     assert y is not None
Пример #4
0
 def test_PcaTransformer(self):
     df = get_dataset("infert").as_df()
     X = [
         'age', 'parity', 'induced', 'spontaneous', 'stratum',
         'pooled.stratum'
     ]
     pipe = Pipeline(
         [ColumnConcatenator() << {
             'X': X
         }, PcaTransformer(rank=3) << 'X'])
     y = pipe.fit_transform(df[X].astype(numpy.float32))
     y = y[['X.0', 'X.1', 'X.2']]
     assert_almost_equal(y.sum().sum(),
                         11.293087,
                         decimal=3,
                         err_msg="Sum should be %s" % 11.293087)
Пример #5
0
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'PcaAnomalyDetector': PcaAnomalyDetector(rank=3),
 'PcaTransformer':  PcaTransformer(rank=2),
 'PixelExtractor': Pipeline([
     Loader(columns={'ImgPath': 'Path'}),
     PixelExtractor(columns={'ImgPixels': 'ImgPath'}),
 ]),
 'PrefixColumnConcatenator': PrefixColumnConcatenator(columns={'Features': 'Sepal_'}),
 'Resizer': Pipeline([
     Loader(columns={'ImgPath': 'Path'}),
     Resizer(image_width=227, image_height=227,
             columns={'ImgResize': 'ImgPath'})
 ]),
 'SkipFilter': SkipFilter(count=5),
 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'],
                                      seasonal_window_size=2),
 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'],
                                                 seasonal_window_size=2),
Пример #6
0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs

train_file = get_dataset("infert").as_filepath()
schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \
         "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \
         "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \
         "sep=, header=+"
fds = FileDataStream(train_file, schema=schema)

# target and features columns
y = 'case'
X = ['age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooledstratum']

# observe gradual impact of dimensionality reduction on AUC
# reducing dimensions should degrade signal gradually, while
# maintaining the traits in original dataset as much as possible.
for rank in range(len(X), 2, -1):
    print('Number of dimensions=', rank)
    pipe = Pipeline([
        ColumnConcatenator() << {
            'X': X
        },  # X is VectorDataViewType column
        PcaTransformer(rank=rank) << 'X',  # find principal components of X
        LightGbmBinaryClassifier()
    ])
    metrics, scores = pipe.fit(fds, y).test(fds, y)
    print('AUC=', metrics['AUC'].values)
Пример #7
0
###############################################################################
# PcaTransformer
import numpy
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.decomposition import PcaTransformer

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32)

# transform data
feature_columns = ['age', 'parity', 'induced', 'spontaneous']

pipe = PcaTransformer(rank=3, columns={'features': feature_columns})

print(pipe.fit_transform(data).head())
#     age  case education  features.0  features.1  features.2  induced  ...
# 0  26.0   1.0    0-5yrs   -5.675901   -3.964389   -1.031570      1.0  ...
# 1  42.0   1.0    0-5yrs   10.364552    0.875251    0.773911      1.0  ...
# 2  39.0   1.0    0-5yrs    7.336117   -4.073389    1.128798      2.0  ...
# 3  34.0   1.0    0-5yrs    2.340584   -2.130528    1.248973      2.0  ...
# 4  35.0   1.0   6-11yrs    3.343876   -1.088401   -0.100063      1.0  ...