def test_PcaAnomalyDetector(self):
     np.random.seed(0)
     df = get_dataset("iris").as_df()
     df.drop(['Species'], inplace=True, axis=1)
     X_train, X_test, y_train, y_test = train_test_split(
         df.loc[:, df.columns != 'Label'], df['Label'])
     svm = PcaAnomalyDetector(rank=3)
     svm.fit(X_train, y_train)
     scores = svm.predict(X_test)
     assert_almost_equal(scores.sum().sum(),
                         4.181632,
                         decimal=7,
                         err_msg="Sum should be %s" % 4.181632)
示例#2
0
     OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'PcaAnomalyDetector': PcaAnomalyDetector(rank=3),
 'PcaTransformer':  PcaTransformer(rank=2),
 'PixelExtractor': Pipeline([
     Loader(columns={'ImgPath': 'Path'}),
     PixelExtractor(columns={'ImgPixels': 'ImgPath'}),
 ]),
 'PrefixColumnConcatenator': PrefixColumnConcatenator(columns={'Features': 'Sepal_'}),
 'Resizer': Pipeline([
     Loader(columns={'ImgPath': 'Path'}),
     Resizer(image_width=227, image_height=227,
             columns={'ImgResize': 'ImgPath'})
 ]),
 'SkipFilter': SkipFilter(count=5),
 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'],
                                      seasonal_window_size=2),
 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'],
示例#3
0
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
    PcaAnomalyDetector(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # PcaTransformer(), # REVIEW: crashes
    GamBinaryClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    GamRegressor(
    ),  # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    LightGbmClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # LightGbmRanker(), # REVIEW: crashes
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    OneVsRestClassifier(FastLinearBinaryClassifier()),
]


class TestModelSummary(unittest.TestCase):
示例#4
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    PcaAnomalyDetector(rank=3, feature=['induced', 'edu'])
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data,
                                               'case',
                                               output_scores=True)
#      Score
# 0  0.026155
# 1  0.026155
# 2  0.018055
# 3  0.018055
# 4  0.004043

# print predictions
print(predictions.head())
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0

df = get_dataset("iris").as_df()
df.drop(['Label', 'Setosa', 'Species'], axis=1, inplace=True)

X_train, X_test = train_test_split(df)

# homogenous values for labels, y-column required by scikit
y_train = np.ones(len(X_train))

svm = PcaAnomalyDetector(rank=3)
svm.fit(X_train)

# add additional non-iris data to the test data set
not_iris = pandas.DataFrame(data=dict(Sepal_Length=[2.5, 2.6],
                                      Sepal_Width=[.75, .9],
                                      Petal_Length=[2.5, 2.5],
                                      Petal_Width=[.8, .7]))

merged_test = pandas.concat([X_test, not_iris], sort=False)

scores = svm.predict(merged_test)

# look at the last few observations
print(scores.tail())