Пример #1
0
 def test_defaults(self):
     schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32)
     data = FileDataStream.read_csv(infert_file, schema=schema)
     pipeline_steps = [
         OneHotVectorizer(columns={'edu': 'education'}),
         KMeansPlusPlus(
             n_clusters=5,
             feature=['edu', 'age', 'parity', 'spontaneous', 'stratum'])
     ]
     check_cv(pipeline_steps, data)
Пример #2
0
 def test_get_fit_info_clustering(self):
     X_train = pandas.DataFrame(
         data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                   y=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                   z=[0, 1, 2, 10, 11, 12, -10, -11, -12]))
     y_train = pandas.DataFrame(data=dict(
         clusterid=[0, 0, 0, 1, 1, 1, 2, 2, 2]))
     pipeline = Pipeline([KMeansPlusPlus(n_clusters=3)])
     pipeline.fit(X_train, y_train, verbose=0)
     scores = pipeline.predict(X_train)
     info = pipeline.get_fit_info(X_train, y_train)
     last = info[0][-1]
     out = last['outputs']
     assert out == ['PredictedLabel', 'Score.0', 'Score.1', 'Score.2']
     assert len(scores) == 9
Пример #3
0
    def test_score_clusterer(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = KMeansPlusPlus(n_clusters=2,
                            init_algorithm="Random",
                            train_threads=1)
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame())
        metrics = e.score(X_test, y_test)
        print(metrics)
        assert_almost_equal(metrics,
                            0.36840763005544264,
                            decimal=5,
                            err_msg="NMI loss should be %s" %
                            0.36840763005544264)
Пример #4
0
    def test_non_label_based_predictor_does_not_have_label_column_automatically_removed(
            self):
        train_data = {
            'c1': [2, 3, 4, 5],
            'c2': [3, 4, 5, 6],
            'c3': [4, 5, 6, 7],
            'Label': [0, 1, 2, 1]
        }
        train_df = pd.DataFrame(train_data)

        predictor = KMeansPlusPlus(n_clusters=5)
        pipeline = Pipeline([predictor])
        result = json.loads(pipeline.fit(train_df, dry_run=True))
        nodes = result['nodes']

        self.assertEqual(nodes[0]["Name"], "Transforms.FeatureCombiner")
        if six.PY2:
            self.assertItemsEqual(nodes[0]["Inputs"]["Features"],
                                  ['c1', 'c2', 'c3', 'Label'])
        else:
            self.assertCountEqual(nodes[0]["Inputs"]["Features"],
                                  ['c1', 'c2', 'c3', 'Label'])
        self.assertEqual(nodes[1]["Name"], "Trainers.KMeansPlusPlusClusterer")
        self.assertEqual(nodes[1]["Inputs"]["FeatureColumnName"], "Features")
Пример #5
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    KMeansPlusPlus(n_clusters=5, feature=['induced', 'edu', 'parity'])
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline \
    .fit(data) \
    .test(data, 'induced', output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2   Score.3   Score.4
# 0               4  2.732253  2.667988  2.353899  2.339244  0.092014
# 1               4  2.269290  2.120064  2.102576  2.222578  0.300347
# 2               4  3.482253  3.253153  2.425328  2.269245  0.258680
# 3               4  3.130401  2.867317  2.158132  2.055911  0.175347
Пример #6
0
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'EnsembleClassifier':
    EnsembleClassifier(num_models=3),
    'EnsembleRegressor':
    EnsembleRegressor(num_models=3),
    'FactorizationMachineBinaryClassifier':
    FactorizationMachineBinaryClassifier(shuffle=False),
    'KMeansPlusPlus':
    KMeansPlusPlus(n_clusters=2),
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SgdBinaryClassifier':
Пример #7
0
     FromKey(columns=['Sepal_Length'])
 ]),
 # GlobalContrastRowScaler currently requires a vector input to work
 'GlobalContrastRowScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']),
 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']),
 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                                  label='rank',
                                  group_id='group'),
 'Loader': Loader(columns={'ImgPath': 'Path'}),
 'LpScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
Пример #8
0
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
    PcaAnomalyDetector(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # PcaTransformer(), # REVIEW: crashes
    GamBinaryClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    GamRegressor(
    ),  # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    LightGbmClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # LightGbmRanker(), # REVIEW: crashes
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    OneVsRestClassifier(FastLinearBinaryClassifier()),
Пример #9
0
###############################################################################
# KMeansPlusPlus
import pandas
from nimbusml import Pipeline
from nimbusml.cluster import KMeansPlusPlus

# define 3 clusters with centroids (1,1,1), (11,11,11) and (-11,-11,-11)
X_train = pandas.DataFrame(data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                                     y=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                                     z=[0, 1, 2, 10, 11, 12, -10, -11, -12]))

# these should clearly belong to just 1 of the 3 clusters
X_test = pandas.DataFrame(data=dict(x=[-1, 3, 9, 13, -13, -20],
                                    y=[-1, 3, 9, 13, -13, -20],
                                    z=[-1, 3, 9, 13, -13, -20]))

y_test = pandas.DataFrame(data=dict(clusterid=[2, 2, 1, 1, 0, 0]))

pipe = Pipeline([KMeansPlusPlus(n_clusters=3)]).fit(X_train)

metrics, predictions = pipe.test(X_test, y_test, output_scores=True)

# print predictions
print(predictions.head())

# print evaluation metrics
print(metrics)