def test_pipeline_with_no_columns(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"]) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
def test_lightgbmclassifier(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t', encoding="utf-8") X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train, max_slots=5000) X_test = texttransform.transform(X_test, max_slots=5000) mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0) scores = mymodel.predict(X_test) accuracy = np.mean(y_test.values.ravel() == scores.values) assert_greater( accuracy, 0.58, "accuracy should be greater than %s" % 0.58)
def test_pipeline_with_no_columns_raise(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier() ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 with self.assertRaises(RuntimeError): # Message # System.InvalidOperationException: # 'LightGBM Error, code is -1, error message is # 'Cannot construct Dataset since there are not useful features. # It should be at least two unique rows. # If the num_row (num_data) is small, # you can set min_data=1 and min_data_in_bin=1 to fix this. # Otherwise please make sure you are using the right dataset.' ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
def test_pass_predict_proba_multiclass_with_pipeline(self): algos = [ LogisticRegressionClassifier(), FastLinearClassifier(), LightGbmClassifier() ] for algo in algos: assert_almost_equal(proba_sum(Pipeline([algo])), 38.0, decimal=3, err_msg=invalid_predict_proba_output)
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None): data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [1.0, 2.0, 3.0], [2.0, 2.0, 2.0]] label = [1, 0, 1, 1] if fit_X_type == "sparse": model = Pipeline([Binner(), LightGbmClassifier(minimum_example_count_per_leaf=1)]) else: model = Pipeline([Binner(), LogisticRegressionBinaryClassifier()]) data_with_new_type = transform_data(data, fit_X_type) label_with_new_type = transform_data(label, fit_Y_type) model.fit(data_with_new_type, label_with_new_type) metrics, scores = model.test( data_with_new_type, label_with_new_type, output_scores=True) test_data_with_new_type = transform_data(data, predict_X_type) return model.predict(test_data_with_new_type), scores, metrics
def test_syntax1_passing(self): df, X, y = self.get_simple_df() exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education2' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, LightGbmClassifier(min_data_per_leaf=1) << ['f1', 'f3'] ]) exp.fit(X, y) res = exp.transform(X) assert res.shape == (5, 16)
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None): data = [ "This is sentence 1", "Talk about second", "Thrid one", "Final example." ] label = [1, 0, 1, 1] model = Pipeline([ NGramFeaturizer(), LightGbmClassifier(min_data_per_leaf=1, n_thread=1) ]) data_with_new_type = transform_data(data, fit_X_type) label_with_new_type = transform_data(label, fit_Y_type) model.fit(data_with_new_type, label_with_new_type) metrics, scores = model.test(data_with_new_type, label_with_new_type, output_scores=True) test_data_with_new_type = transform_data(data, predict_X_type) return model.predict(test_data_with_new_type), scores, metrics
def test_pipeline_name_error(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) NGramFeaturizer(word_feature_extractor=n_gram()).fit_transform( trainData[["SentimentText"]]) msg = "Parameters ['NumLeaves', 'min_data', 'min_data_in_bin', " \ "'minsplit'] are not allowed" with self.assertRaises(NameError, msg=msg): LightGbmClassifier(min_data=1, min_data_in_bin=1, min_data_per_leaf=1, minsplit=1, NumLeaves=2)
def train_data_type_single(fit_X_type="dataframe", fit_Y_type=None, predict_X_type=None): data = [[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 2, 2]] label = [1, 0, 1, 1] if fit_X_type == "sparse": model = LightGbmClassifier(minimum_example_count_per_leaf=1) else: model = LogisticRegressionBinaryClassifier() data_with_new_type = transform_data(data, fit_X_type) label_with_new_type = transform_data(label, fit_Y_type) model.fit(data_with_new_type, label_with_new_type) test_data_with_new_type = transform_data(data, predict_X_type) return model.predict(test_data_with_new_type)
NOBINARY_CHECKS = [ 'check_estimator_sparse_data', 'check_dtype_object', 'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), 'IidChangePointDetector':
############################################################################### # LightGbmClassifier import numpy as np import pandas as pd from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmClassifier from sklearn.model_selection import train_test_split np.random.seed(0) # use 'iris' data set to create test and train data df = get_dataset("iris").as_df() print(df.head()) # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 df.drop(['Species'], inplace=True, axis=1) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = LightGbmClassifier().fit(X_train, y_train) scores = lr.predict(X_test) scores = pd.to_numeric(scores) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # PcaTransformer(), # REVIEW: crashes GamBinaryClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView GamRegressor( ), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView LightGbmClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # LightGbmRanker(), # REVIEW: crashes # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView OneVsRestClassifier(FastLinearBinaryClassifier()), ] class TestModelSummary(unittest.TestCase): def test_model_summary(self): for learner in learners: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) pipeline.summary()
OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf' NOBINARY_CHECKS = [ 'check_estimator_sparse_data', 'check_dtype_object', 'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmRanker': LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TensorFlowScorer': TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples', 'frozen_saved_model.pb'), columns={'c': ['a', 'b']}), } MULTI_OUTPUT_EX = [
path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LightGbmClassifier(feature=['parity', 'edu'], label='induced', booster=Dart(reg_lambda=0.1)) ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 0.070722 0.145439 0.783839 # 1 0 0.737733 0.260116 0.002150 # 2 2 0.070722 0.145439 0.783839 # 3 0 0.490715 0.091749 0.417537 # 4 0 0.562419 0.197818 0.239763 # print evaluation metrics