def test_syntax12_mixed2(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline( [ OneHotVectorizer( columns=[ 'workclass', 'education']), Concat( columns={ 'Feature': ['workclass', 'education']}), FastTreesRegressor( num_trees=5, feature='Feature', weight='weight') << { Role.Label: 'y'}]) exp.fit(X, verbose=0) assert exp.nodes[-1].feature_column_ == 'Feature' assert exp.nodes[-1].label_column_ == 'y' assert exp.nodes[-1].weight_column_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
def test_metrics_evaluate_regressor(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = FastTreesRegressor() e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) # TODO: debug flucations, and increase decimal precision on checks assert_almost_equal(metrics['L1(avg)'][0], 0.107, decimal=1, err_msg="L1 loss should be %s" % 0.107) assert_almost_equal(metrics['L2(avg)'][0], 0.0453, decimal=1, err_msg="L2(avg) should be %s" % 0.0453) assert_almost_equal(metrics['Loss-fn(avg)'][0], 0.0453, decimal=1, err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
def test_label_column_defaults_to_label_when_label_column_in_input_data( self): train_data = { 'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1] } train_df = pd.DataFrame(train_data) predictor = FastTreesRegressor() pipeline = Pipeline([predictor]) result = json.loads(pipeline.fit(train_df, dry_run=True)) self.verify_regressor_nodes(result, "Label", ["c1", "c2", "c3"], "Trainers.FastTreeRegressor")
def test_score_regressor(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = FastTreesRegressor(train_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train.to_frame()) metrics = e.score(X_test, y_test) print(metrics) assert_almost_equal(metrics, 0.814061733686017, decimal=5, err_msg="L1 loss should be %s" % 0.814061733686017)
def test_syntax12_group(self): # This tests check that a learner raises an exception # if a role is not allowed by the entrypoint. X = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], gr=[0, 0, 1, 1, 1], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), ToKey() << 'gr', FastTreesRegressor( number_of_trees=5, feature='Feature', group_id='gr') << { Role.Label: 'y' } ]) exp.fit(X, verbose=0) assert not hasattr(exp.nodes[-1], 'feature_') assert not hasattr(exp.nodes[-1], 'group_id_') assert exp.nodes[-1].feature_column_name_ == 'Feature' assert exp.nodes[-1].label_column_name_ == 'y' # assert not hasattr(exp.nodes[-1], 'row_group_column_name_') assert not hasattr(exp.nodes[-1], 'group_id_column') assert not hasattr(exp.nodes[-1], 'groupid_column_') assert not hasattr(exp.nodes[-1], 'groupid_column') if not hasattr(exp.nodes[-1], 'row_group_column_name_'): raise AssertionError("Attribute not found: {0}".format(", ".join( sorted(dir(exp.nodes[-1]))))) assert exp.nodes[-1].row_group_column_name_ == 'gr' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
train_file = get_dataset("uciadult_train").as_filepath() categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \ 'col=education:TX:2 col=marital-status:TX:3 ' \ 'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ]
# Error on linux # Unable to load shared library 'SymSgdNative' or one of its dependencies #SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), OneVsRestClassifier(FastLinearBinaryClassifier()), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), FactorizationMachineBinaryClassifier(), KMeansPlusPlus(n_clusters=2), NaiveBayesClassifier(), FastForestBinaryClassifier(number_of_trees=2), FastForestRegressor(number_of_trees=2), FastTreesBinaryClassifier(number_of_trees=2), FastTreesRegressor(number_of_trees=2), FastTreesTweedieRegressor(number_of_trees=2), LightGbmRegressor(number_of_iterations=2), LightGbmClassifier(), LightGbmBinaryClassifier(number_of_iterations=2) ] learners_not_supported = [ #PcaTransformer(), # REVIEW: crashes ] class TestModelSummary(unittest.TestCase): def test_model_summary(self): for learner in learners:
############################################################################### # FastTreesRegressor import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastTreesRegressor from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # train a model and score ftree = FastTreesRegressor().fit(X_train, y_train) scores = ftree.predict(X_test) # evaluate the model print('R-squared fit:', r2_score(y_test, scores))
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), FastTreesRegressor(feature=['induced', 'edu'], label='age') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 35.171112 # 1 35.171112 # 2 34.118595 # 3 34.118595 # 4 32.484325 # print evaluation metrics