# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), GamRegressor(feature=['induced', 'edu'], label='age') ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 35.390697 # 1 35.390697 # 2 34.603725 # 3 34.603725 # 4 32.455437 # print evaluation metrics
learners = [ AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # Error on linux # Unable to load shared library 'SymSgdNative' or one of its dependencies #SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), OneVsRestClassifier(FastLinearBinaryClassifier()), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), FactorizationMachineBinaryClassifier(), KMeansPlusPlus(n_clusters=2), NaiveBayesClassifier(), FastForestBinaryClassifier(number_of_trees=2), FastForestRegressor(number_of_trees=2), FastTreesBinaryClassifier(number_of_trees=2), FastTreesRegressor(number_of_trees=2), FastTreesTweedieRegressor(number_of_trees=2), LightGbmRegressor(number_of_iterations=2), LightGbmClassifier(), LightGbmBinaryClassifier(number_of_iterations=2) ]
PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # PcaTransformer(), # REVIEW: crashes GamBinaryClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView GamRegressor( ), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView LightGbmClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # LightGbmRanker(), # REVIEW: crashes # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView OneVsRestClassifier(FastLinearBinaryClassifier()), ] class TestModelSummary(unittest.TestCase): def test_model_summary(self): for learner in learners: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column)
############################################################################### # GamRegressor import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import GamRegressor from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # train a model and score ftree = GamRegressor().fit(X_train, y_train) scores = ftree.predict(X_test) # evaluate the model print('R-squared fit:', r2_score(y_test, scores))