def test_pipeline_with_no_columns_raise(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier() ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 with self.assertRaises(RuntimeError): # Message # System.InvalidOperationException: # 'LightGBM Error, code is -1, error message is # 'Cannot construct Dataset since there are not useful features. # It should be at least two unique rows. # If the num_row (num_data) is small, # you can set min_data=1 and min_data_in_bin=1 to fix this. # Otherwise please make sure you are using the right dataset.' ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
def test_clone_sweep(self): # grid search, then clone pipeline and grid search again # results should be same np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) pipe1 = pipe.clone() grid1 = GridSearchCV(pipe1, param_grid) grid1.fit(X_train, y_train) assert grid.best_params_[ 'learner__number_of_trees'] == grid1.best_params_[ 'learner__number_of_trees']
def test_test(self): transformed_data, transformed_data_df = transform_data() fl = FastLinearRegressor( feature=[ 'parity', 'in', 'sp', 'stratum'], label='age') flpipe = Pipeline([fl]) flpipe.fit(transformed_data) metrics, scores = flpipe.test(transformed_data, output_scores=True) metrics_df, scores_df = flpipe.test( transformed_data_df, output_scores=True) assert_array_equal(scores, scores_df) assert_array_equal(metrics, metrics_df) flpipe.fit( transformed_data_df.drop( 'age', axis=1), transformed_data_df['age']) metrics, scores = flpipe.test(transformed_data, output_scores=True) metrics_df, scores_df = flpipe.test( transformed_data_df, output_scores=True) assert_array_equal(scores, scores_df) assert_array_equal(metrics, metrics_df)
def test_get_fit_info_fastl(self): train_file = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(train_file) data = FileDataStream(train_file, schema) pipeline = Pipeline([ Filter(columns=['Ozone']), FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone') ]) info = pipeline.get_fit_info(data) exp = [{ 'name': None, 'outputs': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'start' }, { 'inputs': ['Ozone'], 'name': 'Filter', 'outputs': ['Ozone'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'transform' }] for el in info[0]: if 'operator' in el: del el['operator'] self.assertEqual(exp, info[0][:2])
def test_pipeline_info(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) infos = exp.get_fit_info(df)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['education', 'workclass', 'yy'], 'type': 'start', 'outputs': ['education', 'workclass', 'yy'] }, { 'name': 'TypeConverter', 'inputs': ['yy'], 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'], 'type': 'transform' }, { 'name': 'MeanVarianceScaler', 'inputs': ['new_y'], 'type': 'transform', 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'OneHotVectorizer', 'inputs': ['workclass', 'education'], 'type': 'transform', 'outputs': ['workclass', 'education'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'ColumnDropper', 'type': 'transform', 'schema_after': ['education', 'workclass', 'new_y'], 'inputs': ['education', 'workclass', 'yy', 'new_y'], 'outputs': ['education', 'workclass', 'new_y'] }, { 'name': 'FastLinearRegressor', 'inputs': ['Feature:education,workclass', 'Label:new_y'], 'type': 'regressor', 'outputs': ['Score'], 'schema_after': ['Score'] }] if infos != exp: raise Exception(infos)
def test_syntax12_mixed2(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline( [ OneHotVectorizer( columns=[ 'workclass', 'education']), Concat( columns={ 'Feature': ['workclass', 'education']}), FastTreesRegressor( num_trees=5, feature='Feature', weight='weight') << { Role.Label: 'y'}]) exp.fit(X, verbose=0) assert exp.nodes[-1].feature_column_ == 'Feature' assert exp.nodes[-1].label_column_ == 'y' assert exp.nodes[-1].weight_column_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
def test_syntax8_label(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) exp = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} ]) exp.fit(df, verbose=0) assert exp.nodes[-1].feature_column_ == 'Features' assert exp.nodes[-1].label_column_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X, verbose=0) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 0.4: raise Exception(prediction) if prediction['Score'].max() > 2.00: raise Exception(prediction)
def nimbus_pred(model_path, test_set_path): X = pd.read_csv(test_set_path) X['c'] = X['c'].astype("category") p = Pipeline() p.load_model(model_path) pred = p.predict(X) print(pred)
def test_syntax4_dict(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, Concat() << { 'Inputs': ['edu1', 'edu2', 'wki'] }, FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_ensemble_supports_get_fit_info(self): df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info r3 = LightGbmRegressor(normalize="Yes") << col_info pipeline = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], ColumnDropper() << 'yy', VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) info = pipeline.get_fit_info(df) last_info_node = info[0][-1] self.assertEqual(last_info_node['inputs'], ['Feature:education,workclass', 'Label:new_y']) self.assertEqual(last_info_node['name'], 'VotingRegressor') self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor)) self.assertEqual(last_info_node['outputs'], ['Score']) self.assertEqual(last_info_node['schema_after'], ['Score']) self.assertEqual(last_info_node['type'], 'regressor')
def test_syntax6_change_role(self): # REVIEW: the pipeline drops all columns but one --> # nimbusml still thinks the Features are eduction, workclass # and does not automatically detects that the only remaining # columns should play that role # (maybe because the label column is here too even though # the only remaining column without a role is Features). df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << {'f1': 'education'}, OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_predictor_loaded_from_zip_has_feature_contributions(self): features = ['age', 'education-num', 'hours-per-week'] model_nimbusml = FastLinearBinaryClassifier(feature=features) model_nimbusml.fit(train, label) fc = model_nimbusml.get_feature_contributions(test) # Save the model to zip model_filename = 'nimbusml_model.zip' model_nimbusml.save_model(model_filename) # Load the model from zip model_nimbusml_zip = Pipeline() model_nimbusml_zip.load_model(model_filename) fc_zip = model_nimbusml_zip.get_feature_contributions(test) assert [ 'FeatureContributions.' + feature in fc_zip.columns for feature in features ] assert [ fc['FeatureContributions.' + feature].equals( fc_zip['FeatureContributions.' + feature]) for feature in features ] os.remove(model_filename)
def test_PcaTransformer_int(self): df_ = get_dataset("infert").as_df() res = {} dt = {} for ty in (int, float): df = df_.copy() df['age'] = df['age'].astype(ty) df['parity'] = df['parity'].astype(ty) df['spontaneous'] = df['spontaneous'].astype(ty) df['stratum'] = df['stratum'].astype(ty) X = ['age', 'parity', 'spontaneous', 'stratum'] pipe = Pipeline([ ColumnConcatenator() << { 'X': X }, PcaTransformer(rank=3) << 'X' ]) y = pipe.fit_transform(df[X], verbose=0) res[ty] = y.sum().sum() dt[ty] = list(y.dtypes) vals = list(res.values()) assert_almost_equal(vals[0], vals[1]) dt = list(dt.values()) dt[0].sort() dt[1].sort() assert dt[0] != dt[1]
def test_metrics_evaluate_clusterer(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = KMeansPlusPlus(n_clusters=2, initialization_algorithm="Random") e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) # if abs(metrics['NMI'][0] - 0.7) >= 0.15: # raise AssertionError("NMI loss should be %f not %f" % \ # (0.7, metrics['NMI'][0])) # if abs(metrics['AvgMinScore'][0] - 0.014) >= 0.015: # raise AssertionError("AvgMinScore should be %f not %f" % (\ # 0.014, metrics['AvgMinScore'][0])) assert_almost_equal(metrics['NMI'][0], 0.7, decimal=0, err_msg="NMI loss should be %s" % 0.7) assert_almost_equal(metrics['AvgMinScore'][0], 0.032, decimal=2, err_msg="AvgMinScore should be %s" % 0.014)
def test_syntax4_fail(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu2', 'wki'] ]) try: exp.fit(X, y) assert False except RuntimeError as e: assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \ in str(e)
def test_syntax10_weights(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[1., 1., 1., 2., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop(['y', 'weight'], axis=1) y = df['y'] w = df['weight'] exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], FastLinearRegressor() ]) exp.fit(X, y, weight=w, verbose=0) assert exp.nodes[-1].feature_column == 'Features' assert exp.nodes[-1].label_column == 'y' assert exp.nodes[-1].weight_column == 'weight' X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 1.: raise Exception(prediction) if prediction['Score'].max() > 3.6: raise Exception(prediction) if len(set(prediction['Score'])) < 4: raise Exception(prediction)
def test_syntax4_fail2(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu4', 'wki'] ]) try: exp.fit(X, y) raise AssertionError("The test should not reach this line.") except Exception as e: assert "Feature column 'edu4' not found" in str(e)
def test_syntax7_rename(self): # Error message are usually not informative enough. # Missing column --> no indication of other columns. # Error is (one transform should handle it) # 'The label column 'y' of the training data has a data type # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>. # Type must be Bool, R4, R8 or Key with two classes. df = pandas.DataFrame( dict( education=[ 'A', 'B', 'A', 'B', 'A'], workclass=[ 'X', 'X', 'Y', 'Y', 'Y'], y=[ 'red', 'white', 'red', 'white', 'white'])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << 'y', OneHotVectorizer() << ['workclass', 'education'], TypeConverter(result_type='R4') << {'yi': 'y'}, Drop() << 'y', FastLinearBinaryClassifier(max_iterations=1) << 'yi' ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) assert prediction.min() > 0.01 assert prediction.max() < 0.05
def test_syntax5_regular_expression(self): # REVIEW: not implemented yet # The best would be to handle regular expression inside nimbusml. # It could be handled in entrypoint.py just before calling nimbusml. # It can be handled inside Pipeline if it is aware of # the input schema. df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': 'f[0-9]+' }, FastLinearBinaryClassifier(max_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_unpickled_pipeline_has_feature_contributions(self): features = ['age', 'education-num', 'hours-per-week'] model_nimbusml = Pipeline( steps=[FastLinearBinaryClassifier(feature=features)]) model_nimbusml.fit(train, label) fc = model_nimbusml.get_feature_contributions(test) # Save with pickle pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) # Unpickle model with open(pickle_filename, "rb") as f: model_nimbusml_pickle = pickle.load(f) fc_pickle = model_nimbusml_pickle.get_feature_contributions(test) assert ['FeatureContributions.' + feature in fc_pickle.columns for feature in features] assert [fc['FeatureContributions.' + feature].equals( fc_pickle['FeatureContributions.' + feature]) for feature in features] os.remove(pickle_filename)
def test_syntax6_regular_expression(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': ['f%d' % i for i in range(1, 4)] }, Drop() << '~Features', FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_pipeline_pca(self): X = numpy.array([[1.0, 2, 3], [2, 3, 4], [3, 4, 5]]) exp = Pipeline([PcaTransformer(rank=2)]) infos = exp.get_fit_info(X)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['F0', 'F1', 'F2'], 'type': 'start', 'outputs': ['F0', 'F1', 'F2'] }, { 'name': 'TypeConverter', 'inputs': ['F0', 'F1', 'F2'], 'type': 'transform', 'outputs': ['F0', 'F1', 'F2'], 'schema_after': ['F0', 'F1', 'F2'] }, { 'name': 'PcaTransformer', 'inputs': ['temp_'], 'type': 'transform', 'outputs': ['temp_'], 'schema_after': ['F0', 'F1', 'F2', 'temp_'] }] # This id depends on id(node), different at each execution. infos[-1]["inputs"] = ["temp_"] # This id depends on id(node), different at each execution. infos[-1]["outputs"] = ["temp_"] # This id depends on id(node), different at each execution. infos[-1]["schema_after"][-1] = ["temp_"] self.assertTrue(any(x != y for x, y in zip(exp, infos)))
def test_syntax11_learner(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, FastLinearBinaryClassifier(max_iterations=1) << { 'Features': ['edu1', 'edu2'], Role.Label: 'y' } ]) exp.fit(df) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_datetime_column_parsed_from_string(self): dates = ["2018-01-02", "2018-02-01"] df = pd.DataFrame({'c1': dates, 'c2': [3, 4]}) file_name = get_temp_file('.csv') df.to_csv(file_name) df = pd.read_csv(file_name, parse_dates=['c1'], index_col=0) self.assertEqual(df.dtypes[0], np.dtype('datetime64[ns]')) pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) result = pipeline.fit_transform(df) self.assertEqual(result.loc[0, 'c1'].year, 2018) self.assertEqual(result.loc[0, 'c1'].month, 1) self.assertEqual(result.loc[0, 'c1'].day, 2) self.assertEqual(result.loc[0, 'c1'].hour, 0) self.assertEqual(result.loc[0, 'c1'].minute, 0) self.assertEqual(result.loc[0, 'c1'].second, 0) self.assertEqual(result.loc[1, 'c1'].year, 2018) self.assertEqual(result.loc[1, 'c1'].month, 2) self.assertEqual(result.loc[1, 'c1'].day, 1) self.assertEqual(result.loc[1, 'c1'].hour, 0) self.assertEqual(result.loc[1, 'c1'].minute, 0) self.assertEqual(result.loc[1, 'c1'].second, 0) self.assertEqual(len(result), 2) self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]')) os.remove(file_name)
def test_syntax3(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << 'education', OneHotVectorizer(max_num_terms=2) << 'workclass', # Currently the learner does not use edu1 # unless it is specified explicitely so nimbusml # does not do what the syntax implicetely tells. # We need to modify either the bridge to look into # every available column at one step. FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_model_summary_not_supported(self): for learner in learners_not_supported: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) assert_raises(TypeError, pipeline.summary)
def test_metrics_evaluate_regressor(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = FastTreesRegressor() e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) # TODO: debug flucations, and increase decimal precision on checks assert_almost_equal(metrics['L1(avg)'][0], 0.107, decimal=1, err_msg="L1 loss should be %s" % 0.107) assert_almost_equal(metrics['L2(avg)'][0], 0.0453, decimal=1, err_msg="L2(avg) should be %s" % 0.0453) assert_almost_equal(metrics['Loss-fn(avg)'][0], 0.0453, decimal=1, err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
def test_globalcontrastrowscaler(self): in_df = pd.DataFrame( data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[0, 2.5, 2.6, 2.4], Species=["setosa", "viginica", "setosa", 'versicolor'])) in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float32) # generate two new Columns - Petal_Normed and Sepal_Normed concat = ColumnConcatenator() << { 'concated_columns': ['Petal_Length', 'Sepal_Width', 'Sepal_Length'] } # Performs a global contrast normalization on input values: # Y = (s * X - M) / D, where s is a scale, M is mean and D is either # L2 norm or standard deviation normed = GlobalContrastRowScaler() << { 'normed_columns': 'concated_columns' } pipeline = Pipeline([concat, normed]) out_df = pipeline.fit_transform(in_df) cols = [ 'concated_columns.' + s for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length'] ] cols.extend([ 'normed_columns.' + s for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length'] ]) sum = out_df[cols].sum().sum() assert_greater(sum, 17.309, "sum should be greater than %s" % 17.309) assert_less(sum, 17.3102, "sum should be less than %s" % 17.31)
def test_lpscaler_automatically_converts_to_single(self): in_df = pd.DataFrame( data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[0, 2.5, 2.6, 2.4], Species=["setosa", "viginica", "setosa", 'versicolor'])) in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float64) src_cols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length'] pipeline = Pipeline([ ColumnConcatenator() << { 'concat': src_cols }, LpScaler() << { 'norm': 'concat' } ]) out_df = pipeline.fit_transform(in_df) cols = ['concat.' + s for s in src_cols] cols.extend(['norm.' + s for s in src_cols]) sum = out_df[cols].sum().sum() sum_range = (23.24, 23.25) assert_greater(sum, sum_range[0], "sum should be greater than %s" % sum_range[0]) assert_less(sum, sum_range[1], "sum should be less than %s" % sum_range[1])