def test_syntax12_mixed2(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline( [ OneHotVectorizer( columns=[ 'workclass', 'education']), Concat( columns={ 'Feature': ['workclass', 'education']}), FastTreesRegressor( num_trees=5, feature='Feature', weight='weight') << { Role.Label: 'y'}]) exp.fit(X, verbose=0) assert exp.nodes[-1].feature_column_ == 'Feature' assert exp.nodes[-1].label_column_ == 'y' assert exp.nodes[-1].weight_column_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
def test_syntax6_regular_expression(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': ['f%d' % i for i in range(1, 4)] }, Drop() << '~Features', FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax5_regular_expression(self): # REVIEW: not implemented yet # The best would be to handle regular expression inside nimbusml. # It could be handled in entrypoint.py just before calling nimbusml. # It can be handled inside Pipeline if it is aware of # the input schema. df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': 'f[0-9]+' }, FastLinearBinaryClassifier(max_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax4_dict(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, Concat() << { 'Inputs': ['edu1', 'edu2', 'wki'] }, FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax6_change_role(self): # REVIEW: the pipeline drops all columns but one --> # nimbusml still thinks the Features are eduction, workclass # and does not automatically detects that the only remaining # columns should play that role # (maybe because the label column is here too even though # the only remaining column without a role is Features). df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << {'f1': 'education'}, OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax_concat_slots(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], Concat() << {'newcol': ['workclass', 'education']}, ]) exp.fit(X, verbose=0) exp.predict(X)
def test_syntax12_fail(self): # This tests check that a learner raises an exception # if a role is not allowed by the entrypoint. X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) try: Pipeline([ OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), FastLinearBinaryClassifier(feature='Feature', group_id='weight') << { Role.Label: 'y'} ]) Pipeline.fit(X) assert False except (RuntimeError, NameError) as e: exp = "Parameter 'group_id' is not allowed " \ "for class 'FastLinearBinaryClassifier'" if exp not in str(e): raise e
def test_syntax12_group(self): # This tests check that a learner raises an exception # if a role is not allowed by the entrypoint. X = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], gr=[0, 0, 1, 1, 1], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), ToKey() << 'gr', FastTreesRegressor( number_of_trees=5, feature='Feature', group_id='gr') << { Role.Label: 'y' } ]) exp.fit(X, verbose=0) assert not hasattr(exp.nodes[-1], 'feature_') assert not hasattr(exp.nodes[-1], 'group_id_') assert exp.nodes[-1].feature_column_name_ == 'Feature' assert exp.nodes[-1].label_column_name_ == 'y' # assert not hasattr(exp.nodes[-1], 'row_group_column_name_') assert not hasattr(exp.nodes[-1], 'group_id_column') assert not hasattr(exp.nodes[-1], 'groupid_column_') assert not hasattr(exp.nodes[-1], 'groupid_column') if not hasattr(exp.nodes[-1], 'row_group_column_name_'): raise AssertionError("Attribute not found: {0}".format(", ".join( sorted(dir(exp.nodes[-1]))))) assert exp.nodes[-1].row_group_column_name_ == 'gr' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)