def test_combined_models_support_decision_function(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) transform = OneHotVectorizer(columns={'edu': 'education'}) df = transform.fit_transform(data, as_binary_data_stream=True) feature_cols = [ 'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum' ] predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') predictor.fit(df) data = FileDataStream.read_csv(path) df = transform.transform(data, as_binary_data_stream=True) result_1 = predictor.decision_function(df) data = FileDataStream.read_csv(path) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.decision_function(data) self.assertTrue(np.array_equal(result_1, result_2))
def test_sparse_vector_column_combined_with_single_value_columns(self): train_data = {'c0': [0, 1, 0, 3], 'c1': ['a', 'b', 'a', 'b']} train_df = pd.DataFrame(train_data).astype({'c0': np.float32}) xf = OneHotVectorizer(columns={'c1': 'c1'}) xf.fit(train_df) expected_result = xf.transform(train_df) self.assertTrue(type(expected_result) == pd.DataFrame) result = xf.transform(train_df, as_csr=True) self.assertEqual(result.nnz, 6) self.assertTrue(type(result) == csr_matrix) result = pd.DataFrame(result.todense(), columns=['c0', 'c1.a', 'c1.b']) self.assertTrue(result.equals(expected_result))
def test_combine_with_classifier_trained_with_filedatastream(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) transform = OneHotVectorizer(columns={'edu': 'education'}) df = transform.fit_transform(data, as_binary_data_stream=True) feature_cols = [ 'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum' ] predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') predictor.fit(df) data = FileDataStream.read_csv(path) df = transform.transform(data, as_binary_data_stream=True) result_1 = predictor.predict(df) data = FileDataStream.read_csv(path) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(data) result_1 = result_1.astype(np.int32) result_2 = result_2['PredictedLabel'].astype(np.int32) self.assertTrue(result_1.equals(result_2))
def test_combine_with_classifier_trained_with_y_arg(self): """ Tests a sequence where the initial transform is computed using both X and y input args. Note, any steps after the initial transform will be operating on data where the X and y have been combined in to one dataset. """ np.random.seed(0) df = get_dataset("infert").as_df() X = df.loc[:, df.columns != 'case'] y = df['case'] transform = OneHotVectorizer() << 'education_str' # Passing in both X and y df = transform.fit_transform(X, y, as_binary_data_stream=True) # NOTE: need to specify the label column here because the # feature and label data was joined in the last step. predictor = LogisticRegressionBinaryClassifier(label='case', feature=list(X.columns)) predictor.fit(df) df = transform.transform(X, as_binary_data_stream=True) result_1 = predictor.predict(df) # Combine the models and perform a prediction combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(X) result_2 = result_2['PredictedLabel'].astype(np.float64) self.assertTrue(result_1.equals(result_2))
def test_sparse_vector_column(self): train_data = {'c0': ['a', 'b', 'a', 'b'], 'c1': ['c', 'd', 'd', 'c']} train_df = pd.DataFrame(train_data) xf = OneHotVectorizer(columns={'c0': 'c0', 'c1': 'c1'}) xf.fit(train_df) expected_result = xf.transform(train_df) self.assertTrue(type(expected_result) == pd.DataFrame) result = xf.transform(train_df, as_csr=True) self.assertEqual(result.nnz, 8) self.assertTrue(type(result) == csr_matrix) result = pd.DataFrame(result.todense(), columns=['c0.a', 'c0.b', 'c1.c', 'c1.d']) self.assertTrue(result.equals(expected_result))
def test_combine_transform_and_predictor(self): transform = OneHotVectorizer() << 'c0' df = transform.fit_transform(train_df, as_binary_data_stream=True) predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor.fit(df) df = transform.transform(test_df, as_binary_data_stream=True) result_1 = predictor.predict(df) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(test_df) self.assertEqual(result_1[0], result_2.loc[0, 'Score']) self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
def test_combine_transform_and_transform(self): transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2' df = transform_1.fit_transform(train_df) transform_2 = OneHotVectorizer() << 'c0' transform_2.fit(df) df = transform_1.transform(test_df) result_1 = transform_2.transform(df) combined_pipeline = Pipeline.combine_models(transform_1, transform_2, contains_predictor=False) result_2 = combined_pipeline.transform(test_df) self.assertTrue(result_1.equals(result_2))
def test_fit_predictor_with_idv(self): train_data = { 'c0': ['a', 'b', 'a', 'b'], 'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5] } train_df = pd.DataFrame(train_data).astype({ 'c1': np.float64, 'c2': np.float64 }) test_data = { 'c0': ['a', 'b', 'b'], 'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7] } test_df = pd.DataFrame(test_data).astype({ 'c1': np.float64, 'c2': np.float64 }) # Fit a transform pipeline to the training data transform_pipeline = Pipeline([OneHotVectorizer() << 'c0']) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Fit a predictor pipeline given a transformed BinaryDataStream predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor_pipeline = Pipeline([predictor]) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Create expected result xf = OneHotVectorizer() << 'c0' df = xf.fit_transform(train_df) predictor = OnlineGradientDescentRegressor( label='c2', feature=['c0.a', 'c0.b', 'c1']) predictor.fit(df) df = xf.transform(test_df) expected_result = predictor.predict(df) self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
def test_combine_with_classifier_trained_with_joined_X_and_y(self): np.random.seed(0) infert_df = get_dataset("infert").as_df() feature_cols = [c for c in infert_df.columns if c != 'case'] transform = OneHotVectorizer() << 'education_str' df = transform.fit_transform(infert_df, as_binary_data_stream=True) predictor = LogisticRegressionBinaryClassifier(label='case', feature=feature_cols) predictor.fit(df) df = transform.transform(infert_df, as_binary_data_stream=True) result_1 = predictor.predict(df) # Combine the models and perform a prediction combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(infert_df) result_2 = result_2['PredictedLabel'].astype(np.float64) self.assertTrue(result_1.equals(result_2))
def test_syntax4_passing(self): df, X, y = self.get_simple_df() vec = OneHotVectorizer() << {'edu1': ['education']} vec.fit(X) res = vec.transform(X) assert res.shape == (5, 5)