def test_passing_in_a_single_transform_returns_new_pipeline(self): transform = OneHotVectorizer() << 'c0' transform.fit(train_df) combined_pipeline = Pipeline.combine_models(transform, contains_predictor=False) result = combined_pipeline.transform(test_df) self.assertEqual(len(result), 3) self.assertEqual(len(result.columns), 4) self.assertTrue(result.columns[0].startswith('c0.')) self.assertTrue(result.columns[1].startswith('c0.')) self.assertTrue(isinstance(combined_pipeline, Pipeline))
def test_combine_transform_and_transform(self): transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2' df = transform_1.fit_transform(train_df) transform_2 = OneHotVectorizer() << 'c0' transform_2.fit(df) df = transform_1.transform(test_df) result_1 = transform_2.transform(df) combined_pipeline = Pipeline.combine_models(transform_1, transform_2, contains_predictor=False) result_2 = combined_pipeline.transform(test_df) self.assertTrue(result_1.equals(result_2))
def test_sparse_vector_column_combined_with_single_value_columns(self): train_data = {'c0': [0, 1, 0, 3], 'c1': ['a', 'b', 'a', 'b']} train_df = pd.DataFrame(train_data).astype({'c0': np.float32}) xf = OneHotVectorizer(columns={'c1': 'c1'}) xf.fit(train_df) expected_result = xf.transform(train_df) self.assertTrue(type(expected_result) == pd.DataFrame) result = xf.transform(train_df, as_csr=True) self.assertEqual(result.nnz, 6) self.assertTrue(type(result) == csr_matrix) result = pd.DataFrame(result.todense(), columns=['c0', 'c1.a', 'c1.b']) self.assertTrue(result.equals(expected_result))
def test_sparse_vector_column(self): train_data = {'c0': ['a', 'b', 'a', 'b'], 'c1': ['c', 'd', 'd', 'c']} train_df = pd.DataFrame(train_data) xf = OneHotVectorizer(columns={'c0': 'c0', 'c1': 'c1'}) xf.fit(train_df) expected_result = xf.transform(train_df) self.assertTrue(type(expected_result) == pd.DataFrame) result = xf.transform(train_df, as_csr=True) self.assertEqual(result.nnz, 8) self.assertTrue(type(result) == csr_matrix) result = pd.DataFrame(result.todense(), columns=['c0.a', 'c0.b', 'c1.c', 'c1.d']) self.assertTrue(result.equals(expected_result))
def test_fit_transform(self): # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) # transform usage xf = OneHotVectorizer( columns={ 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous'}) # fit and transform res1 = xf.fit_transform(data) res2 = xf.fit(data).transform(data) assert_frame_equal(res1, res2)
def test_syntax4_passing(self): df, X, y = self.get_simple_df() vec = OneHotVectorizer() << {'edu1': ['education']} vec.fit(X) res = vec.transform(X) assert res.shape == (5, 5)