示例#1
0
    def test_init(self):

        gender_map = {"m": "Male", "f": "Female"}
        state_map = {"NY": "New York", "CA": "California"}
        _ = Map(columns=[
            replace(it.gender, gender_map),
            replace(it.state, state_map)
        ])
示例#2
0
 def test_transform_replace_list(self):
     d = {
         "gender": ["m", "f", "m", "m", "f"],
         "state": ["NY", "NY", "CA", "NY", "CA"],
         "status": [0, 1, 1, 0, 1],
     }
     df = pd.DataFrame(data=d)
     gender_map = {"m": "Male", "f": "Female"}
     state_map = {"NY": "New York", "CA": "California"}
     trainable = Map(columns=[
         replace(it.gender, gender_map),
         replace(it.state, state_map)
     ])
     trained = trainable.fit(df)
     transformed_df = trained.transform(df)
     self.assertEqual(transformed_df.shape, (5, 3))
     self.assertEqual(transformed_df["gender"][0], "Male")
     self.assertEqual(transformed_df["state"][0], "New York")
示例#3
0
    def test_with_hyperopt(self):
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        gender_map = {"m": "Male", "f": "Female"}
        state_map = {"NY": "New York", "CA": "California"}
        map_replace = Map(
            columns=[
                replace(it.gender, gender_map),
                replace(it.state, state_map)
            ],
            remainder="drop",
        )
        pipeline = (Relational(operator=(Scan(table=it.main) & Scan(
            table=it.delay)) >> map_replace) >> LogisticRegression())
        opt = Hyperopt(estimator=pipeline, cv=3, max_evals=5)
        trained = opt.fit(X, y)
        _ = trained
示例#4
0
 def test_transform_spark_replace_list(self):
     if spark_installed:
         d = {
             "gender": ["m", "f", "m", "m", "f"],
             "state": ["NY", "NY", "CA", "NY", "CA"],
             "status": [0, 1, 1, 0, 1],
         }
         df = pd.DataFrame(data=d)
         sdf = self.sqlCtx.createDataFrame(df)
         gender_map = {"m": "Male", "f": "Female"}
         state_map = {"NY": "New York", "CA": "California"}
         trainable = Map(columns=[
             replace(it.gender, gender_map),
             replace(it.state, state_map)
         ])
         trained = trainable.fit(sdf)
         transformed_df = trained.transform(sdf)
         self.assertEqual(
             (transformed_df.count(), len(transformed_df.columns)), (5, 3))
         self.assertEqual(transformed_df.head()[0], "Male")
         self.assertEqual(transformed_df.head()[1], "New York")
示例#5
0
 def _build_transformer(self):
     result = Map(
         columns={
             f"{col_name}_{cat_value}": replace(
                 it[col_name],
                 {cat_value: 1},
                 handle_unknown="use_encoded_value",
                 unknown_value=0,
             )
             for col_idx, col_name in enumerate(self.feature_names_in_)
             for cat_value in self.categories_[col_idx]
         })
     return result
示例#6
0
 def _build_transformer(self):
     # prepare the transformer
     transformer = Map(
         columns={
             col_name: replace(
                 it[col_name],
                 {
                     self._hyperparams["missing_values"]:
                     self.statistics_[col_idx]
                 },
             )
             for col_idx, col_name in enumerate(self.feature_names_in_)
         })
     return transformer
示例#7
0
 def _build_transformer(self):
     result = Map(
         columns={
             col_name: replace(
                 it[col_name],
                 {
                     cat_value: cat_idx
                     for cat_idx, cat_value in enumerate(
                         self.categories_[col_idx])
                 },
                 handle_unknown="use_encoded_value",
                 unknown_value=self._hyperparams["unknown_value"],
             )
             for col_idx, col_name in enumerate(self.feature_names_in_)
         })
     return result