예제 #1
0
 def test_categories_to_integers_grid_search(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data",
                         "adult_set.txt")
     df = pandas.read_csv(data, sep="\t")
     X = df.drop('income', axis=1)
     y = df['income']
     pipe = make_pipeline(CategoriesToIntegers(), LogisticRegression())
     self.assertRaise(lambda: test_sklearn_grid_search_cv(lambda: pipe, df),
                      ValueError)
     self.assertRaise(
         lambda: test_sklearn_grid_search_cv(
             lambda: pipe, X, y, categoriestointegers__single=[True, False]
         ), ValueError, "Unable to find category value")
     pipe = make_pipeline(CategoriesToIntegers(),
                          Imputer(strategy='most_frequent'),
                          LogisticRegression())
     res = test_sklearn_grid_search_cv(
         lambda: pipe,
         X,
         y,
         categoriestointegers__single=[True, False],
         categoriestointegers__skip_errors=[True])
     self.assertIn('model', res)
     self.assertIn('score', res)
     self.assertGreater(res['score'], 0)
     self.assertLesser(res['score'], 1)
    def test_categories_to_integers(self):
        data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data",
                            "adult_set.txt")
        df = pandas.read_csv(data, sep="\t")

        trans = CategoriesToIntegers()
        trans.fit(df)
        self.assertIsInstance(str(trans), str)
        newdf = trans.transform(df)
        exp = [
            'age', 'final_weight', 'education_num', 'capital_gain',
            'capital_loss', 'hours_per_week', 'marital_status= Divorced',
            'marital_status= Married-AF-spouse',
            'marital_status= Married-civ-spouse',
            'marital_status= Married-spouse-absent',
            'marital_status= Never-married', 'marital_status= Separated',
            'marital_status= Widowed', 'sex= Female', 'sex= Male',
            'education= 10th', 'education= 11th', 'education= 12th',
            'education= 1st-4th', 'education= 5th-6th', 'education= 7th-8th',
            'education= 9th', 'education= Assoc-acdm', 'education= Assoc-voc',
            'education= Bachelors', 'education= Doctorate',
            'education= HS-grad', 'education= Masters', 'education= Preschool',
            'education= Prof-school', 'education= Some-college',
            'native_country= ?', 'native_country= Cambodia',
            'native_country= Canada', 'native_country= China',
            'native_country= Columbia', 'native_country= Cuba',
            'native_country= Dominican-Republic', 'native_country= Ecuador',
            'native_country= El-Salvador', 'native_country= England',
            'native_country= France', 'native_country= Germany',
            'native_country= Guatemala', 'native_country= Haiti',
            'native_country= Honduras', 'native_country= India',
            'native_country= Iran', 'native_country= Italy',
            'native_country= Jamaica', 'native_country= Laos',
            'native_country= Mexico', 'native_country= Philippines',
            'native_country= Poland', 'native_country= Portugal',
            'native_country= Puerto-Rico', 'native_country= South',
            'native_country= Taiwan', 'native_country= Thailand',
            'native_country= United-States', 'race= Amer-Indian-Eskimo',
            'race= Asian-Pac-Islander', 'race= Black', 'race= Other',
            'race= White', 'relationship= Husband',
            'relationship= Not-in-family', 'relationship= Other-relative',
            'relationship= Own-child', 'relationship= Unmarried',
            'relationship= Wife', 'workclass= ?', 'workclass= Federal-gov',
            'workclass= Local-gov', 'workclass= Private',
            'workclass= Self-emp-inc', 'workclass= Self-emp-not-inc',
            'workclass= State-gov', 'income= <=50K', 'income= >50K',
            'occupation= ?', 'occupation= Adm-clerical',
            'occupation= Armed-Forces', 'occupation= Craft-repair',
            'occupation= Exec-managerial', 'occupation= Farming-fishing',
            'occupation= Handlers-cleaners', 'occupation= Machine-op-inspct',
            'occupation= Other-service', 'occupation= Priv-house-serv',
            'occupation= Prof-specialty', 'occupation= Protective-serv',
            'occupation= Sales', 'occupation= Tech-support',
            'occupation= Transport-moving'
        ]
        exp.sort()
        ret = list(newdf.columns)
        ret.sort()
        self.assertEqual(len(ret), len(exp))
        self.assertEqual(exp, ret)
 def test_categories_to_integers_grid_search(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data",
                         "adult_set.txt")
     df = pandas.read_csv(data, sep="\t")
     X = df.drop('income', axis=1)
     y = df['income']  # pylint: disable=E1136
     pipe = make_pipeline(CategoriesToIntegers(), LogisticRegression())
     self.assertRaise(lambda: test_sklearn_grid_search_cv(lambda: pipe, df),
                      ValueError)
     if (compare_module_version(sklver, "0.24") >= 0 and  # pylint: disable=R1716
             compare_module_version(pandas.__version__, "1.3") < 0):
         self.assertRaise(
             lambda: test_sklearn_grid_search_cv(
                 lambda: pipe,
                 X,
                 y,
                 categoriestointegers__single=[True, False]), ValueError,
             "Unable to find category value")
     pipe = make_pipeline(CategoriesToIntegers(),
                          Imputer(strategy='most_frequent'),
                          LogisticRegression(n_jobs=1))
     try:
         res = test_sklearn_grid_search_cv(
             lambda: pipe,
             X,
             y,
             categoriestointegers__single=[True, False],
             categoriestointegers__skip_errors=[True])
     except AttributeError as e:
         if compare_module_version(sklver, "0.24") < 0:
             return
         raise e
     self.assertIn('model', res)
     self.assertIn('score', res)
     self.assertGreater(res['score'], 0)
     self.assertLesser(res['score'], 1)
    def test_categories_to_integers_big(self):
        data = os.path.join(os.path.abspath(
            os.path.dirname(__file__)), "data", "adult_set.txt")
        df = pandas.read_csv(data, sep="\t")

        trans = CategoriesToIntegers(single=True)
        trans.fit(df)
        newdf = trans.transform(df)
        self.assertEqual(len(newdf.columns), len(df.columns))
        self.assertEqual(list(newdf.columns), list(df.columns))
        newdf2 = trans.fit_transform(df)
        self.assertEqual(newdf, newdf2)
        rep = repr(trans)
        self.assertEqual("CategoriesToIntegers(columns=None,remove=None,single=True,skip_errors=False)",
                         rep.replace(" ", "").replace("\n", ""))
    def test_categories_to_integers_big(self):
        data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data",
                            "adult_set.txt")
        df = pandas.read_csv(data, sep="\t")

        trans = CategoriesToIntegers(single=True)
        trans.fit(df)
        newdf = trans.transform(df)
        self.assertEqual(len(newdf.columns), len(df.columns))
        self.assertEqual(list(newdf.columns), list(df.columns))  # pylint: disable=E1101
        newdf2 = trans.fit_transform(df)
        self.assertEqual(newdf, newdf2)
        rep = repr(trans)
        self.assertStartsWith("CategoriesToIntegers(",
                              rep.replace(" ", "").replace("\n", ""))
        self.assertIn("single=True", rep.replace(" ", "").replace("\n", ""))
예제 #6
0
 def test_categories_to_integers_clone(self):
     self.maxDiff = None
     test_sklearn_clone(lambda: CategoriesToIntegers())
예제 #7
0
 def test_categories_to_integers_pickle(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data",
                         "adult_set.txt")
     df = pandas.read_csv(data, sep="\t")
     test_sklearn_pickle(lambda: CategoriesToIntegers(skip_errors=True), df)