示例#1
0
文件: test_aif360.py 项目: sks95/lale
 def test_scorers_warn(self):
     fairness_info = {
         "favorable_labels": ["good"],
         "protected_attributes": [{"feature": "age", "privileged_groups": [1]}],
     }
     trainable = (
         (
             (
                 Project(columns={"type": "string"})
                 >> OneHotEncoder(handle_unknown="ignore")
             )
             & Project(columns={"type": "number"})
         )
         >> ConcatFeatures
         >> LogisticRegression(max_iter=1000)
     )
     train_X = self.creditg_pd_cat["train_X"]
     train_y = self.creditg_pd_cat["train_y"]
     trained = trainable.fit(train_X, train_y)
     test_X = self.creditg_pd_cat["test_X"]
     test_y = self.creditg_pd_cat["test_y"]
     disparate_impact_scorer = lale.lib.aif360.disparate_impact(**fairness_info)
     with self.assertWarnsRegex(UserWarning, "disparate_impact is ill-defined"):
         impact = disparate_impact_scorer(trained, test_X, test_y)
     self.assertTrue(np.isnan(impact))
示例#2
0
    def test_using_pipeline(self):
        import lale.datasets.openml
        import pandas as pd
        (X_train,
         y_train), (X_test,
                    y_test) = lale.datasets.openml.fetch('credit-g',
                                                         'classification',
                                                         preprocess=False)

        project_nums = Project(columns={'type': 'number'})
        project_cats = Project(columns={'type': 'string'})
        planned_pipeline = (
            (project_nums >>
             (Normalizer | NoOp) & project_cats >> OneHotEncoder) >>
            ConcatFeatures >> (LGBMClassifier | GradientBoostingClassifier))

        # Let's first use Hyperopt to find the best pipeline
        opt = Hyperopt(estimator=planned_pipeline, max_evals=3)
        # run optimizer
        res = opt.fit(X_train, y_train)
        best_pipeline = res.get_pipeline()

        # Now let's use NSGA2 to perform multi-objective
        # optimization on the last step (i.e., classifier)
        # in the best pipeline returned by Hyperopt
        fpr_scorer = make_scorer(compute_fpr, greater_is_better=False)
        nsga2_args = {
            'scoring': ['roc_auc', fpr_scorer],
            'best_score': [1, 0],
            'cv': 3,
            'max_evals': 20,
            'population_size': 10
        }
        opt_last = OptimizeLast(estimator=best_pipeline,
                                last_optimizer=NSGA2,
                                optimizer_args=nsga2_args)

        res_last = opt_last.fit(X_train, y_train)
        df_summary = res_last.summary()
        print(df_summary)
        self.assertTrue(df_summary.shape[0] > 0)

        # check if summary contains valid loss values
        valid_objs = True
        for i in range(df_summary.shape[0]):
            record = df_summary.iloc[i]
            valid_objs = valid_objs and \
                         all([0 <= record['loss1'], record['loss1'] <= 1,
                              0 <= record['loss2'], record['loss2'] <= 1])
        self.assertTrue(valid_objs, msg="Invalid loss values in summary")

        _ = res_last.predict(X_test)
        best_pipeline2 = res_last.get_pipeline()
        self.assertEqual(type(best_pipeline), type(best_pipeline2))

        auc_scorer = get_scorer('roc_auc')
        print(f'test_using_pipeline: \n'
              'AUC, FPR scorer values on test split - %.3f %.3f' %
              (auc_scorer(best_pipeline2, X_test,
                          y_test), fpr_scorer(best_pipeline2, X_test, y_test)))
示例#3
0
    def test_decision_function_binary(self):
        from lale.lib.lale import Project

        train_X, train_y = self._creditG["X"], self._creditG["y"]
        trainable = Project(columns={"type": "number"}) >> LogisticRegression()
        trained = trainable.fit(train_X, train_y)
        _ = trained.decision_function(train_X)
示例#4
0
    def test_keep_numbers(self):
        from lale.datasets.data_schemas import to_schema
        from lale.lib.lale import Project

        train_X, train_y = self._creditG["X"], self._creditG["y"]
        trainable = Project(columns={"type": "number"})
        trained = trainable.fit(train_X)
        transformed = trained.transform(train_X)
        transformed_schema = to_schema(transformed)
        transformed_expected = {
            "type": "array",
            "minItems": 670,
            "maxItems": 670,
            "items": {
                "type": "array",
                "minItems": 7,
                "maxItems": 7,
                "items": [
                    {"description": "duration", "type": "number"},
                    {"description": "credit_amount", "type": "number"},
                    {"description": "installment_commitment", "type": "number"},
                    {"description": "residence_since", "type": "number"},
                    {"description": "age", "type": "number"},
                    {"description": "existing_credits", "type": "number"},
                    {"description": "num_dependents", "type": "number"},
                ],
            },
        }
        self.maxDiff = None
        self.assertEqual(transformed_schema, transformed_expected)
示例#5
0
    def test_multimodal(self):
        from lale.lib.lale import ConcatFeatures as Cat
        from lale.lib.lale import Project
        from lale.lib.sklearn import LinearSVC
        from lale.lib.sklearn import Normalizer as Norm
        from lale.lib.sklearn import OneHotEncoder as OneHot

        project_0 = Project(columns={"type": "number"})
        project_1 = Project(columns={"type": "string"})
        linear_svc = LinearSVC(C=29617.4, dual=False, tol=0.005266)
        pipeline = (
            ((project_0 >> Norm()) & (project_1 >> OneHot())) >> Cat >> linear_svc
        )
        expected = """from lale.lib.lale import Project
from sklearn.preprocessing import Normalizer as Norm
from sklearn.preprocessing import OneHotEncoder as OneHot
from lale.lib.lale import ConcatFeatures as Cat
from sklearn.svm import LinearSVC
import lale

lale.wrap_imported_operators()
project_0 = Project(columns={"type": "number"})
project_1 = Project(columns={"type": "string"})
linear_svc = LinearSVC(C=29617.4, dual=False, tol=0.005266)
pipeline = (
    ((project_0 >> Norm()) & (project_1 >> OneHot())) >> Cat >> linear_svc
)"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
示例#6
0
 def test_scorers_np_cat(self):
     fairness_info = self.creditg_np_cat["fairness_info"]
     train_X = self.creditg_np_cat["train_X"]
     train_y = self.creditg_np_cat["train_y"]
     cat_columns, num_columns = [], []
     for i in range(train_X.shape[1]):
         try:
             _ = train_X[:, i].astype(np.float64)
             num_columns.append(i)
         except ValueError:
             cat_columns.append(i)
     trainable = (
         (
             (Project(columns=cat_columns) >> OneHotEncoder(handle_unknown="ignore"))
             & (
                 Project(columns=num_columns)
                 >> FunctionTransformer(func=lambda x: x.astype(np.float64))
             )
         )
         >> ConcatFeatures
         >> LogisticRegression(max_iter=1000)
     )
     trained = trainable.fit(train_X, train_y)
     test_X = self.creditg_np_cat["test_X"]
     test_y = self.creditg_np_cat["test_y"]
     self._attempt_scorers(fairness_info, trained, test_X, test_y)
示例#7
0
 def _prep_pd_cat(cls):
     result = (
         (
             Project(columns={"type": "string"})
             >> OneHotEncoder(handle_unknown="ignore")
         )
         & Project(columns={"type": "number"})
     ) >> ConcatFeatures
     return result
示例#8
0
 def test_preprocessing_union(self):
     from lale.datasets import openml
     (train_X, train_y), (test_X, test_y) = openml.fetch(
         'credit-g', 'classification', preprocess=False)
     from lale.lib.lale import Project
     from lale.lib.sklearn import Normalizer, OneHotEncoder
     from lale.lib.lale import ConcatFeatures as Concat
     from lale.lib.sklearn import RandomForestClassifier as Forest
     prep_num = Project(columns={'type': 'number'}) >> Normalizer
     prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False)
     planned = (prep_num & prep_cat) >> Concat >> Forest
     from lale.lib.lale import Hyperopt
     hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1)
     best_found = hyperopt_classifier.fit(train_X, train_y)
示例#9
0
 def test_keep_non_numbers(self):
     from lale.datasets.data_schemas import to_schema
     from lale.lib.lale import Project
     train_X, train_y = self._creditG['X'], self._creditG['y']
     trainable = Project(columns={'not': {'type': 'number'}})
     trained = trainable.fit(train_X)
     transformed = trained.transform(train_X)
     transformed_schema = to_schema(transformed)
     transformed_expected = {
         'type': 'array', 'minItems': 670, 'maxItems': 670,
         'items': {
             'type': 'array', 'minItems': 13, 'maxItems': 13,
             'items': [
                 {'description': 'checking_status', 'enum': [
                     '<0', '0<=X<200', '>=200', 'no checking']},
                 {'description': 'credit_history', 'enum': [
                     'no credits/all paid', 'all paid',
                     'existing paid', 'delayed previously',
                     'critical/other existing credit']},
                 {'description': 'purpose', 'enum': [
                     'new car', 'used car', 'furniture/equipment',
                     'radio/tv', 'domestic appliance', 'repairs',
                     'education', 'vacation', 'retraining', 'business',
                     'other']},
                 {'description': 'savings_status', 'enum': [
                     '<100', '100<=X<500', '500<=X<1000', '>=1000',
                     'no known savings']},
                 {'description': 'employment', 'enum': [
                     'unemployed', '<1', '1<=X<4', '4<=X<7', '>=7']},
                 {'description': 'personal_status', 'enum': [
                     'male div/sep', 'female div/dep/mar', 'male single',
                     'male mar/wid', 'female single']},
                 {'description': 'other_parties', 'enum': [
                     'none', 'co applicant', 'guarantor']},
                 {'description': 'property_magnitude', 'enum': [
                     'real estate', 'life insurance', 'car',
                     'no known property']},
                 {'description': 'other_payment_plans', 'enum': [
                     'bank', 'stores', 'none']},
                 {'description': 'housing', 'enum': [
                     'rent', 'own', 'for free']},
                 {'description': 'job', 'enum': [
                     'unemp/unskilled non res', 'unskilled resident',
                     'skilled', 'high qualif/self emp/mgmt']},
                 {'description': 'own_telephone', 'enum': ['none', 'yes']},
                 {'description': 'foreign_worker', 'enum': ['yes', 'no']}]}}
     self.maxDiff = None
     self.assertEqual(transformed_schema, transformed_expected)
示例#10
0
 def _fit_gbt_num(self, X, y):
     from lale.lib.lale import Project
     from lale.lib.sklearn import SimpleImputer
     gbt = auto_gbt(self.prediction_type)
     trainable = (Project(columns={'type': 'number'}) >>
                  SimpleImputer(strategy='mean') >> gbt())
     self._try_and_add('gbt_num', trainable, X, y)
示例#11
0
    def _fit_gbt_num(self, X, y):
        from lale.lib.lale import Project
        from lale.lib.sklearn import SimpleImputer

        gbt = auto_gbt(self.prediction_type)
        trainable = (Project(columns={"type": "number"}) >>
                     SimpleImputer(strategy="mean") >> gbt())
        self._try_and_add("gbt_num", trainable, X, y)
示例#12
0
 def test_keep_numbers(self):
     from lale.datasets.data_schemas import to_schema
     from lale.lib.lale import Project
     train_X, train_y = self._creditG['X'], self._creditG['y']
     trainable = Project(columns={'type': 'number'})
     trained = trainable.fit(train_X)
     transformed = trained.transform(train_X)
     transformed_schema = to_schema(transformed)
     transformed_expected = {
         'type': 'array',
         'minItems': 670,
         'maxItems': 670,
         'items': {
             'type':
             'array',
             'minItems':
             7,
             'maxItems':
             7,
             'items': [{
                 'description': 'duration',
                 'type': 'number'
             }, {
                 'description': 'credit_amount',
                 'type': 'number'
             }, {
                 'description': 'installment_commitment',
                 'type': 'number'
             }, {
                 'description': 'residence_since',
                 'type': 'number'
             }, {
                 'description': 'age',
                 'type': 'number'
             }, {
                 'description': 'existing_credits',
                 'type': 'number'
             }, {
                 'description': 'num_dependents',
                 'type': 'number'
             }]
         }
     }
     self.maxDiff = None
     self.assertEqual(transformed_schema, transformed_expected)
示例#13
0
 def test_text_and_structured(self):
     from lale.datasets.uci.uci_datasets import fetch_drugscom
     from sklearn.model_selection import train_test_split
     train_X_all, train_y_all, test_X, test_y = fetch_drugscom()
     #subset to speed up debugging
     train_X, train_X_ignore, train_y, train_y_ignore = train_test_split(
         train_X_all, train_y_all, train_size=0.01, random_state=42)
     from lale.lib.lale import Project
     from lale.lib.lale import ConcatFeatures as Cat
     from lale.lib.sklearn import TfidfVectorizer as Tfidf
     from lale.lib.sklearn import LinearRegression as LinReg
     from lale.lib.sklearn import RandomForestRegressor as Forest
     prep_text = Project(columns=['review']) >> Tfidf(max_features=100)
     prep_nums = Project(columns={'type': 'number'})
     planned = (prep_text & prep_nums) >> Cat >> (LinReg | Forest)
     from lale.lib.lale import Hyperopt
     hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1, scoring='r2')
     best_found = hyperopt_classifier.fit(train_X, train_y)
示例#14
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures, Project, categorical
    from lale.lib.sklearn import OneHotEncoder, SimpleImputer

    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy="mean")
    prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder(
        handle_unknown="ignore")
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={"type": "number"}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
示例#15
0
文件: test_aif360.py 项目: sks95/lale
 def test_scorers_pd_cat(self):
     fairness_info = self.creditg_pd_cat["fairness_info"]
     trainable = (
         (
             (
                 Project(columns={"type": "string"})
                 >> OneHotEncoder(handle_unknown="ignore")
             )
             & Project(columns={"type": "number"})
         )
         >> ConcatFeatures
         >> LogisticRegression(max_iter=1000)
     )
     train_X = self.creditg_pd_cat["train_X"]
     train_y = self.creditg_pd_cat["train_y"]
     trained = trainable.fit(train_X, train_y)
     test_X = self.creditg_pd_cat["test_X"]
     test_y = self.creditg_pd_cat["test_y"]
     self._attempt_scorers(fairness_info, trained, test_X, test_y)
示例#16
0
def auto_prep(X):
    from lale.lib.lale import ConcatFeatures
    from lale.lib.lale import Project
    from lale.lib.lale import categorical
    from lale.lib.sklearn import OneHotEncoder
    from lale.lib.sklearn import SimpleImputer
    n_cols = X.shape[1]
    n_cats = len(categorical()(X))
    prep_num = SimpleImputer(strategy='mean')
    prep_cat = (SimpleImputer(strategy='most_frequent') >>
                OneHotEncoder(handle_unknown='ignore'))
    if n_cats == 0:
        result = prep_num
    elif n_cats == n_cols:
        result = prep_cat
    else:
        result = (
            (Project(columns={'type': 'number'}, drop_columns=categorical()) >>
             prep_num)
            & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures
    return result
示例#17
0
    def test_keep_non_numbers(self):
        from lale.datasets.data_schemas import to_schema
        from lale.lib.lale import Project

        train_X = self._creditG["X"]
        trainable = Project(columns={"not": {"type": "number"}})
        trained = trainable.fit(train_X)
        transformed = trained.transform(train_X)
        transformed_schema = to_schema(transformed)
        transformed_expected = {
            "type": "array",
            "minItems": 670,
            "maxItems": 670,
            "items": {
                "type":
                "array",
                "minItems":
                13,
                "maxItems":
                13,
                "items": [
                    {
                        "description": "checking_status",
                        "enum": ["<0", "0<=X<200", ">=200", "no checking"],
                    },
                    {
                        "description":
                        "credit_history",
                        "enum": [
                            "no credits/all paid",
                            "all paid",
                            "existing paid",
                            "delayed previously",
                            "critical/other existing credit",
                        ],
                    },
                    {
                        "description":
                        "purpose",
                        "enum": [
                            "new car",
                            "used car",
                            "furniture/equipment",
                            "radio/tv",
                            "domestic appliance",
                            "repairs",
                            "education",
                            "vacation",
                            "retraining",
                            "business",
                            "other",
                        ],
                    },
                    {
                        "description":
                        "savings_status",
                        "enum": [
                            "<100",
                            "100<=X<500",
                            "500<=X<1000",
                            ">=1000",
                            "no known savings",
                        ],
                    },
                    {
                        "description": "employment",
                        "enum":
                        ["unemployed", "<1", "1<=X<4", "4<=X<7", ">=7"],
                    },
                    {
                        "description":
                        "personal_status",
                        "enum": [
                            "male div/sep",
                            "female div/dep/mar",
                            "male single",
                            "male mar/wid",
                            "female single",
                        ],
                    },
                    {
                        "description": "other_parties",
                        "enum": ["none", "co applicant", "guarantor"],
                    },
                    {
                        "description":
                        "property_magnitude",
                        "enum": [
                            "real estate",
                            "life insurance",
                            "car",
                            "no known property",
                        ],
                    },
                    {
                        "description": "other_payment_plans",
                        "enum": ["bank", "stores", "none"],
                    },
                    {
                        "description": "housing",
                        "enum": ["rent", "own", "for free"]
                    },
                    {
                        "description":
                        "job",
                        "enum": [
                            "unemp/unskilled non res",
                            "unskilled resident",
                            "skilled",
                            "high qualif/self emp/mgmt",
                        ],
                    },
                    {
                        "description": "own_telephone",
                        "enum": ["none", "yes"]
                    },
                    {
                        "description": "foreign_worker",
                        "enum": ["yes", "no"]
                    },
                ],
            },
        }
        self.maxDiff = None
        self.assertEqual(transformed_schema, transformed_expected)
示例#18
0
 def test_decision_function_binary(self):
     from lale.lib.lale import Project
     train_X, train_y = self._creditG['X'], self._creditG['y']
     trainable = Project(columns={'type': 'number'}) >> LogisticRegression()
     trained = trainable.fit(train_X, train_y)
     decisions = trained.decision_function(train_X)