def test_compose4(self): from lale.operators import make_choice digits = sklearn.datasets.load_digits() ohe = OneHotEncoder(handle_unknown=OneHotEncoder.handle_unknown.ignore) ohe.get_params() no_op = NoOp() pca = PCA() nys = Nystroem() lr = LogisticRegression() knn = KNeighborsClassifier() step1 = ohe | no_op step2 = pca | nys step3 = lr | knn model_plan = step1 >> step2 >> step3
def test_inverse_transform(self): from lale.lib.sklearn import OneHotEncoder, OrdinalEncoder fproc_ohe = OneHotEncoder(handle_unknown="ignore") # test_init_fit_transform trained_ohe = fproc_ohe.fit(self.X_train, self.y_train) transformed_X = trained_ohe.transform(self.X_test) orig_X_ohe = trained_ohe._impl._wrapped_model.inverse_transform(transformed_X) fproc_oe = OrdinalEncoder(handle_unknown="ignore") # test_init_fit_transform trained_oe = fproc_oe.fit(self.X_train, self.y_train) transformed_X = trained_oe.transform(self.X_test) orig_X_oe = trained_oe._impl.inverse_transform(transformed_X) self.assertEqual(orig_X_ohe.all(), orig_X_oe.all())
def test_scorers_np_cat(self): fairness_info = self.creditg_np_cat["fairness_info"] train_X = self.creditg_np_cat["train_X"] train_y = self.creditg_np_cat["train_y"] cat_columns, num_columns = [], [] for i in range(train_X.shape[1]): try: _ = train_X[:, i].astype(np.float64) num_columns.append(i) except ValueError: cat_columns.append(i) trainable = ( ( (Project(columns=cat_columns) >> OneHotEncoder(handle_unknown="ignore")) & ( Project(columns=num_columns) >> FunctionTransformer(func=lambda x: x.astype(np.float64)) ) ) >> ConcatFeatures >> LogisticRegression(max_iter=1000) ) trained = trainable.fit(train_X, train_y) test_X = self.creditg_np_cat["test_X"] test_y = self.creditg_np_cat["test_y"] self._attempt_scorers(fairness_info, trained, test_X, test_y)
def test_scorers_warn(self): fairness_info = { "favorable_labels": ["good"], "protected_attributes": [{"feature": "age", "privileged_groups": [1]}], } trainable = ( ( ( Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore") ) & Project(columns={"type": "number"}) ) >> ConcatFeatures >> LogisticRegression(max_iter=1000) ) train_X = self.creditg_pd_cat["train_X"] train_y = self.creditg_pd_cat["train_y"] trained = trainable.fit(train_X, train_y) test_X = self.creditg_pd_cat["test_X"] test_y = self.creditg_pd_cat["test_y"] disparate_impact_scorer = lale.lib.aif360.disparate_impact(**fairness_info) with self.assertWarnsRegex(UserWarning, "disparate_impact is ill-defined"): impact = disparate_impact_scorer(trained, test_X, test_y) self.assertTrue(np.isnan(impact))
def test_compose5(self): ohe = OneHotEncoder(handle_unknown=OneHotEncoder.handle_unknown.ignore) digits = sklearn.datasets.load_digits() lr = LogisticRegression() lr_trained = lr.fit(digits.data, digits.target) lr_trained.predict(digits.data) pipeline1 = ohe >> lr pipeline1_trained = pipeline1.fit(digits.data, digits.target) pipeline1_trained.predict(digits.data)
def _prep_pd_cat(cls): result = ( ( Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore") ) & Project(columns={"type": "number"}) ) >> ConcatFeatures return result
def test_bool_label(self): import pandas as pd data_records = [ {'IS_TENT':False, 'GENDER':'M', 'AGE':20, 'MARITAL_STATUS':'Single', 'PROFESSION':'Sales'}, {'IS_TENT':False, 'GENDER':'M', 'AGE':20, 'MARITAL_STATUS':'Single', 'PROFESSION':'Sales'}, {'IS_TENT':False, 'GENDER':'F', 'AGE':37, 'MARITAL_STATUS':'Single', 'PROFESSION':'Other'}, {'IS_TENT':False, 'GENDER':'M', 'AGE':42, 'MARITAL_STATUS':'Married', 'PROFESSION':'Other'}, {'IS_TENT':True, 'GENDER':'F', 'AGE':24, 'MARITAL_STATUS':'Married', 'PROFESSION':'Retail'}, {'IS_TENT':False, 'GENDER':'F', 'AGE':24, 'MARITAL_STATUS':'Married', 'PROFESSION':'Retail'}, {'IS_TENT':False, 'GENDER':'M', 'AGE':29, 'MARITAL_STATUS':'Single', 'PROFESSION':'Retail'}, {'IS_TENT':False, 'GENDER':'M', 'AGE':29, 'MARITAL_STATUS':'Single', 'PROFESSION':'Retail'}, {'IS_TENT':True, 'GENDER':'M', 'AGE':43, 'MARITAL_STATUS':'Married', 'PROFESSION':'Trades'}, {'IS_TENT':False, 'GENDER':'M', 'AGE':43, 'MARITAL_STATUS':'Married', 'PROFESSION':'Trades'}] df = pd.DataFrame.from_records(data_records) X = df.drop(['IS_TENT'], axis=1).values y = df['IS_TENT'].values from lale.lib.sklearn import OneHotEncoder as Enc from lale.lib.sklearn import GradientBoostingClassifier as Clf trainable = Enc() >> Clf() trained = trainable.fit(X, y)
def test_preprocessing_union(self): from lale.datasets import openml (train_X, train_y), (test_X, test_y) = openml.fetch( 'credit-g', 'classification', preprocess=False) from lale.lib.lale import Project from lale.lib.sklearn import Normalizer, OneHotEncoder from lale.lib.lale import ConcatFeatures as Concat from lale.lib.sklearn import RandomForestClassifier as Forest prep_num = Project(columns={'type': 'number'}) >> Normalizer prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False) planned = (prep_num & prep_cat) >> Concat >> Forest from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1) best_found = hyperopt_classifier.fit(train_X, train_y)
def dont_test_smac_choice(self): import numpy as np from sklearn import svm, datasets from sklearn.model_selection import cross_val_score # Import ConfigSpace and different types of parameters from smac.configspace import ConfigurationSpace # Import SMAC-utilities from smac.tae.execute_func import ExecuteTAFuncDict from smac.scenario.scenario import Scenario from smac.facade.smac_facade import SMAC tfm = PCA() | Nystroem() | NoOp() planned_pipeline1 = ( OneHotEncoder(handle_unknown='ignore', sparse=False) | NoOp()) >> tfm >> (LogisticRegression() | KNeighborsClassifier()) cs: ConfigurationSpace = get_smac_space(planned_pipeline1, lale_num_grids=5) # Scenario object scenario = Scenario({ "run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": 1, # maximum function evaluations "cs": cs, # configuration space "deterministic": "true" }) # Optimize, using a SMAC-object tae = test_iris_fmin_tae(planned_pipeline1, num_folds=2) print( "Optimizing! Depending on your machine, this might take a few minutes." ) smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=tae) incumbent = smac.optimize() inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value))
def auto_prep(X): from lale.lib.lale import ConcatFeatures, Project, categorical from lale.lib.sklearn import OneHotEncoder, SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy="mean") prep_cat = SimpleImputer(strategy="most_frequent") >> OneHotEncoder( handle_unknown="ignore") if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={"type": "number"}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_scorers_pd_cat(self): fairness_info = self.creditg_pd_cat["fairness_info"] trainable = ( ( ( Project(columns={"type": "string"}) >> OneHotEncoder(handle_unknown="ignore") ) & Project(columns={"type": "number"}) ) >> ConcatFeatures >> LogisticRegression(max_iter=1000) ) train_X = self.creditg_pd_cat["train_X"] train_y = self.creditg_pd_cat["train_y"] trained = trainable.fit(train_X, train_y) test_X = self.creditg_pd_cat["test_X"] test_y = self.creditg_pd_cat["test_y"] self._attempt_scorers(fairness_info, trained, test_X, test_y)
def auto_prep(X): from lale.lib.lale import ConcatFeatures from lale.lib.lale import Project from lale.lib.lale import categorical from lale.lib.sklearn import OneHotEncoder from lale.lib.sklearn import SimpleImputer n_cols = X.shape[1] n_cats = len(categorical()(X)) prep_num = SimpleImputer(strategy='mean') prep_cat = (SimpleImputer(strategy='most_frequent') >> OneHotEncoder(handle_unknown='ignore')) if n_cats == 0: result = prep_num elif n_cats == n_cols: result = prep_cat else: result = ( (Project(columns={'type': 'number'}, drop_columns=categorical()) >> prep_num) & (Project(columns=categorical()) >> prep_cat)) >> ConcatFeatures return result
def test_bool_label(self): import pandas as pd data_records = [ { "IS_TENT": False, "GENDER": "M", "AGE": 20, "MARITAL_STATUS": "Single", "PROFESSION": "Sales", }, { "IS_TENT": False, "GENDER": "M", "AGE": 20, "MARITAL_STATUS": "Single", "PROFESSION": "Sales", }, { "IS_TENT": False, "GENDER": "F", "AGE": 37, "MARITAL_STATUS": "Single", "PROFESSION": "Other", }, { "IS_TENT": False, "GENDER": "M", "AGE": 42, "MARITAL_STATUS": "Married", "PROFESSION": "Other", }, { "IS_TENT": True, "GENDER": "F", "AGE": 24, "MARITAL_STATUS": "Married", "PROFESSION": "Retail", }, { "IS_TENT": False, "GENDER": "F", "AGE": 24, "MARITAL_STATUS": "Married", "PROFESSION": "Retail", }, { "IS_TENT": False, "GENDER": "M", "AGE": 29, "MARITAL_STATUS": "Single", "PROFESSION": "Retail", }, { "IS_TENT": False, "GENDER": "M", "AGE": 29, "MARITAL_STATUS": "Single", "PROFESSION": "Retail", }, { "IS_TENT": True, "GENDER": "M", "AGE": 43, "MARITAL_STATUS": "Married", "PROFESSION": "Trades", }, { "IS_TENT": False, "GENDER": "M", "AGE": 43, "MARITAL_STATUS": "Married", "PROFESSION": "Trades", }, ] df = pd.DataFrame.from_records(data_records) X = df.drop(["IS_TENT"], axis=1).values y = df["IS_TENT"].values from lale.lib.sklearn import GradientBoostingClassifier as Clf from lale.lib.sklearn import OneHotEncoder as Enc trainable = Enc() >> Clf() _ = trainable.fit(X, y)
def test_shallow_impl(self): import lale.lib.sklearn.one_hot_encoder as lohe ohe = OneHotEncoder() self.assertIsInstance(ohe.shallow_impl, lohe._OneHotEncoderImpl)
def test_impl(self): import sklearn.preprocessing._encoders as skohe ohe = OneHotEncoder() self.assertIsInstance(ohe.impl, skohe.OneHotEncoder)
def fetch(dataset_name, task_type, verbose=False, preprocess=True): if verbose: print('Loading dataset:', dataset_name) #Check that the dataset name exists in experiments_dict try: dataset_name_found = experiments_dict[dataset_name] if experiments_dict[dataset_name]['task_type'] != task_type.lower(): raise ValueError("The task type {} does not match with the given datasets task type {}"\ .format(task_type, experiments_dict[dataset_name]['task_type'])) except KeyError: raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name)) data_file_name = os.path.join(download_data_dir, dataset_name+".arff") if verbose: print(data_file_name) if not os.path.exists(data_file_name): #TODO: Download the data if not os.path.exists(download_data_dir): os.makedirs(download_data_dir) if verbose: print('created directory {}'.format(download_data_dir)) urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name) assert os.path.exists(data_file_name) with open(data_file_name) as f: dataDictionary = arff.load(f) f.close() from lale.datasets.data_schemas import liac_arff_to_schema schema_orig = liac_arff_to_schema(dataDictionary) target_col = experiments_dict[dataset_name]['target'] if preprocess: arffData = pd.DataFrame(dataDictionary['data']) #arffData = arffData.fillna(0) attributes = dataDictionary['attributes'] if verbose: print(attributes) categorical_cols = [] numeric_cols = [] X_columns = [] for i, item in enumerate(attributes): if item[0].lower() == target_col: target_indx = i #remove it from attributes so that the next loop indices are adjusted accordingly. del attributes[i] y = arffData.iloc[:,target_indx] arffData = arffData.drop(i, axis = 1) for i, item in enumerate(attributes): X_columns.append(i) if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \ or isinstance(item[1], list)) and (item[0].lower() != 'class')): categorical_cols.append(i) elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'): numeric_cols.append(i) if verbose: print(f'categorical columns: {categorical_cols}') print(f'numeric columns: {numeric_cols}') X = arffData.iloc[:,X_columns] #Check whether there is any error num_classes_from_last_row = len(list(set(y))) if verbose: print('num_classes_from_last_row', num_classes_from_last_row) transformers1 = [ ( 'imputer_str', SimpleImputer(missing_values=None, strategy='most_frequent'), categorical_cols), ( 'imputer_num', SimpleImputer(strategy='mean'), numeric_cols)] txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0) transformers2 = [ ( 'ohe', OneHotEncoder(sparse=False), list(range(len(categorical_cols)))), ( 'no_op', 'passthrough', list(range(len(categorical_cols), len(categorical_cols) + len(numeric_cols))))] txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0) if verbose: print("Shape of X before preprocessing", X.shape) from lale.operators import make_pipeline preprocessing = make_pipeline(txm1, txm2) X = preprocessing.fit(X).transform(X) if verbose: print("Shape of X after preprocessing", X.shape) else: col_names = [attr[0] for attr in dataDictionary['attributes']] df_all = pd.DataFrame(dataDictionary['data'], columns=col_names) y = df_all[target_col] y = y.squeeze() cols_X = [col for col in col_names if col != target_col] X = df_all[cols_X] labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size = 0.33, random_state = 0) if verbose: print(f'training set shapes: X {X_train.shape}, y {y_train.shape}') print(f'test set shapes: X {X_test.shape}, y {y_test.shape}') X_train, X_test, y_train, y_test = add_schemas( \ schema_orig, target_col, X_train, X_test, y_train, y_test) return (X_train, y_train), (X_test, y_test)