def do1DTest(self, trainable, train_X, train_y, test_X, test_y): #Test for 1-D array as input to the transformers train_X = train_X[:, 0] test_X = test_X[:, 0] trainable_pipeline = (trainable & NoOp()) >> ConcatFeatures( ) >> float32_transform() >> LR() trained_pipeline = trainable_pipeline.fit(train_X, train_y) trained_pipeline.predict(test_X) hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def run_hyperopt_on_planned_pipeline(planned_pipeline, max_iters=1): # data from sklearn.datasets import load_iris features, labels = load_iris(return_X_y=True) # set up optimizer from lale.lib.lale.hyperopt import Hyperopt opt = Hyperopt(estimator=planned_pipeline, max_evals=max_iters) # run optimizer _ = opt.fit(features, labels)
def test_nested_pipeline1(self): from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target #pipeline = KNeighborsClassifier() | (OneHotEncoder(handle_unknown = 'ignore') >> LogisticRegression()) pipeline = KNeighborsClassifier() | (SimpleImputer() >> LogisticRegression()) clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions))
def test_runtime_limit_zero_time_hor(self): planned_pipeline = (MinMaxScaler | Normalizer) >> LinearRegression from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) hor = Hyperopt(estimator=planned_pipeline, max_evals=1, cv=3, max_opt_time=0.0, scoring='r2') hor_fitted = hor.fit(X, y) assert hor_fitted.get_pipeline() is None
def _fit_hyperopt(self, X, y): from lale.lib.lale import Hyperopt, NoOp from lale.lib.sklearn import ( PCA, DecisionTreeClassifier, DecisionTreeRegressor, KNeighborsClassifier, KNeighborsRegressor, MinMaxScaler, RandomForestClassifier, RandomForestRegressor, RobustScaler, SelectKBest, SGDClassifier, SGDRegressor, StandardScaler, ) prep = auto_prep(X) scale = MinMaxScaler | StandardScaler | RobustScaler | NoOp reduce_dims = PCA | SelectKBest | NoOp gbt = auto_gbt(self.prediction_type) if self.prediction_type == "regression": estim_trees = gbt | DecisionTreeRegressor | RandomForestRegressor estim_notree = SGDRegressor | KNeighborsRegressor else: estim_trees = gbt | DecisionTreeClassifier | RandomForestClassifier estim_notree = SGDClassifier | KNeighborsClassifier model_trees = reduce_dims >> estim_trees model_notree = scale >> reduce_dims >> estim_notree planned = prep >> (model_trees | model_notree) trainable = Hyperopt( estimator=planned, max_evals=self.max_evals - self._summary.shape[0], scoring=self.scoring, best_score=self.best_score, max_opt_time=self.max_opt_time - (time.time() - self._start_fit), max_eval_time=self.max_eval_time, verbose=self.verbose, show_progressbar=False, ) trained = trainable.fit(X, y) best_trial = trained._impl._trials.best_trial if "loss" in best_trial["result"]: if (best_trial["result"]["loss"] < self._summary.at[self._name_of_best, "loss"]): self._name_of_best = f'p{best_trial["tid"]}' summary = trained.summary() self._summary = pd.concat([self._summary, summary]) for name in summary.index: assert name not in self._pipelines if summary.at[name, "status"] == hyperopt.STATUS_OK: self._pipelines[name] = trained.get_pipeline(name)
def doTest(self, trainable, train_X, train_y, test_X, test_y): trained = trainable.fit(train_X, train_y) transformed = trained.transform(test_X) with self.assertWarns(DeprecationWarning): trainable.transform(train_X) trainable.to_json() trainable_pipeline = trainable >> float32_transform() >> LR() trained_pipeline = trainable_pipeline.fit(train_X, train_y) trained_pipeline.predict(test_X) hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def test_with_hyperopt(self): from lale.lib.lale import Hyperopt def my_scorer(estimator, X, y=None): return 1 hyperopt = Hyperopt(estimator=KMeans(n_clusters=3), max_evals=5, verbose=True, scoring=my_scorer) trained = hyperopt.fit(self.X_train) _ = trained.predict(self.X_test)
def test_runtime_limit_zero_time_hoc(self): planned_pipeline = (MinMaxScaler | Normalizer) >> ( LogisticRegression | KNeighborsClassifier) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) hoc = Hyperopt(estimator=planned_pipeline, max_evals=1, cv=3, scoring='accuracy', max_opt_time=0.0) hoc_fitted = hoc.fit(X, y) assert hoc_fitted.get_pipeline() is None
def test_feature_preprocessor(self): X_train, y_train = self.X_train, self.y_train import importlib module_name = ".".join(fproc_name.split(".")[0:-1]) class_name = fproc_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) fproc = class_() from lale.lib.sklearn.one_hot_encoder import OneHotEncoder if isinstance(fproc, OneHotEncoder): # type: ignore # fproc = OneHotEncoder(handle_unknown = 'ignore') # remove the hack when this is fixed fproc = PCA() # test_schemas_are_schemas lale.type_checking.validate_is_schema(fproc.input_schema_fit()) lale.type_checking.validate_is_schema(fproc.input_schema_transform()) lale.type_checking.validate_is_schema(fproc.output_schema_transform()) lale.type_checking.validate_is_schema(fproc.hyperparam_schema()) # test_init_fit_transform trained = fproc.fit(self.X_train, self.y_train) _ = trained.transform(self.X_test) # test_predict_on_trainable trained = fproc.fit(X_train, y_train) fproc.transform(X_train) # test_to_json fproc.to_json() # test_in_a_pipeline # This test assumes that the output of feature processing is compatible with LogisticRegression from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() trained = pipeline.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1, verbose=True, cv=3) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_with_hyperopt(self): def my_scorer(estimator, X, y=None): return 1 from lale.lib.lale import Hyperopt hyperopt = Hyperopt( estimator=IsolationForest(max_features=1.0, max_samples=1.0), max_evals=5, verbose=True, scoring=my_scorer, ) trained = hyperopt.fit(self.X_train) _ = trained.predict(self.X_test)
def test_with_hyperopt(self): from lale.lib.sklearn import OrdinalEncoder X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test fproc = OrdinalEncoder() from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() #Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def test_with_hyperopt(self): from lale.lib.sklearn import OrdinalEncoder fproc = OrdinalEncoder(handle_unknown="ignore") from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() # Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_preprocessing_union(self): from lale.datasets import openml (train_X, train_y), (test_X, test_y) = openml.fetch( 'credit-g', 'classification', preprocess=False) from lale.lib.lale import Project from lale.lib.sklearn import Normalizer, OneHotEncoder from lale.lib.lale import ConcatFeatures as Concat from lale.lib.sklearn import RandomForestClassifier as Forest prep_num = Project(columns={'type': 'number'}) >> Normalizer prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False) planned = (prep_num & prep_cat) >> Concat >> Forest from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1) best_found = hyperopt_classifier.fit(train_X, train_y)
def test_custom_scoring(self): from sklearn.metrics import f1_score, make_scorer lr = LogisticRegression() clf = Hyperopt( estimator=lr, scoring=make_scorer(f1_score, average="macro"), cv=5, max_evals=1, ) trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) predictions_1 = clf.predict(self.X_test) assert np.array_equal(predictions_1, predictions)
def test_regressor(self): X_train, y_train = self.X_train, self.y_train import importlib module_name = ".".join(clf_name.split(".")[0:-1]) class_name = clf_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) regr = None if class_name in ["StackingRegressor", "VotingRegressor"]: regr = class_(estimators=[("base", SGDRegressor())]) else: regr = class_() # test_schemas_are_schemas lale.type_checking.validate_is_schema(regr.input_schema_fit()) lale.type_checking.validate_is_schema(regr.input_schema_predict()) lale.type_checking.validate_is_schema(regr.output_schema_predict()) lale.type_checking.validate_is_schema(regr.hyperparam_schema()) # test_init_fit_predict trained = regr.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # test score _ = trained.score(self.X_test, self.y_test) # test_predict_on_trainable trained = regr.fit(X_train, y_train) regr.predict(X_train) # test_to_json regr.to_json() # test_in_a_pipeline pipeline = NoOp() >> regr trained = pipeline.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # test_with_hyperopt from lale.lib.sklearn.ridge import Ridge if isinstance(regr, Ridge): # type: ignore from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_J48_for_car_dataset(self): from lalegpl.datasets.auto_weka import fetch_car (X_train, y_train), (X_test, y_test) = fetch_car() from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_train = le.fit_transform(y_train) y_test = le.transform(y_test) clf = J48() from sklearn.metrics import accuracy_score from lale.lib.lale import NoOp, Hyperopt from lale.operators import make_pipeline clf = Hyperopt(estimator=make_pipeline(J48()), max_evals=1) trained_clf = clf.fit(X_train, y_train) print(accuracy_score(y_test, trained_clf.predict(X_test)))
def test_grammar_all_combinator(self): g = Grammar() g.start = g.estimator g.estimator = g.term_est | g.transformer >> g.term_est g.term_est = g.prim_est | g.ensemble g.ensemble = Boost(base_estimator=LR) g.transformer = g.union_tfm | g.union_tfm >> g.transformer g.union_tfm = g.prim_tfm | g.union_body >> Concat g.union_body = g.transformer | g.transformer & g.union_body g.prim_est = LR | KNN g.prim_tfm = PCA | Scaler g.ensembler = Boost generated = g.unfold(7) sample = g.sample(7) assert isinstance(generated, PlannedOperator) assert isinstance(sample, PlannedOperator) # Train try: gtrainer = Hyperopt(estimator=generated, max_evals=3, scoring="r2") gtrained = gtrainer.fit(self.train_X, self.train_y) assert isinstance(gtrained.get_pipeline(), TrainedOperator) except ValueError: # None of the trials succeeded pass try: strainer = Hyperopt(estimator=sample, max_evals=3, scoring="r2") strained = strainer.fit(self.train_X, self.train_y) assert isinstance(strained.get_pipeline(), TrainedOperator) except ValueError: # None of the trials succeeded pass
def test_custom_scorer(self): from sklearn.metrics import f1_score, make_scorer pipeline = PCA() >> LogisticRegression() def custom_scorer(estimator, X, y, factor=0.1): #This is a custom scorer for demonstrating the use of kwargs #Just applies some factor to the accuracy from sklearn.metrics import accuracy_score predictions = estimator.predict(X) self.assertEqual(factor, 0.5) return factor*accuracy_score(y, predictions) clf = Hyperopt(estimator=pipeline, scoring=custom_scorer, cv = 5, max_evals=1, args_to_scorer={'factor':0.5}) trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) predictions_1 = clf.predict(self.X_test) assert np.array_equal(predictions_1, predictions)
def test_decision_function_1(self): def my_scorer(estimator, X, y=None): return 1 from lale.lib.lale import Hyperopt hyperopt = Hyperopt( estimator=IsolationForest(max_features=1.0, max_samples=1.0), max_evals=5, verbose=True, scoring=my_scorer, ) trained = hyperopt.fit(self.X_train) pipeline = trained.get_pipeline() assert pipeline is not None _ = pipeline.decision_function(self.X_test)
def doTestPipeline(self, trainable_pipeline, train_X, train_y, test_X, test_y, optimization=False): def adjusted_smape(y_true, y_pred): """ SMAPE """ y_true, y_pred = np.array(y_true).ravel(), np.array(y_pred).ravel() if len(y_true) != len(y_pred): print( "Size of Ground Truth and Predicted Values do not match!, returning None." ) # May be raising error will interfere with daub execution if one pipeline fails # raise ValueError('Size of Ground Truth and Predicted Values do not match!') return None pred_diff = 2.0 * np.abs(cast(float, y_true - y_pred)) divide = np.abs(y_true) + np.abs(y_pred) divide[divide < 1e-12] = 1.0 scores = pred_diff / divide scores = np.array(scores, dtype=float) return np.nanmean(scores) * 100.0 trained_pipeline = trainable_pipeline.fit(train_X, train_y) predicted = trained_pipeline.predict(test_X[:-1]) if optimization: print(adjusted_smape(test_X[:-1], predicted)) else: print(adjusted_smape(test_X[-1], predicted)) with self.assertWarns(DeprecationWarning): trainable_pipeline.predict(train_X) trainable_pipeline.to_json() if optimization: hyperopt = Hyperopt( estimator=trainable_pipeline, max_evals=2, verbose=True, cv=TimeSeriesSplit(), scoring=make_scorer(adjusted_smape), ) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def test_with_hyperopt(self): from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) gender_map = {"m": "Male", "f": "Female"} state_map = {"NY": "New York", "CA": "California"} map_replace = Map( columns=[ replace(it.gender, gender_map), replace(it.state, state_map) ], remainder="drop", ) pipeline = (Relational(operator=(Scan(table=it.main) & Scan( table=it.delay)) >> map_replace) >> LogisticRegression()) opt = Hyperopt(estimator=pipeline, cv=3, max_evals=5) trained = opt.fit(X, y) _ = trained
def test_text_and_structured(self): from lale.datasets.uci.uci_datasets import fetch_drugscom from sklearn.model_selection import train_test_split train_X_all, train_y_all, test_X, test_y = fetch_drugscom() #subset to speed up debugging train_X, train_X_ignore, train_y, train_y_ignore = train_test_split( train_X_all, train_y_all, train_size=0.01, random_state=42) from lale.lib.lale import Project from lale.lib.lale import ConcatFeatures as Cat from lale.lib.sklearn import TfidfVectorizer as Tfidf from lale.lib.sklearn import LinearRegression as LinReg from lale.lib.sklearn import RandomForestRegressor as Forest prep_text = Project(columns=['review']) >> Tfidf(max_features=100) prep_nums = Project(columns={'type': 'number'}) planned = (prep_text & prep_nums) >> Cat >> (LinReg | Forest) from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1, scoring='r2') best_found = hyperopt_classifier.fit(train_X, train_y)
def test_with_concat_features1(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) pipeline = ((pca & nys) >> concat >> lr) | KNeighborsClassifier() clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_regressor(self): X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(clf_name.split(".")[0:-1]) class_name = clf_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) regr = class_() # test_schemas_are_schemas lale.type_checking.validate_is_schema(regr.input_schema_fit()) lale.type_checking.validate_is_schema(regr.input_schema_predict()) lale.type_checking.validate_is_schema(regr.output_schema_predict()) lale.type_checking.validate_is_schema(regr.hyperparam_schema()) # test_init_fit_predict trained = regr.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) # test_predict_on_trainable trained = regr.fit(X_train, y_train) regr.predict(X_train) # test_to_json regr.to_json() # test_in_a_pipeline pipeline = NoOp() >> regr trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) # test_with_hyperopt from lale.lib.sklearn.ridge import RidgeImpl if regr._impl_class() != RidgeImpl: from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def test_runtime_limit_hor(self): import time planned_pipeline = (MinMaxScaler | Normalizer) >> LinearRegression from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) max_opt_time = 3.0 hor = Hyperopt(estimator=planned_pipeline, max_evals=1, cv=3, max_opt_time=max_opt_time, scoring='r2') start = time.time() best_trained = hor.fit(X[:500, :], y[:500]) end = time.time() opt_time = end - start rel_diff = (opt_time - max_opt_time) / max_opt_time assert rel_diff < 0.2, ( 'Max time: {}, Actual time: {}, relative diff: {}'.format( max_opt_time, opt_time, rel_diff))
def test_runtime_limit_hoc(self): import time planned_pipeline = (MinMaxScaler | Normalizer) >> ( LogisticRegression | KNeighborsClassifier) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) max_opt_time = 2.0 hoc = Hyperopt(estimator=planned_pipeline, max_evals=1, cv=3, scoring='accuracy', max_opt_time=max_opt_time) start = time.time() best_trained = hoc.fit(X, y) end = time.time() opt_time = end - start rel_diff = (opt_time - max_opt_time) / max_opt_time assert rel_diff < 0.5, ( 'Max time: {}, Actual time: {}, relative diff: {}'.format( max_opt_time, opt_time, rel_diff))
def _fit_hyperopt(self, X, y): from lale.lib.lale import Hyperopt, NoOp from lale.lib.sklearn import ( PCA, DecisionTreeClassifier, DecisionTreeRegressor, KNeighborsClassifier, KNeighborsRegressor, MinMaxScaler, RandomForestClassifier, RandomForestRegressor, RobustScaler, SelectKBest, SGDClassifier, SGDRegressor, StandardScaler, ) remaining_time = self.max_opt_time - (time.time() - self._start_fit) if remaining_time <= 0: return prep = auto_prep(X) scale = MinMaxScaler | StandardScaler | RobustScaler | NoOp reduce_dims = PCA | SelectKBest | NoOp gbt = auto_gbt(self.prediction_type) if self.prediction_type == "regression": estim_trees = gbt | DecisionTreeRegressor | RandomForestRegressor estim_notree = SGDRegressor | KNeighborsRegressor else: estim_trees = gbt | DecisionTreeClassifier | RandomForestClassifier estim_notree = SGDClassifier | KNeighborsClassifier model_trees = reduce_dims >> estim_trees model_notree = scale >> reduce_dims >> estim_notree planned = prep >> (model_trees | model_notree) prior_evals = self._summary.shape[0] if self._summary is not None else 0 trainable = Hyperopt( estimator=planned, max_evals=self.max_evals - prior_evals, scoring=self.scoring, best_score=self.best_score, max_opt_time=remaining_time, max_eval_time=self.max_eval_time, verbose=self.verbose, show_progressbar=False, ) trained = trainable.fit(X, y) # The static types are not currently smart enough to verify # that the conditionally defined summary method is actually present # But it must be, since the hyperopt impl type provides it summary: pd.DataFrame = trained.summary() # type: ignore if list(summary.status) == ["new"]: return # only one trial and that one timed out best_trial = trained._impl._trials.best_trial if "loss" in best_trial["result"]: if (self._summary is None or best_trial["result"]["loss"] < self._summary.at[self._name_of_best, "loss"]): self._name_of_best = f'p{best_trial["tid"]}' if self._summary is None: self._summary = summary else: self._summary = pd.concat([self._summary, summary]) for name in summary.index: assert name not in self._pipelines if summary.at[name, "status"] == hyperopt.STATUS_OK: self._pipelines[name] = trained.get_pipeline(name)
def test_classifier(self): X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(clf_name.split('.')[0:-1]) class_name = clf_name.split('.')[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) clf = class_() #test_schemas_are_schemas lale.type_checking.validate_is_schema(clf.input_schema_fit()) lale.type_checking.validate_is_schema(clf.input_schema_predict()) lale.type_checking.validate_is_schema(clf.output_schema_predict()) lale.type_checking.validate_is_schema(clf.hyperparam_schema()) #test_init_fit_predict trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) #test_with_hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=clf, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) #test_cross_validation from lale.helpers import cross_val_score cv_results = cross_val_score(clf, X_train, y_train, cv=2) self.assertEqual(len(cv_results), 2) #test_with_gridsearchcv_auto_wrapped from sklearn.metrics import accuracy_score, make_scorer with warnings.catch_warnings(): warnings.simplefilter("ignore") from lale.lib.sklearn.gradient_boosting_classifier import GradientBoostingClassifierImpl from lale.lib.sklearn.mlp_classifier import MLPClassifierImpl if clf._impl_class() == GradientBoostingClassifierImpl: #because exponential loss does not work with iris dataset as it is not binary classification import lale.schemas as schemas clf = clf.customize_schema( loss=schemas.Enum(default='deviance', values=['deviance'])) grid_search = lale.lib.lale.GridSearchCV( estimator=clf, lale_num_samples=1, lale_num_grids=1, cv=2, scoring=make_scorer(accuracy_score)) grid_search.fit(X_train, y_train) #test_predict_on_trainable trained = clf.fit(X_train, y_train) clf.predict(X_train) #test_to_json clf.to_json() #test_in_a_pipeline pipeline = NoOp() >> clf trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def test_with_hyperopt2(self): from lale.expressions import ( count, it, max, mean, min, string_indexer, sum, variance, ) wrap_imported_operators() scan = Scan(table=it["main"]) scan_0 = Scan(table=it["customers"]) join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"] ["group_customer_id"])]) map = Map( columns={ "[main](group_customer_id)[customers]|number_children|identity": it["number_children"], "[main](group_customer_id)[customers]|name|identity": it["name"], "[main](group_customer_id)[customers]|income|identity": it["income"], "[main](group_customer_id)[customers]|address|identity": it["address"], "[main](group_customer_id)[customers]|age|identity": it["age"], }, remainder="drop", ) pipeline_4 = join >> map scan_1 = Scan(table=it["purchase"]) join_0 = Join( pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])], join_limit=50.0, ) aggregate = Aggregate( columns={ "[main](group_id)[purchase]|price|variance": variance(it["price"]), "[main](group_id)[purchase]|time|sum": sum(it["time"]), "[main](group_id)[purchase]|time|mean": mean(it["time"]), "[main](group_id)[purchase]|time|min": min(it["time"]), "[main](group_id)[purchase]|price|sum": sum(it["price"]), "[main](group_id)[purchase]|price|count": count(it["price"]), "[main](group_id)[purchase]|price|mean": mean(it["price"]), "[main](group_id)[purchase]|price|min": min(it["price"]), "[main](group_id)[purchase]|price|max": max(it["price"]), "[main](group_id)[purchase]|time|max": max(it["time"]), "[main](group_id)[purchase]|time|variance": variance(it["time"]), }, group_by=it["row_id"], ) pipeline_5 = join_0 >> aggregate map_0 = Map( columns={ "[main]|group_customer_id|identity": it["group_customer_id"], "[main]|transaction_id|identity": it["transaction_id"], "[main]|group_id|identity": it["group_id"], "[main]|comments|identity": it["comments"], "[main]|id|identity": it["id"], "prefix_0_id": it["prefix_0_id"], "next_purchase": it["next_purchase"], "[main]|time|identity": it["time"], }, remainder="drop", ) scan_2 = Scan(table=it["transactions"]) scan_3 = Scan(table=it["products"]) join_1 = Join(pred=[ (it["main"]["transaction_id"] == it["transactions"] ["transaction_id"]), (it["transactions"]["product_id"] == it["products"]["product_id"]), ]) map_1 = Map( columns={ "[main](transaction_id)[transactions](product_id)[products]|price|identity": it["price"], "[main](transaction_id)[transactions](product_id)[products]|type|identity": it["type"], }, remainder="drop", ) pipeline_6 = join_1 >> map_1 join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"] ["transaction_id"])]) map_2 = Map( columns={ "[main](transaction_id)[transactions]|description|identity": it["description"], "[main](transaction_id)[transactions]|product_id|identity": it["product_id"], }, remainder="drop", ) pipeline_7 = join_2 >> map_2 map_3 = Map(columns=[ string_indexer(it["[main]|comments|identity"]), string_indexer( it["[main](transaction_id)[transactions]|description|identity"] ), string_indexer(it[ "[main](transaction_id)[transactions](product_id)[products]|type|identity"] ), string_indexer( it["[main](group_customer_id)[customers]|name|identity"]), string_indexer( it["[main](group_customer_id)[customers]|address|identity"]), ]) pipeline_8 = ConcatFeatures() >> map_3 relational = Relational(operator=make_pipeline_graph( steps=[ scan, scan_0, pipeline_4, scan_1, pipeline_5, map_0, scan_2, scan_3, pipeline_6, pipeline_7, pipeline_8, ], edges=[ (scan, pipeline_4), (scan, pipeline_5), (scan, map_0), (scan, pipeline_6), (scan, pipeline_7), (scan_0, pipeline_4), (pipeline_4, pipeline_8), (scan_1, pipeline_5), (pipeline_5, pipeline_8), (map_0, pipeline_8), (scan_2, pipeline_6), (scan_2, pipeline_7), (scan_3, pipeline_6), (pipeline_6, pipeline_8), (pipeline_7, pipeline_8), ], )) pipeline = relational >> (KNeighborsClassifier | LogisticRegression) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) from lale.lib.lale import Hyperopt opt = Hyperopt(estimator=pipeline, max_evals=2) opt.fit(X, y)
def test_resampler(self): from lale.lib.sklearn import PCA, LogisticRegression X_train, y_train = self.X_train, self.y_train X_test = self.X_test import importlib module_name = ".".join(res_name.split(".")[0:-1]) class_name = res_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) with EnableSchemaValidation(): with self.assertRaises(ValidationError): _ = class_() # test_schemas_are_schemas lale.type_checking.validate_is_schema(class_.input_schema_fit()) lale.type_checking.validate_is_schema(class_.input_schema_predict()) lale.type_checking.validate_is_schema(class_.output_schema_predict()) lale.type_checking.validate_is_schema(class_.hyperparam_schema()) # test_init_fit_predict from lale.operators import make_pipeline pipeline1 = PCA() >> class_(operator=make_pipeline(LogisticRegression())) trained = pipeline1.fit(X_train, y_train) _ = trained.predict(X_test) pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression())) trained = pipeline2.fit(X_train, y_train) _ = trained.predict(X_test) # test_with_hyperopt from lale.lib.lale import Hyperopt optimizer = Hyperopt( estimator=PCA >> class_(operator=make_pipeline(LogisticRegression())), max_evals=1, show_progressbar=False, ) trained_optimizer = optimizer.fit(X_train, y_train) _ = trained_optimizer.predict(X_test) pipeline3 = class_( operator=PCA() >> (Nystroem & NoOp) >> ConcatFeatures >> LogisticRegression() ) optimizer = Hyperopt(estimator=pipeline3, max_evals=1, show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) _ = trained_optimizer.predict(X_test) pipeline4 = ( ( PCA >> class_(operator=make_pipeline(Nystroem())) & class_(operator=make_pipeline(Nystroem())) ) >> ConcatFeatures >> LogisticRegression() ) optimizer = Hyperopt( estimator=pipeline4, max_evals=1, scoring="roc_auc", show_progressbar=False ) trained_optimizer = optimizer.fit(X_train, y_train) _ = trained_optimizer.predict(X_test) # test_cross_validation from lale.helpers import cross_val_score cv_results = cross_val_score(pipeline1, X_train, y_train, cv=2) self.assertEqual(len(cv_results), 2) # test_to_json pipeline1.to_json()