def train_heart_disease(**kwargs): from hypernets.tabular.datasets import dsutils from sklearn.model_selection import train_test_split X = dsutils.load_heart_disease_uci() y = X.pop('target') X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.3, random_state=randint()) X_train, X_eval, y_train, y_eval = \ train_test_split(X_train, y_train, test_size=0.3, random_state=randint()) kwargs = {'reward_metric': 'auc', 'max_trials': 10, **kwargs} hm, model = train(X_train, y_train, X_eval, y_eval, const.TASK_BINARY, **kwargs) print('-' * 50) scores = model.evaluate(X_test, y_test, metrics=['auc', 'accuracy', 'f1', 'recall', 'precision']) print('scores:', scores) trials = hm.get_top_trials(10) models = [hm.load_estimator(t.model_file) for t in trials] msgs = [f'{t.trial_no},{t.reward},{m.cls.__name__} {m.model_args}' for t, m in zip(trials, models)] print('top trials:') print('\n'.join(msgs))
def detect(self, X, method=None): X_shape = X.shape sample_limit = cfg.multi_collinearity_sample_limit if X_shape[0] > sample_limit: logger.info( f'{X_shape[0]} rows data found, sample to {sample_limit}') frac = sample_limit / X_shape[0] from . import get_tool_box X, _, = get_tool_box(X).train_test_split(X, train_size=frac, random_state=randint()) n_values = self._value_counts(X) one_values = [n.name for n in n_values if len(n) <= 1] if len(one_values) > 0: X = X[[c for c in X.columns if c not in one_values]] logger.info('computing correlation') corr = self._corr(X, method) logger.info('computing cluster') corr_linkage = hierarchy.ward(corr) cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance') cluster_id_to_feature_ids = defaultdict(list) for idx, cluster_id in enumerate(cluster_ids): cluster_id_to_feature_ids[cluster_id].append(idx) selected = [ X.columns[v[0]] for v in cluster_id_to_feature_ids.values() ] unselected = list(set(X.columns.to_list()) - set(selected)) + one_values feature_clusters = [[X.columns[i] for i in v] for v in cluster_id_to_feature_ids.values()] return feature_clusters, selected, unselected
def dtr(self): return dict( cls=DecisionTreeRegressor, splitter=Choice(["best", "random"]), max_depth=Choice([None, 3, 5, 10, 20, 50]), random_state=randint(), )
def dt(self): return dict( cls=DecisionTreeClassifier, criterion=Choice(["gini", "entropy"]), splitter=Choice(["best", "random"]), max_depth=Choice([None, 3, 5, 10, 20, 50]), random_state=randint(), )
def __init__(self, preprocessor=None, estimator=None, random_state=None): self.preprocessor = preprocessor self.estimator_ = estimator self.random_state = random_state if random_state is not None else randint( ) self.auc_ = None self.feature_names_ = None self.feature_importances_ = None self.fitted = False
def prepare_data(self): if self.task == const.TASK_BINARY: X, y = make_classification(n_samples=self.n_samples, n_features=self.n_features, n_classes=2, random_state=randint()) elif self.task == const.TASK_MULTICLASS: X, y = make_classification(n_samples=self.n_samples, n_features=self.n_features, n_classes=5, random_state=randint()) else: X, y = make_regression(n_samples=self.n_samples, n_features=self.n_features, random_state=randint()) X = pd.DataFrame(X, columns=[f'c{i}' for i in range(X.shape[1])]) return X, y
def nn(self): solver = Choice(['lbfgs', 'sgd', 'adam']) return dict( cls=MLPClassifier, max_iter=Int(500, 5000, step=500), activation=Choice(['identity', 'logistic', 'tanh', 'relu']), solver=solver, learning_rate=Choice(['constant', 'invscaling', 'adaptive']), learning_rate_init_stub=Cascade(partial(self._cascade, self._nn_learning_rate_init, 'slvr'), slvr=solver), random_state=randint(), )
def default_gbm(task_): est_cls = lightgbm.LGBMRegressor if task_ == const.TASK_REGRESSION else lightgbm.LGBMClassifier return est_cls(n_estimators=50, num_leaves=15, max_depth=5, subsample=0.5, subsample_freq=1, colsample_bytree=0.8, reg_alpha=1, reg_lambda=1, importance_type='gain', random_state=randint(), verbose=-1)
def lr(self): iters = [1000] while iters[-1] < 9000: iters.append(int(round(iters[-1] * 1.25, -2))) solver = Choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) penalty = Cascade(partial(self._cascade, self._lr_penalty_fn, 'slvr'), slvr=solver) l1_ratio = Cascade(partial(self._cascade, self._lr_l1_ratio, 'penalty'), penalty=penalty) return dict( cls=LogisticRegression, max_iter=Choice(iters), solver=solver, penalty_stub=penalty, l1_ratio_stub=l1_ratio, random_state=randint(), )
def __init__(self, remove_shift_variable=True, variable_shift_threshold=0.7, variable_shift_scorer=None, auc_threshold=0.55, min_features=10, remove_size=0.1, sample_balance=True, max_test_samples=None, cv=5, random_state=None, callbacks=None): self.remove_shift_variable = remove_shift_variable self.variable_shift_threshold = variable_shift_threshold self.variable_shift_scorer = variable_shift_scorer self.auc_threshold = auc_threshold self.min_features = min_features self.remove_size = remove_size self.sample_balance = sample_balance self.max_test_samples = max_test_samples self.cv = cv self.random_state = random_state if random_state is not None else randint( ) self.callbacks = callbacks
def default_rf(task_): from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor est_cls = RandomForestRegressor if task_ == const.TASK_REGRESSION else RandomForestClassifier return est_cls(min_samples_leaf=20, min_impurity_decrease=0.01, random_state=randint())
def default_dt(task_): from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor est_cls = DecisionTreeRegressor if task_ == const.TASK_REGRESSION else DecisionTreeClassifier return est_cls(min_samples_leaf=20, min_impurity_decrease=0.01, random_state=randint())