def fit_forest(X_train, y_train, n_estimators=10, dirichlet=0.5, step=1.0): clf_kwargs = { "n_estimators": n_estimators, "min_samples_split": 2, "random_state": random_state, "n_jobs": 1, "step": step, "dirichlet": dirichlet, } clf = ForestClassifier(**clf_kwargs) clf.fit(X_train, y_train) return clf
def fit( self, params, X_train, y_train, Xy_val, sample_weight, n_estimators=None, seed=None, ): if seed is not None: params.update({"random_state": seed}) if n_estimators is not None: params.update({"n_estimators": n_estimators}) clf = ForestClassifier(**params, n_jobs=-1) clf.fit( X_train, y_train, sample_weight=sample_weight, categorical_features=self.categorical_features, ) return clf, None
def fit_forest( X_train, y_train, aggregation=True, n_estimators=10, dirichlet=0.5, step=1.0, min_samples_split=2, n_jobs=1, ): clf_kwargs = { "n_estimators": n_estimators, "aggregation": aggregation, "min_samples_split": min_samples_split, "random_state": random_state, "n_jobs": n_jobs, "step": step, "dirichlet": dirichlet, "max_features": 2, } clf = ForestClassifier(**clf_kwargs) clf.fit(X_train, y_train) return clf
def launch_run(*, run_config, experiment_id): """ Parameters ---------- run_config : dict The configuration of the run experiment_id : str Id of the experiment that groups runs in mlflow Returns ------- output : dict Metrics computed during this run """ wildwood_kwargs = { key.replace("wildwood_", ""): val for key, val in run_config.items() if key.startswith("wildwood") } dataset_name = run_config["dataset"] dataset_random_state = run_config["dataset_random_state"] loader = loader_from_name[dataset_name] # Just get the task from the dataset dataset = loader() learning_task = dataset.task # But we use the raw data in wildwood X, y = loader(raw=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=dataset_random_state, shuffle=True, stratify=y) label_encoder = LabelEncoder() y_train = label_encoder.fit_transform(y_train) y_test = label_encoder.transform(y_test) kwargs_one_tree = wildwood_kwargs.copy() kwargs_one_tree["n_estimators"] = 1 # Fit a single tree on the full dataset to force pre-compilation (doing so on a # subset often fails). # TODO: debug such cases clf = ForestClassifier(**kwargs_one_tree) clf.fit(X_train, y_train) # Instantiate again just to be sure clf = ForestClassifier(**wildwood_kwargs) with mlflow.start_run(experiment_id=experiment_id): # Fit and timing tic = time() # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name)) # TODO: include computations with an without categorical features ? clf.fit(X_train, y_train) toc = time() fit_time = toc - tic logging.info(f"Fitted for experiment {filename} in {fit_time}s") # Predict and timing tic = time() y_scores_train = clf.predict_proba(X_train) toc = time() predict_train_time = toc - tic tic = time() y_scores_test = clf.predict_proba(X_test) toc = time() predict_test_time = toc - tic # col_predict_time.append(predict_time) logging.info( f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s" ) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) metrics = compute_metrics( learning_task=learning_task, y_train=y_train, y_test=y_test, y_scores_train=y_scores_train, y_scores_test=y_scores_test, y_pred_train=y_pred_train, y_pred_test=y_pred_test, ) mlflow_metrics = dict( **metrics, fit_time=fit_time, predict_train_time=predict_train_time, predict_test_time=predict_test_time, ) mlflow_params = dict( **wildwood_kwargs, dataset=dataset_name, dataset_random_state=dataset_random_state, ) mlflow.log_params(mlflow_params) mlflow.log_metrics(mlflow_metrics)
"random_state": random_state, "n_jobs": -1, "dirichlet": 1e-5, "step": 2.0, "aggregation": True } clf = ForestClassifier(**clf_kwargs) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) tic = time() clf.fit(X_train, y_train) toc = time() print("time to fit: ", toc - tic) tic = time() y_scores = clf.predict_proba(X_test) toc = time() print("time to predict_proba: ", toc - tic) tic = time() y_pred = clf.predict(X_test) toc = time() print("time to predict: ", toc - tic) cm = confusion_matrix(y_test, y_pred) acc = accuracy_score(y_test, y_pred)