def launch_run(*, run_config, experiment_id): """ Parameters ---------- run_config : dict The configuration of the run experiment_id : str Id of the experiment that groups runs in mlflow Returns ------- output : dict Metrics computed during this run """ wildwood_kwargs = { key.replace("wildwood_", ""): val for key, val in run_config.items() if key.startswith("wildwood") } dataset_name = run_config["dataset"] dataset_random_state = run_config["dataset_random_state"] loader = loader_from_name[dataset_name] # Just get the task from the dataset dataset = loader() learning_task = dataset.task # But we use the raw data in wildwood X, y = loader(raw=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=dataset_random_state, shuffle=True, stratify=y) label_encoder = LabelEncoder() y_train = label_encoder.fit_transform(y_train) y_test = label_encoder.transform(y_test) kwargs_one_tree = wildwood_kwargs.copy() kwargs_one_tree["n_estimators"] = 1 # Fit a single tree on the full dataset to force pre-compilation (doing so on a # subset often fails). # TODO: debug such cases clf = ForestClassifier(**kwargs_one_tree) clf.fit(X_train, y_train) # Instantiate again just to be sure clf = ForestClassifier(**wildwood_kwargs) with mlflow.start_run(experiment_id=experiment_id): # Fit and timing tic = time() # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name)) # TODO: include computations with an without categorical features ? clf.fit(X_train, y_train) toc = time() fit_time = toc - tic logging.info(f"Fitted for experiment {filename} in {fit_time}s") # Predict and timing tic = time() y_scores_train = clf.predict_proba(X_train) toc = time() predict_train_time = toc - tic tic = time() y_scores_test = clf.predict_proba(X_test) toc = time() predict_test_time = toc - tic # col_predict_time.append(predict_time) logging.info( f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s" ) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) metrics = compute_metrics( learning_task=learning_task, y_train=y_train, y_test=y_test, y_scores_train=y_scores_train, y_scores_test=y_scores_test, y_pred_train=y_pred_train, y_pred_test=y_pred_test, ) mlflow_metrics = dict( **metrics, fit_time=fit_time, predict_train_time=predict_train_time, predict_test_time=predict_test_time, ) mlflow_params = dict( **wildwood_kwargs, dataset=dataset_name, dataset_random_state=dataset_random_state, ) mlflow.log_params(mlflow_params) mlflow.log_metrics(mlflow_metrics)
ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) # iterate over classifiers ax = plt.subplot(1, 2, 2) clf.fit(X_train, y_train) # clf.apply(X_train) # logging.info("%s had %d nodes" % (name, clf.tree_.node_count)) truc = np.empty((xx.ravel().shape[0], 2)) truc[:, 0] = xx.ravel() truc[:, 1] = yy.ravel() Z = clf.predict_proba(truc)[:, 1] # Z = clf.predict_proba_trees(truc)[0][:, 1] # score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) plt.tight_layout() # print("time: ", toc - tic)
"aggregation": True } clf = ForestClassifier(**clf_kwargs) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) tic = time() clf.fit(X_train, y_train) toc = time() print("time to fit: ", toc - tic) tic = time() y_scores = clf.predict_proba(X_test) toc = time() print("time to predict_proba: ", toc - tic) tic = time() y_pred = clf.predict(X_test) toc = time() print("time to predict: ", toc - tic) cm = confusion_matrix(y_test, y_pred) acc = accuracy_score(y_test, y_pred) print(cm) print(acc)
random_state=data_random_state) n_estimators = 100 clf = ForestClassifier( n_estimators=n_estimators, random_state=42, aggregation=False, max_features=None, categorical_features=dataset.categorical_features_, n_jobs=1, class_weight="balanced", criterion="entropy", ) clf.fit(X_train, y_train) y_scores_train = clf.predict_proba(X_train) y_scores_test = clf.predict_proba(X_test) avg_prec_train = average_precision_score(y_train, y_scores_train[:, 1]) avg_prec_test = average_precision_score(y_test, y_scores_test[:, 1]) print("Categorical") print("AP(train):", avg_prec_train, "AP(test):", avg_prec_test) clf = ForestClassifier( n_estimators=n_estimators, random_state=42, aggregation=False, max_features=None, # categorical_features=dataset.categorical_features_, criterion="entropy", n_jobs=1, class_weight="balanced",