def train(self, importance_cutoff=0.15): classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) # Get items and labels, filtering out those for which we have no labels. X_iter, y_iter = split_tuple_iterator(self.items_gen(classes)) # Extract features from the items. X = self.extraction_pipeline.fit_transform([item for item in X_iter]) # Calculate labels. y = np.array(y_iter) print(f"X: {X.shape}, y: {y.shape}") is_multilabel = isinstance(y[0], np.ndarray) # Split dataset in training and test. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: pipeline = self.clf tracking_metrics = {} # Use k-fold cross validation to evaluate results. if self.cross_validation_enabled: scorings = ["accuracy"] if len(self.class_names) == 2: scorings += ["precision", "recall"] scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5) print("Cross Validation scores:") for scoring in scorings: score = scores[f"test_{scoring}"] tracking_metrics[f"test_{scoring}"] = { "mean": score.mean(), "std": score.std() * 2, } print( f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})" ) # Training on the resampled dataset if sampler is provided. if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X_train, y_train) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, y_train) feature_names = self.get_human_readable_feature_names() if self.calculate_importance and len(feature_names): explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X_train) shap.summary_plot( shap_values, X_train.toarray(), feature_names=feature_names, class_names=self.class_names, plot_type="layered_violin" if not isinstance(shap_values, list) else None, show=False, ) matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight") important_features = self.get_important_features( importance_cutoff, shap_values) self.print_feature_importances(important_features, feature_names) # Save the important features in the metric report too feature_report = self.save_feature_importances( important_features, feature_names) tracking_metrics["feature_report"] = feature_report print("Test Set scores:") # Evaluate results on the test set. y_pred = self.clf.predict(X_test) if is_multilabel: assert isinstance( y_pred[0], np.ndarray), "The predictions should be multilabel" print(f"No confidence threshold - {len(y_test)} classified") if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( y_test, y_pred) else: confusion_matrix = metrics.confusion_matrix( y_test, y_pred, labels=self.class_names) print( classification_report_imbalanced(y_test, y_pred, labels=self.class_names)) report = classification_report_imbalanced_values( y_test, y_pred, labels=self.class_names) tracking_metrics["report"] = report print_labeled_confusion_matrix(confusion_matrix, self.class_names, is_multilabel=is_multilabel) tracking_metrics["confusion_matrix"] = confusion_matrix.tolist() # Evaluate results on the test set for some confidence thresholds. for confidence_threshold in [0.6, 0.7, 0.8, 0.9]: y_pred_probas = self.clf.predict_proba(X_test) confidence_class_names = self.class_names + ["__NOT_CLASSIFIED__"] y_pred_filter = [] classified_indices = [] for i in range(0, len(y_test)): argmax = np.argmax(y_pred_probas[i]) if y_pred_probas[i][argmax] < confidence_threshold: if not is_multilabel: y_pred_filter.append("__NOT_CLASSIFIED__") continue classified_indices.append(i) if is_multilabel: y_pred_filter.append(y_pred[i]) else: y_pred_filter.append(argmax) if not is_multilabel: y_pred_filter = np.array(y_pred_filter) y_pred_filter[classified_indices] = self.le.inverse_transform( np.array(y_pred_filter[classified_indices], dtype=int)) print( f"\nConfidence threshold > {confidence_threshold} - {len(y_test)} classified" ) if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( y_test[classified_indices], np.asarray(y_pred_filter)) else: confusion_matrix = metrics.confusion_matrix( y_test.astype(str), y_pred_filter.astype(str), labels=confidence_class_names, ) print( classification_report_imbalanced( y_test.astype(str), y_pred_filter.astype(str), labels=confidence_class_names, )) print_labeled_confusion_matrix(confusion_matrix, confidence_class_names, is_multilabel=is_multilabel) joblib.dump(self, self.__class__.__name__.lower()) return tracking_metrics
def train(self, importance_cutoff=0.15): classes, class_names = self.get_labels() class_names = sorted(list(class_names), reverse=True) # Get items and labels, filtering out those for which we have no labels. X_iter, y_iter = split_tuple_iterator(self.items_gen(classes)) # Extract features from the items. X = self.extraction_pipeline.fit_transform(X_iter) # Calculate labels. y = np.array(y_iter) print(f"X: {X.shape}, y: {y.shape}") # Split dataset in training and test. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: pipeline = self.clf tracking_metrics = {} # Use k-fold cross validation to evaluate results. if self.cross_validation_enabled: scorings = ["accuracy"] if len(class_names) == 2: scorings += ["precision", "recall"] scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5) print("Cross Validation scores:") for scoring in scorings: score = scores[f"test_{scoring}"] tracking_metrics[f"test_{scoring}"] = { "mean": score.mean(), "std": score.std() * 2, } print( f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})" ) # Training on the resampled dataset if sampler is provided. if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X_train, y_train) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, y_train) feature_names = self.get_feature_names() if self.calculate_importance and len(feature_names): explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X_train) # TODO: Actually implement feature importance visualization for multiclass problems. if isinstance(shap_values, list): shap_values = np.sum(np.abs(shap_values), axis=0) important_features = self.get_important_features( importance_cutoff, shap_values) print(f"\nTop {len(important_features)} Features:") for i, [importance, index, is_positive] in enumerate(important_features): print( f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})' ) print("Test Set scores:") # Evaluate results on the test set. y_pred = self.clf.predict(X_test) print(f"No confidence threshold - {len(y_test)} classified") confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=class_names) print(confusion_matrix) tracking_metrics["confusion_matrix"] = confusion_matrix.tolist() print( classification_report_imbalanced(y_test, y_pred, labels=class_names)) report = classification_report_imbalanced_values(y_test, y_pred, labels=class_names) tracking_metrics["report"] = report # Evaluate results on the test set for some confidence thresholds. for confidence_threshold in [0.6, 0.7, 0.8, 0.9]: y_pred_probas = self.clf.predict_proba(X_test) y_test_filter = [] y_pred_filter = [] for i in range(0, len(y_test)): argmax = np.argmax(y_pred_probas[i]) if y_pred_probas[i][argmax] < confidence_threshold: continue y_test_filter.append(y_test[i]) y_pred_filter.append(argmax) y_pred_filter = self.le.inverse_transform(y_pred_filter) print( f"\nConfidence threshold > {confidence_threshold} - {len(y_test_filter)} classified" ) print( metrics.confusion_matrix(y_test_filter, y_pred_filter, labels=class_names)) print( classification_report_imbalanced(y_test_filter, y_pred_filter, labels=class_names)) joblib.dump(self, self.__class__.__name__.lower()) return tracking_metrics