def plot_anomalies(X, y, sample_size=256, n_trees=100, desired_TPR=None, percentile=None, normal_ymax=None, bins=20): N = len(X) it = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees) fit_start = time.time() it.fit(X) fit_stop = time.time() fit_time = fit_stop - fit_start print(f"fit time {fit_time:3.2f}s") score_start = time.time() scores = it.anomaly_score(X) score_stop = time.time() score_time = score_stop - score_start print(f"score time {score_time:3.2f}s") if desired_TPR is not None: threshold, FPR = find_TPR_threshold(y, scores, desired_TPR) print(f"Computed {desired_TPR:.4f} TPR threshold {threshold:.4f} with FPR {FPR:.4f}") else: threshold = np.percentile(scores, percentile) y_pred = it.predict_from_anomaly_scores(scores, threshold=threshold) confusion = confusion_matrix(y, y_pred) print(confusion) TN, FP, FN, TP = confusion.flat TPR = TP / (TP + FN) FPR = FP / (FP + TN) normal = scores[y == 0] anomalies = scores[y == 1] F1 = f1_score(y, y_pred) PR = average_precision_score(y, scores) print(f"Proportion anomalies/normal = {len(anomalies)}/{len(normal)} = {(len(anomalies)/len(normal))*100:.1f}%") print(f"F1 score {F1:.4f}, avg PR {PR:.4f}") fig, axes = plt.subplots(2, 1, sharex=True) counts0, binlocs0, _ = axes[0].hist(normal, color='#c7e9b4', bins=bins) counts1, binlocs1, _ = axes[1].hist(anomalies, color='#fee090', bins=bins) axes[1].set_xlabel("Anomaly score") axes[0].set_ylabel("Normal sample count") axes[1].set_ylabel("Anomalous sample count") axes[0].plot([threshold, threshold], [0, max(counts0)], '--', color='grey') axes[1].plot([threshold, threshold], [0, max(counts1)], '--', color='grey') text_xr = 0.97 * axes[0].get_xlim()[1] axes[0].text(text_xr, .85 * max(counts0), f"N {N}, {n_trees} trees", horizontalalignment='right') axes[0].text(text_xr, .75 * max(counts0), f"F1 score {F1:.4f}, avg PR {PR:.4f}", horizontalalignment='right') axes[0].text(text_xr, .65 * max(counts0), f"TPR {TPR:.4f}, FPR {FPR:.4f}", horizontalalignment='right') axes[0].text(threshold + .005, .20 * max(counts0), f"score threshold {threshold:.3f}") axes[0].text(threshold + .005, .10 * max(counts0), f"True anomaly rate {len(anomalies) / len(normal):.4f}") if normal_ymax is not None: axes[0].set_ylim(0, normal_ymax) plt.tight_layout() plt.savefig(f"{datafile.split('.')[0]}-{n_trees}-{int(desired_TPR*100)}.svg", bbox_inches='tight', pad_inches=0) plt.show()
def score(X, y, n_trees, desired_TPR, datafile, sample_size, reqd_fit_time, reqd_score_time, reqd_FPR, reqd_n_nodes): it = IsolationTreeEnsemble(sample_size=sample_size, n_trees=n_trees) fit_start = time.time() it.fit(X, improved=improved) fit_stop = time.time() fit_time = fit_stop - fit_start print(f"INFO {datafile} fit time {fit_time:3.2f}s") n_nodes = sum([t.n_nodes for t in it.trees]) print(f"INFO {datafile} {n_nodes} total nodes in {n_trees} trees") score_start = time.time() scores = it.anomaly_score(X) score_stop = time.time() score_time = score_stop - score_start print(f"INFO {datafile} score time {score_time:3.2f}s") threshold, FPR = find_TPR_threshold(y, scores, desired_TPR) y_pred = it.predict_from_anomaly_scores(scores, threshold=threshold) confusion = confusion_matrix(y, y_pred) TN, FP, FN, TP = confusion.flat TPR = TP / (TP + FN) FPR = FP / (FP + TN) errors = 0 if fit_time > reqd_fit_time * 2: print(f"FAIL {datafile} fit time {fit_time:.1f} > {reqd_fit_time}") errors += 1 if score_time > reqd_score_time * 2: print( f"FAIL {datafile} score time {score_time:.1f} > {reqd_score_time}") errors += 1 if TPR < desired_TPR * .9: # TPR must be within 10% (or above) print(f"FAIL {datafile} TPR {TPR:.2f} < {desired_TPR} +- 10%") errors += 1 if FPR > reqd_FPR * 1.3: # TPR must be within 30% print(f"FAIL {datafile} FPR {FPR:.4f} > {reqd_FPR} +- 30%") errors += 1 if n_nodes > reqd_n_nodes * 1.15: print(f"FAIL {datafile} n_nodes {n_nodes} > {reqd_n_nodes} +- 15%") errors += 1 if errors == 0: print( f"SUCCESS {datafile} {n_trees} trees at desired TPR {desired_TPR*100.0:.1f}% getting FPR {FPR:.4f}%" ) else: print( f"ERRORS {datafile} {errors} errors {n_trees} trees at desired TPR {desired_TPR*100.0:.1f}% getting FPR {FPR:.4f}%" )