def test_kdd_cup(): def detectors(): return [DAGMM(num_epochs=10, sequence_length=1)] evaluator = Evaluator( [KDDCup(21), KDDCup(22), KDDCup(23), KDDCup(24), KDDCup(25)], detectors) df_evaluation = pd.DataFrame(columns=[ 'dataset', 'algorithm', 'accuracy', 'precision', 'recall', 'F1-score', 'F0.1-score' ]) evaluator.evaluate() df = evaluator.benchmarks() df_evaluation = df_evaluation.append(df) print(df_evaluation.to_string()) assert (df_evaluation == 0).sum().sum() == 0 # No zeroes in the DataFrame assert df_evaluation['F1-score'].std() > 0 # Not always the same value # Values reported in the paper -1% each assert df_evaluation['precision'].mean() >= 0.91 assert df_evaluation['recall'].mean() >= 0.93 assert df_evaluation['F1-score'].mean() >= 0.92
from src.helper.data_structures import izip from src.evaluation.print_methods import print_evaluator if __name__ == "__main__": benchmark_name = parameters["benchmark"] benchmark_subset = SUBSETS[parameters["set"]] benchmark = Benchmark(benchmark_name, subset=benchmark_subset) sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if parameters["file"] == "corrupt.txt": predicted_sequences = [corrupt for _, corrupt in sequence_pairs] else: predicted_sequences = benchmark.get_predicted_sequences( parameters["file"]) evaluator = Evaluator() for s_i, (correct, corrupt), predicted in izip(sequence_pairs, predicted_sequences): if s_i == parameters["sequences"]: break evaluator.evaluate(benchmark, s_i, correct, corrupt, predicted, evaluate_ed=False) evaluator.print_sequence() print_evaluator(evaluator)
def run_experiment_evaluation(detectors, seeds, runs, output_dir, anomaly_type, steps=5, outlier_type='extreme_1', store_results=True): # get synthetic anomaly dataset from agots datasets = list( get_datasets_for_multiple_runs(anomaly_type, seeds, steps, outlier_type)) results = pd.DataFrame() evaluator = None ipdb.set_trace() for index, seed in enumerate(seeds): evaluator = Evaluator(datasets[index], detectors, output_dir, seed=seed) evaluator.evaluate() result = evaluator.benchmarks() evaluator.plot_roc_curves(store=store_results) evaluator.plot_threshold_comparison(store=store_results) evaluator.plot_scores(store=store_results) evaluator.set_benchmark_results(result) evaluator.export_results(f'experiment-run-{index}-{seed}') results = results.append(result, ignore_index=True) if not store_results: return # set average results from multiple pipeline runs for evaluation avg_results = results.groupby(['dataset', 'algorithm'], as_index=False).mean() evaluator.set_benchmark_results(avg_results) evaluator.export_results(f'experiment-{anomaly_type}') # Plots which need the whole data (not averaged) evaluator.create_boxplots(runs=runs, data=results, detectorwise=True, store=store_results) evaluator.create_boxplots(runs=runs, data=results, detectorwise=False, store=store_results) evaluator.gen_merged_tables(results, f'for_{anomaly_type}', store=store_results) # Plots using 'self.benchmark_results' -> using the averaged results evaluator.create_bar_charts(runs=runs, detectorwise=True, store=store_results) evaluator.create_bar_charts(runs=runs, detectorwise=False, store=store_results) evaluator.plot_auroc( title=f'Area under the curve for differing {anomaly_type} anomalies', store=store_results) # Plots using 'self.results' (need the score) -> only from the last run evaluator.plot_threshold_comparison(store=store_results) evaluator.plot_scores(store=store_results) evaluator.plot_roc_curves(store=store_results) return evaluator
def run_different_window_sizes_evaluator(detectors, seeds, runs): results = pd.DataFrame() for seed in seeds: datasets = [ SyntheticDataGenerator.long_term_dependencies_width(seed), SyntheticDataGenerator.long_term_dependencies_height(seed), SyntheticDataGenerator.long_term_dependencies_missing(seed) ] evaluator = Evaluator(datasets, detectors, seed=seed) evaluator.evaluate() evaluator.plot_scores() result = evaluator.benchmarks() results = results.append(result, ignore_index=True) evaluator.set_benchmark_results(results) evaluator.export_results('run_different_windows') evaluator.create_boxplots(runs=runs, data=results, detectorwise=False) evaluator.create_boxplots(runs=runs, data=results, detectorwise=True) return evaluator
) == 1 or "-h" in sys.argv or "-help" in sys.argv or "help" in sys.argv: print_help() exit(0) benchmark, subset, file_name = get_arguments() benchmark = Benchmark(benchmark, subset) correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) if file_name == "corrupt.txt": predicted_sequences = corrupt_sequences else: predicted_sequences = benchmark.get_predicted_sequences(file_name) original_sequences = correct_sequences evaluator = Evaluator() for seq_id, (original, correct, corrupt, predicted) in \ enumerate(zip(original_sequences, correct_sequences, corrupt_sequences, predicted_sequences)): if benchmark.name == "acl" and original.startswith("#"): print(original) continue correct_processed, corrupt_processed, predicted_processed = \ tolerant_preprocess_sequences(original, correct, corrupt, predicted) evaluator.evaluate(None, None, original_sequence=correct_processed, corrupt_sequence=corrupt_processed, predicted_sequence=predicted_processed,
benchmark = Benchmark(benchmark_name, benchmark_subset) sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if file_name == "corrupt.txt": predicted_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) mean_runtime = 0 else: try: predicted_sequences = benchmark.get_predicted_sequences(file_name)[:len(sequence_pairs)] mean_runtime = benchmark.get_mean_runtime(file_name) except FileNotFoundError: predicted_sequences = [] mean_runtime = 0 if len(predicted_sequences) == len(sequence_pairs): evaluator = Evaluator() for i, (correct, corrupt) in enumerate(sequence_pairs): predicted = predicted_sequences[i] evaluator.evaluate(file_name=None, line=None, original_sequence=correct, corrupt_sequence=corrupt, predicted_sequence=predicted, evaluate_ed=False) f1 = evaluator.f1() acc = evaluator.sequence_accuracy() print("f1 = %2.2f" % (f1 * 100)) print("acc = %2.2f" % (acc * 100)) print("t = %.2f" % mean_runtime) else:
for benchmark in benchmarks: original_sequences = { Subset.TUNING: read_sequences(paths.WIKI_TUNING_SENTENCES), Subset.DEVELOPMENT: Wikipedia.development_sequences(), Subset.TEST: Wikipedia.test_sequences() }[subset] sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT) if predictions_file_name == "corrupt.txt": predicted_sequences = [corrupt for _, corrupt in sequence_pairs] else: predicted_sequences = benchmark.get_predicted_sequences( predictions_file_name) evaluator = Evaluator() for s_i, original, (correct, corrupt), predicted in izip( original_sequences, sequence_pairs, predicted_sequences): if s_i == n_sequences: break correct_processed, corrupt_processed, predicted_processed = \ tolerant_preprocess_sequences(original, correct, corrupt, predicted) evaluator.evaluate(predictions_file_name, s_i, original_sequence=correct_processed, corrupt_sequence=corrupt_processed, predicted_sequence=predicted_processed, evaluate_ed=False)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model model = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator trainer = Trainer(model, data, params) evaluator = Evaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps trainer.mlm_step(params.lambda_mlm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)