def stephenRun(): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out # preProcessData("../Data/") # Indicate which are the training and test files train_file = "../Data/BPIC15_train_1.csv" test_file = "../Data/BPIC15_test_1.csv" # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, "../Data/output.csv", model, label="Anomaly", normal_val="0") # Plot the ROC curve based on the results plot.plot_single_roc_curve("../Data/output.csv")
def compare_bpic_total(path): train = path + "BPIC15_train_total.csv" test = path + "BPIC15_test_total.csv" output = path + "Output/BPIC_15_output_total.csv" output_edbn = path + "Output/BPIC15_edbn_output_total.csv" prec_recall = path + "Output/prec_recall_total.png" roc = path + "Output/roc_total.png" if not os.path.exists(path + "Output"): os.mkdir(path + "Output") train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) bohmer_model = bmr.train(train_data) bmr.test(test_data, output, bohmer_model, label = "Anomaly", normal_val = 0) train_data.convert2int() test_data.convert2int() edbn_model = edbn_train(train_data) edbn_test(test_data, output_edbn, edbn_model, label = "Anomaly", normal_val = "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
def test_file_full(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", None) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "name") train_data.remove_attributes(["label"]) model = edbn.train(train_data) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "name", values=train_data.values) edbn.test(test_data, file + "_output_full.csv", model, "label", "0", train_data) plot.plot_single_roc_curve(file + "_output_full.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc.png") plot.plot_single_prec_recall_curve(file + "_output_full.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec.png")
def test_lin(dataset_folder, model_folder): from RelatedMethods.Lin.model import predict_next logfile = LogFile(dataset_folder + "full_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=True, k=0) test_log = LogFile(dataset_folder + "test_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=True, k=0, values=logfile.values) model_file = sorted([ model_file for model_file in os.listdir(model_folder) if model_file.endswith(".h5") ])[-1] acc = predict_next(os.path.join(model_folder, model_file), test_log.data, test_log.trace, test_log.activity) with open(os.path.join(model_folder, "results_next_event.log"), "a") as fout: fout.write("Accuracy: (%s) %s\n" % (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc))
def test_file_bohmer(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", 10000) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "name", convert=False) train_data.remove_attributes(["label"]) model = bohmer.train(train_data, 3, 4, 1) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "name", convert=False, values=train_data.values) bohmer.test(test_data, file + "_output_bohmer.csv", model, "label", 0) plot.plot_single_roc_curve(file + "_output_bohmer.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc_bohmer.png") plot.plot_single_prec_recall_curve(file + "_output_bohmer.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec_bohmer.png")
def train_vars_and_test(model, alias, filename, event_emit_obj): file = UPLOAD_FOLDER + "/" + alias + "/" + filename folder = UPLOAD_FOLDER + "/" + alias + "/" train_file = get_constructed_file(file) test_file = get_constructed_file(file, type="test") train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) event_emit_obj('score_resp', {'step': 2, "msg": "Data loaded."}) train_data.create_k_context() event_emit_obj('score_resp', { 'step': 3, "msg": "Build K-Context for data." }) model_trained_on_data = edbn.train_seperate(train_data, model) event_emit_obj('score_resp', {'step': 4, "msg": "Finished training data."}) test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, folder + "output.csv", model_trained_on_data, label="Anomaly", normal_val="0") event_emit_obj('score_resp', {'step': 5, "msg": "Finished testing"}) # # Plot the ROC curve based on the results # plot.plot_single_roc_curve(experiment_folder + "output.csv") event_emit_obj('score_resp', {'step': 6, "msg": "Preparing to score."}) scores = get_event_scores(test_data.data, model_trained_on_data) r = list(scores.keys()) one = np.random.randint(0, len(r)) random_key = r[one] print(random_key) print(test_data.convert_int2string('Case', int(random_key))) # results = plottable(scores) event_emit_obj('score_resp', {'step': 7, "msg": "Finished scoring!"}) print("Finished scoring...") # plot_single_scores(scores) # r, ps = plot_pvalues(scores, 20) return scores
def compare_bpics(path): for i in range(1, 6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) train_data = LogFile(train, ",", 0, 500000, "Time", "Case", activity_attr="Activity", convert=False) train_data.remove_attributes(["Anomaly", "Type", "Time"]) test_data = LogFile(test, ",", 0, 500000, "Time", "Case", activity_attr="Activity", values=train_data.values, convert=False) bohmer_model = bmr.train(train_data) bmr.test(test_data, output, bohmer_model, label="Anomaly", normal_val="0") train_data.convert2int() test_data.convert2int() edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, label="Anomaly", normal_val="0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "EDBN"], roc)
def _test1(): data = "../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() # logfile.filter_case_length(5) logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") model = edbn_train(train_log) acc = predict_next_event(model, test_log) acc_update = predict_next_event_update(model, test_log) print("ACC:", acc, acc_update)
def categorical_test(): path = "../Data/Experiments/" train_rates = [0, 5, 10, 25] test_rates = [1, 5, 10, 25, 50, 100, 250, 500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) generator.create_shipment_data(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test_file, ",", 0, 1000000, None, "Case", values=train_data.values) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "Anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def experiment_standard(): data = LogFile("../Data/bpic2018.csv", ",", 0, 3000, "startTime", "case") #data_str = pd.read_csv("../Data/bpic2018_ints.csv", delimiter=",", header=0, dtype=int, nrows=3000) # data.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) # model = create_model(data, data) #with open("model_30000b", "wb") as fout: # pickle.dump(model, fout) with open("model_30000b", "rb") as fin: model = pickle.load(fin) data = pd.read_csv("../Data/bpic2018_ints.csv", delimiter=",", header=0, dtype=int) data = filter_attributes(data, [ "eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected" ]) print("Get Scores") scores = get_event_scores(data, model) plot_single_scores(scores) plot_pvalues(scores, 800) y = [] for key in sorted(scores.keys()): if sum(scores[key]) != 0: y.append(math.log10(sum(scores[key]) / len(scores[key]))) kernel = stats.gaussian_kde(y) plt.plot(np.linspace(0, max(y), 1000), kernel(np.linspace(0, max(y), 1000))) plt.show()
def run(default_dataset="edbn/Data/BPIC15_1_sorted.csv", default_alias="run/"): # Use the BPIC15_x_sorted.csv to generate new training and test datafiles with anomalies introduced # After running this once you can comment this line out # which_dataset = "edbn/Data/BPIC15_1_sorted.csv" # which_dataset = "edbn/Data/BPIC15_1_sorted.csv" # preprocess_folder = "run/" which_dataset = default_dataset preprocess_folder = default_alias train_file, test_file, experiment_folder = preProcessFile( which_dataset, preprocess_folder) # Indicate which are the training and test files # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder) # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder) # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) # Train the model model = edbn.train(train_data) # Test the model and save the scores in ../Data/output.csv test_data = LogFile(test_file, ",", header=0, rows=500000, time_attr=None, trace_attr="Case", values=train_data.values) edbn.test(test_data, experiment_folder + "output.csv", model, label="Anomaly", normal_val="0") # # Plot the ROC curve based on the results # plot.plot_single_roc_curve(experiment_folder + "output.csv") scores = get_event_scores(test_data.data, model) print("Finished scoring...") # plot_single_scores(scores) r, ps = plot_pvalues(scores, 20) return scores, (r, ps), model
def train_camargo(data_folder, model_folder, architecture): import RelatedMethods.Camargo.embedding_training as em import RelatedMethods.Camargo.model_training as mo logfile = LogFile(data_folder + "full_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0) train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0) test_log = LogFile(data_folder + "test_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0) args = {} args["file_name"] = "data" args[ "model_type"] = architecture # Choose from 'joint', 'shared', 'concatenated', 'specialized', 'shared_cat' args["norm_method"] = "lognorm" # Choose from 'lognorm' or 'max' args["n_size"] = 5 # n-gram size args['lstm_act'] = None # optimization function see keras doc args['l_size'] = 100 # LSTM layer sizes args['imp'] = 1 # keras lstm implementation 1 cpu, 2 gpu args['dense_act'] = None # optimization function see keras doc args['optim'] = 'Nadam' # optimization function see keras doc em.training_model(logfile, model_folder) mo.training_model(logfile, train_log, test_log, model_folder, args)
def compare_bpic_total(path): train = path + "BPIC15_train_total.csv" test = path + "BPIC15_test_total.csv" output = path + "Output/BPIC_15_output_total.csv" output_edbn = path + "Output/BPIC15_edbn_output_total.csv" prec_recall = path + "Output/prec_recall_total.png" roc = path + "Output/roc_total.png" #bohmer_model = bmr.train(train, header = 0, length = 5000000) #bmr.test(train, test, output, bohmer_model, ",", 5000000, skip=0) train_data = LogFile(train, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string) edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
def only_train(default_dataset="edbn/Data/BPIC15_1_sorted.csv", default_alias="run/"): which_dataset = default_dataset preprocess_folder = default_alias train_file, test_file, experiment_folder = preProcessFile( which_dataset, preprocess_folder) # Indicate which are the training and test files # train_file = "../Data/{}BPIC15_train_1.csv".format(preprocess_folder) # test_file = "../Data/{}BPIC15_test_1.csv".format(preprocess_folder) # Load logfile to use as training data train_data = LogFile(train_file, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly", "time"]) # Train the model model = edbn.train(train_data) return model
def test_edbn(dataset_folder, model_folder, k): from eDBN_Prediction import predict_next_event model_file = os.path.join(model_folder, "model") with open(model_file, "rb") as pickle_file: model = pickle.load(pickle_file) model.print_parents() if k is None: with open(os.path.join(model_folder, "k")) as finn: k = int(finn.readline()) print("K=", k) train_log = LogFile(dataset_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=True, k=k) test_log = LogFile(dataset_folder + "test_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=True, k=k, values=train_log.values) test_log.create_k_context() acc = predict_next_event(model, test_log) with open(os.path.join(model_folder, "results_next_event.log"), "a") as fout: fout.write("Accuracy: (%s) %s\n" % (time.strftime("%d-%m-%y %H:%M:%S", time.localtime()), acc))
def compare_bpics(path): for i in range(1,6): # Input Files train = path + "BPIC15_train_%i.csv" % (i) test = path + "BPIC15_test_%i.csv" % (i) output = path + "Output/BPIC15_output_%i.csv" % (i) output_edbn = path + "Output/BPIC15_edbn_output_%i.csv" % (i) prec_recall = path + "Output/prec_recall_%i.png" % (i) roc = path + "Output/roc_%i.png" % (i) #bohmer_model = bmr.train(train + "_ints", header = 0, length = 500000) #bmr.test(train + "_ints", test + "_ints", output, bohmer_model, ",", 500000, skip=0) train_data = LogFile(train, ",", 0, 500000, None, "Case") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test, ",", 0, 500000, None, "Case", train_data.string_2_int, train_data.int_2_string) edbn_model = edbn.train(train_data) edbn.test(test_data, output_edbn, edbn_model, "Anomaly", "0") plt.plot_compare_prec_recall_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], save_file=prec_recall) plt.plot_compare_roc_curve([output, output_edbn], ["Likelihood Graph", "eDBN"], roc)
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage): logfile = LogFile(data, ",", 0, None, None, "case", activity_attr="event", convert=False, k=prefix_size) if add_end_event: logfile.add_end_events() logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open("Baseline/results.txt", "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------") baseline_acc = test(test_log, train(train_log, epochs=100, early_stop=10)) fout.write("\nBaseline: " + str(baseline_acc)) fout.write("\n") fout.write("====================================\n\n")
def run_sdl(): from Methods.SDL.sdl import train, test labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv" log = LogFile(labeled_logfile, ";", 0, None, "time_timestamp", "Case_ID", activity_attr="label", convert=True, k=10) columns = [ "label", "Case_ID", "Activity", "monitoringResource", "question", "org_resource", "Responsible_actor", "SUMleges" ] log.keep_attributes(columns) log.create_k_context() train_log, test_log = log.splitTrainTest(80, True, "train-test") train_log.ignoreHistoryAttributes.add("label") test_log.ignoreHistoryAttributes.add("label") model = train(train_log, 200, 42) print(test(test_log, model)) results1 = [] results2 = [] for case in test_log.get_cases(): pass
model = edbn_train(train_log) acc = predict_next_event(model, test_log) acc_update = predict_next_event_update(model, test_log) print("ACC:", acc, acc_update) if __name__ == "__main__": data = "../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, "completeTime", case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() weeks = logfile.split_days("%Y-%m-%d %H:%M:%S") weeks_sorted = sorted(weeks.keys()) num_weeks = len(weeks_sorted) for i in range(num_weeks): weeks[weeks_sorted[i]]["model"] = edbn_train( weeks[weeks_sorted[i]]["data"])
for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(1): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) generator.create_shipment_data(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_date = LogFile(train_file, ",", 0, 1000000, None, "Case") train_date.remove_attributes(["Anomaly"]) test_date = LogFile(test_file, ",", 0, 1000000, None, "Case", string_2_int=train_date.string_2_int, int_2_string=train_date.int_2_string) model = edbn.train(train_date) edbn.test(test_date, path + "Output_%i_%i.csv" % anoms_rates[i], model, "Anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i]
validation_split=0.2, batch_size=train_log.k) return results if __name__ == "__main__": # data = "../../Data/Helpdesk.csv" data = "../../Data/BPIC12W.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, time_attr="completeTime", trace_attr=case_attr, activity_attr=act_attr, convert=False, k=5) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(80, case=True, method="random") model = train(train_log, epochs=100, early_stop=10) acc = test(test_log, model) print(acc)
def train(log): return edbn_train(log) def test(log, model): return predict_next_event(model, log) if __name__ == "__main__": # data = "../Data/Helpdesk.csv" # data = "../../Data/Taymouri_bpi_12_w.csv" data = "../Data/BPIC12W.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=4) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() # logfile.filter_case_length(5) logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="test-train") model = train(train_log) acc = test(test_log, model) print(acc) import base_adapter model2 = base_adapter.train(train_log, 100, 10) acc2 = base_adapter.test(test_log, model2) print(acc2)
def train_edbn(data_folder, model_folder, k=None, next_event=True): from EDBN.Execute import train from Predictions.eDBN_Prediction import learn_duplicated_events, predict_next_event, predict_suffix if k is None: best_model = {} for k in range(1, 6): train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=k) train_train_log, train_test_log = train_log.splitTrainTest(80) train_train_log.add_end_events() train_train_log.convert2int() train_train_log.create_k_context() train_test_log.values = train_train_log.values train_test_log.add_end_events() train_test_log.convert2int() train_test_log.create_k_context() model = train(train_train_log) # Train average number of duplicated events model.duplicate_events = learn_duplicated_events(train_train_log) if next_event: acc = predict_next_event(model, train_test_log) else: acc = predict_suffix(model, train_test_log) print("Testing k=", k, " | Validation acc:", acc) if "Acc" not in best_model or best_model["Acc"] < acc: best_model["Acc"] = acc best_model["Model"] = model best_model["k"] = k print("Best k value:", best_model["k"], " | Validation acc of", best_model["Acc"]) k = best_model["k"] train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=k) train_log.add_end_events() train_log.convert2int() train_log.create_k_context() model = train(train_log) # Train average number of duplicated events model.duplicate_events = learn_duplicated_events(train_log) with open(os.path.join(model_folder, "model"), "wb") as pickle_file: pickle.dump(model, pickle_file) with open(os.path.join(model_folder, "k"), "w") as outfile: outfile.write(str(k))
def get_data(dataset, dataset_size, k, add_end, reduce_tasks, resource_pools, remove_resource): filename_parts = [dataset, str(dataset_size), str(k)] for v in [add_end, reduce_tasks, resource_pools, remove_resource]: if v: filename_parts.append(str(1)) else: filename_parts.append(str(0)) print(filename_parts) cache_file = LOGFILE_PATH + "/" + "_".join(filename_parts) colTitles = [] if os.path.exists(cache_file): print("Loading file from cache") with open(cache_file, "rb") as pickle_file: preprocessed_log = pickle.load(pickle_file) else: resource_attr = None if dataset == BPIC15_1 or dataset == BPIC15: logfile = LogFile("../Data/BPIC15_1_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_2: logfile = LogFile("../Data/BPIC15_2_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_3: logfile = LogFile("../Data/BPIC15_3_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_4: logfile = LogFile("../Data/BPIC15_4_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_5: logfile = LogFile("../Data/BPIC15_5_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC12: logfile = LogFile("../Data/BPIC12.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k) resource_attr = "org:resource" colTitles = ["case", "event", "org:resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC12W: logfile = LogFile("../Data/BPIC12W.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k) resource_attr = "org:resource" colTitles = ["case", "event", "org:resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == HELPDESK: logfile = LogFile("../Data/Helpdesk.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k) resource_attr = "Resource" colTitles = ["case", "event", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(3) elif dataset == BPIC18: logfile = LogFile("../Data/bpic2018.csv", ",", 0, dataset_size, "startTime", "case", activity_attr="event", convert=False, k=k) colTitles = ["case", "event", "subprocess"] logfile.keep_attributes(colTitles) else: print("Unknown Dataset") return None preprocessed_log = preprocess(logfile, add_end, reduce_tasks, resource_pools, resource_attr, remove_resource) preprocessed_log.create_k_context() with open(cache_file, "wb") as pickle_file: pickle.dump(preprocessed_log, pickle_file) return preprocessed_log, "_".join(filename_parts)
hidden_size=2 * len(selected_columns), num_layers=2, num_directions=1) optimizerD = torch.optim.Adam(rnnD.parameters(), lr=0.0002, betas=(0.5, 0.999)) # Training and testing ep.train(rnnD=rnnD, rnnG=rnnG, optimizerD=optimizerD, optimizerG=optimizerG, obj=train_data, epoch=epoch) return rnnG def test(model, test_log, batch_size=5): test_data = adapted_Input() test_data.run(test_log, batch_size, False) rnng_validation = torch.load(test_data.path + "/rnnG(validation).m") print("EVAL model") acc = ep.model_eval_test(modelG=model, mode='test', obj=test_data) print("EVAL model from validation") ep.model_eval_test(modelG=rnng_validation, mode='test', obj=test_data) return acc if __name__ == "__main__": from data import Data import setting from metric import ACCURACY d = Data("Helpdesk", LogFile("../../Data/Helpdesk.csv", ",", 0, None, "completeTime", "case", activity_attr="event", convert=False)) d.logfile.keep_attributes(["event", "role", "completeTime"]) d.prepare(setting.STANDARD) r = test(train(d.train), d.test_orig) print("Accuracy:", ACCURACY.calculate(r))
def duration_test_discretize(): path = "../Data/Experiments_Discretize/" train_rates = [0, 5, 10, 25] test_rates = [1, 5, 10, 25, 50, 100, 250, 500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) duration_generator.generate(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, "date", "trace", convert=False) train_data.remove_attributes(["Anomaly"]) train_data.keep_attributes( ["event", "date", "trace", "process", "resource", "random"]) train_data.convert2int() train_data.create_k_context() train_data.add_duration_to_k_context() bins = train_data.discretize("duration_0", bins=10) test_data = LogFile(test_file, ",", 0, 1000000, "date", "trace", values=train_data.values, convert=False) test_data.keep_attributes([ "event", "date", "trace", "process", "resource", "random", "anomaly" ]) test_data.convert2int() test_data.create_k_context() test_data.add_duration_to_k_context() test_data.discretize("duration_0", bins) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def train_lin(data_folder, model_folder): from RelatedMethods.Lin.model import train logfile = LogFile(data_folder + "full_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0) logfile.add_end_events() logfile.convert2int() train_log = LogFile(data_folder + "train_log.csv", ",", 0, None, None, "case", activity_attr="event", convert=False, k=0, values=logfile.values) train_log.add_end_events() train_log.convert2int() train(logfile, train_log, model_folder)
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage, filename="results.txt"): data = DATA_FOLDER + data logfile = LogFile(data, ",", 0, None, "completeTime", "case", activity_attr="event", convert=False, k=prefix_size) if prefix_size is None: prefix_size = max(logfile.data.groupby(logfile.trace).size()) if prefix_size > 40: prefix_size = 40 logfile.k = prefix_size if add_end_event: logfile.add_end_events() # logfile.keep_attributes(["case", "event", "role", "completeTime"]) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open(filename, "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------\n") processes = [] processes.append( Process(target=execute_tax, args=(train_log, test_log, filename), name="Tax")) processes.append( Process(target=execute_taymouri, args=(train_log, test_log, filename), name="Taymouri")) processes.append( Process(target=execute_camargo, args=(train_log, test_log, filename), name="Camargo")) processes.append( Process(target=execute_lin, args=(train_log, test_log, filename), name="Lin")) processes.append( Process(target=execute_dimauro, args=(train_log, test_log, filename), name="Di Mauro")) processes.append( Process(target=execute_pasquadibisceglie, args=(train_log, test_log, filename), name="Pasquadibisceglie")) processes.append( Process(target=execute_edbn, args=(train_log, test_log, filename), name="EDBN")) processes.append( Process(target=execute_baseline, args=(train_log, test_log, filename), name="Baseline")) # processes.append(Process(target=execute_new_method, args=(train_log, test_log, filename), name="New Method")) print("Starting Processes") for p in processes: p.start() print(p.name, "started") print("All processes running") for p in processes: p.join() print(p.name, "stopped") with open(filename, "a") as fout: fout.write("====================================\n\n") print("All processes stopped")
from RelatedMethods.Lin.model import create_model, predict_next from Utils.LogFile import LogFile def train(log, epochs=200, early_stop=42): return create_model(log, "tmp", epochs, early_stop) def test(log, model): return predict_next(log, model) if __name__ == "__main__": data = "../../Data/BPIC15_5_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=1) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") # model = train(train_log, epochs=100, early_stop=5) model = load_model("../../Predictions/tmp/model_001-4.51.h5", custom_objects={'Modulator': Modulator}) acc = test(test_log, model) print(acc)
def run_edbn(): from eDBN_Prediction import get_probabilities from Methods.EDBN.Train import train labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv" log = LogFile(labeled_logfile, ";", 0, None, "time_timestamp", "Case_ID", activity_attr="label", convert=True, k=1) columns = [ "label", "Case_ID", "time_timestamp", "Activity", "monitoringResource", "question", "org_resource", "Responsible_actor", "SUMleges" ] log.keep_attributes(columns) log.create_k_context() train_log, test_log = log.splitTrainTest(80, True, "train-test") train_log.ignoreHistoryAttributes.add("label") model = train(train_log) results1 = [] results2 = [] for case in test_log.get_cases(): case_df = case[1] case_probs = {1: 1, 2: 1} ground = 0 for row in case_df.iterrows(): ground = getattr(row[1], "label") parents = model.variables["label"].conditional_table.parents value = [] for parent in parents: value.append(getattr(row[1], parent.attr_name)) tuple_val = tuple(value) activity_var = model.variables["label"] probs, unknown = get_probabilities(activity_var, tuple_val, parents) case_probs[1] += probs.get(1, 0) case_probs[2] += probs.get(2, 0) # correct_prob = sum(case_probs) / len(case_probs) if ground == 1: if case_probs[1] > case_probs[2]: results1.append(1) else: results1.append(0) if ground == 2: if case_probs[2] > case_probs[1]: results2.append(1) else: results2.append(0) print(len(results1), sum(results1) / len(results1)) print(len(results2), sum(results2) / len(results2))