def _test1(): data = "../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, None, case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() # logfile.filter_case_length(5) logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") model = edbn_train(train_log) acc = predict_next_event(model, test_log) acc_update = predict_next_event_update(model, test_log) print("ACC:", acc, acc_update)
def duration_test(): path = "../Data/Experiments_Duration/" train_rates = [0, 5, 10, 25] test_rates = [1, 5, 10, 25, 50, 100, 250, 500] anoms_rates = [] for train_rate in train_rates: for test_rate in test_rates: anoms_rates.append((train_rate, test_rate)) for i in range(len(anoms_rates)): print(anoms_rates[i]) scores = [] for run in range(RUNS): print("Run %i" % run) train_file = path + "%i_train_%i.csv" % (i, anoms_rates[i][0]) test_file = path + "%i_test_%i.csv" % (i, anoms_rates[i][1]) duration_generator.generate(10000, 10000, anoms_rates[i][0], anoms_rates[i][1], train_file, test_file) train_data = LogFile(train_file, ",", 0, 1000000, "date", "trace") train_data.remove_attributes(["Anomaly"]) test_data = LogFile(test_file, ",", 0, 1000000, "date", "trace", values=train_data.values) train_data.keep_attributes( ["event", "date", "trace", "process", "resource", "random"]) train_data.create_k_context() train_data.add_duration_to_k_context() bins = train_data.discretize("duration_0") test_data.create_k_context() test_data.add_duration_to_k_context() test_data.discretize("duration_0", bins) model = edbn.train(train_data) edbn.test(test_data, path + "Output_%i_%i.csv" % anoms_rates[i], model, "anomaly", "0") output_file = path + "Output_%i_%i.csv" % anoms_rates[i] output_roc = path + "roc_%i_%i.png" % anoms_rates[i] output_prec = path + "prec_recall_%i_%i.png" % anoms_rates[i] score = plt.get_roc_auc(output_file) scores.append(plt.get_roc_auc(output_file)) print("Score = %f" % score) with open(path + "results.txt", "a") as fout: fout.write("Testing:\ntrain rate: %i\ntest rate: %i\n" % (anoms_rates[i][0], anoms_rates[i][1])) fout.write("Result: " + str(scores) + "\n") fout.write("Mean: %f Median: %f\n" % (np.mean(scores), np.median(scores))) fout.write("Variance: %f\n\n" % np.var(scores))
def run_sdl(): from Methods.SDL.sdl import train, test labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv" log = LogFile(labeled_logfile, ";", 0, None, "time_timestamp", "Case_ID", activity_attr="label", convert=True, k=10) columns = [ "label", "Case_ID", "Activity", "monitoringResource", "question", "org_resource", "Responsible_actor", "SUMleges" ] log.keep_attributes(columns) log.create_k_context() train_log, test_log = log.splitTrainTest(80, True, "train-test") train_log.ignoreHistoryAttributes.add("label") test_log.ignoreHistoryAttributes.add("label") model = train(train_log, 200, 42) print(test(test_log, model)) results1 = [] results2 = [] for case in test_log.get_cases(): pass
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage): logfile = LogFile(data, ",", 0, None, None, "case", activity_attr="event", convert=False, k=prefix_size) if add_end_event: logfile.add_end_events() logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open("Baseline/results.txt", "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------") baseline_acc = test(test_log, train(train_log, epochs=100, early_stop=10)) fout.write("\nBaseline: " + str(baseline_acc)) fout.write("\n") fout.write("====================================\n\n")
if __name__ == "__main__": data = "../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, "completeTime", case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() weeks = logfile.split_days("%Y-%m-%d %H:%M:%S") weeks_sorted = sorted(weeks.keys()) num_weeks = len(weeks_sorted) for i in range(num_weeks): weeks[weeks_sorted[i]]["model"] = edbn_train( weeks[weeks_sorted[i]]["data"]) # # accs1 = [] # for i in range(1, num_weeks): # accs1.append(predict_next_event_multi([weeks[w]["model"] for w in weeks_sorted[:i]], weeks[weeks_sorted[i]]["data"])) #
def run_edbn(): from eDBN_Prediction import get_probabilities from Methods.EDBN.Train import train labeled_logfile = "../Data/Outcome_Prediction/BPIC15_1_f2.csv" log = LogFile(labeled_logfile, ";", 0, None, "time_timestamp", "Case_ID", activity_attr="label", convert=True, k=1) columns = [ "label", "Case_ID", "time_timestamp", "Activity", "monitoringResource", "question", "org_resource", "Responsible_actor", "SUMleges" ] log.keep_attributes(columns) log.create_k_context() train_log, test_log = log.splitTrainTest(80, True, "train-test") train_log.ignoreHistoryAttributes.add("label") model = train(train_log) results1 = [] results2 = [] for case in test_log.get_cases(): case_df = case[1] case_probs = {1: 1, 2: 1} ground = 0 for row in case_df.iterrows(): ground = getattr(row[1], "label") parents = model.variables["label"].conditional_table.parents value = [] for parent in parents: value.append(getattr(row[1], parent.attr_name)) tuple_val = tuple(value) activity_var = model.variables["label"] probs, unknown = get_probabilities(activity_var, tuple_val, parents) case_probs[1] += probs.get(1, 0) case_probs[2] += probs.get(2, 0) # correct_prob = sum(case_probs) / len(case_probs) if ground == 1: if case_probs[1] > case_probs[2]: results1.append(1) else: results1.append(0) if ground == 2: if case_probs[2] > case_probs[1]: results2.append(1) else: results2.append(0) print(len(results1), sum(results1) / len(results1)) print(len(results2), sum(results2) / len(results2))
if __name__ == "__main__": data = "../../Data/Helpdesk.csv" # data = "../../Data/BPIC15_1_sorted_new.csv" case_attr = "case" act_attr = "event" logfile = LogFile(data, ",", 0, None, "completeTime", case_attr, activity_attr=act_attr, convert=False, k=5) logfile.keep_attributes(["case", "event", "role", "completeTime"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(70, case=True, method="train-test") create_data(train_log, test_log, "helpdesk/") # model = train(train_log, 5, 20) # model = keras.models.load_model("premiere_model") # print("Accuracy:", test(test_log, model))
def run_experiment(data, prefix_size, add_end_event, split_method, split_cases, train_percentage, filename="results.txt"): data = DATA_FOLDER + data logfile = LogFile(data, ",", 0, None, "completeTime", "case", activity_attr="event", convert=False, k=prefix_size) if prefix_size is None: prefix_size = max(logfile.data.groupby(logfile.trace).size()) if prefix_size > 40: prefix_size = 40 logfile.k = prefix_size if add_end_event: logfile.add_end_events() # logfile.keep_attributes(["case", "event", "role", "completeTime"]) logfile.keep_attributes(["case", "event", "role"]) logfile.convert2int() logfile.create_k_context() train_log, test_log = logfile.splitTrainTest(train_percentage, case=split_cases, method=split_method) with open(filename, "a") as fout: fout.write("Data: " + data) fout.write("\nPrefix Size: " + str(prefix_size)) fout.write("\nEnd event: " + str(add_end_event)) fout.write("\nSplit method: " + split_method) fout.write("\nSplit cases: " + str(split_cases)) fout.write("\nTrain percentage: " + str(train_percentage)) fout.write("\nDate: " + time.strftime("%d.%m.%y-%H.%M", time.localtime())) fout.write("\n------------------------------------\n") processes = [] processes.append( Process(target=execute_tax, args=(train_log, test_log, filename), name="Tax")) processes.append( Process(target=execute_taymouri, args=(train_log, test_log, filename), name="Taymouri")) processes.append( Process(target=execute_camargo, args=(train_log, test_log, filename), name="Camargo")) processes.append( Process(target=execute_lin, args=(train_log, test_log, filename), name="Lin")) processes.append( Process(target=execute_dimauro, args=(train_log, test_log, filename), name="Di Mauro")) processes.append( Process(target=execute_pasquadibisceglie, args=(train_log, test_log, filename), name="Pasquadibisceglie")) processes.append( Process(target=execute_edbn, args=(train_log, test_log, filename), name="EDBN")) processes.append( Process(target=execute_baseline, args=(train_log, test_log, filename), name="Baseline")) # processes.append(Process(target=execute_new_method, args=(train_log, test_log, filename), name="New Method")) print("Starting Processes") for p in processes: p.start() print(p.name, "started") print("All processes running") for p in processes: p.join() print(p.name, "stopped") with open(filename, "a") as fout: fout.write("====================================\n\n") print("All processes stopped")
def get_data(dataset, dataset_size, k, add_end, reduce_tasks, resource_pools, remove_resource): filename_parts = [dataset, str(dataset_size), str(k)] for v in [add_end, reduce_tasks, resource_pools, remove_resource]: if v: filename_parts.append(str(1)) else: filename_parts.append(str(0)) print(filename_parts) cache_file = LOGFILE_PATH + "/" + "_".join(filename_parts) colTitles = [] if os.path.exists(cache_file): print("Loading file from cache") with open(cache_file, "rb") as pickle_file: preprocessed_log = pickle.load(pickle_file) else: resource_attr = None if dataset == BPIC15_1 or dataset == BPIC15: logfile = LogFile("../Data/BPIC15_1_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_2: logfile = LogFile("../Data/BPIC15_2_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_3: logfile = LogFile("../Data/BPIC15_3_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_4: logfile = LogFile("../Data/BPIC15_4_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC15_5: logfile = LogFile("../Data/BPIC15_5_sorted_new.csv", ",", 0, dataset_size, "Complete Timestamp", "Case ID", activity_attr="Activity", convert=False, k=k) resource_attr = "Resource" colTitles = ["Case ID", "Activity", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC12: logfile = LogFile("../Data/BPIC12.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k) resource_attr = "org:resource" colTitles = ["case", "event", "org:resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == BPIC12W: logfile = LogFile("../Data/BPIC12W.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k) resource_attr = "org:resource" colTitles = ["case", "event", "org:resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(5) elif dataset == HELPDESK: logfile = LogFile("../Data/Helpdesk.csv", ",", 0, dataset_size, "completeTime", "case", activity_attr="event", convert=False, k=k) resource_attr = "Resource" colTitles = ["case", "event", "Resource"] logfile.keep_attributes(colTitles) logfile.filter_case_length(3) elif dataset == BPIC18: logfile = LogFile("../Data/bpic2018.csv", ",", 0, dataset_size, "startTime", "case", activity_attr="event", convert=False, k=k) colTitles = ["case", "event", "subprocess"] logfile.keep_attributes(colTitles) else: print("Unknown Dataset") return None preprocessed_log = preprocess(logfile, add_end, reduce_tasks, resource_pools, resource_attr, remove_resource) preprocessed_log.create_k_context() with open(cache_file, "wb") as pickle_file: pickle.dump(preprocessed_log, pickle_file) return preprocessed_log, "_".join(filename_parts)