def test_file_full(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", None) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "name") train_data.remove_attributes(["label"]) model = edbn.train(train_data) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "name", values=train_data.values) edbn.test(test_data, file + "_output_full.csv", model, "label", "0", train_data) plot.plot_single_roc_curve(file + "_output_full.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc.png") plot.plot_single_prec_recall_curve(file + "_output_full.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec.png")
def test_file_bohmer(file): split_dataset(file + "_data.csv", file + "_labels.csv", file + "_train.csv", file + "_test.csv", 10000) train_data = LogFile(file + "_train.csv", ",", 0, 1000000, None, "case_id", "name", convert=False) train_data.remove_attributes(["label"]) model = bohmer.train(train_data, 3, 4, 1) test_data = LogFile(file + "_test.csv", ",", 0, 1000000, None, "case_id", "name", convert=False, values=train_data.values) bohmer.test(test_data, file + "_output_bohmer.csv", model, "label", 0) plot.plot_single_roc_curve(file + "_output_bohmer.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_roc_bohmer.png") plot.plot_single_prec_recall_curve(file + "_output_bohmer.csv", file, save_file="../Data/Nolle_Graphs/" + file.split("/")[-1] + "_precrec_bohmer.png")
def experiment_department(): input = LogFile("../Data/bpic2018.csv", ",", 0, None, "startTime", "case", convert=False) input.remove_attributes([ "eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected" ]) input.convert2int() data = input.filter_copy("self.data.department == 1") model = cd.create_model(data, data) print("Starting writing model to file") with open("model_department", "wb") as fout: pickle.dump(model, fout) print("Done") with open("model_department", "rb") as fin: model = pickle.load(fin) for dept in [1, 2, 3, 4]: data = input.filter_copy("self.data.department == " + str(dept)) scores = cd.get_event_detailed_scores(data, model) cd.plot_attribute_graph(scores, model.current_variables)
def learn_and_dump_model(): train = LogFile("../Data/bpic2018.csv", ",", 0, 30000, "startTime", "case", activity_attr=None, integer_input=False, convert=False) train.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) train.convert2int() model = cd.create_model(train, train) with open("model_30000b", "wb") as fout: pickle.dump(model, fout)
def analyze(): train = LogFile("../Data/bpic2018.csv", ",", 0, None, "startTime", "case", activity_attr=None, integer_input=False, convert=False) print("Num of attributes:", len(train.data.columns)) train.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) print("Num of attributes:", len(train.data.columns)) print(train.data.columns) for attr in train.data.columns: print(attr, len(train.data[attr].value_counts()))
def experiment_standard(): with open("model_30000b", "rb") as fin: model = pickle.load(fin) print("Get Scores") train = LogFile("../Data/bpic2018.csv", ",", 0, 30000, "startTime", "case", activity_attr=None, integer_input=False, convert=False) train.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) train.convert2int() data = LogFile("../Data/bpic2018.csv", ",", 0, None, "startTime", "case", convert=False, values=train.values, integer_input=False) data.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) data.convert2int() scores = cd.get_event_scores(data, model) cd.plot_single_scores(scores) cd.plot_pvalues(scores, 400)
def experiment_outliers(): with open("model_30000", "rb") as fin: model = pickle.load(fin) train = LogFile("../Data/bpic2018.csv", ",", 0, 30000, "startTime", "case", activity_attr=None, integer_input=False, convert=False) train.remove_attributes([ "eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected" ]) train.convert2int() attr_dicts = [] data = LogFile("../Data/bpic2018.csv", ",", 0, None, "startTime", "case", convert=False, values=train.values) data.filter("self.data.year == 1") data.remove_attributes([ "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected" ]) scores = cd.get_event_scores(data, model) for s in scores: if sum(scores[s]) != 0: score = math.log10(sum(scores[s]) / len(scores[s])) if score < -12: for case in attr_dicts[0]: if attr_dicts[0][case] == s: print(s, case, score)
def experiment_clusters(): with open("model_30000", "rb") as fin: model = pickle.load(fin) train = LogFile("../Data/bpic2018.csv", ",", 0, 30000, "startTime", "case", activity_attr=None, integer_input=False, convert=False) train.remove_attributes(["eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) train.convert2int() data = LogFile("../Data/bpic2018.csv", ",", 0, None, "startTime", "case", convert=False, values=train.values) data.remove_attributes(["event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected"]) data.convert2int() data.filter("self.data.year == 1") scores = cd.get_event_detailed_scores(data, model) # First calculate score per trace attributes = list(scores.keys()) num_traces = len(scores[attributes[0]]) upper = {} lower = {} for a in attributes: upper[a] = [] lower[a] = [] for trace_ix in range(num_traces): score = 1 for a in scores: a_score = scores[a][trace_ix] if a_score == -5: score = 0 break score *= a_score if -8 < score < -10: for a in scores: upper[a].append(scores[a][trace_ix]) elif -10 < score < -12: for a in scores: lower[a].append(scores[a][trace_ix]) print(attributes) print(upper) cd.plot_attribute_graph(upper, attributes) cd.plot_attribute_graph(lower, attributes)
def experiment_attributes_standard(): with open("model_30000b", "rb") as fin: model = pickle.load(fin) train = LogFile("../Data/bpic2018.csv", ",", 0, 30000, "startTime", "case", activity_attr=None, integer_input=False, convert=False) train.remove_attributes([ "eventid", "identity_id", "event_identity_id", "year", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected" ]) train.convert2int() input = LogFile("../Data/bpic2018.csv", ",", 0, 10000000, "startTime", "case", convert=False, values=train.values) input.remove_attributes([ "eventid", "identity_id", "event_identity_id", "penalty_", "amount_applied", "payment_actual", "penalty_amount", "risk_factor", "cross_compliance", "selected_random", "selected_risk", "selected_manually", "rejected" ]) input.convert2int() data = input.filter_copy("self.data.year == 1") scores_year1 = cd.get_event_detailed_scores(data, model) cd.plot_attribute_graph(scores_year1, model.current_variables) data = input.filter_copy("self.data.year == 2") scores_year2 = cd.get_event_detailed_scores(data, model) cd.plot_attribute_graph(scores_year2, model.current_variables) data = input.filter_copy("self.data.year == 3") scores_year3 = cd.get_event_detailed_scores(data, model) cd.plot_attribute_graph(scores_year3, model.current_variables) p_vals_year1_2 = [] p_vals_year2_3 = [] p_vals_year1_3 = [] for key in sorted(scores_year1.keys()): p_vals_year1_2.append( stats.ks_2samp(scores_year1[key], scores_year2[key]).pvalue) p_vals_year2_3.append( stats.ks_2samp(scores_year2[key], scores_year3[key]).pvalue) p_vals_year1_3.append( stats.ks_2samp(scores_year1[key], scores_year3[key]).pvalue) def tmp(x): if x == 0: return x else: return 1 p_vals_year1_2 = [tmp(x) for x in p_vals_year1_2] plt.plot(sorted(scores_year1.keys()), p_vals_year1_2, "o") plt.xticks(rotation='vertical') plt.show() p_vals_year2_3 = [tmp(x) for x in p_vals_year2_3] plt.plot(sorted(scores_year1.keys()), p_vals_year2_3, "o") plt.xticks(rotation='vertical') plt.show() p_vals_year1_3 = [tmp(x) for x in p_vals_year1_3] plt.plot(sorted(scores_year1.keys()), p_vals_year1_3, "o") plt.xticks(rotation='vertical') plt.show() x = [] y_1 = [] y_2 = [] y_3 = [] for key in sorted(scores_year1.keys()): x.append(key) y_1.append(np.median(scores_year1[key])) y_2.append(np.median(scores_year2[key])) y_3.append(np.median(scores_year3[key])) plt.plot(x, y_1, "o") plt.plot(x, y_2, "o") plt.plot(x, y_3, "o") plt.xticks(rotation='vertical') plt.xlabel("Attributes") plt.ylabel("Median Score") plt.legend(["2015", "2016", "2017"]) plt.show() p_vals_year1_2 = [] p_vals_year2_3 = [] p_vals_year1_3 = [] for key in sorted(scores_year1.keys()): p_vals_year1_2.append( stats.ks_2samp(scores_year1[key], [np.median(scores_year1[key])]).pvalue) p_vals_year2_3.append( stats.ks_2samp(scores_year2[key], [np.median(scores_year2[key])]).pvalue) p_vals_year1_3.append( stats.ks_2samp(scores_year1[key], [np.median(scores_year3[key])]).pvalue) def tmp(x): if x == 0: return x else: return 1 p_vals_year1_2 = [tmp(x) for x in p_vals_year1_2] plt.plot(sorted(scores_year1.keys()), p_vals_year1_2, "o") plt.xticks(rotation='vertical') plt.show() p_vals_year2_3 = [tmp(x) for x in p_vals_year2_3] plt.plot(sorted(scores_year1.keys()), p_vals_year2_3, "o") plt.xticks(rotation='vertical') plt.show() p_vals_year1_3 = [tmp(x) for x in p_vals_year1_3] plt.plot(sorted(scores_year1.keys()), p_vals_year1_3, "o") plt.xticks(rotation='vertical') plt.show()