from grm import GRM, preprocessing from pm4py.objects.log.util import sampling # import data data_raw = preprocessing.import_data("data", "BPI2020_PermitLog.csv", separator=";", quote='', case_id="Case ID", activity="Activity", time_stamp="Complete Timestamp", target="(case) Overspent") # Create new GRM model object hyper_params = {'num_epochs': 1} grm_model = GRM.GRM(data_raw, params=hyper_params) # Train GGNN model grm_model.train() # Evaluation of the GGNN model evaluation_metrics = grm_model.testing_log() print(evaluation_metrics) # Visualization as DFG (with sample of evaluation data) multi_instance_log = sampling.sample(data_raw, n=100) grm_model.visualize_dfg(save_file=False, log=multi_instance_log, file_name="multi")
hyper_params = {'num_epochs': 1000} k = 10 # Load data log = preprocessing.import_data("../data", log_file, separator=";", quote='"', case_id=name_of_case_id, activity=name_of_activity, time_stamp=name_of_timestamp, target=name_of_label) # restore trained GGNN model grm_model = GRM.GRM( log, get_activities(log), restore_file= "../predictive_quality/logged_models/2020-07-11-08-12_best_model.pickle", params=hyper_params) # create process model (full) grm_model.visualize_dfg(save_file=True, log=log, file_name="bpi2020_all_", variant="all") # create process model with filter (top 5 most relevant) grm_model.visualize_dfg(save_file=True, log=log, file_name="bpi2020_5_", variant="all", topK=5) # create process model with filter (top 10 most relevant)
hyper_params = {'num_epochs': 1000} k = 10 log = preprocessing.import_data("../data", logfile, separator=";", quote='', case_id=name_of_case_id, activity=name_of_activity, time_stamp=name_of_timestamp, target=name_of_label) # filter out most relevant activity model_path = '../best_models/sp2020/2020-05-05-14-59_best_model.pickle' activities = get_activities(log) grm_model = GRM.GRM(log, activities, restore_file=model_path) filtered_log = EventLog() for trace in log: case_id, pred, rel_scores = grm_model.predict(trace) if len(rel_scores) > 1: most_relevant = max(rel_scores.items(), key=operator.itemgetter(1))[0] log_trace = attributes_filter.apply_events( log, [case_id], parameters={ attributes_filter.PARAMETER_CONSTANT_ATTRIBUTE_KEY: name_of_case_id, "positive": True }) trace_without_most = attributes_filter.apply_events(
def run_experiment(data_raw, hyper_params=None, k=10, ml_flow_uri="databricks", ml_flow_exp="/Shared/grm-review", ml_flow_run_name_prefix="Experiment", save_artifact=True): """ Performs experiment. :param data_raw: raw data from event log file. :param hyper_params: set of hyper-parameters. :param k: index of k-fold cross-validation. :param ml_flow_uri: ?? :param ml_flow_exp: ?? :param ml_flow_run_name_prefix: ?? :param save_artifact: set False if >1GB :return: none. """ # init ml flow mlflow.set_tracking_uri(ml_flow_uri) mlflow.set_experiment(ml_flow_exp) # load event log activities = get_activities(data_raw) num_activities = len(activities) with mlflow.start_run(run_name=ml_flow_run_name_prefix + "_" + str(uuid.uuid1())) as run: if hyper_params: for key, value in hyper_params.items(): log_param(key, value) log_param("k", k) log_metric("number of activities", num_activities) results_measures = dict() i = 0 # Perform k-fold cross-validation kf = KFold(n_splits=k, shuffle=True) for train_idx, test_idx in kf.split(data_raw): i += 1 data_training = [data_raw[j] for j in train_idx] data_testing = [data_raw[j] for j in test_idx] with mlflow.start_run(nested=True, run_name="run_%d" % i) as run_cv: print("Starting Run " + str(i)) # Create new GGNN model object grm_model = GRM.GRM(data_training, activities, restore_file=None, params=hyper_params) # Train GGNN model grm_model.train() # Perform evaluation measures = grm_model.testing_log(data_testing) for key in measures.keys(): log_metric(key, measures[key], i) if key in results_measures: pass else: results_measures[key] = [] results_measures[key].append(measures[key]) print(key + " of run " + str(i) + ": " + str(round(measures[key], 3))) if save_artifact is True: log_artifact(grm_model.best_model_file) log_artifact('../results/cm.pdf') for key in results_measures.keys(): overall_measure = mean(results_measures[key]) log_metric(key, overall_measure) print("Overall " + key + ": " + str(overall_measure)) overall_st_dev = stdev(results_measures["accuracy"]) log_metric("st_dev", overall_st_dev) print("Standard deviation: " + str(overall_st_dev)) """ Relevance visualisation for one instance """ # Extract one random instance from the log single_instance_log = sampling.sample(data_raw, n=1) # Visualization as direct follower graph (DFG) with evaluation data filenames = grm_model.visualize_dfg(save_file=True, log=single_instance_log, file_name="single") for file in filenames: log_artifact(file) """ Relevance visualisation for 1000 instances """ # Extract 1000 instances from the event log multi_instance_log = sampling.sample(data_raw, n=1000) # Visualization as DFG (with evaluation data) for file in grm_model.visualize_dfg(save_file=True, log=multi_instance_log, file_name="multi"): log_artifact(file)