示例#1
0
 def test_applyAlphaMinerToCSV(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     # calculate and compare Petri nets obtained on the same log to verify that instances
     # are working correctly
     log1, net1, marking1, fmarking1 = self.obtainPetriNetThroughAlphaMiner(
         os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     log2, net2, marking2, fmarking2 = self.obtainPetriNetThroughAlphaMiner(
         os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     log1 = sorting.sort_timestamp(log1)
     log1 = sampling.sample(log1)
     log1 = index_attribute.insert_trace_index_as_event_attribute(log1)
     log2 = sorting.sort_timestamp(log2)
     log2 = sampling.sample(log2)
     log2 = index_attribute.insert_trace_index_as_event_attribute(log2)
     petri_exporter.export_net(
         net1, marking1,
         os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     self.assertEqual(len(net1.places), len(net2.places))
     self.assertEqual(len(net1.transitions), len(net2.transitions))
     self.assertEqual(len(net1.arcs), len(net2.arcs))
     final_marking = petri.petrinet.Marking()
     for p in net1.places:
         if not p.out_arcs:
             final_marking[p] = 1
     aligned_traces = token_replay.apply_log(log1, net1, marking1,
                                             final_marking)
     self.assertEqual(aligned_traces, aligned_traces)
 def test_importExportCSVtoCSV(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     event_log = csv_importer.import_event_stream(
         os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     event_log = sorting.sort_timestamp(event_log)
     event_log = sampling.sample(event_log)
     event_log = index_attribute.insert_event_index_as_event_attribute(
         event_log)
     log = log_conv_fact.apply(event_log)
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     event_log_transformed = log_conv_fact.apply(
         log, variant=log_conv_fact.TO_EVENT_STREAM)
     csv_exporter.export(
         event_log_transformed,
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
     event_log_imported_after_export = csv_importer.import_event_stream(
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
     log_imported_after_export = log_conv_fact.apply(
         event_log_imported_after_export)
     self.assertEqual(len(log), len(log_imported_after_export))
     os.remove(os.path.join(OUTPUT_DATA_DIR,
                            "running-example-exported.csv"))
示例#3
0
 def test_importExportCSVtoCSV(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     df = dataframe_utils.convert_timestamp_columns_in_df(df)
     event_log = log_conversion.apply(
         df, variant=log_conversion.TO_EVENT_STREAM)
     event_log = sorting.sort_timestamp(event_log)
     event_log = sampling.sample(event_log)
     event_log = index_attribute.insert_event_index_as_event_attribute(
         event_log)
     log = log_conversion.apply(event_log)
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     event_log_transformed = log_conversion.apply(
         log, variant=log_conversion.TO_EVENT_STREAM)
     df = log_conversion.apply(event_log_transformed,
                               variant=log_conversion.TO_DATA_FRAME)
     df.to_csv(os.path.join(OUTPUT_DATA_DIR,
                            "running-example-exported.csv"))
     df = pd.read_csv(
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
     df = dataframe_utils.convert_timestamp_columns_in_df(df)
     event_log_imported_after_export = log_conversion.apply(
         df, variant=log_conversion.TO_EVENT_STREAM)
     log_imported_after_export = log_conversion.apply(
         event_log_imported_after_export)
     self.assertEqual(len(log), len(log_imported_after_export))
     os.remove(os.path.join(OUTPUT_DATA_DIR,
                            "running-example-exported.csv"))
 def test_importExportCSVtoXES(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     event_log = csv_importer.import_event_stream(os.path.join(INPUT_DATA_DIR, "running-example.csv"))
     event_log = sorting.sort_timestamp(event_log)
     event_log = sampling.sample(event_log)
     event_log = index_attribute.insert_event_index_as_event_attribute(event_log)
     log = log_transform.transform_event_stream_to_event_log(event_log)
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     xes_exporter.export_log(log, os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
     log_imported_after_export = xes_importer.import_log(
         os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
     self.assertEqual(len(log), len(log_imported_after_export))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
def select_attributes_from_log_for_tree(log, max_cases_for_attr_selection=DEFAULT_MAX_CASES_FOR_ATTR_SELECTION,
                                        max_diff_occ=DEFAULT_MAX_CASES_FOR_ATTR_SELECTION / 4):
    """
    Select attributes from log for tree

    Parameters
    ------------
    log
        Log
    max_cases_for_attr_selection
        Maximum number of cases to consider for attribute selection
    max_diff_occ
        Maximum number of different occurrences

    Returns
    ------------

    """
    if len(log) > max_cases_for_attr_selection:
        filtered_log = sampling.sample(log, max_cases_for_attr_selection)
    else:
        filtered_log = log
    event_attributes = get_all_event_attributes_from_log(filtered_log)
    trace_attributes = get_all_trace_attributes_from_log(filtered_log)
    event_attributes_values = {}
    trace_attributes_values = {}
    for attr in event_attributes:
        event_attributes_values[attr] = set(get_attribute_values(log, attr).keys())
    for attr in trace_attributes:
        trace_attributes_values[attr] = set(get_trace_attribute_values(log, attr).keys())

    numeric_event_attributes_to_consider = list()
    string_event_attributes_to_consider = list()
    numeric_trace_attributes_to_consider = list()
    string_trace_attributes_to_consider = list()

    for attr in event_attributes_values:
        if type(list(event_attributes_values[attr])[0]) is int or type(list(event_attributes_values[attr])[0]) is float:
            numeric_event_attributes_to_consider.append(attr)
        elif type(list(event_attributes_values[attr])[0]) is str and len(event_attributes_values[attr]) < max_diff_occ:
            string_event_attributes_to_consider.append(attr)

    for attr in trace_attributes_values:
        if type(list(trace_attributes_values[attr])[0]) is int or type(list(trace_attributes_values[attr])[0]) is float:
            numeric_trace_attributes_to_consider.append(attr)
        elif type(list(trace_attributes_values[attr])[0]) is str and len(trace_attributes_values[attr]) < max_diff_occ:
            string_trace_attributes_to_consider.append(attr)

    numeric_event_attributes_to_consider = check_event_attributes_presence(log,
                                                                           numeric_event_attributes_to_consider)
    string_event_attributes_to_consider = check_event_attributes_presence(log,
                                                                          string_event_attributes_to_consider)
    numeric_trace_attributes_to_consider = check_trace_attributes_presence(log,
                                                                           numeric_trace_attributes_to_consider)
    string_trace_attributes_to_consider = check_trace_attributes_presence(log,
                                                                          string_trace_attributes_to_consider)

    return string_trace_attributes_to_consider, string_event_attributes_to_consider, numeric_trace_attributes_to_consider, numeric_event_attributes_to_consider
示例#6
0
 def test_alphaMinerVisualizationFromXES(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     log, net, marking, fmarking = self.obtainPetriNetThroughAlphaMiner(
         os.path.join(INPUT_DATA_DIR, "running-example.xes"))
     log = sorting.sort_timestamp(log)
     log = sampling.sample(log)
     log = index_attribute.insert_trace_index_as_event_attribute(log)
     petri_exporter.apply(net, marking, os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml"))
     gviz = pn_viz.graphviz_visualization(net)
     self.assertEqual(gviz, gviz)
     final_marking = petri.petrinet.Marking()
     for p in net.places:
         if not p.out_arcs:
             final_marking[p] = 1
     aligned_traces = token_replay.apply(log, net, marking, fmarking)
     self.assertEqual(aligned_traces, aligned_traces)
示例#7
0
def sample_cases(log: Union[EventLog, pd.DataFrame],
                 num_cases: int) -> Union[EventLog, pd.DataFrame]:
    """
    (Random) Sample a given number of cases from the event log.

    Parameters
    ---------------
    log
        Event log / Pandas dataframe
    num_cases
        Number of cases to sample

    Returns
    ---------------
    sampled_log
        Sampled event log (containing the specified amount of cases)
    """
    if isinstance(log, EventLog):
        from pm4py.objects.log.util import sampling
        return sampling.sample(log, num_cases)
    elif isinstance(log, pd.DataFrame):
        from pm4py.objects.log.util import dataframe_utils
        return dataframe_utils.sample_dataframe(
            log, parameters={"max_no_cases": num_cases})
示例#8
0
from grm import GRM, preprocessing
from pm4py.objects.log.util import sampling

# import data
data_raw = preprocessing.import_data("data",
                                     "BPI2020_PermitLog.csv",
                                     separator=";",
                                     quote='',
                                     case_id="Case ID",
                                     activity="Activity",
                                     time_stamp="Complete Timestamp",
                                     target="(case) Overspent")

# Create new GRM model object
hyper_params = {'num_epochs': 1}
grm_model = GRM.GRM(data_raw, params=hyper_params)

# Train GGNN model
grm_model.train()

# Evaluation of the GGNN model
evaluation_metrics = grm_model.testing_log()
print(evaluation_metrics)

# Visualization as DFG (with sample of evaluation data)
multi_instance_log = sampling.sample(data_raw, n=100)
grm_model.visualize_dfg(save_file=False,
                        log=multi_instance_log,
                        file_name="multi")
示例#9
0
from grm import preprocessing, GRM
from grm.util import get_activities
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.util import constants
from pm4py.objects.log.util import sampling

model_path = '../best_models/sp2020/2020-05-06-05-40_best_model.pickle'
logfile = "sp2020.csv"
name_of_case_id = "CASE_ID"
name_of_activity = "ACTIVITY"
name_of_timestamp = "TIMESTAMP"
name_of_label = "REPAIR_IN_TIME_5D"

log = preprocessing.import_data("data", logfile, separator=";", quote='"', case_id=name_of_case_id,
                                activity=name_of_activity,
                                time_stamp=name_of_timestamp, target=name_of_label)

activities = get_activities(log)
grm_model = GRM.GRM(log, activities, restore_file=model_path)

log = attributes_filter.apply(log, [0],
                              parameters={constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: "label", "positive": True})
log = sampling.sample(log, n=5000)
grm_model.visualize_dfg(save_file=True, log=log, file_name="sp2020_", variant="all")
示例#10
0
import os
from pm4py.objects.log.adapters.pandas import csv_import_adapter
from pm4py.objects.conversion.log import factory as conversion_factory
dataframe = csv_import_adapter.import_dataframe_from_path(os.path.join(
    "pmdata/", "running-example.csv"),
                                                          sep=',')
dataframe
dataframe.head()
dataframe.summary()
log = conversion_factory.apply(dataframe)

from pm4py.objects.log.exporter.csv import factory as csv_exporter
csv_exporter.export(event_stream, "data/outputFile1.csv")

#sorting
log
from pm4py.objects.log.util import sorting
log = sorting.sort_timestamp(log)
log
from pm4py.objects.log.util import sorting
sorted_log = sorting.sort_lambda(log,
                                 lambda x: x.attributes["concept:name"],
                                 reverse=False)
sorted_log

#sampling
from pm4py.objects.log.util import sampling
sampled_log = sampling.sample(log, n=50)
sampled_log
#links
#http://www.processmining.org/event_logs_and_models_used_in_book
示例#11
0
文件: evaluation.py 项目: fau-is/grm
def run_experiment(data_raw,
                   hyper_params=None,
                   k=10,
                   ml_flow_uri="databricks",
                   ml_flow_exp="/Shared/grm-review",
                   ml_flow_run_name_prefix="Experiment",
                   save_artifact=True):
    """
    Performs experiment.
    :param data_raw: raw data from event log file.
    :param hyper_params: set of hyper-parameters.
    :param k: index of k-fold cross-validation.
    :param ml_flow_uri: ??
    :param ml_flow_exp: ??
    :param ml_flow_run_name_prefix: ??
    :param save_artifact: set False if >1GB
    :return: none.
    """

    # init ml flow
    mlflow.set_tracking_uri(ml_flow_uri)
    mlflow.set_experiment(ml_flow_exp)

    # load event log
    activities = get_activities(data_raw)
    num_activities = len(activities)
    with mlflow.start_run(run_name=ml_flow_run_name_prefix + "_" +
                          str(uuid.uuid1())) as run:
        if hyper_params:
            for key, value in hyper_params.items():
                log_param(key, value)

        log_param("k", k)
        log_metric("number of activities", num_activities)
        results_measures = dict()
        i = 0

        # Perform k-fold cross-validation
        kf = KFold(n_splits=k, shuffle=True)
        for train_idx, test_idx in kf.split(data_raw):
            i += 1
            data_training = [data_raw[j] for j in train_idx]
            data_testing = [data_raw[j] for j in test_idx]

            with mlflow.start_run(nested=True,
                                  run_name="run_%d" % i) as run_cv:
                print("Starting Run " + str(i))

                # Create new GGNN model object
                grm_model = GRM.GRM(data_training,
                                    activities,
                                    restore_file=None,
                                    params=hyper_params)

                # Train GGNN model
                grm_model.train()

                # Perform evaluation
                measures = grm_model.testing_log(data_testing)
                for key in measures.keys():
                    log_metric(key, measures[key], i)
                    if key in results_measures:
                        pass
                    else:
                        results_measures[key] = []
                    results_measures[key].append(measures[key])
                    print(key + " of run " + str(i) + ": " +
                          str(round(measures[key], 3)))

                if save_artifact is True:
                    log_artifact(grm_model.best_model_file)
                log_artifact('../results/cm.pdf')

        for key in results_measures.keys():
            overall_measure = mean(results_measures[key])
            log_metric(key, overall_measure)
            print("Overall " + key + ": " + str(overall_measure))

        overall_st_dev = stdev(results_measures["accuracy"])
        log_metric("st_dev", overall_st_dev)
        print("Standard deviation: " + str(overall_st_dev))
        """ Relevance visualisation for one instance """
        # Extract one random instance from the log
        single_instance_log = sampling.sample(data_raw, n=1)

        # Visualization as direct follower graph (DFG) with evaluation data
        filenames = grm_model.visualize_dfg(save_file=True,
                                            log=single_instance_log,
                                            file_name="single")
        for file in filenames:
            log_artifact(file)
        """ Relevance visualisation for 1000 instances """
        # Extract 1000 instances from the event log
        multi_instance_log = sampling.sample(data_raw, n=1000)

        # Visualization as DFG (with evaluation data)
        for file in grm_model.visualize_dfg(save_file=True,
                                            log=multi_instance_log,
                                            file_name="multi"):
            log_artifact(file)