Пример #1
0
def apply(df, obj_type, act1, act2, minp, maxp, parameters=None):
    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass

    if parameters is None:
        parameters = {}

    cols = [x for x in df.columns if x.startswith("event_")] + [obj_type]
    red_df = df[cols].dropna(subset=[obj_type])
    red_df = red_df.sort_values([obj_type, "event_timestamp"])
    red_df_shifted = red_df.shift(-1)
    red_df_shifted.columns = [
        str(col) + '_2' for col in red_df_shifted.columns
    ]
    stacked_df = pd.concat([red_df, red_df_shifted], axis=1)
    stacked_df["@@path"] = stacked_df["event_activity"] + "," + stacked_df[
        "event_activity_2"]
    stacked_df["@@diff"] = (
        stacked_df["event_timestamp_2"] -
        stacked_df["event_timestamp"]).astype('timedelta64[s]')
    stacked_df = stacked_df[stacked_df["@@path"] == act1 + "," + act2]
    #stacked_df.info()
    #print(minp, type(minp))
    #print(maxp, type(maxp))
    #print(stacked_df["@@diff"])
    stacked_df = stacked_df[minp <= stacked_df["@@diff"]]
    stacked_df = stacked_df[stacked_df["@@diff"] <= maxp]
    filt_df = red_df[red_df["event_id"].isin(stacked_df["event_id"])
                     | red_df["event_id"].isin(stacked_df["event_id_2"])]
    return filter_metaclass.do_filtering(df, filt_df)
Пример #2
0
def do_negative_filtering(dataframe, fd0, parameters=None):
    if parameters is None:
        parameters = {}

    try:
        if dataframe.type == "succint":
            dataframe = succint_mdl_to_exploded_mdl.apply(dataframe)
    except:
        pass

    fd1 = dataframe[dataframe["event_id"].isin(fd0["event_id"])]
    cols = [x for x in fd1.columns if not x.startswith("event_")]
    cols_values = {}
    collation = []
    for c in cols:
        cols_values[c] = list(fd1.dropna(subset=[c])[c].unique())
        collation.append(dataframe[dataframe[c].isin(cols_values[c])])

    if collation:
        df2 = pd.concat(collation)
    else:
        df2 = pd.DataFrame()

    i1 = dataframe.index
    i2 = df2.index

    return dataframe[~i1.isin(i2)]
Пример #3
0
    def __init__(self, name, mdl_path, shared_logs):
        self.shared_logs = shared_logs
        self.shared_logs_names = []
        self.parent = self
        self.obj_types_str = None
        self.act_obj_types = None
        self.initial_act_obj_types = None
        self.activities = []
        self.obj_types = []
        self.clusters = {}
        self.clustersrepr = ""
        self.clusterid = str(id(self))
        self.stream = None
        self.nodes = None
        self.events_corr = None
        self.events_corr2 = None
        self.matrix = None
        self.powered_matrix = None
        self.powered_matrix_2 = None
        self.graph = None
        self.row_sum = None
        self.overall_sum = 0
        self.selected_act_obj_types = None
        self.name = name
        self.mdl_path = mdl_path
        if "ocel" in self.mdl_path:
            self.succint_dataframe, odataframe = ocel_importer.apply(
                self.mdl_path)
        else:
            self.succint_dataframe = mdl_importer.apply(self.mdl_path)
        self.succint_dataframe = self.succint_dataframe.dropna(
            subset=["event_activity"])
        self.succint_dataframe.type = "succint"
        self.exploded_dataframe = succint_mdl_to_exploded_mdl.apply(
            self.succint_dataframe)
        self.exploded_dataframe.type = "exploded"
        self.session_objects = {}

        self.possible_model_types = {
            "mvp_frequency": "MVP (frequency)",
            "mvp_performance": "MVP (performance)",
            "process_tree": "oc-PTree",
            "petri_alpha": "oc-Net-Alpha",
            "petri_inductive": "oc-Net-Inductive",
            "dfg": "oc-DFG",
            "multigraph": "OC Multigraph"
        }
        self.selected_model_type = defaults.DEFAULT_MODEL_TYPE
        self.possible_classifiers = {"activity", "combined"}
        self.selected_classifier = "activity"
        self.selected_aggregation_measure = "events"
        self.selected_decoration_measure = "frequency"
        self.selected_projection = "no"
        self.selected_min_acti_count = 800
        self.selected_min_edge_freq_count = 800
        self.epsilon = 0.0
        self.noise_threshold = 0.0
        self.model_view = ""
Пример #4
0
def apply(df, min_acti_freq=0):
    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass
    activ = dict(df.groupby("event_id").first()["event_activity"].value_counts())
    activ = [x for x,y in activ.items() if y >= min_acti_freq]
    return df[df["event_activity"].isin(activ)]
Пример #5
0
 def get_log_obj_type(self, objtype):
     columns = [x for x in self.exploded_dataframe.columns if x.startswith("event_")] + [objtype]
     dataframe = self.exploded_dataframe[columns].dropna(how="any", subset=[objtype])
     dataframe = succint_mdl_to_exploded_mdl.apply(dataframe)
     dataframe = dataframe.rename(columns={"event_activity": "concept:name", "event_timestamp": "time:timestamp",
                                           objtype: "case:concept:name"})
     stream = EventStream(dataframe.to_dict('r'))
     log = log_conv_factory.apply(stream)
     log = sorting.sort_timestamp(log, "time:timestamp")
     exported_log = base64.b64encode(xes_exporter.export_log_as_string(log)).decode("utf-8")
     return self.name + "_" + objtype, "xes", exported_log
Пример #6
0
def filter_paths(df, paths, parameters=None):
    """
    Apply a filter on traces containing / not containing a path

    Parameters
    ----------
    df
        Dataframe
    paths
        Paths to filter on
    parameters
        Possible parameters of the algorithm, including:
            case_id_glue -> Case ID column in the dataframe
            attribute_key -> Attribute we want to filter
            positive -> Specifies if the filter should be applied including traces (positive=True)
            or excluding traces (positive=False)
    Returns
    ----------
    df
        Filtered dataframe
    """
    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass
    if parameters is None:
        parameters = {}
    paths = [path[0] + "," + path[1] for path in paths]
    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    attribute_key = parameters[
        PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
    df = df.sort_values([case_id_glue, "event_timestamp"])
    positive = parameters["positive"] if "positive" in parameters else True
    filt_df = df[[case_id_glue, attribute_key, "event_id"]]
    filt_dif_shifted = filt_df.shift(-1)
    filt_dif_shifted.columns = [
        str(col) + '_2' for col in filt_dif_shifted.columns
    ]
    stacked_df = pd.concat([filt_df, filt_dif_shifted], axis=1)
    stacked_df["@@path"] = stacked_df[attribute_key] + "," + stacked_df[
        attribute_key + "_2"]
    stacked_df = stacked_df[stacked_df["@@path"].isin(paths)]
    i1 = df.set_index("event_id").index
    i2 = stacked_df.set_index("event_id").index
    i3 = stacked_df.set_index("event_id_2").index
    if positive:
        return df[i1.isin(i2) | i1.isin(i3)]
    else:
        return df[~i1.isin(i2) & ~i1.isin(i3)]
Пример #7
0
def get_stream_from_dataframe(df, parameters=None):
    if parameters is None:
        parameters = {}

    df_type = df.type
    df = df.sort_values(["event_timestamp", "event_id"])
    if df_type == "succint":
        df = succint_mdl_to_exploded_mdl.apply(df)

    columns = [x for x in df.columns if not x.startswith("event") or x == "event_activity" or x == "event_id" or x == "event_timestamp"]
    df = df[columns]

    stream = converter.apply(df, variant=converter.Variants.TO_EVENT_STREAM)

    return stream
Пример #8
0
def get(df):
    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass
    activ = dict(
        df.groupby("event_id").first()["event_activity"].value_counts())
    max_activ_freq = max(activ.values()) if len(activ.values()) > 0 else 0
    dfg = df_statistics.get_dfg_graph(red_df,
                                      activity_key="event_activity",
                                      timestamp_key="event_timestamp",
                                      case_id_glue=persp)
    max_edge_freq = max(dfg.values()) if len(dfg) > 0 else 0
    return {"max_activ_freq": max_activ_freq, "max_edge_freq": max_edge_freq}
Пример #9
0
def apply(df,
          model_type_variant=MODEL1,
          rel_ev_variant=REL_DFG,
          node_freq_variant=TYPE1,
          edge_freq_variant=TYPE11,
          parameters=None):
    if parameters is None:
        parameters = {}

    conversion_needed = False

    try:
        if df.type == "succint":
            conversion_needed = True
    except:
        pass

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    if conversion_needed:
        df = succint_mdl_to_exploded_mdl.apply(df)

    df = clean_objtypes.perfom_cleaning(df, parameters=parameters)

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    model = model_factory.apply(df, variant=model_type_variant)
    rel_ev = rel_ev_factory.apply(df, model, variant=rel_ev_variant)
    rel_act = rel_act_factory.apply(df, model, rel_ev)
    node_freq = node_freq_factory.apply(df,
                                        model,
                                        rel_ev,
                                        rel_act,
                                        variant=node_freq_variant)
    edge_freq = edge_freq_factory.apply(df,
                                        model,
                                        rel_ev,
                                        rel_act,
                                        variant=edge_freq_variant)

    model.set_rel_ev(rel_ev)
    model.set_rel_act(rel_act)
    model.set_node_freq(node_freq)
    model.set_edge_freq(edge_freq)

    return model
Пример #10
0
def filter_float(df, act, attr, v1, v2, parameters=None):
    if parameters is None:
        parameters = {}

    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass

    red_df = df[df["event_activity"] == act]
    red_df = red_df[df[attr] >= v1]
    red_df = red_df[df[attr] <= v2]

    red_df = red_df.dropna(how="all", axis=1)

    return filter_metaclass.do_filtering(df, red_df, parameters=parameters)
Пример #11
0
def filter_ot(df, act, ot, v1, v2, parameters=None):
    if parameters is None:
        parameters = {}

    print(len(df))

    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
    except:
        pass

    red_df0 = df[df["event_activity"] == act]
    red_df = red_df0.dropna(subset=[ot])

    dct = red_df.groupby("event_id").size().to_dict()
    lst = [x for x, y in dct.items() if v1 <= y <= v2]

    red_df = red_df0[red_df0["event_id"].isin(lst)]

    return filter_metaclass.do_filtering(df, red_df, parameters=parameters)
Пример #12
0
def apply(df, file_path, obj_df=None, parameters=None):
    if parameters is None:
        parameters = {}

    if file_path.endswith(".csv") or file_path.endswith(".mdl"):
        conversion_needed = True
        try:
            if df.type == "succint":
                conversion_needed = False
        except:
            pass

        if conversion_needed:
            df = exploded_mdl_to_succint_mdl.apply(df)

        if obj_df is not None:
            df = pd.concat([df, obj_df])

        df.to_csv(file_path, index=False, sep=',', quotechar='\"')
    else:
        from pm4pymdl.util.parquet_exporter import exporter as parquet_exporter

        conversion_needed = False
        try:
            if df.type == "succint":
                conversion_needed = True
        except:
            pass

        if conversion_needed:
            df = succint_mdl_to_exploded_mdl.apply(df)

        if obj_df is not None:
            df = pd.concat([df, obj_df])

        new_parameters = deepcopy(parameters)
        new_parameters["compression"] = "gzip"
        parquet_exporter.export_df(df, file_path, parameters=new_parameters)
Пример #13
0
def preprocess(df, parameters=None):
    if parameters is None:
        parameters = {}

    conversion_needed = False

    try:
        if df.type == "succint":
            conversion_needed = True
    except:
        pass

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    if conversion_needed:
        df = succint_mdl_to_exploded_mdl.apply(df)

    #df = clean_objtypes.perfom_cleaning(df, parameters=parameters)

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    return df
Пример #14
0
from pm4pymdl.objects.mdl.importer import importer as mdl_importer
from pm4pymdl.algo.mvp.utils import succint_mdl_to_exploded_mdl
from pm4pymdl.objects.mdl.exporter import exporter as mdl_exporter
import random
import pandas as pd

succint_df = mdl_importer.apply("example_logs/mdl/mdl-running-example.mdl")
df = succint_mdl_to_exploded_mdl.apply(succint_df)
products = df["products"].dropna().unique()
customers = df["customers"].dropna().unique()

objects = []
for p in products:
    objects.append({
        "object_id": p,
        "object_type": "products",
        "object_cost": random.randrange(100, 500),
        "object_producer": random.choice(["A", "B", "C"])
    })
for c in customers:
    objects.append({
        "object_id": c,
        "object_type": "customers",
        "object_age": random.randrange(30, 60),
        "object_bankaccount": random.randrange(1000, 100000)
    })

print(objects)

obj_df = pd.DataFrame(objects)
mdl_exporter.apply(df, "mdl-running-example-w-objects.mdl", obj_df=obj_df)
Пример #15
0
def apply(df, discovery_algorithm=discover_inductive, parameters=None):
    if parameters is None:
        parameters = {}

    allowed_activities = parameters[
        "allowed_activities"] if "allowed_activities" in parameters else None
    debug = parameters["debug"] if "debug" in parameters else True

    try:
        if df.type == "succint":
            df = succint_mdl_to_exploded_mdl.apply(df)
            df.type = "exploded"
    except:
        pass

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    min_node_freq = parameters[
        "min_node_freq"] if "min_node_freq" in parameters else 0
    min_edge_freq = parameters[
        "min_edge_freq"] if "min_edge_freq" in parameters else 0

    df = clean_frequency.apply(df, min_node_freq)
    df = clean_arc_frequency.apply(df, min_edge_freq)

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    persps = [x for x in df.columns if not x.startswith("event_")]

    ret = {}
    ret["nets"] = {}
    ret["act_count"] = {}
    ret["replay"] = {}
    ret["group_size_hist"] = {}
    ret["act_count_replay"] = {}
    ret["group_size_hist_replay"] = {}
    ret["aligned_traces"] = {}
    ret["place_fitness_per_trace"] = {}
    ret["aggregated_statistics_frequency"] = {}
    ret["aggregated_statistics_performance_min"] = {}
    ret["aggregated_statistics_performance_max"] = {}
    ret["aggregated_statistics_performance_median"] = {}
    ret["aggregated_statistics_performance_mean"] = {}

    diff_log = 0
    diff_model = 0
    diff_token_replay = 0
    diff_performance_annotation = 0
    diff_basic_stats = 0

    for persp in persps:
        aa = time.time()
        if debug:
            print(persp, "getting log")
        log = algorithm.apply(df, persp, parameters=parameters)
        if debug:
            print(len(log))

        if allowed_activities is not None:
            if persp not in allowed_activities:
                continue
            filtered_log = attributes_filter.apply_events(
                log, allowed_activities[persp])
        else:
            filtered_log = log
        bb = time.time()

        diff_log += (bb - aa)

        # filtered_log = variants_filter.apply_auto_filter(deepcopy(filtered_log), parameters={"decreasingFactor": 0.5})

        if debug:
            print(len(log))
            print(persp, "got log")

        cc = time.time()
        #net, im, fm = inductive_miner.apply(filtered_log)
        net, im, fm = discovery_algorithm(filtered_log)
        """if persp == "items":
            trans_map = {t.label:t for t in net.transitions}
            source_place_it = list(trans_map["item out of stock"].in_arcs)[0].source
            target_place_re = list(trans_map["reorder item"].out_arcs)[0].target
            skip_trans_1 = PetriNet.Transition(str(uuid.uuid4()), None)
            net.transitions.add(skip_trans_1)
            add_arc_from_to(source_place_it, skip_trans_1, net)
            add_arc_from_to(skip_trans_1, target_place_re, net)"""

        #net = reduce_petri_net(net)
        dd = time.time()

        diff_model += (dd - cc)

        # net, im, fm = alpha_miner.apply(filtered_log)
        if debug:
            print(persp, "got model")

        xx1 = time.time()
        activ_count = algorithm.apply(df,
                                      persp,
                                      variant="activity_occurrence",
                                      parameters=parameters)
        if debug:
            print(persp, "got activ_count")
        xx2 = time.time()

        ee = time.time()
        variants_idx = variants_module.get_variants_from_log_trace_idx(log)
        # variants = variants_module.convert_variants_trace_idx_to_trace_obj(log, variants_idx)
        # parameters_tr = {PARAM_ACTIVITY_KEY: "concept:name", "variants": variants}

        if debug:
            print(persp, "got variants")

        aligned_traces, place_fitness_per_trace, transition_fitness_per_trace, notexisting_activities_in_model = tr_factory.apply(
            log,
            net,
            im,
            fm,
            parameters={
                "enable_pltr_fitness": True,
                "disable_variants": True
            })

        if debug:
            print(persp, "done tbr")

        element_statistics = performance_map.single_element_statistics(
            log, net, im, aligned_traces, variants_idx)

        if debug:
            print(persp, "done element_statistics")
        ff = time.time()

        diff_token_replay += (ff - ee)

        aggregated_statistics = performance_map.aggregate_statistics(
            element_statistics)

        if debug:
            print(persp, "done aggregated_statistics")

        element_statistics_performance = performance_map.single_element_statistics(
            log, net, im, aligned_traces, variants_idx)

        if debug:
            print(persp, "done element_statistics_performance")

        gg = time.time()

        aggregated_statistics_performance_min = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="min")
        aggregated_statistics_performance_max = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="max")
        aggregated_statistics_performance_median = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="median")
        aggregated_statistics_performance_mean = performance_map.aggregate_statistics(
            element_statistics_performance,
            measure="performance",
            aggregation_measure="mean")

        hh = time.time()

        diff_performance_annotation += (hh - ee)

        if debug:
            print(persp, "done aggregated_statistics_performance")

        group_size_hist = algorithm.apply(df,
                                          persp,
                                          variant="group_size_hist",
                                          parameters=parameters)

        if debug:
            print(persp, "done group_size_hist")

        occurrences = {}
        for trans in transition_fitness_per_trace:
            occurrences[trans.label] = set()
            for trace in transition_fitness_per_trace[trans]["fit_traces"]:
                if not trace in transition_fitness_per_trace[trans][
                        "underfed_traces"]:
                    case_id = trace.attributes["concept:name"]
                    for event in trace:
                        if event["concept:name"] == trans.label:
                            occurrences[trans.label].add(
                                (case_id, event["event_id"]))
            # print(transition_fitness_per_trace[trans])

        len_different_ids = {}
        for act in occurrences:
            len_different_ids[act] = len(set(x[1] for x in occurrences[act]))

        eid_acti_count = {}
        for act in occurrences:
            eid_acti_count[act] = {}
            for x in occurrences[act]:
                if not x[0] in eid_acti_count:
                    eid_acti_count[act][x[0]] = 0
                eid_acti_count[act][x[0]] = eid_acti_count[act][x[0]] + 1
            eid_acti_count[act] = sorted(list(eid_acti_count[act].values()))

        ii = time.time()

        diff_basic_stats += (ii - hh) + (xx2 - xx1)

        ret["nets"][persp] = [net, im, fm]
        ret["act_count"][persp] = activ_count
        ret["aligned_traces"][persp] = aligned_traces
        ret["place_fitness_per_trace"][persp] = place_fitness_per_trace
        ret["aggregated_statistics_frequency"][persp] = aggregated_statistics
        ret["aggregated_statistics_performance_min"][
            persp] = aggregated_statistics_performance_min
        ret["aggregated_statistics_performance_max"][
            persp] = aggregated_statistics_performance_max
        ret["aggregated_statistics_performance_median"][
            persp] = aggregated_statistics_performance_median
        ret["aggregated_statistics_performance_mean"][
            persp] = aggregated_statistics_performance_mean

        ret["replay"][persp] = aggregated_statistics
        ret["group_size_hist"][persp] = group_size_hist
        ret["act_count_replay"][persp] = len_different_ids
        ret["group_size_hist_replay"][persp] = eid_acti_count

    ret["computation_statistics"] = {
        "diff_log": diff_log,
        "diff_model": diff_model,
        "diff_token_replay": diff_token_replay,
        "diff_performance_annotation": diff_performance_annotation,
        "diff_basic_stats": diff_basic_stats
    }

    return ret
Пример #16
0
from pm4pymdl.objects.mdl.importer import importer as mdl_importer
from pm4pymdl.algo.mvp.utils import succint_mdl_to_exploded_mdl, succint_stream_to_exploded_stream
from copy import deepcopy
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import networkx as nx
from networkx.algorithms.community import asyn_lpa_communities
from networkx.algorithms.community import quality

log0 = mdl_importer.apply("example_logs/mdl/log_opp_red.mdl")
log = succint_mdl_to_exploded_mdl.apply(log0)
stream = log.to_dict('r')
nodes = dict()
for ev in stream:
    ev2 = {x: y for x, y in ev.items() if str(y) != "nan"}
    id = "event_id=" + str(ev2["event_id"])
    activity = "event_activity=" + ev2["event_activity"]
    if id not in nodes:
        nodes[id] = len(nodes)
    if activity not in nodes:
        nodes[activity] = len(nodes)
    for col in ev2:
        if not col.startswith("event_"):
            val = ev2[col]
            oid = "object_id=" + str(val)
            cla = "class=" + str(col)
            if oid not in nodes:
                nodes[oid] = len(nodes)
            if cla not in nodes:
                nodes[cla] = len(nodes)
Пример #17
0
from pm4pymdl.objects.mdl.importer import importer as mdl_importer
from pm4pymdl.algo.mvp.utils import succint_mdl_to_exploded_mdl
from pm4pymdl.algo.mvp.gen_framework import algorithm as discovery
from pm4pymdl.visualization.mvp.gen_framework import visualizer as vis_factory
from pm4pymdl.objects.mdl.exporter import exporter as mdl_exporter

# import a succint MDL table
succint_table = mdl_importer.apply("../example_logs/mdl/order_management.mdl")
print(len(succint_table), succint_table.type)
# convert it into an exploded MDL table
exploded_table = succint_mdl_to_exploded_mdl.apply(succint_table)
print(len(exploded_table), exploded_table.type)
# keeps only events related to orders that have a profit >= 200
# to make the filtering on the exploded table we have to follow the procedure:
f0 = exploded_table[exploded_table["event_profit"] >= 200]
f1 = exploded_table[exploded_table["order"].isin(f0["order"])]
filtered_exploded_table = exploded_table[exploded_table["event_id"].isin(
    f1["event_id"])]

# suppose that we want to get also the packages related to the filtered orders, then:
f2 = exploded_table[exploded_table["package"].isin(
    filtered_exploded_table["package"])]
filtered_table_2 = exploded_table[
    exploded_table["event_id"].isin(filtered_exploded_table["event_id"])
    | exploded_table["event_id"].isin(f2["event_id"])]

# mine a process model out of the filtered table
model = discovery.apply(filtered_table_2)
gviz = vis_factory.apply(model)
vis_factory.view(gviz)