Exemplo n.º 1
0
def apply(log: Union[EventLog, EventStream],
          activity: str,
          parameters: Optional[Dict[Any, Any]] = None) -> EventLog:
    """
    Filters the suffixes of an activity in the event log

    Parameters
    ----------------
    log
        Event log
    activity
        Target activity
    parameters
        Parameters of the algorithm, including:
        - Parameters.ACTIVITY_KEY => the activity.
        - Parameters.STRICT => applies the filter strictly (cuts the occurrences of the selected activity).
        - Parameters.FIRST_OR_LAST => decides if the first or last occurrence of an activity should be selected.

    Returns
    ----------------
    filtered_log
        Filtered event log
    """
    if parameters is None:
        parameters = {}

    log = log_converter.apply(log,
                              variant=log_converter.Variants.TO_EVENT_LOG,
                              parameters=parameters)

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    first_or_last = exec_utils.get_param_value(Parameters.FIRST_OR_LAST,
                                               parameters, "first")
    strict = exec_utils.get_param_value(Parameters.STRICT, parameters, True)

    filtered_log = EventLog(attributes=log.attributes,
                            extensions=log.extensions,
                            globals=log.omni_present,
                            classifiers=log.classifiers,
                            properties=log.properties)

    for trace in log:
        activities = [
            x[activity_key] if activity_key in x else None for x in trace
        ]
        if activity in activities:
            if first_or_last == "first":
                op = min
            else:
                op = max
            idx_activity = op(i for i in range(len(activities))
                              if activities[i] == activity)
            if strict:
                idx_activity = idx_activity + 1
            filtered_trace = Trace(attributes=trace.attributes,
                                   properties=trace.properties)
            for i in range(idx_activity, len(trace)):
                filtered_trace.append(trace[i])
            filtered_log.append(filtered_trace)

    return filtered_log
Exemplo n.º 2
0
def import_from_context(context, num_traces, parameters=None):
    """
    Import a XES log from an iterparse context

    Parameters
    --------------
    context
        Iterparse context
    num_traces
        Number of traces of the XES log
    parameters
        Parameters of the algorithm

    Returns
    --------------
    log
        Event log
    """
    if parameters is None:
        parameters = {}

    max_no_traces_to_import = exec_utils.get_param_value(Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters,
                                               xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False)
    show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True)

    date_parser = dt_parser.get()
    progress = None
    if pkgutil.find_loader("tqdm") and show_progress_bar:
        from tqdm.auto import tqdm
        progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ")

    log = None
    trace = None
    event = None

    tree = {}

    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree)
                except TypeError:
                    logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError('file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError('file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree)
                    except ValueError:
                        logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST) or elem.tag.endswith(xes_constants.TAG_CONTAINER):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE), tree)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if log is None:
                    raise SyntaxError('extension found outside of <log> tag')
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI)}
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if log is None:
                    raise SyntaxError('global found outside of <log> tag')
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if log is None:
                    raise SyntaxError('classifier found outside of <log> tag')
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [x for x in classifier_value.split("'")
                                                                             if x.strip()]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                if log is not None:
                    raise SyntaxError('file contains > 1 <log> tags')
                log = EventLog()
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    # gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if timestamp_sort:
        log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort)

    # sets the activity key as default classifier in the log's properties
    log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY
    log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY
    # sets the default timestamp key
    log.properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY
    # sets the default resource key
    log.properties[constants.PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY
    # sets the default transition key
    log.properties[constants.PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY
    # sets the default group key
    log.properties[constants.PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY

    return log
Exemplo n.º 3
0
def get_log_traces_until_activity(log, activity, parameters=None):
    """
    Gets a reduced version of the log containing, for each trace, only the events before a
    specified activity

    Parameters
    -------------
    log
        Trace log
    activity
        Activity to reach
    parameters
        Possible parameters of the algorithm, including:
            PARAMETER_CONSTANT_ACTIVITY_KEY -> activity
            PARAMETER_CONSTANT_TIMESTAMP_KEY -> timestamp

    Returns
    -------------
    new_log
        New log
    """
    if parameters is None:
        parameters = {}

    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    duration_attribute = parameters[
        "duration"] if "duration" in parameters else None
    use_future_attributes = parameters[
        "use_future_attributes"] if "use_future_attributes" in parameters else False

    new_log = EventLog()
    traces_interlapsed_time_to_act = []

    i = 0
    while i < len(log):
        ev_in_tr_w_act = sorted([
            j for j in range(len(log[i]))
            if log[i][j][activity_key] == activity
        ])
        if ev_in_tr_w_act and ev_in_tr_w_act[0] > 0:
            new_trace = Trace(log[i][0:ev_in_tr_w_act[0]])
            for attr in log[i].attributes:
                new_trace.attributes[attr] = log[i].attributes[attr]

            if duration_attribute is None:
                try:
                    curr_trace_interlapsed_time_to_act = log[i][ev_in_tr_w_act[0]][timestamp_key].timestamp() - \
                                                         log[i][ev_in_tr_w_act[0] - 1][timestamp_key].timestamp()
                except:
                    curr_trace_interlapsed_time_to_act = log[i][ev_in_tr_w_act[0]][timestamp_key] - \
                                                         log[i][ev_in_tr_w_act[0] - 1][timestamp_key]
                    logging.error("timestamp_key not timestamp")
            else:
                curr_trace_interlapsed_time_to_act = log[i][
                    ev_in_tr_w_act[0]][duration_attribute]

            traces_interlapsed_time_to_act.append(
                curr_trace_interlapsed_time_to_act)

            if use_future_attributes:
                for j in range(ev_in_tr_w_act[0] + 1, len(log[i])):
                    new_ev = deepcopy(log[i][j])
                    if activity_key in new_ev:
                        del new_ev[activity_key]
                    new_trace.append(new_ev)

            new_log.append(new_trace)
        i = i + 1

    return new_log, traces_interlapsed_time_to_act
Exemplo n.º 4
0
    def read_event(self):
        """
        Gets the next event from the iterator

        Returns
        ------------
        event
            Event
        """
        tree = self.tree
        while True:
            tree_event, elem = next(self.context)

            if tree_event == _EVENT_START:
                parent = tree[
                    elem.getparent()] if elem.getparent() in tree else None

                if elem.tag.endswith(xes_constants.TAG_TRACE):
                    self.trace = Trace()
                    tree[elem] = self.trace.attributes
                    self.reading_trace = True
                    continue

                if elem.tag.endswith(xes_constants.TAG_EVENT):
                    self.event = Event()
                    tree[elem] = self.event
                    self.reading_event = True
                    continue

                if self.reading_event or self.reading_trace:
                    if elem.tag.endswith(xes_constants.TAG_STRING):
                        if parent is not None:
                            tree = parse_attribute(
                                elem, parent, elem.get(xes_constants.KEY_KEY),
                                elem.get(xes_constants.KEY_VALUE), tree)
                        continue

                    elif elem.tag.endswith(xes_constants.TAG_DATE):
                        try:
                            dt = self.date_parser.apply(
                                elem.get(xes_constants.KEY_VALUE))
                            tree = parse_attribute(
                                elem, parent, elem.get(xes_constants.KEY_KEY),
                                dt, tree)
                        except TypeError:
                            logging.info(
                                "failed to parse date: " +
                                str(elem.get(xes_constants.KEY_VALUE)))
                        except ValueError:
                            logging.info(
                                "failed to parse date: " +
                                str(elem.get(xes_constants.KEY_VALUE)))
                        continue

                    elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                        if parent is not None:
                            try:
                                val = float(elem.get(xes_constants.KEY_VALUE))
                                tree = parse_attribute(
                                    elem, parent,
                                    elem.get(xes_constants.KEY_KEY), val, tree)
                            except ValueError:
                                logging.info(
                                    "failed to parse float: " +
                                    str(elem.get(xes_constants.KEY_VALUE)))
                        continue

                    elif elem.tag.endswith(xes_constants.TAG_INT):
                        if parent is not None:
                            try:
                                val = int(elem.get(xes_constants.KEY_VALUE))
                                tree = parse_attribute(
                                    elem, parent,
                                    elem.get(xes_constants.KEY_KEY), val, tree)
                            except ValueError:
                                logging.info(
                                    "failed to parse int: " +
                                    str(elem.get(xes_constants.KEY_VALUE)))
                        continue

                    elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                        if parent is not None:
                            try:
                                val0 = elem.get(xes_constants.KEY_VALUE)
                                val = False
                                if str(val0).lower() == "true":
                                    val = True
                                tree = parse_attribute(
                                    elem, parent,
                                    elem.get(xes_constants.KEY_KEY), val, tree)
                            except ValueError:
                                logging.info(
                                    "failed to parse boolean: " +
                                    str(elem.get(xes_constants.KEY_VALUE)))
                        continue

                    elif elem.tag.endswith(xes_constants.TAG_LIST):
                        if parent is not None:
                            # lists have no value, hence we put None as a value
                            tree = parse_attribute(
                                elem, parent, elem.get(xes_constants.KEY_KEY),
                                None, tree)
                        continue

                    elif elem.tag.endswith(xes_constants.TAG_ID):
                        if parent is not None:
                            tree = parse_attribute(
                                elem, parent, elem.get(xes_constants.KEY_KEY),
                                elem.get(xes_constants.KEY_VALUE), tree)
                        continue

            elif tree_event == _EVENT_END:
                if elem in tree:
                    del tree[elem]
                elem.clear()
                if elem.getprevious() is not None:
                    try:
                        del elem.getparent()[0]
                    except TypeError:
                        pass

                if elem.tag.endswith(xes_constants.TAG_EVENT):
                    self.reading_event = False
                    if self.acceptance_condition(self.event):
                        for attr in self.trace.attributes:
                            self.event[constants.CASE_ATTRIBUTE_PREFIX +
                                       attr] = self.trace.attributes[attr]
                        return self.event
                    continue

                elif elem.tag.endswith(xes_constants.TAG_TRACE):
                    self.reading_trace = False
                    continue

                elif elem.tag.endswith(xes_constants.TAG_LOG):
                    self.reading_log = False
                    break
Exemplo n.º 5
0
def apply(dfg, start_activities, end_activities, parameters=None):
    """
    Applies the playout algorithm on a DFG, extracting the most likely traces according to the DFG

    Parameters
    ---------------
    dfg
        *Complete* DFG
    start_activities
        Start activities
    end_activities
        End activities
    parameters
        Parameters of the algorithm, including:
        - Parameters.ACTIVITY_KEY => the activity key of the simulated log
        - Parameters.TIMESTAMP_KEY => the timestamp key of the simulated log
        - Parameters.MAX_NO_VARIANTS => the maximum number of variants generated by the method (default: 3000)
        - Parameters.MIN_WEIGHTED_PROBABILITY => the minimum overall weighted probability that makes the method stop
                                                (default: 1)
        - Parameters.MAX_NO_OCC_PER_ACTIVITY => the maximum number of occurrences per activity in the traces of the log
                                                (default: 2)
        - Parameters.INTERRUPT_SIMULATION_WHEN_DFG_COMPLETE => interrupts the simulation when the DFG of the simulated
                                                    log has the same keys to the DFG of the original log
                                                    (all behavior is contained) (default: False)
        - Parameters.ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG => adds a simulated trace to the simulated log only if it adds
                                                    elements to the simulated DFG, e.g., it adds behavior;
                                                    skip insertion otherwise (default: False)
        - Parameters.RETURN_VARIANTS => returns the traces as variants with a likely number of occurrences

    Returns
    ---------------
    simulated_log
        Simulated log
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_constants.DEFAULT_NAME_KEY)
    max_no_variants = exec_utils.get_param_value(Parameters.MAX_NO_VARIANTS,
                                                 parameters, 3000)
    min_weighted_probability = exec_utils.get_param_value(
        Parameters.MIN_WEIGHTED_PROBABILITY, parameters, 1.0)
    interrupt_simulation_when_dfg_complete = exec_utils.get_param_value(
        Parameters.INTERRUPT_SIMULATION_WHEN_DFG_COMPLETE, parameters, False)
    add_trace_if_takes_new_els_to_dfg = exec_utils.get_param_value(
        Parameters.ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG, parameters, False)
    return_variants = exec_utils.get_param_value(Parameters.RETURN_VARIANTS,
                                                 parameters, False)

    # keep track of the DFG, start activities and end activities of the (ongoing) simulation
    simulated_traces_dfg = set()
    simulated_traces_sa = set()
    simulated_traces_ea = set()
    interrupt_break_condition = False
    overall_probability = 0.0

    final_traces = []

    for tr, p in get_traces(dfg,
                            start_activities,
                            end_activities,
                            parameters=parameters):
        if (interrupt_simulation_when_dfg_complete
                and interrupt_break_condition
            ) or not (len(final_traces) < max_no_variants
                      and overall_probability <= min_weighted_probability):
            break
        overall_probability += p
        diff_sa = {tr[0]}.difference(simulated_traces_sa)
        diff_ea = {tr[-1]}.difference(simulated_traces_ea)
        diff_dfg = {(tr[i], tr[i + 1])
                    for i in range(len(tr) - 1)
                    }.difference(simulated_traces_dfg)
        adds_something = len(diff_sa) > 0 or len(diff_ea) > 0 or len(
            diff_dfg) > 0
        if add_trace_if_takes_new_els_to_dfg and not adds_something:
            # interrupt the addition if the ADD_TRACE_IF_TAKES_NEW_ELS_TO_DFG is set to True,
            # and the trace does not really change the information on the DFG, start activities,
            # end activities
            continue
        # update the start activities, end activities, DFG of the original log
        simulated_traces_sa = simulated_traces_sa.union(diff_sa)
        simulated_traces_ea = simulated_traces_ea.union(diff_ea)
        simulated_traces_dfg = simulated_traces_dfg.union(diff_dfg)
        # memorize the difference between the original DFG and the DFG of the simulated log
        diff_original_sa = set(start_activities).difference(
            simulated_traces_sa)
        diff_original_ea = set(end_activities).difference(simulated_traces_ea)
        diff_original_dfg = set(dfg).difference(simulated_traces_dfg)
        interrupt_break_condition = len(diff_original_sa) == 0 and len(
            diff_original_ea) == 0 and len(diff_original_dfg) == 0
        final_traces.append((-p, tr))
        if interrupt_simulation_when_dfg_complete and interrupt_break_condition:
            break

    # make sure that the traces are strictly ordered by their probability
    # (generally, the order is already pretty good, since the states are visited in the queue based on their order,
    # but not always 100% consistent)
    final_traces = sorted(final_traces)

    if return_variants:
        # returns the variants instead of the log
        variants = []
        for p, tr in final_traces:
            variants.append({
                "variant": ",".join(tr),
                "count": math.ceil(-p * max_no_variants)
            })
        return variants
    else:
        event_log = EventLog()
        # assigns to each event an increased timestamp from 1970
        curr_timestamp = 10000000
        for index, tr in enumerate(final_traces):
            log_trace = Trace(
                attributes={
                    xes_constants.DEFAULT_TRACEID_KEY: str(index),
                    "probability": -tr[0]
                })
            for act in tr[1]:
                log_trace.append(
                    Event({
                        activity_key:
                        act,
                        timestamp_key:
                        datetime.datetime.fromtimestamp(curr_timestamp)
                    }))
                # increases by 1 second
                curr_timestamp += 1
            event_log.append(log_trace)
        return event_log
Exemplo n.º 6
0
def to_lifecycle(log, parameters=None):
    """
    Converts a log from interval format (e.g. an event has two timestamps)
    to lifecycle format (an event has only a timestamp, and a transition lifecycle)

    Parameters
    -------------
    log
        Log (expressed in the interval format)
    parameters
        Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...)

    Returns
    -------------
    log
        Lifecycle event log
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters,
        xes.DEFAULT_START_TIMESTAMP_KEY)
    transition_key = exec_utils.get_param_value(Parameters.TRANSITION_KEY,
                                                parameters,
                                                xes.DEFAULT_TRANSITION_KEY)

    if log is not None and len(log) > 0:
        if "PM4PY_TYPE" in log.attributes and log.attributes[
                "PM4PY_TYPE"] == "lifecycle":
            return log
        if log[0] is not None and len(log[0]) > 0:
            first_event = log[0][0]
            if transition_key in first_event:
                return log

        new_log = EventLog(attributes=copy(log.attributes),
                           extensions=copy(log.extensions),
                           classifiers=copy(log.classifiers),
                           omni_present=copy(log.omni_present),
                           properties=copy(log.properties))
        new_log.attributes["PM4PY_TYPE"] = "lifecycle"

        for trace in log:
            new_trace = Trace()
            for attr in trace.attributes:
                new_trace.attributes[attr] = trace.attributes[attr]
            list_events = []
            for index, event in enumerate(trace):
                new_event_start = Event()
                new_event_complete = Event()
                for attr in event:
                    if not attr == timestamp_key and not attr == start_timestamp_key:
                        new_event_start[attr] = event[attr]
                        new_event_complete[attr] = event[attr]
                new_event_start[timestamp_key] = event[start_timestamp_key]
                new_event_start[transition_key] = "start"
                new_event_start["@@custom_lif_id"] = 0
                new_event_start["@@origin_ev_idx"] = index
                new_event_complete[timestamp_key] = event[timestamp_key]
                new_event_complete[transition_key] = "complete"
                new_event_complete["@@custom_lif_id"] = 1
                new_event_complete["@@origin_ev_idx"] = index
                list_events.append(new_event_start)
                list_events.append(new_event_complete)
            list_events = sorted(
                list_events,
                key=lambda x:
                (x[timestamp_key], x["@@origin_ev_idx"], x["@@custom_lif_id"]))
            for ev in list_events:
                new_trace.append(ev)
            new_log.append(new_trace)
        return new_log
    return log
Exemplo n.º 7
0
def to_interval(log, parameters=None):
    """
    Converts a log to interval format (e.g. an event has two timestamps)
    from lifecycle format (an event has only a timestamp, and a transition lifecycle)

    Parameters
    -------------
    log
        Log (expressed in the lifecycle format)
    parameters
        Possible parameters of the method (activity, timestamp key, start timestamp key, transition ...)

    Returns
    -------------
    log
        Interval event log
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)
    start_timestamp_key = exec_utils.get_param_value(
        Parameters.START_TIMESTAMP_KEY, parameters,
        xes.DEFAULT_START_TIMESTAMP_KEY)
    transition_key = exec_utils.get_param_value(Parameters.TRANSITION_KEY,
                                                parameters,
                                                xes.DEFAULT_TRANSITION_KEY)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    lifecycle_instance_key = exec_utils.get_param_value(
        Parameters.LIFECYCLE_INSTANCE_KEY, parameters,
        xes.DEFAULT_INSTANCE_KEY)
    business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS,
                                                parameters, False)
    worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters,
                                            [7, 17])
    weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters,
                                          [6, 7])

    if log is not None and len(log) > 0:
        if "PM4PY_TYPE" in log.attributes and log.attributes[
                "PM4PY_TYPE"] == "interval":
            return log
        if log[0] is not None and len(log[0]) > 0:
            first_event = log[0][0]
            if start_timestamp_key in first_event:
                return log

        new_log = EventLog(attributes=copy(log.attributes),
                           extensions=copy(log.extensions),
                           classifiers=copy(log.classifiers),
                           omni_present=copy(log.omni_present),
                           properties=copy(log.properties))
        new_log.attributes["PM4PY_TYPE"] = "interval"
        new_log.properties[
            constants.
            PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = xes.DEFAULT_START_TIMESTAMP_KEY

        for trace in log:
            new_trace = Trace()
            for attr in trace.attributes:
                new_trace.attributes[attr] = trace.attributes[attr]
            activities_start = {}
            for event in trace:
                activity = event[activity_key]
                instance = event[
                    lifecycle_instance_key] if lifecycle_instance_key in event else None
                activity = (activity, instance)
                transition = event[
                    transition_key] if transition_key in event else "complete"
                timestamp = event[timestamp_key]
                if transition.lower() == "start":
                    if activity not in activities_start:
                        activities_start[activity] = list()
                    activities_start[activity].append(event)
                elif transition.lower() == "complete":
                    start_event = None
                    start_timestamp = event[timestamp_key]
                    if activity in activities_start and len(
                            activities_start[activity]) > 0:
                        start_event = activities_start[activity].pop(0)
                        start_timestamp = start_event[timestamp_key]
                    new_event = Event()
                    for attr in event:
                        if not attr == timestamp_key and not attr == transition_key:
                            new_event[attr] = event[attr]
                    if start_event is not None:
                        for attr in start_event:
                            if not attr == timestamp_key and not attr == transition_key:
                                new_event["@@startevent_" +
                                          attr] = start_event[attr]
                    new_event[start_timestamp_key] = start_timestamp
                    new_event[timestamp_key] = timestamp
                    new_event["@@duration"] = (
                        timestamp - start_timestamp).total_seconds()

                    if business_hours:
                        bh = BusinessHours(
                            start_timestamp.replace(tzinfo=None),
                            timestamp.replace(tzinfo=None),
                            worktiming=worktiming,
                            weekends=weekends)
                        new_event["@@approx_bh_duration"] = bh.getseconds()

                    new_trace.append(new_event)
            new_trace = sorting.sort_timestamp_trace(new_trace,
                                                     start_timestamp_key)
            new_log.append(new_trace)
        return new_log

    return log
Exemplo n.º 8
0
def empty_sequence_accepted(pt: ProcessTree) -> bool:
    alignment = calculate_optimal_alignment(pt, Trace())
    return alignment["cost"] < STD_MODEL_LOG_MOVE_COST
Exemplo n.º 9
0
def concatenate_traces(t1: Trace, t2: Trace) -> Trace:
    for e in t2:
        t1.append(e)
    return t1
Exemplo n.º 10
0
def apply(log, net, im, fm, parameters=None):
    """
    Performs a Monte Carlo simulation of an accepting Petri net without duplicate transitions and where the preset is always
    distinct from the postset (FIFO variant; the semaphores pile up if waiting is needed, and the first in is the first to win
    the semaphore)

    Parameters
    -------------
    log
        Event log
    net
        Accepting Petri net without duplicate transitions and where the preset is always distinct from the postset
    im
        Initial marking
    fm
        Final marking
    parameters
        Parameters of the algorithm:
            PARAM_NUM_SIMULATIONS => (default: 100)
            PARAM_FORCE_DISTRIBUTION => Force a particular stochastic distribution (e.g. normal) when the stochastic map
            is discovered from the log (default: None; no distribution is forced)
            PARAM_ENABLE_DIAGNOSTICS => Enable the printing of diagnostics (default: True)
            PARAM_DIAGN_INTERVAL => Interval of time in which diagnostics of the simulation are printed (default: 32)
            PARAM_CASE_ARRIVAL_RATIO => Case arrival of new cases (default: None; inferred from the log)
            PARAM_PROVIDED_SMAP => Stochastic map that is used in the simulation (default: None; inferred from the log)
            PARAM_MAP_RESOURCES_PER_PLACE => Specification of the number of resources available per place
            (default: None; each place gets the default number of resources)
            PARAM_DEFAULT_NUM_RESOURCES_PER_PLACE => Default number of resources per place when not specified
            (default: 1; each place gets 1 resource and has to wait for the resource to finish)
            PARAM_SMALL_SCALE_FACTOR => Scale factor for the sleeping time of the actual simulation
            (default: 864000.0, 10gg)
            PARAM_MAX_THREAD_EXECUTION_TIME => Maximum execution time per thread (default: 60.0, 1 minute)

    Returns
    ------------
    simulated_log
        Simulated event log
    simulation_result
        Result of the simulation:
            Outputs.OUTPUT_PLACES_INTERVAL_TREES => inteval trees that associate to each place the times in which it was occupied.
            Outputs.OUTPUT_TRANSITIONS_INTERVAL_TREES => interval trees that associate to each transition the intervals of time
            in which it could not fire because some token was in the output.
            Outputs.OUTPUT_CASES_EX_TIME => Throughput time of the cases included in the simulated log
            Outputs.OUTPUT_MEDIAN_CASES_EX_TIME => Median of the throughput times
            Outputs.OUTPUT_CASE_ARRIVAL_RATIO => Case arrival ratio that was specified in the simulation
            Outputs.OUTPUT_TOTAL_CASES_TIME => Total time occupied by cases of the simulated log
    """
    if parameters is None:
        parameters = {}

    from intervaltree import IntervalTree

    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    no_simulations = exec_utils.get_param_value(
        Parameters.PARAM_NUM_SIMULATIONS, parameters, 100)
    force_distribution = exec_utils.get_param_value(
        Parameters.PARAM_FORCE_DISTRIBUTION, parameters, None)
    enable_diagnostics = exec_utils.get_param_value(
        Parameters.PARAM_ENABLE_DIAGNOSTICS, parameters, True)
    diagn_interval = exec_utils.get_param_value(
        Parameters.PARAM_DIAGN_INTERVAL, parameters, 32.0)
    case_arrival_ratio = exec_utils.get_param_value(
        Parameters.PARAM_CASE_ARRIVAL_RATIO, parameters, None)
    smap = exec_utils.get_param_value(Parameters.PARAM_PROVIDED_SMAP,
                                      parameters, None)
    resources_per_places = exec_utils.get_param_value(
        Parameters.PARAM_MAP_RESOURCES_PER_PLACE, parameters, None)
    default_num_resources_per_places = exec_utils.get_param_value(
        Parameters.PARAM_DEFAULT_NUM_RESOURCES_PER_PLACE, parameters, 1)
    small_scale_factor = exec_utils.get_param_value(
        Parameters.PARAM_SMALL_SCALE_FACTOR, parameters, 864000)
    max_thread_exec_time = exec_utils.get_param_value(
        Parameters.PARAM_MAX_THREAD_EXECUTION_TIME, parameters, 60.0)

    if case_arrival_ratio is None:
        case_arrival_ratio = case_arrival.get_case_arrival_avg(
            log, parameters=parameters)
    if resources_per_places is None:
        resources_per_places = {}

    logging.basicConfig()
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    places_interval_trees = {}
    transitions_interval_trees = {}
    cases_ex_time = []
    list_cases = {}

    for place in net.places:
        # assign a semaphore to each place.
        if place in resources_per_places:
            place.semaphore = Semaphore(resources_per_places[place])
        else:
            # if the user does not specify the number of resources per place,
            # the default number is used
            place.semaphore = Semaphore(default_num_resources_per_places)
        place.assigned_time = []
        places_interval_trees[place] = IntervalTree()
    for trans in net.transitions:
        transitions_interval_trees[trans] = IntervalTree()

    # when the user does not specify any map from transitions to random variables,
    # a replay operation is performed
    if smap is None:
        if enable_diagnostics:
            logger.info(str(time()) + " started the replay operation.")
        if force_distribution is not None:
            smap = replay.get_map_from_log_and_net(
                log,
                net,
                im,
                fm,
                force_distribution=force_distribution,
                parameters=parameters)
        else:
            smap = replay.get_map_from_log_and_net(log,
                                                   net,
                                                   im,
                                                   fm,
                                                   parameters=parameters)
        if enable_diagnostics:
            logger.info(str(time()) + " ended the replay operation.")

    # the start timestamp is set to 1000000 instead of 0 to avoid problems with 32 bit machines
    start_time = 1000000
    threads = []
    for i in range(no_simulations):
        list_cases[i] = Trace()
        t = SimulationThread(i, net, im, fm, smap, start_time,
                             places_interval_trees, transitions_interval_trees,
                             cases_ex_time, list_cases, enable_diagnostics,
                             diagn_interval, small_scale_factor,
                             max_thread_exec_time)
        t.start()
        threads.append(t)
        start_time = start_time + case_arrival_ratio
        # wait a factor before opening a thread and the next one
        sleep(case_arrival_ratio / small_scale_factor)

    for t in threads:
        t.join()

    i = 0
    while i < len(threads):
        if threads[i].terminated_correctly is False:
            del list_cases[threads[i].id]
            del threads[i]
            del cases_ex_time[i]
            continue
        i = i + 1

    if enable_diagnostics:
        logger.info(str(time()) + " ended the Monte carlo simulation.")

    log = EventLog(list(list_cases.values()))
    min_timestamp = log[0][0][timestamp_key].timestamp()
    max_timestamp = max(y[timestamp_key].timestamp() for x in log for y in x)

    transitions_interval_trees = {
        t.name: y
        for t, y in transitions_interval_trees.items()
    }

    return log, {
        Outputs.OUTPUT_PLACES_INTERVAL_TREES.value: places_interval_trees,
        Outputs.OUTPUT_TRANSITIONS_INTERVAL_TREES.value:
        transitions_interval_trees,
        Outputs.OUTPUT_CASES_EX_TIME.value: cases_ex_time,
        Outputs.OUTPUT_MEDIAN_CASES_EX_TIME.value: median(cases_ex_time),
        Outputs.OUTPUT_CASE_ARRIVAL_RATIO.value: case_arrival_ratio,
        Outputs.OUTPUT_TOTAL_CASES_TIME.value: max_timestamp - min_timestamp
    }
Exemplo n.º 11
0
def filter_log_by_paths(log,
                        paths,
                        variants,
                        vc,
                        threshold,
                        attribute_key="concept:name"):
    """
    Keep only paths which number of occurrences is above the threshold (or they belong to the first variant)

    Parameters
    ----------
    log
        Log
    paths
        Dictionary of paths associated with their count
    variants
        (If specified) Dictionary with variant as the key and the list of traces as the value
    vc
        List of variant names along with their count
    threshold
        Cutting threshold (remove paths which number of occurrences is below the threshold)
    attribute_key
        (If specified) Specify the attribute key to use (default concept:name)

    Returns
    ----------
    filtered_log
        Filtered log
    """
    filtered_log = EventLog(list(),
                            attributes=log.attributes,
                            extensions=log.extensions,
                            classifiers=log.classifiers,
                            omni_present=log.omni_present,
                            properties=log.properties)
    fvft = variants[vc[0][0]][0]
    fvp = set()
    for i in range(0, len(fvft) - 1):
        path = fvft[i][attribute_key] + "," + fvft[i + 1][attribute_key]
        fvp.add(path)
    for trace in log:
        new_trace = Trace()
        jj = 0
        if len(trace) > 0:
            new_trace.append(trace[0])
            for j in range(1, len(trace) - 1):
                jj = j
                if j >= len(trace):
                    break
                if attribute_key in trace[j] and attribute_key in trace[j + 1]:
                    path = trace[j][attribute_key] + "," + trace[
                        j + 1][attribute_key]
                    if path in paths:
                        if path in fvp or paths[path] >= threshold:
                            new_trace.append(trace[j])
                            new_trace.append(trace[j + 1])
        if len(trace) > 1 and not jj == len(trace):
            new_trace.append(trace[-1])
        if len(new_trace) > 0:
            for attr in trace.attributes:
                new_trace.attributes[attr] = trace.attributes[attr]
            filtered_log.append(new_trace)
    return filtered_log
Exemplo n.º 12
0
def preprocessing(log, parameters=None):
    """
    Preprocessing step for the Aplha+ algorithm. Removing all transitions from the log with a loop of length one.

    Parameters
    ------------
    log
        Event log
    parameters
        Parameters of the algorithm

    Returns
    -------------
    log
        filtered log and a list of the filtered transitions
    loop_one_list
        Loop one list
    A_filtered
        Dictionary: activity before the loop-length-one activity
    B_filtered
        Dictionary: activity after the loop-length-one activity
    loops_in_first_place
        Loops in source place
    loops_in_last_place
        Loops in sink place
    """
    loops_in_first_place = set()
    loops_in_last_place = set()

    if parameters is None:
        parameters = {}
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters,
                                              xes_util.DEFAULT_NAME_KEY)

    # List for values that have a loop of length one
    loop_one_list = []
    # Log without activities that have a loop of length one
    filtered_log = EventLog()
    # dictionary A: activity before the loop-length-one activity
    A = {}
    # dictionary B: activity after the loop-length-one activity
    B = {}
    A_filtered = {}
    B_filtered = {}
    # inserting artificial start and end activity, since it is not allowed to have a loop at the source place
    # (according to paper)
    for trace in log:
        trace.insert(0, {activity_key: 'artificial_start'})
        trace.append({activity_key: 'artificial_end'})
    for trace in log:
        i = 0
        while i < len(trace) - 1:
            test = trace[1]
            current = trace[i][activity_key]
            successor = trace[i + 1][activity_key]
            if current == successor:
                if current not in loop_one_list:
                    loop_one_list.append(current)
            i += 1
    for trace in log:
        i = 0
        filtered_trace = Trace()
        while i < len(trace) - 1:
            current = trace[i][activity_key]
            successor = trace[i + 1][activity_key]
            if not current in loop_one_list:
                filtered_trace.append(current)
            if successor in loop_one_list:
                if not current in loop_one_list:
                    if current in A:
                        A[successor].append(current)
                    else:
                        A[successor] = [current]
            if current in loop_one_list:
                if not successor in loop_one_list:
                    if current in B:
                        B[current].append(successor)
                    else:
                        B[current] = [successor]
            if i == len(trace) - 2:
                if not successor in loop_one_list:
                    filtered_trace.append(successor)
            i += 1
        filtered_log.append(filtered_trace)
    # Making sets instead of lists
    for key, value in A.items():
        A_filtered[key] = set(value)
    # Making sets instead of lists
    for key, value in B.items():
        B_filtered[key] = set(value)
    for trace in log:
        if trace.__getitem__(0) in loop_one_list:
            loops_in_first_place.add(trace.__getitem__(0))
        if trace.__getitem__(len(trace) - 1) in loop_one_list:
            loops_in_last_place.add(trace.__getitem__(len(trace) - 1))
    loops_in_first_place = list(loops_in_first_place)
    loops_in_last_place = list(loops_in_last_place)

    return (filtered_log, loop_one_list, A_filtered, B_filtered,
            loops_in_first_place, loops_in_last_place)
Exemplo n.º 13
0
def __approximate_alignment_on_parallel(pt: ProcessTree,
                                        trace: Trace,
                                        a_sets: Dict[ProcessTree, Set[str]],
                                        sa_sets: Dict[ProcessTree, Set[str]],
                                        ea_sets: Dict[ProcessTree, Set[str]],
                                        tau_flags: Dict[ProcessTree, bool],
                                        tl: int,
                                        th: int,
                                        parameters=None):
    if parameters is None:
        parameters = {}

    from pulp import lpSum, LpVariable, LpProblem, LpMinimize

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, DEFAULT_NAME_KEY)

    assert pt.operator == Operator.PARALLEL
    assert len(pt.children) > 0
    assert len(trace) > 0

    ilp = LpProblem(sense=LpMinimize)

    # x_i_j = 1 <=> assigns activity i to subtree j
    x_variables: Dict[int, Dict[int, LpVariable]] = {}

    # s_i_j = 1 <=> activity i is a start activity in the current sub-trace assigned to subtree j
    s_variables: Dict[int, Dict[int, LpVariable]] = {}

    # e_i_j = 1 <=> activity i is an end activity in the current sub-trace assigned to subtree j
    e_variables: Dict[int, Dict[int, LpVariable]] = {}

    # auxiliary u_j <=> u_j=1 if an activity is assigned to subtree j
    u_variables: Dict[int, LpVariable] = {}

    # v_i_j = 1 <=> activity i is neither a start nor end-activity in the current sub-trace assigned to subtree j
    v_variables: Dict[int, Dict[int, LpVariable]] = {}

    s_costs = {}
    e_costs = {}
    u_costs = {}
    v_costs = {}

    for i, a in enumerate(trace):
        x_variables[i] = {}
        s_variables[i] = {}
        s_costs[i] = {}
        e_variables[i] = {}
        e_costs[i] = {}
        v_variables[i] = {}
        v_costs[i] = {}

        for j, subtree in enumerate(pt.children):
            x_variables[i][j] = LpVariable('x_' + str(i) + '_' + str(j),
                                           cat='Binary')

            s_variables[i][j] = LpVariable('s_' + str(i) + '_' + str(j),
                                           cat='Binary')
            s_costs[i][j] = 0 if a[activity_key] in sa_sets[subtree] else 1

            e_variables[i][j] = LpVariable('e_' + str(i) + '_' + str(j),
                                           cat='Binary')
            e_costs[i][j] = 0 if a[activity_key] in ea_sets[subtree] else 1

            v_variables[i][j] = LpVariable('v_' + str(i) + '_' + str(j),
                                           cat='Binary')
            v_costs[i][j] = 0 if a[activity_key] in a_sets[subtree] else 1

    for j in range(len(pt.children)):
        u_variables[j] = LpVariable('u_' + str(j), cat='Binary')
        # define costs to not assign anything to subtree j
        if tau_flags[pt.children[j]]:
            u_costs[j] = 0
        elif sa_sets[pt.children[j]] & ea_sets[pt.children[j]]:
            # intersection of start-activities and end-activities is not empty
            u_costs[j] = 1
        else:
            # intersection of start-activities and end-activities is empty
            u_costs[j] = 2

    # objective function
    ilp += lpSum([
        v_variables[i][j] * v_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        s_variables[i][j] * s_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        e_variables[i][j] * e_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [(1 - u_variables[j]) * u_costs[j]
         for j in range(len(pt.children))]), "objective_function"

    # constraints
    for i in range(len(trace)):
        # every activity is assigned to one subtree
        ilp += lpSum([x_variables[i][j] * 1
                      for j in range(len(pt.children))]) == 1

    for j in range(len(pt.children)):
        # first activity is a start activity
        ilp += x_variables[0][j] <= s_variables[0][j]
        # last activity is an end-activity
        ilp += x_variables[len(trace) - 1][j] <= e_variables[len(trace) - 1][j]

    # define s_i_j variables
    for i in range(len(trace)):
        for j in range(len(pt.children)):
            ilp += s_variables[i][j] <= x_variables[i][j]
            for k in range(i):
                ilp += s_variables[i][j] <= 1 - x_variables[k][j]
        # activity can be only a start-activity for one subtree
        ilp += lpSum(s_variables[i][j] for j in range(len(pt.children))) <= 1

    # define e_i_j variables
    for i in range(len(trace)):
        for j in range(len(pt.children)):
            ilp += e_variables[i][j] <= x_variables[i][j]
            for k in range(i + 1, len(trace)):
                ilp += e_variables[i][j] <= 1 - x_variables[k][j]
        # activity can be only an end-activity for one subtree
        ilp += lpSum(e_variables[i][j] for j in range(len(pt.children))) <= 1

    for j in range(len(pt.children)):
        for i in range(len(trace)):
            # define u_j variables
            ilp += u_variables[j] >= x_variables[i][j]
        # if u_j variable = 1 ==> a start activity must exist
        ilp += u_variables[j] <= lpSum(s_variables[i][j]
                                       for i in range(len(trace)))
        # if u_j variable = 1 ==> an end activity must exist
        ilp += u_variables[j] <= lpSum(e_variables[i][j]
                                       for i in range(len(trace)))

    # define v_i_j variables
    for i in range(len(trace)):
        for j in range(2):
            ilp += v_variables[i][j] >= 1 - s_variables[i][
                j] + 1 - e_variables[i][j] + x_variables[i][j] - 2
            ilp += v_variables[i][j] <= x_variables[i][j]
            ilp += v_variables[i][j] <= 1 - e_variables[i][j]
            ilp += v_variables[i][j] <= 1 - s_variables[i][j]

    status = ilp.solve()
    assert status == 1

    # trace_parts list contains trace parts mapped onto the determined subtree
    trace_parts: List[Tuple[ProcessTree, Trace]] = []
    last_subtree: ProcessTree = None
    for i in range(len(trace)):
        for j in range(len(pt.children)):
            subtree = pt.children[j]
            if x_variables[i][j].varValue == 1:
                if last_subtree and subtree == last_subtree:
                    trace_parts[-1][1].append(trace[i])
                else:
                    assert last_subtree is None or subtree != last_subtree
                    t = Trace()
                    t.append(trace[i])
                    trace_parts.append((subtree, t))
                    last_subtree = subtree
                continue

    # calculate an alignment for each subtree
    alignments_per_subtree: Dict[ProcessTree] = {}
    for j in range(len(pt.children)):
        subtree: ProcessTree = pt.children[j]
        sub_trace = Trace()
        for trace_part in trace_parts:
            if subtree == trace_part[0]:
                sub_trace = concatenate_traces(sub_trace, trace_part[1])
        align_result = __approximate_alignment_for_trace(subtree,
                                                         a_sets,
                                                         sa_sets,
                                                         ea_sets,
                                                         tau_flags,
                                                         sub_trace,
                                                         tl,
                                                         th,
                                                         parameters=parameters)
        if align_result is None:
            # the alignment did not terminate correctly.
            return None
        alignments_per_subtree[subtree] = align_result
    # compose alignments from subtree alignments
    res = []
    for trace_part in trace_parts:
        activities_to_cover = trace_to_list_of_str(trace_part[1])
        activities_covered_so_far = []
        alignment = alignments_per_subtree[trace_part[0]]
        while activities_to_cover != activities_covered_so_far:
            move = alignment.pop(0)
            res.append(move)
            # if the alignment move is NOT a model move add activity to activities_covered_so_far
            if move[0] != SKIP:
                activities_covered_so_far.append(move[0])
    # add possible remaining alignment moves to resulting alignment, the order does not matter (parallel operator)
    for subtree in alignments_per_subtree:
        if len(alignments_per_subtree[subtree]) > 0:
            res.extend(alignments_per_subtree[subtree])
    return res
Exemplo n.º 14
0
def __approximate_alignment_on_sequence(pt: ProcessTree,
                                        trace: Trace,
                                        a_sets: Dict[ProcessTree, Set[str]],
                                        sa_sets: Dict[ProcessTree, Set[str]],
                                        ea_sets: Dict[ProcessTree, Set[str]],
                                        tau_flags: Dict[ProcessTree, bool],
                                        tl: int,
                                        th: int,
                                        parameters=None):
    if parameters is None:
        parameters = {}

    from pulp import lpSum, LpVariable, LpProblem, LpMinimize

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, DEFAULT_NAME_KEY)

    assert pt.operator == Operator.SEQUENCE
    assert len(pt.children) > 0
    assert len(trace) > 0

    ilp = LpProblem(sense=LpMinimize)

    # x_i_j = 1 <=> assigns activity i to subtree j
    x_variables: Dict[int, Dict[int, LpVariable]] = {}

    # s_i_j = 1 <=> activity i is a start activity in the current sub-trace assigned to subtree j
    s_variables: Dict[int, Dict[int, LpVariable]] = {}

    # e_i_j = 1 <=> activity i is an end activity in the current sub-trace assigned to subtree j
    e_variables: Dict[int, Dict[int, LpVariable]] = {}

    # auxiliary u_j <=> u_j=1 if an activity is assigned to subtree j
    u_variables: Dict[int, LpVariable] = {}

    # v_i_j = 1 <=> activity i is neither a start nor end-activity in the current sub-trace assigned to subtree j
    v_variables: Dict[int, Dict[int, LpVariable]] = {}

    s_costs = {}
    e_costs = {}
    u_costs = {}
    v_costs = {}

    # trace <a_0,...,a_n>
    for i, a in enumerate(trace):
        x_variables[i] = {}
        s_variables[i] = {}
        s_costs[i] = {}
        e_variables[i] = {}
        e_costs[i] = {}
        v_variables[i] = {}
        v_costs[i] = {}

        for j, subtree in enumerate(pt.children):
            x_variables[i][j] = LpVariable('x_' + str(i) + '_' + str(j),
                                           cat='Binary')

            s_variables[i][j] = LpVariable('s_' + str(i) + '_' + str(j),
                                           cat='Binary')
            s_costs[i][j] = 0 if a[activity_key] in sa_sets[subtree] else 1

            e_variables[i][j] = LpVariable('e_' + str(i) + '_' + str(j),
                                           cat='Binary')
            e_costs[i][j] = 0 if a[activity_key] in ea_sets[subtree] else 1

            v_variables[i][j] = LpVariable('v_' + str(i) + '_' + str(j),
                                           cat='Binary')
            v_costs[i][j] = 0 if a[activity_key] in a_sets[subtree] else 1

    for j in range(len(pt.children)):
        u_variables[j] = LpVariable('u_' + str(j), cat='Binary')
        # define costs to not assign anything to subtree j
        if tau_flags[pt.children[j]]:
            u_costs[j] = 0
        elif sa_sets[pt.children[j]] & ea_sets[pt.children[j]]:
            # intersection of start-activities and end-activities is not empty
            u_costs[j] = 1
        else:
            # intersection of start-activities and end-activities is empty
            u_costs[j] = 2

    # objective function
    ilp += lpSum([
        v_variables[i][j] * v_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        s_variables[i][j] * s_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        e_variables[i][j] * e_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [(1 - u_variables[j]) * u_costs[j]
         for j in range(len(pt.children))]), "objective_function"

    # constraints
    for i in range(len(trace)):
        # every activity is assigned to one subtree
        ilp += lpSum([x_variables[i][j] * 1
                      for j in range(len(pt.children))]) == 1

    for j in range(len(pt.children)):
        # first activity is start activity
        ilp += x_variables[0][j] <= s_variables[0][j]
        # last activity is end-activity
        ilp += x_variables[len(trace) - 1][j] <= e_variables[len(trace) - 1][j]

    # define s_i_j variables
    for i in range(1, len(trace)):
        for j in range(len(pt.children)):
            ilp += s_variables[i][j] >= x_variables[i][j] + 1 - x_variables[
                i - 1][j] - 1
            ilp += s_variables[i][j] <= x_variables[i][j]
            ilp += s_variables[i][j] <= 1 - x_variables[i - 1][j]
    for i in range(len(trace)):
        # activity can be only for one subtree a start-activity
        ilp += lpSum(s_variables[i][j] for j in range(len(pt.children))) <= 1

    # define e_i_j variables
    for i in range(len(trace) - 1):
        for j in range(len(pt.children)):
            ilp += e_variables[i][j] >= x_variables[i][j] + 1 - x_variables[
                i + 1][j] - 1
            ilp += e_variables[i][j] <= x_variables[i][j]
            ilp += e_variables[i][j] <= 1 - x_variables[i + 1][j]
    for i in range(len(trace)):
        # activity can be only for one subtree an end-activity
        ilp += lpSum(e_variables[i][j] for j in range(len(pt.children))) <= 1

    # constraint - preserving sequence when assigning activities to subtrees
    for i in range(len(trace) - 1):
        for j in range(len(pt.children)):
            ilp += lpSum(
                x_variables[i + 1][k]
                for k in range(j, len(pt.children))) >= x_variables[i][j]

    for j in range(len(pt.children)):
        for i in range(len(trace)):
            # define u_j variables
            ilp += u_variables[j] >= x_variables[i][j]

        # if u_j variable = 1 ==> a start activity must exist
        ilp += u_variables[j] <= lpSum(s_variables[i][j]
                                       for i in range(len(trace)))
        # if u_j variable = 1 ==> an end activity must exist
        ilp += u_variables[j] <= lpSum(e_variables[i][j]
                                       for i in range(len(trace)))

    # define v_i_j variables
    for i in range(len(trace)):
        for j in range(2):
            ilp += v_variables[i][j] >= 1 - s_variables[i][
                j] + 1 - e_variables[i][j] + x_variables[i][j] - 2
            ilp += v_variables[i][j] <= x_variables[i][j]
            ilp += v_variables[i][j] <= 1 - e_variables[i][j]
            ilp += v_variables[i][j] <= 1 - s_variables[i][j]

    status = ilp.solve()
    assert status == 1

    alignments_to_calculate: List[Tuple[ProcessTree, Trace]] = []
    for j in range(len(pt.children)):
        sub_trace = Trace()
        for i in range(len(trace)):
            if x_variables[i][j].varValue == 1:
                sub_trace.append(trace[i])
        alignments_to_calculate.append((pt.children[j], sub_trace))
    # calculate and compose alignments
    res = []
    for subtree, sub_trace in alignments_to_calculate:
        align_result = __approximate_alignment_for_trace(subtree,
                                                         a_sets,
                                                         sa_sets,
                                                         ea_sets,
                                                         tau_flags,
                                                         sub_trace,
                                                         tl,
                                                         th,
                                                         parameters=parameters)
        if align_result is None:
            # the alignment did not terminate correctly.
            return None
        res.extend(align_result)
    return res
Exemplo n.º 15
0
def __approximate_alignment_on_loop(pt: ProcessTree,
                                    trace: Trace,
                                    a_sets: Dict[ProcessTree, Set[str]],
                                    sa_sets: Dict[ProcessTree, Set[str]],
                                    ea_sets: Dict[ProcessTree, Set[str]],
                                    tau_flags: Dict[ProcessTree, bool],
                                    tl: int,
                                    th: int,
                                    parameters=None):
    if parameters is None:
        parameters = {}

    from pulp import lpSum, LpVariable, LpProblem, LpMinimize

    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, DEFAULT_NAME_KEY)

    assert pt.operator == Operator.LOOP
    assert len(pt.children) == 2
    assert len(trace) > 0

    ilp = LpProblem(sense=LpMinimize)

    # x_i_j = 1 <=> assigns activity i to subtree j
    x_variables: Dict[int, Dict[int, LpVariable]] = {}

    # t_i_j = 1 <=> inserts a tau at position i and assigns it to subtree j
    t_variables: Dict[int, Dict[int, LpVariable]] = {}

    # s_i_j = 1 <=> activity i is a start activity in the current sub-trace assigned to subtree j
    s_variables: Dict[int, Dict[int, LpVariable]] = {}

    # e_i_j = 1 <=> activity i is an end activity in the current sub-trace assigned to subtree j
    e_variables: Dict[int, Dict[int, LpVariable]] = {}

    # v_i_j = 1 <=> activity i is neither a start nor end-activity in the current sub-trace assigned to subtree j
    v_variables: Dict[int, Dict[int, LpVariable]] = {}

    # auxiliary variables
    # p_i_j = 1 <=> previous activity i-1 is assigned to the other subtree or t_1_other-subtree is 1
    p_variables: Dict[int, Dict[int, LpVariable]] = {}

    # n_i_j = 1 <=> next activity i+1 is assigned to the other subtree or t_1_other-subtree is 1
    n_variables: Dict[int, Dict[int, LpVariable]] = {}

    t_costs = {}
    s_costs = {}
    e_costs = {}
    v_costs = {}

    for i, a in enumerate(trace):
        x_variables[i] = {}
        s_variables[i] = {}
        s_costs[i] = {}
        e_variables[i] = {}
        e_costs[i] = {}
        v_variables[i] = {}
        v_costs[i] = {}
        p_variables[i] = {}
        n_variables[i] = {}
        for j, subtree in enumerate(pt.children):
            x_variables[i][j] = LpVariable('x_' + str(i) + '_' + str(j),
                                           cat='Binary')

            s_variables[i][j] = LpVariable('s_' + str(i) + '_' + str(j),
                                           cat='Binary')
            s_costs[i][j] = 0 if a[activity_key] in sa_sets[subtree] else 1

            e_variables[i][j] = LpVariable('e_' + str(i) + '_' + str(j),
                                           cat='Binary')
            e_costs[i][j] = 0 if a[activity_key] in ea_sets[subtree] else 1

            v_variables[i][j] = LpVariable('v_' + str(i) + '_' + str(j),
                                           cat='Binary')
            v_costs[i][j] = 0 if a[activity_key] in a_sets[subtree] else 1

            p_variables[i][j] = LpVariable('p_' + str(i) + '_' + str(j),
                                           cat='Binary')
            n_variables[i][j] = LpVariable('n_' + str(i) + '_' + str(j),
                                           cat='Binary')

    for i in range(len(trace) + 1):
        t_variables[i] = {}
        t_costs[i] = {}
        for j, subtree in enumerate(pt.children):
            t_variables[i][j] = LpVariable('t_' + str(i) + '_' + str(j),
                                           cat='Binary')
            if tau_flags[subtree]:
                t_costs[i][
                    j] = -0.00001  # favour to add a cut if possible over not putting a cut
            else:
                if len(sa_sets[subtree].intersection(ea_sets[subtree])) != 0:
                    t_costs[i][j] = 1
                else:
                    t_costs[i][j] = 2

    # objective function
    ilp += lpSum([
        s_variables[i][j] * s_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        e_variables[i][j] * e_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        v_variables[i][j] * v_costs[i][j] for i in range(len(trace))
        for j in range(len(pt.children))
    ] + [
        t_variables[i][j] * t_costs[i][j] for i in range(len(trace) + 1)
        for j in range(len(pt.children))
    ]), "objective_function"

    # constraints
    # universe j                        {0,1}
    # universe i for t_i_j variables    {0,...,len(trace)}
    # universe i else                   {0,...,len(trace)-1}

    # first tau can never be assigned to the 2nd subtree
    ilp += t_variables[0][1] == 0

    # last tau can never be assigned to the 2nd subtree
    ilp += t_variables[len(trace)][1] == 0

    # if first/last tau is not used --> first/last activity is assigned to 1st subtree
    ilp += 1 - t_variables[0][0] <= x_variables[0][0]
    ilp += 1 - t_variables[len(trace)][0] <= x_variables[len(trace) - 1][0]

    for i in range(len(trace)):
        # every activity is assigned to one subtree
        ilp += lpSum([x_variables[i][j] * 1
                      for j in range(len(pt.children))]) == 1

        # start/end/intermediate-activity at position i can only be assigned to one subtree
        ilp += lpSum([s_variables[i][j] * 1
                      for j in range(len(pt.children))]) <= 1
        ilp += lpSum([e_variables[i][j] * 1
                      for j in range(len(pt.children))]) <= 1
        ilp += lpSum([v_variables[i][j] * 1
                      for j in range(len(pt.children))]) <= 1

    for i in range(len(trace) + 1):
        # max one tau is used per index
        ilp += lpSum([t_variables[i][j] for j in range(2)]) <= 1

    # if tau is used and hence, assigned to a subtree, the surrounding activities are assigned to the other subtree
    for i in range(1, len(trace)):
        # if tau at position i is assigned to 1st subtree, the previous activity is assigned to 2nd subtree
        ilp += t_variables[i][0] <= x_variables[i - 1][1]
        # if tau at position i is assigned to 1st subtree, the previous activity is assigned to 2nd subtree
        ilp += t_variables[i][1] <= x_variables[i - 1][0]
    for i in range(len(trace)):
        # if tau at position i is assigned to 1st subtree, the next activity is assigned to 2nd subtree
        ilp += t_variables[i][0] <= x_variables[i][1]
        # if tau at position i is assigned to 2nd subtree, the next activity is assigned to 1st subtree
        ilp += t_variables[i][1] <= x_variables[i][0]
    # if last tau is used and assigned to 1st subtree (assigning it to the 2nd subtree is already forbidden by another
    # constraint) --> last activity must be assigned to 2nd subtree
    ilp += t_variables[len(trace)][0] <= x_variables[len(trace) - 1][1]

    # define auxiliary variables n: n_i_1 = 1 <=> next activity i+1 is assigned to 2nd subtree or t_i+1_2 = 1
    for i in range(len(trace) - 1):
        ilp += n_variables[i][0] <= x_variables[i + 1][1] + t_variables[i +
                                                                        1][1]
        ilp += n_variables[i][0] >= x_variables[i + 1][1]
        ilp += n_variables[i][0] >= t_variables[i + 1][1]

        ilp += n_variables[i][1] <= x_variables[i + 1][0] + t_variables[i +
                                                                        1][0]
        ilp += n_variables[i][1] >= x_variables[i + 1][0]
        ilp += n_variables[i][1] >= t_variables[i + 1][0]

    ilp += t_variables[len(trace)][1] <= n_variables[len(trace) - 1][0]
    ilp += t_variables[len(trace)][0] <= n_variables[len(trace) - 1][1]

    # define e_i_j variables
    for i in range(len(trace)):
        for j in range(2):
            ilp += e_variables[i][j] <= n_variables[i][j]
            ilp += e_variables[i][j] <= x_variables[i][j]
            ilp += e_variables[i][
                j] >= n_variables[i][j] + x_variables[i][j] - 1

    # define auxiliary variables p: p_i_1 = 1 <=> previous activity i-1 is assigned to 2nd subtree or t_i-1_2 = 1
    ilp += t_variables[0][1] <= p_variables[0][0]
    ilp += p_variables[0][1] <= t_variables[0][0]

    for i in range(1, len(trace)):
        ilp += p_variables[i][0] <= t_variables[i][1] + x_variables[i - 1][1]
        ilp += p_variables[i][0] >= t_variables[i][1]
        ilp += p_variables[i][0] >= x_variables[i - 1][1]

        ilp += p_variables[i][1] <= t_variables[i][0] + x_variables[i - 1][0]
        ilp += p_variables[i][1] >= t_variables[i][0]
        ilp += p_variables[i][1] >= x_variables[i - 1][0]

    # define s_i_j variables
    for i in range(len(trace)):
        for j in range(2):
            ilp += s_variables[i][
                j] >= p_variables[i][j] + x_variables[i][j] - 1
            ilp += s_variables[i][j] <= p_variables[i][j]
            ilp += s_variables[i][j] <= p_variables[i][j]
    ilp += 1 - t_variables[0][0] <= s_variables[0][0]

    # define v_i_j variables
    for i in range(len(trace)):
        for j in range(2):
            ilp += v_variables[i][j] >= 1 - s_variables[i][
                j] + 1 - e_variables[i][j] + x_variables[i][j] - 2
            ilp += v_variables[i][j] <= x_variables[i][j]
            ilp += v_variables[i][j] <= 1 - e_variables[i][j]
            ilp += v_variables[i][j] <= 1 - s_variables[i][j]

    status = ilp.solve()
    assert status == 1

    alignments_to_calculate: List[Tuple[ProcessTree, Trace]] = []
    sub_trace = Trace()
    current_subtree_idx = 0
    for i in range(len(trace)):
        for j in range(2):
            if t_variables[i][j].varValue:
                if i == 0:
                    # first tau can be only assigned to first subtree
                    assert j == 0
                    alignments_to_calculate.append((pt.children[j], Trace()))
                    current_subtree_idx = 1
                else:
                    alignments_to_calculate.append(
                        (pt.children[current_subtree_idx], sub_trace))
                    alignments_to_calculate.append((pt.children[j], Trace()))
                    sub_trace = Trace()
        for j in range(2):
            if x_variables[i][j].varValue:
                if j == current_subtree_idx:
                    sub_trace.append(trace[i])
                else:
                    alignments_to_calculate.append(
                        (pt.children[current_subtree_idx], sub_trace))
                    sub_trace = Trace()
                    sub_trace.append(trace[i])
                    current_subtree_idx = j
    if len(sub_trace) > 0:
        alignments_to_calculate.append(
            (pt.children[current_subtree_idx], sub_trace))
    if t_variables[len(trace)][0].varValue:
        alignments_to_calculate.append((pt.children[0], Trace()))

    res = []
    for subtree, sub_trace in alignments_to_calculate:
        align_result = __approximate_alignment_for_trace(subtree,
                                                         a_sets,
                                                         sa_sets,
                                                         ea_sets,
                                                         tau_flags,
                                                         sub_trace,
                                                         tl,
                                                         th,
                                                         parameters=parameters)
        if align_result is None:
            # the alignment did not terminate correctly.
            return None
        res.extend(align_result)
    return res
Exemplo n.º 16
0
def apply(
    log: EventLog,
    values: List[str],
    parameters: Optional[Dict[Union[str, Parameters],
                              Any]] = None) -> EventLog:
    """
    Filter log by keeping only traces that has/has not events with an attribute value that belongs to the provided
    values list

    Parameters
    -----------
    log
        Trace log
    values
        Allowed attributes
    parameters
        Parameters of the algorithm, including:
            Parameters.ACTIVITY_KEY -> Attribute identifying the activity in the log
            Parameters.POSITIVE -> Indicate if events should be kept/removed

    Returns
    -----------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}

    log = log_converter.apply(log,
                              variant=log_converter.Variants.TO_EVENT_LOG,
                              parameters=parameters)

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY,
                                               parameters, DEFAULT_NAME_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters,
                                          True)

    filtered_log = EventLog(list(),
                            attributes=log.attributes,
                            extensions=log.extensions,
                            classifiers=log.classifiers,
                            omni_present=log.omni_present,
                            properties=log.properties)
    for trace in log:
        new_trace = Trace()

        found = False
        for j in range(len(trace)):
            if attribute_key in trace[j]:
                attribute_value = trace[j][attribute_key]
                if attribute_value in values:
                    found = True

        if (found and positive) or (not found and not positive):
            new_trace = trace
        else:
            for attr in trace.attributes:
                new_trace.attributes[attr] = trace.attributes[attr]

        if len(new_trace) > 0:
            filtered_log.append(new_trace)
    return filtered_log