def detect_cut_if(self, second_iteration=False, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # check base cases: empty_log = base_case.empty_log(self.log) single_activity = base_case.single_activity(self.log, activity_key) if empty_log: self.detected_cut = 'empty_log' elif single_activity: self.detected_cut = 'single_activity' # if no base cases are found, search for a cut: # use the cutting and splitting functions of im_plain: else: found_plain_cut, type_of_cut, cut = self.check_cut_im_plain() if found_plain_cut: self.apply_cut_im_plain(type_of_cut, cut, activity_key) # if im_plain does not find a cut, we filter on our threshold and then again apply the im_cut detection # but this time, we have to use different splitting functions: else: self.filter_dfg_on_threshold() found_plain_cut, type_of_cut, cut = self.check_cut_im_plain() if found_plain_cut: if type_of_cut == 'concurrent': logging.debug("concurrent_cut_if") self.detected_cut = 'concurrent' new_logs = splitting_infrequent.split_xor_infrequent( cut[1], self.log, activity_key) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) elif type_of_cut == 'sequential': logging.debug("sequential_if") new_logs = splitting_infrequent.split_sequence_infrequent( cut[1], self.log, activity_key) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) elif type_of_cut == 'parallel': logging.debug("parallel_if") new_logs = split.split_parallel( cut[1], self.log, activity_key) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) elif type_of_cut == 'loopCut': logging.debug("loopCut_if") new_logs = splitting_infrequent.split_loop_infrequent( cut[1], self.log, activity_key) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: self.apply_fall_through_infrequent(parameters)
def apply( parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> ProcessTree: """ Generate a process tree Parameters ------------ parameters Paramters of the algorithm, including: Parameters.REC_DEPTH -> current recursion depth Parameters.MIN_REC_DEPTH -> minimum recursion depth Parameters.MAX_REC_DEPTH -> maximum recursion depth Parameters.PROB_LEAF -> Probability to get a leaf Returns ------------ tree Process tree """ if parameters is None: parameters = {} rec_depth = exec_utils.get_param_value(Parameters.REC_DEPTH, parameters, 0) min_rec_depth = exec_utils.get_param_value(Parameters.MIN_REC_DEPTH, parameters, 1) max_rec_depth = exec_utils.get_param_value(Parameters.MAX_REC_DEPTH, parameters, 3) prob_leaf = exec_utils.get_param_value(Parameters.PROB_LEAF, parameters, 0.25) next_parameters = { Parameters.REC_DEPTH: rec_depth + 1, Parameters.MIN_REC_DEPTH: min_rec_depth, Parameters.MAX_REC_DEPTH: max_rec_depth, Parameters.PROB_LEAF: prob_leaf } is_leaf = False if min_rec_depth <= rec_depth <= max_rec_depth: r = random.random() if r < prob_leaf: is_leaf = True elif rec_depth > max_rec_depth: is_leaf = True if is_leaf: current_tree = ProcessTree(label=generate_random_string(6)) elif rec_depth == 0: current_tree = ProcessTree(operator=Operator.SEQUENCE) start = ProcessTree(label=generate_random_string(6), parent=current_tree) current_tree.children.append(start) node = apply(parameters=next_parameters) node.parent = current_tree current_tree.children.append(node) end = ProcessTree(label=generate_random_string(6)) end.parent = current_tree current_tree.children.append(end) else: o = get_random_operator() current_tree = ProcessTree(operator=o) if o == Operator.SEQUENCE: n_min = 2 n_max = 6 selected_n = random.randrange(n_min, n_max) for i in range(selected_n): child = apply(parameters=next_parameters) child.parent = current_tree current_tree.children.append(child) elif o == Operator.LOOP: do = apply(parameters=next_parameters) do.parent = current_tree current_tree.children.append(do) redo = apply(parameters=next_parameters) redo.parent = current_tree current_tree.children.append(redo) exit = ProcessTree(parent=current_tree) current_tree.children.append(exit) elif o == Operator.XOR: n_min = 2 n_max = 5 selected_n = random.randrange(n_min, n_max) for i in range(selected_n): child = apply(parameters=next_parameters) child.parent = current_tree current_tree.children.append(child) elif o == Operator.PARALLEL: n_min = 2 n_max = 4 selected_n = random.randrange(n_min, n_max) for i in range(selected_n): child = apply(parameters=next_parameters) child.parent = current_tree current_tree.children.append(child) return current_tree
def preprocess_log(log, activities=None, activities_counter=None, parameters=None): """ Preprocess the log to get a grouped list of simplified traces (per activity) Parameters -------------- log Log object activities (if provided) activities of the log activities_counter (if provided) counter of the activities of the log parameters Parameters of the algorithm Returns -------------- traces_list List of simplified traces of the log trace_grouped_list Grouped list of simplified traces (per activity) activities Activities of the log activities_counter Activities counter """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) caseid_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY) if type(log) is pd.DataFrame: # keep only the two columns before conversion log = log[list( set([activity_key, timestamp_key, start_timestamp_key, caseid_key]))] log = converter.apply(log, variant=converter.Variants.TO_EVENT_LOG, parameters=parameters) traces_list = [] for trace in log: trace_stream = [{ activity_key: trace[i][activity_key], timestamp_key: trace[i][timestamp_key].timestamp(), start_timestamp_key: trace[i][start_timestamp_key].timestamp(), index_key: i } for i in range(len(trace))] trace_stream = sorted( trace_stream, key=lambda x: (x[start_timestamp_key], x[timestamp_key], x[index_key])) traces_list.append(trace_stream) if activities is None: activities = sorted( list(set(y[activity_key] for x in traces_list for y in x))) trace_grouped_list = [] for trace in traces_list: gr = [] for act in activities: act_gr = [x for x in trace if x[activity_key] == act] gr.append(act_gr) trace_grouped_list.append(gr) if activities_counter is None: activities_counter = Counter(y[activity_key] for x in traces_list for y in x) return traces_list, trace_grouped_list, activities, activities_counter
def apply(log, parameters=None): """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.N -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) n = exec_utils.get_param_value(Parameters.N, parameters, 2) parameters_variants = { case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics( log, parameters=parameters_variants) } variants_resources = list(variants_occ.keys()) resources = [x.split(",") for x in variants_resources] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for rv in resources: for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)] dividend = 0 for rv in resources: dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def get_decorations(log, net, initial_marking, final_marking, parameters=None, measure="frequency", ht_perf_method="last"): """ Calculate decorations in order to annotate the Petri net Parameters ----------- log Trace log net Petri net initial_marking Initial marking final_marking Final marking parameters Parameters associated to the algorithm measure Measure to represent on the process model (frequency/performance) ht_perf_method Method to use in order to annotate hidden transitions (performance value could be put on the last possible point (last) or in the first possible point (first) Returns ------------ decorations Decorations to put on the process model """ if parameters is None: parameters = {} aggregation_measure = exec_utils.get_param_value( Parameters.AGGREGATION_MEASURE, parameters, None) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) stat_locale = exec_utils.get_param_value(Parameters.STAT_LOCALE, parameters, {}) variants_idx = variants_get.get_variants_from_log_trace_idx( log, parameters=parameters) variants = variants_get.convert_variants_trace_idx_to_trace_obj( log, variants_idx) parameters_tr = { token_replay.Variants.TOKEN_REPLAY.value.Parameters.ACTIVITY_KEY: activity_key, token_replay.Variants.TOKEN_REPLAY.value.Parameters.VARIANTS: variants } # do the replay aligned_traces = token_replay.apply(log, net, initial_marking, final_marking, parameters=parameters_tr) # apply petri_reduction technique in order to simplify the Petri net # net = reduction.apply(net, parameters={"aligned_traces": aligned_traces}) element_statistics = performance_map.single_element_statistics( log, net, initial_marking, aligned_traces, variants_idx, activity_key=activity_key, timestamp_key=timestamp_key, ht_perf_method=ht_perf_method) aggregated_statistics = performance_map.aggregate_statistics( element_statistics, measure=measure, aggregation_measure=aggregation_measure, stat_locale=stat_locale) return aggregated_statistics
def __insert_start_from_previous_event( df: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> pd.DataFrame: """ Inserts the start timestamp of an event set to the completion of the previous event in the case Parameters --------------- df Dataframe Returns --------------- df Dataframe with the start timestamp for each event """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_START_TIMESTAMP_KEY) from pm4py.util import pandas_utils df = df[[timestamp_key, resource_key, case_id_key, activity_key]] df = pandas_utils.insert_index(df) df = df.sort_values( [case_id_key, timestamp_key, constants.DEFAULT_INDEX_KEY]) shifted_df = df[[case_id_key, timestamp_key]].shift(1) shifted_df.columns = [x + "_2" for x in shifted_df.columns] concat_df = pd.concat([df, shifted_df], axis=1) concat_df = concat_df[concat_df[case_id_key] == concat_df[ case_id_key + "_2"]][[constants.DEFAULT_INDEX_KEY, timestamp_key + "_2"]] del shifted_df concat_df = concat_df.to_dict("records") concat_df = { x[constants.DEFAULT_INDEX_KEY]: x[timestamp_key + "_2"] for x in concat_df } df[start_timestamp_key] = df[constants.DEFAULT_INDEX_KEY].map(concat_df) df[start_timestamp_key] = df[start_timestamp_key].fillna(df[timestamp_key]) df = df.sort_values( [start_timestamp_key, timestamp_key, constants.DEFAULT_INDEX_KEY]) return df
def interaction_two_resources( df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r1: str, r2: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> float: """ The number of cases completed during a given time slot in which two given resources were involved. Metric RBI 5.1 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- df Dataframe t1 Left interval t2 Right interval r1 Resource 1 r2 Resource 2 Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} t1 = get_dt_from_string(t1) t2 = get_dt_from_string(t2) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) df = df[[timestamp_key, resource_key, case_id_key, activity_key]] from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters_filter = { attributes_filter.Parameters.ATTRIBUTE_KEY: resource_key } df = attributes_filter.apply(df, [r1], parameters=parameters_filter) df = attributes_filter.apply(df, [r2], parameters=parameters_filter) last_df = df.groupby(case_id_key).last().reset_index() last_df = last_df[last_df[timestamp_key] >= t1] last_df = last_df[last_df[timestamp_key] < t2] cases = set(last_df[case_id_key].unique()) df = df[df[case_id_key].isin(cases)] return df[case_id_key].nunique()
def __transform_model_to_mem_efficient_structure(net, im, fm, trace, parameters=None): """ Transform the Petri net model to a memory efficient structure Parameters -------------- net Petri net im Initial marking fm Final marking trace Trace parameters Parameters Returns -------------- model_struct Model data structure, including: PLACES_DICT: associates each place to a number INV_TRANS_DICT: associates a number to each transition LABELS_DICT: labels dictionary (a label to a number) TRANS_LABELS_DICT: associates each transition to the number corresponding to its label TRANS_PRE_DICT: preset of a transition, expressed as in this data structure TRANS_POST_DICT: postset of a transition, expressed as in this data structure TRANSF_IM: transformed initial marking TRANSF_FM: transformed final marking TRANSF_MODEL_COST_FUNCTION: transformed model cost function """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) labels = sorted(list(set(x[activity_key] for x in trace))) model_cost_function = exec_utils.get_param_value( Parameters.PARAM_MODEL_COST_FUNCTION, parameters, None) if model_cost_function is None: model_cost_function = {} for t in net.transitions: if t.label is not None: model_cost_function[t] = align_utils.STD_MODEL_LOG_MOVE_COST else: preset_t = Marking() for a in t.in_arcs: preset_t[a.source] = a.weight # optimization 12/08/2020 # # instead of giving undiscriminately weight 1 to # invisible transitions, assign weight 0 to the ones # for which no 'sync' transition is enabled in their # activation markings. # # this requires to modify the state of the alignment, keeping track # of the length of the alignment, to avoid loops. en_t = enabled_transitions(net, preset_t) vis_t_trace = [t for t in en_t if t.label in labels] if len(vis_t_trace) == 0: model_cost_function[t] = 0 else: model_cost_function[t] = align_utils.STD_TAU_COST places_dict = {place: index for index, place in enumerate(net.places)} trans_dict = {trans: index for index, trans in enumerate(net.transitions)} labels = sorted( list(set(t.label for t in net.transitions if t.label is not None))) labels_dict = {labels[i]: i for i in range(len(labels))} trans_labels_dict = {} for t in net.transitions: trans_labels_dict[trans_dict[t]] = labels_dict[ t.label] if t.label is not None else None trans_pre_dict = { trans_dict[t]: {places_dict[x.source]: x.weight for x in t.in_arcs} for t in net.transitions } trans_post_dict = { trans_dict[t]: {places_dict[x.target]: x.weight for x in t.out_arcs} for t in net.transitions } transf_im = {places_dict[p]: im[p] for p in im} transf_fm = {places_dict[p]: fm[p] for p in fm} transf_model_cost_function = { trans_dict[t]: model_cost_function[t] for t in net.transitions } inv_trans_dict = {y: x for x, y in trans_dict.items()} return { PLACES_DICT: places_dict, INV_TRANS_DICT: inv_trans_dict, LABELS_DICT: labels_dict, TRANS_LABELS_DICT: trans_labels_dict, TRANS_PRE_DICT: trans_pre_dict, TRANS_POST_DICT: trans_post_dict, TRANSF_IM: transf_im, TRANSF_FM: transf_fm, TRANSF_MODEL_COST_FUNCTION: transf_model_cost_function }
def apply_tree(log, parameters=None): """ Apply the IM algorithm to a log obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ if parameters is None: parameters = {} if pkgutil.find_loader("pandas"): import pandas as pd from pm4py.statistics.variants.pandas import get as variants_get if type(log) is pd.DataFrame: vars = variants_get.get_variants_count(log, parameters=parameters) return apply_tree_variants(vars, parameters=parameters) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) log = converter.apply(log, parameters=parameters) # since basic IM is influenced once per variant, it makes sense to keep one trace per variant log = filtering_utils.keep_one_trace_per_variant(log, parameters=parameters) # keep only the activity attribute (since the others are not used) log = filtering_utils.keep_only_one_attribute_per_event(log, activity_key) dfg = [(k, v) for k, v in dfg_inst.apply(log, parameters=parameters).items() if v > 0] c = Counts() activities = attributes_get.get_attribute_values(log, activity_key) start_activities = list(start_activities_get.get_start_activities(log, parameters=parameters).keys()) end_activities = list(end_activities_get.get_end_activities(log, parameters=parameters).keys()) contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 recursion_depth = 0 sub = subtree.make_tree(log, dfg, dfg, dfg, activities, c, recursion_depth, 0.0, start_activities, end_activities, start_activities, end_activities, parameters) process_tree = get_tree_repr_implain.get_repr(sub, 0, contains_empty_traces=contains_empty_traces) # Ensures consistency to the parent pointers in the process tree tree_consistency.fix_parent_pointers(process_tree) # Fixes a 1 child XOR that is added when single-activities flowers are found tree_consistency.fix_one_child_xor_flower(process_tree) # folds the process tree (to simplify it in case fallthroughs/filtering is applied) process_tree = util.fold(process_tree) return process_tree
def align_fake_log_stop_marking(fake_log, net, marking, final_marking, parameters=None): """ Align the 'fake' log with all the prefixes in order to get the markings in which the alignment stops Parameters ------------- fake_log Fake log net Petri net marking Marking final_marking Final marking parameters Parameters of the algorithm Returns ------------- alignment For each trace in the log, return the marking in which the alignment stops (expressed as place name with count) """ if parameters is None: parameters = {} show_progress_bar = exec_utils.get_param_value( Parameters.SHOW_PROGRESS_BAR, parameters, True) multiprocessing = exec_utils.get_param_value(Parameters.MULTIPROCESSING, parameters, False) progress = None if pkgutil.find_loader("tqdm") and show_progress_bar and len(fake_log) > 1: from tqdm.auto import tqdm progress = tqdm( total=len(fake_log), desc="computing precision with alignments, completed variants :: ") if multiprocessing: align_intermediate_result = __align_log_with_multiprocessing_stop_marking( fake_log, net, marking, final_marking, progress, parameters=parameters) else: align_intermediate_result = __align_log_wo_multiprocessing_stop_marking( fake_log, net, marking, final_marking, progress, parameters=parameters) align_result = [] for i in range(len(align_intermediate_result)): res = align_intermediate_result[i] if res is not None: align_result.append([]) for mark in res: res2 = {} for pl in mark: # transforms the markings for easier correspondence at the end # (distributed engine friendly!) res2[(pl.name[0], pl.name[1])] = mark[pl] align_result[-1].append(res2) else: # if there is no path from the initial marking # replaying the given prefix, then add None align_result.append(None) # gracefully close progress bar if progress is not None: progress.close() del progress return align_result
def apply(log, net, marking, final_marking, parameters=None): """ Get Align-ET Conformance precision Parameters ---------- log Trace log net Petri net marking Initial marking final_marking Final marking parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Activity key """ if parameters is None: parameters = {} debug_level = parameters[ "debug_level"] if "debug_level" in parameters else 0 activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, log_lib.util.xes.DEFAULT_NAME_KEY) # default value for precision, when no activated transitions (not even by looking at the initial marking) are found precision = 1.0 sum_ee = 0 sum_at = 0 unfit = 0 if not check_soundness.check_easy_soundness_net_in_fin_marking( net, marking, final_marking): raise Exception( "trying to apply Align-ETConformance on a Petri net that is not a easy sound net!!" ) prefixes, prefix_count = precision_utils.get_log_prefixes( log, activity_key=activity_key) prefixes_keys = list(prefixes.keys()) fake_log = precision_utils.form_fake_log(prefixes_keys, activity_key=activity_key) align_stop_marking = align_fake_log_stop_marking(fake_log, net, marking, final_marking, parameters=parameters) all_markings = transform_markings_from_sync_to_original_net( align_stop_marking, net, parameters=parameters) for i in range(len(prefixes)): markings = all_markings[i] if markings is not None: log_transitions = set(prefixes[prefixes_keys[i]]) activated_transitions_labels = set() for m in markings: # add to the set of activated transitions in the model the activated transitions # for each prefix activated_transitions_labels = activated_transitions_labels.union( x.label for x in utils. get_visible_transitions_eventually_enabled_by_marking( net, m) if x.label is not None) escaping_edges = activated_transitions_labels.difference( log_transitions) sum_at += len(activated_transitions_labels) * prefix_count[ prefixes_keys[i]] sum_ee += len(escaping_edges) * prefix_count[prefixes_keys[i]] if debug_level > 1: print("") print("prefix=", prefixes_keys[i]) print("log_transitions=", log_transitions) print("activated_transitions=", activated_transitions_labels) print("escaping_edges=", escaping_edges) else: unfit += prefix_count[prefixes_keys[i]] if debug_level > 0: print("\n") print("overall unfit", unfit) print("overall activated transitions", sum_at) print("overall escaping edges", sum_ee) # fix: also the empty prefix should be counted! start_activities = set(get_start_activities(log, parameters=parameters)) trans_en_ini_marking = set([ x.label for x in get_visible_transitions_eventually_enabled_by_marking( net, marking) ]) diff = trans_en_ini_marking.difference(start_activities) sum_at += len(log) * len(trans_en_ini_marking) sum_ee += len(log) * len(diff) # end fix if sum_at > 0: precision = 1 - float(sum_ee) / float(sum_at) return precision
def apply(tree, parameters=None): """ Performs an extensive playout of the process tree Parameters ------------- tree Process tree parameters Possible parameters, including: - Parameters.MIN_TRACE_LENGTH => minimum length of a trace (default: 1) - Parameters.MAX_TRACE_LENGTH => maximum length of a trace (default: min_allowed_trace_length) - Parameters.MAX_LOOP_OCC => maximum number of occurrences for a loop (default: MAX_TRACE_LENGTH) - Parameters.ACTIVITY_KEY => activity key - Parameters.MAX_LIMIT_NUM_TRACES => maximum number to the limit of traces; the playout shall stop when the number is reached (default: 100000) Returns ------------- log Event log """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) # to save memory in the returned log, allocate each activity once. to know the list of activities of the # process tree, use the footprints module fp_tree = fp_discovery.apply(tree, parameters=parameters) activities = fp_tree["activities"] activities = {act: Event({activity_key: act}) for act in activities} min_allowed_trace_length = bottomup_discovery.get_min_trace_length( tree, parameters=parameters) min_trace_length = exec_utils.get_param_value(Parameters.MIN_TRACE_LENGTH, parameters, 1) max_trace_length = exec_utils.get_param_value(Parameters.MAX_TRACE_LENGTH, parameters, min_allowed_trace_length) max_loop_occ = exec_utils.get_param_value(Parameters.MAX_LOOP_OCC, parameters, int(max_trace_length / 2)) max_limit_num_traces = exec_utils.get_param_value( Parameters.MAX_LIMIT_NUM_TRACES, parameters, 100000) return_set_strings = exec_utils.get_param_value( Parameters.RETURN_SET_STRINGS, parameters, False) bottomup = bottomup_discovery.get_bottomup_nodes(tree, parameters=parameters) min_rem_dict = bottomup_discovery.get_min_rem_dict(tree, parameters=parameters) max_rem_dict = bottomup_discovery.get_max_rem_dict(tree, parameters=parameters) playout_dictio = {} for i in range(len(bottomup)): get_playout(bottomup[i], playout_dictio, min_trace_length, max_trace_length, max_loop_occ, min_rem_dict, max_rem_dict, max_limit_num_traces) tree_playout_traces = playout_dictio[tree][TRACES] if return_set_strings: return tree_playout_traces log = EventLog() for tr0 in tree_playout_traces: trace = Trace() for act in tr0: trace.append(activities[act]) log.append(trace) return log
def apply( log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> List[Any]: """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.N -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) n = exec_utils.get_param_value(Parameters.N, parameters, 2) parameters_variants = { variants_filter.Parameters.ACTIVITY_KEY: resource_key, variants_filter.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x: len(y) for x, y in variants_filter.get_variants( log, parameters=parameters_variants).items() } variants_resources = list(variants_occ.keys()) resources = [ variants_util.get_activities_from_variant(y) for y in variants_resources ] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] dividend += variants_occ[rvj] for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[rvj] for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def apply_fall_through_infrequent(self, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, self.parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # set flags for fall_throughs, base case is True (enabled) use_empty_trace = (Parameters.EMPTY_TRACE_KEY not in parameters ) or parameters[Parameters.EMPTY_TRACE_KEY] use_act_once_per_trace = ( Parameters.ONCE_PER_TRACE_KEY not in parameters) or parameters[Parameters.ONCE_PER_TRACE_KEY] use_act_concurrent = (Parameters.CONCURRENT_KEY not in parameters ) or parameters[Parameters.CONCURRENT_KEY] use_strict_tau_loop = (Parameters.STRICT_TAU_LOOP_KEY not in parameters ) or parameters[Parameters.STRICT_TAU_LOOP_KEY] use_tau_loop = (Parameters.TAU_LOOP_KEY not in parameters ) or parameters[Parameters.TAU_LOOP_KEY] if use_empty_trace: empty_traces_present, enough_traces, new_log = fall_through_infrequent.empty_trace_filtering( self.log, self.f) self.log = new_log else: empty_traces_present = False enough_traces = False # if an empty trace is found, the empty trace fallthrough applies if empty_traces_present and enough_traces: logging.debug("empty_trace_if") self.detected_cut = 'empty_trace' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=self.parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) elif empty_traces_present and not enough_traces: # no node is added to the PT, instead we just use recursion on the log_skeleton without the empty traces self.detect_cut_if(parameters=parameters) else: if use_act_once_per_trace: activity_once, new_log, small_log = fall_through.act_once_per_trace( self.log, self.activities, activity_key) else: activity_once = False if activity_once: self.detected_cut = 'parallel' # create two new dfgs as we need them to append to self.children later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) # append the chosen activity as leaf: self.children.append( SubtreeInfrequent( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log_skeleton self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_concurrent: activity_concurrent, new_log, small_log, key = fall_through.activity_concurrent( self, self.log, self.activities, activity_key, parameters=parameters) else: activity_concurrent = False if activity_concurrent: self.detected_cut = 'parallel' # create two new dfgs on to append later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) # append the concurrent activity as leaf: self.children.append( SubtreeInfrequent( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log_skeleton: self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_strict_tau_loop: strict_tau_loop, new_log = fall_through.strict_tau_loop( self.log, self.start_activities, self.end_activities, activity_key) else: strict_tau_loop = False if strict_tau_loop: self.detected_cut = 'strict_tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: if use_tau_loop: tau_loop, new_log = fall_through.tau_loop( self.log, self.start_activities, activity_key) else: tau_loop = False if tau_loop: self.detected_cut = 'tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: logging.debug("flower_if") self.detected_cut = 'flower'
def apply(dfg, parameters=None): """ Applies the DFG mining on a given object (if it is a Pandas dataframe or a log, the DFG is calculated) Parameters ------------- dfg Object (DFG) (if it is a Pandas dataframe or a log, the DFG is calculated) parameters Parameters """ if parameters is None: parameters = {} dfg = dfg start_activities = exec_utils.get_param_value( Parameters.START_ACTIVITIES, parameters, dfg_utils.infer_start_activities(dfg)) end_activities = exec_utils.get_param_value( Parameters.END_ACTIVITIES, parameters, dfg_utils.infer_end_activities(dfg)) activities = dfg_utils.get_activities_from_dfg(dfg) net = PetriNet("") im = Marking() fm = Marking() source = PetriNet.Place("source") net.places.add(source) im[source] = 1 sink = PetriNet.Place("sink") net.places.add(sink) fm[sink] = 1 places_corr = {} index = 0 for act in activities: places_corr[act] = PetriNet.Place(act) net.places.add(places_corr[act]) for act in start_activities: if act in places_corr: index = index + 1 trans = PetriNet.Transition(act + "_" + str(index), act) net.transitions.add(trans) pn_util.add_arc_from_to(source, trans, net) pn_util.add_arc_from_to(trans, places_corr[act], net) for act in end_activities: if act in places_corr: index = index + 1 inv_trans = PetriNet.Transition(act + "_" + str(index), None) net.transitions.add(inv_trans) pn_util.add_arc_from_to(places_corr[act], inv_trans, net) pn_util.add_arc_from_to(inv_trans, sink, net) for el in dfg.keys(): act1 = el[0] act2 = el[1] index = index + 1 trans = PetriNet.Transition(act2 + "_" + str(index), act2) net.transitions.add(trans) pn_util.add_arc_from_to(places_corr[act1], trans, net) pn_util.add_arc_from_to(trans, places_corr[act2], net) return net, im, fm
def preprocessing(log, parameters=None): """ Preprocessing step for the Aplha+ algorithm. Removing all transitions from the log with a loop of length one. Parameters ------------ log Event log parameters Parameters of the algorithm Returns ------------- log filtered log and a list of the filtered transitions loop_one_list Loop one list A_filtered Dictionary: activity before the loop-length-one activity B_filtered Dictionary: activity after the loop-length-one activity loops_in_first_place Loops in source place loops_in_last_place Loops in sink place """ loops_in_first_place = set() loops_in_last_place = set() if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_util.DEFAULT_NAME_KEY) # List for values that have a loop of length one loop_one_list = [] # Log without activities that have a loop of length one filtered_log = EventLog() # dictionary A: activity before the loop-length-one activity A = {} # dictionary B: activity after the loop-length-one activity B = {} A_filtered = {} B_filtered = {} # inserting artificial start and end activity, since it is not allowed to have a loop at the source place # (according to paper) for trace in log: trace.insert(0, {activity_key: 'artificial_start'}) trace.append({activity_key: 'artificial_end'}) for trace in log: i = 0 while i < len(trace) - 1: test = trace[1] current = trace[i][activity_key] successor = trace[i + 1][activity_key] if current == successor: if current not in loop_one_list: loop_one_list.append(current) i += 1 for trace in log: i = 0 filtered_trace = Trace() while i < len(trace) - 1: current = trace[i][activity_key] successor = trace[i + 1][activity_key] if not current in loop_one_list: filtered_trace.append(current) if successor in loop_one_list: if not current in loop_one_list: if current in A: A[successor].append(current) else: A[successor] = [current] if current in loop_one_list: if not successor in loop_one_list: if current in B: B[current].append(successor) else: B[current] = [successor] if i == len(trace) - 2: if not successor in loop_one_list: filtered_trace.append(successor) i += 1 filtered_log.append(filtered_trace) # Making sets instead of lists for key, value in A.items(): A_filtered[key] = set(value) # Making sets instead of lists for key, value in B.items(): B_filtered[key] = set(value) for trace in log: if trace.__getitem__(0) in loop_one_list: loops_in_first_place.add(trace.__getitem__(0)) if trace.__getitem__(len(trace) - 1) in loop_one_list: loops_in_last_place.add(trace.__getitem__(len(trace) - 1)) loops_in_first_place = list(loops_in_first_place) loops_in_last_place = list(loops_in_last_place) return (filtered_log, loop_one_list, A_filtered, B_filtered, loops_in_first_place, loops_in_last_place)
def activity_frequency( df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r: str, a: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> float: """ Fraction of completions of a given activity a, by a given resource r, during a given time slot, [t1, t2), with respect to the total number of activity completions by resource r during [t1, t2) Metric RBI 1.3 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- df Dataframe t1 Left interval t2 Right interval r Resource a Activity Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) t1 = get_dt_from_string(t1) t2 = get_dt_from_string(t2) df = df[[activity_key, timestamp_key, resource_key]] df = df[df[resource_key] == r] df = df[df[timestamp_key] >= t1] df = df[df[timestamp_key] < t2] total = len(df) df = df[df[activity_key] == a] activity_a = len(df) return float(activity_a) / float(total) if total > 0 else 0.0
def apply(log, net, im, fm, parameters=None): """ Performs a Monte Carlo simulation of an accepting Petri net without duplicate transitions and where the preset is always distinct from the postset (FIFO variant; the semaphores pile up if waiting is needed, and the first in is the first to win the semaphore) Parameters ------------- log Event log net Accepting Petri net without duplicate transitions and where the preset is always distinct from the postset im Initial marking fm Final marking parameters Parameters of the algorithm: PARAM_NUM_SIMULATIONS => (default: 100) PARAM_FORCE_DISTRIBUTION => Force a particular stochastic distribution (e.g. normal) when the stochastic map is discovered from the log (default: None; no distribution is forced) PARAM_ENABLE_DIAGNOSTICS => Enable the printing of diagnostics (default: True) PARAM_DIAGN_INTERVAL => Interval of time in which diagnostics of the simulation are printed (default: 32) PARAM_CASE_ARRIVAL_RATIO => Case arrival of new cases (default: None; inferred from the log) PARAM_PROVIDED_SMAP => Stochastic map that is used in the simulation (default: None; inferred from the log) PARAM_MAP_RESOURCES_PER_PLACE => Specification of the number of resources available per place (default: None; each place gets the default number of resources) PARAM_DEFAULT_NUM_RESOURCES_PER_PLACE => Default number of resources per place when not specified (default: 1; each place gets 1 resource and has to wait for the resource to finish) PARAM_SMALL_SCALE_FACTOR => Scale factor for the sleeping time of the actual simulation (default: 864000.0, 10gg) PARAM_MAX_THREAD_EXECUTION_TIME => Maximum execution time per thread (default: 60.0, 1 minute) Returns ------------ simulated_log Simulated event log simulation_result Result of the simulation: Outputs.OUTPUT_PLACES_INTERVAL_TREES => inteval trees that associate to each place the times in which it was occupied. Outputs.OUTPUT_TRANSITIONS_INTERVAL_TREES => interval trees that associate to each transition the intervals of time in which it could not fire because some token was in the output. Outputs.OUTPUT_CASES_EX_TIME => Throughput time of the cases included in the simulated log Outputs.OUTPUT_MEDIAN_CASES_EX_TIME => Median of the throughput times Outputs.OUTPUT_CASE_ARRIVAL_RATIO => Case arrival ratio that was specified in the simulation Outputs.OUTPUT_TOTAL_CASES_TIME => Total time occupied by cases of the simulated log """ if parameters is None: parameters = {} from intervaltree import IntervalTree timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) no_simulations = exec_utils.get_param_value( Parameters.PARAM_NUM_SIMULATIONS, parameters, 100) force_distribution = exec_utils.get_param_value( Parameters.PARAM_FORCE_DISTRIBUTION, parameters, None) enable_diagnostics = exec_utils.get_param_value( Parameters.PARAM_ENABLE_DIAGNOSTICS, parameters, True) diagn_interval = exec_utils.get_param_value( Parameters.PARAM_DIAGN_INTERVAL, parameters, 32.0) case_arrival_ratio = exec_utils.get_param_value( Parameters.PARAM_CASE_ARRIVAL_RATIO, parameters, None) smap = exec_utils.get_param_value(Parameters.PARAM_PROVIDED_SMAP, parameters, None) resources_per_places = exec_utils.get_param_value( Parameters.PARAM_MAP_RESOURCES_PER_PLACE, parameters, None) default_num_resources_per_places = exec_utils.get_param_value( Parameters.PARAM_DEFAULT_NUM_RESOURCES_PER_PLACE, parameters, 1) small_scale_factor = exec_utils.get_param_value( Parameters.PARAM_SMALL_SCALE_FACTOR, parameters, 864000) max_thread_exec_time = exec_utils.get_param_value( Parameters.PARAM_MAX_THREAD_EXECUTION_TIME, parameters, 60.0) if case_arrival_ratio is None: case_arrival_ratio = case_arrival.get_case_arrival_avg( log, parameters=parameters) if resources_per_places is None: resources_per_places = {} logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) places_interval_trees = {} transitions_interval_trees = {} cases_ex_time = [] list_cases = {} for place in net.places: # assign a semaphore to each place. if place in resources_per_places: place.semaphore = Semaphore(resources_per_places[place]) else: # if the user does not specify the number of resources per place, # the default number is used place.semaphore = Semaphore(default_num_resources_per_places) place.assigned_time = [] places_interval_trees[place] = IntervalTree() for trans in net.transitions: transitions_interval_trees[trans] = IntervalTree() # when the user does not specify any map from transitions to random variables, # a replay operation is performed if smap is None: if enable_diagnostics: logger.info(str(time()) + " started the replay operation.") if force_distribution is not None: smap = replay.get_map_from_log_and_net( log, net, im, fm, force_distribution=force_distribution, parameters=parameters) else: smap = replay.get_map_from_log_and_net(log, net, im, fm, parameters=parameters) if enable_diagnostics: logger.info(str(time()) + " ended the replay operation.") # the start timestamp is set to 1000000 instead of 0 to avoid problems with 32 bit machines start_time = 1000000 threads = [] for i in range(no_simulations): list_cases[i] = Trace() t = SimulationThread(i, net, im, fm, smap, start_time, places_interval_trees, transitions_interval_trees, cases_ex_time, list_cases, enable_diagnostics, diagn_interval, small_scale_factor, max_thread_exec_time) t.start() threads.append(t) start_time = start_time + case_arrival_ratio # wait a factor before opening a thread and the next one sleep(case_arrival_ratio / small_scale_factor) for t in threads: t.join() i = 0 while i < len(threads): if threads[i].terminated_correctly is False: del list_cases[threads[i].id] del threads[i] del cases_ex_time[i] continue i = i + 1 if enable_diagnostics: logger.info(str(time()) + " ended the Monte carlo simulation.") log = EventLog(list(list_cases.values())) min_timestamp = log[0][0][timestamp_key].timestamp() max_timestamp = max(y[timestamp_key].timestamp() for x in log for y in x) transitions_interval_trees = { t.name: y for t, y in transitions_interval_trees.items() } return log, { Outputs.OUTPUT_PLACES_INTERVAL_TREES.value: places_interval_trees, Outputs.OUTPUT_TRANSITIONS_INTERVAL_TREES.value: transitions_interval_trees, Outputs.OUTPUT_CASES_EX_TIME.value: cases_ex_time, Outputs.OUTPUT_MEDIAN_CASES_EX_TIME.value: median(cases_ex_time), Outputs.OUTPUT_CASE_ARRIVAL_RATIO.value: case_arrival_ratio, Outputs.OUTPUT_TOTAL_CASES_TIME.value: max_timestamp - min_timestamp }
def average_duration_activity( df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r: str, a: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> float: """ The average duration of instances of a given activity completed during a given time slot by a given resource. Metric RBI 4.3 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- df Dataframe t1 Left interval t2 Right interval r Resource a Activity Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} t1 = get_dt_from_string(t1) t2 = get_dt_from_string(t2) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, None) if start_timestamp_key is None: df = __insert_start_from_previous_event(df, parameters=parameters) start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY df = df[[ timestamp_key, resource_key, case_id_key, activity_key, start_timestamp_key ]] df = df[df[resource_key] == r] df = df[df[activity_key] == a] df = df[df[timestamp_key] >= t1] df = df[df[timestamp_key] < t2] return float((df[timestamp_key] - df[start_timestamp_key]).astype('timedelta64[s]').mean())
def get_map_from_log_and_net(log, net, initial_marking, final_marking, force_distribution=None, parameters=None): """ Get transition stochastic distribution map given the log and the Petri net Parameters ----------- log Event log net Petri net initial_marking Initial marking of the Petri net final_marking Final marking of the Petri net force_distribution If provided, distribution to force usage (e.g. EXPONENTIAL) parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> activity name Parameters.TIMESTAMP_KEY -> timestamp key Returns ----------- stochastic_map Map that to each transition associates a random variable """ stochastic_map = {} if parameters is None: parameters = {} token_replay_variant = exec_utils.get_param_value(Parameters.TOKEN_REPLAY_VARIANT, parameters, executor.Variants.TOKEN_REPLAY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) parameters_variants = {constants.PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key} variants_idx = variants_module.get_variants_from_log_trace_idx(log, parameters=parameters_variants) variants = variants_module.convert_variants_trace_idx_to_trace_obj(log, variants_idx) parameters_tr = {token_replay.Parameters.ACTIVITY_KEY: activity_key, token_replay.Parameters.VARIANTS: variants} # do the replay aligned_traces = executor.apply(log, net, initial_marking, final_marking, variant=token_replay_variant, parameters=parameters_tr) element_statistics = performance_map.single_element_statistics(log, net, initial_marking, aligned_traces, variants_idx, activity_key=activity_key, timestamp_key=timestamp_key, parameters={"business_hours": True}) for el in element_statistics: if type(el) is PetriNet.Transition and "performance" in element_statistics[el]: values = element_statistics[el]["performance"] rand = RandomVariable() rand.calculate_parameters(values, force_distribution=force_distribution) no_of_times_enabled = element_statistics[el]['no_of_times_enabled'] no_of_times_activated = element_statistics[el]['no_of_times_activated'] if no_of_times_enabled > 0: rand.set_weight(float(no_of_times_activated) / float(no_of_times_enabled)) else: rand.set_weight(0.0) stochastic_map[el] = rand return stochastic_map
def apply(c, Aub, bub, Aeq, beq, parameters=None): """ Gets the overall solution of the problem Parameters ------------ c c parameter of the algorithm Aub A_ub parameter of the algorithm bub b_ub parameter of the algorithm Aeq A_eq parameter of the algorithm beq b_eq parameter of the algorithm parameters Possible parameters of the algorithm Returns ------------- sol Solution of the LP problem by the given algorithm """ if parameters is None: parameters = {} require_ilp = exec_utils.get_param_value(Parameters.REQUIRE_ILP, parameters, False) solver = pywraplp.Solver('LinearProgrammingExample', pywraplp.Solver.GLOP_LINEAR_PROGRAMMING) solver.Clear() solver.SuppressOutput() x_list = [] for i in range(Aub.shape[1]): if require_ilp: x = solver.IntVar(-solver.infinity(), solver.infinity(), "x_" + str(i)) else: x = solver.NumVar(-solver.infinity(), solver.infinity(), "x_" + str(i)) x_list.append(x) objective = solver.Objective() for j in range(len(c)): if abs(c[j]) > MIN_THRESHOLD: objective.SetCoefficient(x_list[j], c[j]) for i in range(Aub.shape[0]): ok = False for j in range(Aub.shape[1]): if abs(Aub[i, j]) > MIN_THRESHOLD: ok = True break if ok: constraint = solver.Constraint(-solver.infinity(), bub[i]) for j in range(Aub.shape[1]): if abs(Aub[i, j]) > MIN_THRESHOLD: constraint.SetCoefficient(x_list[j], Aub[i, j]) if Aeq is not None and beq is not None: for i in range(Aeq.shape[0]): ok = False for j in range(Aeq.shape[1]): if abs(Aeq[i, j]) > MIN_THRESHOLD: ok = True break if ok: constraint = solver.Constraint(beq[i], beq[i]) for j in range(Aeq.shape[1]): if abs(Aeq[i, j]) > MIN_THRESHOLD: constraint.SetCoefficient(x_list[j], Aeq[i, j]) objective.SetMinimization() status = solver.Solve() if status == 0: sol_value = 0.0 for j in range(len(c)): if abs(c[j]) > MIN_THRESHOLD: sol_value = sol_value + c[j] * x_list[j].solution_value() points = [x.solution_value() for x in x_list] else: return None return {"c": c, "x_list": x_list, "sol_value": sol_value, "points": points}
def apply(dataframe, list_activities, sample_size, parameters): """ Finds the performance spectrum provided a dataframe and a list of activities Parameters ------------- dataframe Dataframe list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} import pandas as pd import numpy as np case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]] dataframe = dataframe[dataframe[activity_key].isin(list_activities)] dataframe = pandas_utils.insert_index(dataframe, constants.DEFAULT_EVENT_INDEX_KEY) dataframe = dataframe.sort_values( [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY]) dataframe[timestamp_key] = dataframe[timestamp_key].astype( np.int64) / 10**9 list_replicas = [] activity_names = [] filt_col_names = [] for i in range(len(list_activities)): if i > 0: dataframe = dataframe.shift(-1) activity_names.append("+'@@'+") ren = {x: x + "_" + str(i) for x in dataframe.columns} list_replicas.append(dataframe.rename(columns=ren)) filt_col_names.append(timestamp_key + "_" + str(i)) activity_names.append("dataframe[activity_key+'_" + str(i) + "']") dataframe = pd.concat(list_replicas, axis=1) for i in range(len(list_activities) - 1): dataframe = dataframe[dataframe[case_id_glue + "_" + str(i)] == dataframe[case_id_glue + "_" + str(i + 1)]] dataframe["@@merged_activity"] = eval("".join(activity_names)) desidered_act = "@@".join(list_activities) dataframe = dataframe[dataframe["@@merged_activity"] == desidered_act] dataframe = dataframe[filt_col_names] if len(dataframe) > sample_size: dataframe = dataframe.sample(n=sample_size) points = pandas_utils.to_dict_records(dataframe) points = [[p[tk] for tk in filt_col_names] for p in points] points = sorted(points, key=lambda x: x[0]) return points
def apply_trace(trace, list_nets, parameters=None): """ Align a trace against a decomposition Parameters -------------- trace Trace list_nets List of Petri nets (decomposed) parameters Parameters of the algorithm Returns -------------- alignment Alignment of the trace """ if parameters is None: parameters = {} max_align_time_trace = exec_utils.get_param_value( Parameters.PARAM_MAX_ALIGN_TIME_TRACE, parameters, sys.maxsize) threshold_border_agreement = exec_utils.get_param_value( Parameters.PARAM_THRESHOLD_BORDER_AGREEMENT, parameters, 100000000) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) icache = exec_utils.get_param_value(Parameters.ICACHE, parameters, dict()) mcache = exec_utils.get_param_value(Parameters.MCACHE, parameters, dict()) cons_nets = copy(list_nets) acache = get_acache(cons_nets) cons_nets_result = [] cons_nets_alres = [] cons_nets_costs = [] max_val_alres = 0 start_time = time.time() i = 0 while i < len(cons_nets): this_time = time.time() if this_time - start_time > max_align_time_trace: # the alignment did not termine in the provided time return None net, im, fm = cons_nets[i] proj = Trace([x for x in trace if x[activity_key] in net.lvis_labels]) if len(proj) > 0: acti = tuple(x[activity_key] for x in proj) tup = (cons_nets[i], acti) if tup not in icache: al, cf = align(proj, net, im, fm, parameters=parameters) alres = get_alres(al) icache[tup] = (al, cf, alres) al, cf, alres = icache[tup] cons_nets_result.append(al) cons_nets_alres.append(alres) cons_nets_costs.append(cf) if this_time - start_time > max_align_time_trace: # the alignment did not termine in the provided time return None max_val_alres = max( max_val_alres, max(z for y in alres.values() for z in y) if alres else 0) border_disagreements = 0 if max_val_alres > 0: comp_to_merge = set() for act in [ x[activity_key] for x in trace if x[activity_key] in net.lvis_labels ]: for ind in acache[act]: if ind >= i: break if cons_nets_alres[ind] is None or cons_nets_alres[ ind] is None: # the alignment did not termine in the provided time return None if cons_nets_alres[ind][act] != cons_nets_alres[i][act]: for ind2 in acache[act]: comp_to_merge.add(ind2) if comp_to_merge: comp_to_merge = sorted(list(comp_to_merge), reverse=True) border_disagreements += len(comp_to_merge) # if the number of border disagreements exceed the specified threshold # then stop iterating on the trace if border_disagreements > threshold_border_agreement: return None comp_to_merge_ids = tuple( list(cons_nets[j][0].t_tuple for j in comp_to_merge)) if comp_to_merge_ids not in mcache: mcache[ comp_to_merge_ids] = decomp_utils.merge_sublist_nets( [cons_nets[zz] for zz in comp_to_merge]) new_comp = mcache[comp_to_merge_ids] cons_nets.append(new_comp) j = 0 while j < len(comp_to_merge): z = comp_to_merge[j] if z < i: i = i - 1 if z <= i: del cons_nets_result[z] del cons_nets_alres[z] del cons_nets_costs[z] del cons_nets[z] j = j + 1 acache = get_acache(cons_nets) continue else: cons_nets_result.append(None) cons_nets_alres.append(None) cons_nets_costs.append(None) i = i + 1 if this_time - start_time > max_align_time_trace: # the alignment did not termine in the provided time return None alignment = recompose_alignment( cons_nets, cons_nets_result, ) overall_cost_dict = {} for cf in cons_nets_costs: if cf is not None: for el in cf: overall_cost_dict[el] = cf[el] cost = 0 for el in alignment: cost = cost + overall_cost_dict[el] alignment = [x[1] for x in alignment] if this_time - start_time > max_align_time_trace: # the alignment did not termine in the provided time return None res = {"cost": cost, "alignment": alignment} best_worst_cost = exec_utils.get_param_value(Parameters.BEST_WORST_COST, parameters, None) if best_worst_cost is not None and len(trace) > 0: cost1 = cost // utils.STD_MODEL_LOG_MOVE_COST fitness = 1.0 - cost1 / (best_worst_cost + len(trace)) res["fitness"] = fitness return res
def diagnose_from_trans_fitness(log, trans_fitness, parameters=None): """ Perform root cause analysis starting from transition fitness knowledge Parameters ------------- log Trace log object trans_fitness Transition fitness object parameters Possible parameters of the algorithm, including: string_attributes -> List of string event attributes to consider in building the decision tree numeric_attributes -> List of numeric event attributes to consider in building the decision tree Returns ----------- diagnostics For each problematic transition: - a decision tree comparing fit and unfit executions - feature names - classes """ from sklearn import tree if parameters is None: parameters = {} diagnostics = {} string_attributes = exec_utils.get_param_value( Parameters.STRING_ATTRIBUTES, parameters, []) numeric_attributes = exec_utils.get_param_value( Parameters.NUMERIC_ATTRIBUTES, parameters, []) enable_multiplier = exec_utils.get_param_value( Parameters.ENABLE_MULTIPLIER, parameters, False) for trans in trans_fitness: if len(trans_fitness[trans]["underfed_traces"]) > 0: fit_cases_repr = [] underfed_cases_repr = [] for trace in log: if trace in trans_fitness[trans]["underfed_traces"]: underfed_cases_repr.append( trans_fitness[trans]["underfed_traces"][trace][0]) elif trace in trans_fitness[trans]["fit_traces"]: fit_cases_repr.append( trans_fitness[trans]["fit_traces"][trace][0]) if fit_cases_repr and underfed_cases_repr: data, feature_names = form_representation_from_dictio_couple( fit_cases_repr, underfed_cases_repr, string_attributes, numeric_attributes, enable_multiplier=enable_multiplier) target = [] classes = [] if enable_multiplier: multiplier_first = int( max( float(len(underfed_cases_repr)) / float(len(fit_cases_repr)), 1)) multiplier_second = int( max( float(len(fit_cases_repr)) / float(len(underfed_cases_repr)), 1)) else: multiplier_first = 1 multiplier_second = 1 for j in range(multiplier_first): for i in range(len(fit_cases_repr)): target.append(0) classes.append("fit") for j in range(multiplier_second): for i in range(len(underfed_cases_repr)): target.append(1) classes.append("underfed") target = np.asarray(target) clf = tree.DecisionTreeClassifier(max_depth=7) clf.fit(data, target) diagn_dict = { "clf": clf, "data": data, "feature_names": feature_names, "target": target, "classes": classes } diagnostics[trans] = diagn_dict return diagnostics
def apply(c, Aub, bub, Aeq, beq, parameters=None): """ Gets the overall solution of the problem Parameters ------------ c c parameter of the algorithm Aub A_ub parameter of the algorithm bub b_ub parameter of the algorithm Aeq A_eq parameter of the algorithm beq b_eq parameter of the algorithm parameters Possible parameters of the algorithm Returns ------------- sol Solution of the LP problem by the given algorithm """ if parameters is None: parameters = {} require_ilp = exec_utils.get_param_value(Parameters.REQUIRE_ILP, parameters, False) prob = LpProblem("", LpMinimize) x_list = [] for i in range(Aub.shape[1]): if require_ilp: x_list.append( LpVariable("x_" + get_terminal_part_name_num(i), cat='Integer')) else: x_list.append(LpVariable("x_" + get_terminal_part_name_num(i))) eval_str = "" expr_count = 0 for j in range(len(c)): if abs(c[j]) > MIN_THRESHOLD: if expr_count > 0: eval_str = eval_str + " + " eval_str = eval_str + str(c[j]) + "*x_list[" + str(j) + "]" expr_count = expr_count + 1 eval_str = eval_str + ", \"objective\"" prob += eval(eval_str) for i in range(Aub.shape[0]): expr_count = 0 eval_str = 0 eval_str = "" for j in range(Aub.shape[1]): if abs(Aub[i, j]) > MIN_THRESHOLD: if expr_count > 0: eval_str = eval_str + " + " eval_str = eval_str + str(Aub[i, j]) + "*x_list[" + str(j) + "]" expr_count = expr_count + 1 if eval_str: eval_str = eval_str + "<=" + str( bub[i]) + ", \"vinc_" + get_terminal_part_name_num(i) + "\"" prob += eval(eval_str) if Aeq is not None and beq is not None: for i in range(Aeq.shape[0]): expr_count = 0 eval_str = 0 eval_str = "" for j in range(Aeq.shape[1]): if abs(Aeq[i, j]) > MIN_THRESHOLD: if expr_count > 0: eval_str = eval_str + " + " eval_str = eval_str + str( Aeq[i, j]) + "*x_list[" + str(j) + "]" expr_count = expr_count + 1 if eval_str: eval_str = eval_str + "==" + str( beq[i]) + ", \"vinceq_" + get_terminal_part_name_num( i + 1 + Aub.shape[0]) + "\"" prob += eval(eval_str) filename = tempfile.NamedTemporaryFile(suffix='.lp').name prob.writeLP(filename) solver(prob) return prob
def diagnose_from_notexisting_activities(log, notexisting_activities_in_model, parameters=None): """ Perform root cause analysis related to activities that are not present in the model Parameters ------------- log Trace log object notexisting_activities_in_model Not existing activities in the model parameters Possible parameters of the algorithm, including: string_attributes -> List of string event attributes to consider in building the decision tree numeric_attributes -> List of numeric event attributes to consider in building the decision tree Returns ----------- diagnostics For each problematic transition: - a decision tree comparing fit and unfit executions - feature names - classes """ from sklearn import tree if parameters is None: parameters = {} diagnostics = {} string_attributes = exec_utils.get_param_value( Parameters.STRING_ATTRIBUTES, parameters, []) numeric_attributes = exec_utils.get_param_value( Parameters.NUMERIC_ATTRIBUTES, parameters, []) enable_multiplier = exec_utils.get_param_value( Parameters.ENABLE_MULTIPLIER, parameters, False) parameters_filtering = deepcopy(parameters) parameters_filtering["positive"] = False values = list(notexisting_activities_in_model.keys()) filtered_log = basic_filter.filter_log_traces_attr( log, values, parameters=parameters_filtering) for act in notexisting_activities_in_model: fit_cases_repr = [] containing_cases_repr = [] for trace in log: if trace in notexisting_activities_in_model[act]: containing_cases_repr.append( notexisting_activities_in_model[act][trace]) elif trace in filtered_log: fit_cases_repr.append(dict(trace[-1])) if fit_cases_repr and containing_cases_repr: data, feature_names = form_representation_from_dictio_couple( fit_cases_repr, containing_cases_repr, string_attributes, numeric_attributes, enable_multiplier=enable_multiplier) target = [] classes = [] if enable_multiplier: multiplier_first = int( max( float(len(containing_cases_repr)) / float(len(fit_cases_repr)), 1)) multiplier_second = int( max( float(len(fit_cases_repr)) / float(len(containing_cases_repr)), 1)) else: multiplier_first = 1 multiplier_second = 1 for j in range(multiplier_first): for i in range(len(fit_cases_repr)): target.append(0) classes.append("fit") for j in range(multiplier_second): for i in range(len(containing_cases_repr)): target.append(1) classes.append("containing") target = np.asarray(target) clf = tree.DecisionTreeClassifier(max_depth=7) clf.fit(data, target) diagn_dict = { "clf": clf, "data": data, "feature_names": feature_names, "target": target, "classes": classes } diagnostics[act] = diagn_dict return diagnostics
def get_cases_description(log, parameters=None): """ Get a description of traces present in the log Parameters ----------- log Log parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Trace attribute in which the case ID is contained Parameters.TIMESTAMP_KEY -> Column that identifies the timestamp Parameters.ENABLE_SORT -> Enable sorting of traces Parameters.SORT_BY_INDEX -> Sort the traces using this index: 0 -> case ID 1 -> start time 2 -> end time 3 -> difference Parameters.SORT_ASCENDING -> Set sort direction (boolean; it true then the sort direction is ascending, otherwise descending) Parameters.MAX_RET_CASES -> Set the maximum number of returned traces Returns ----------- ret Dictionary of traces associated to their start timestamp, their end timestamp and their duration """ if parameters is None: parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, DEFAULT_TRACEID_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) enable_sort = exec_utils.get_param_value(Parameters.ENABLE_SORT, parameters, True) sort_by_index = exec_utils.get_param_value(Parameters.SORT_BY_INDEX, parameters, 0) sort_ascending = exec_utils.get_param_value(Parameters.SORT_ASCENDING, parameters, True) max_ret_cases = exec_utils.get_param_value(Parameters.MAX_RET_CASES, parameters, None) business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) statistics_list = [] for index, trace in enumerate(log): if trace: ci = trace.attributes[ case_id_key] if case_id_key in trace.attributes else "EMPTY" + str( index) st = trace[0][timestamp_key] et = trace[-1][timestamp_key] if business_hours: bh = BusinessHours(st.replace(tzinfo=None), et.replace(tzinfo=None), worktiming=worktiming, weekends=weekends) diff = bh.getseconds() else: diff = et.timestamp() - st.timestamp() st = st.timestamp() et = et.timestamp() statistics_list.append([ci, st, et, diff]) if enable_sort: statistics_list = sorted(statistics_list, key=lambda x: x[sort_by_index], reverse=not sort_ascending) if max_ret_cases is not None: statistics_list = statistics_list[:min(len(statistics_list ), max_ret_cases)] statistics_dict = {} for el in statistics_list: statistics_dict[str(el[0])] = { "startTime": el[1], "endTime": el[2], "caseDuration": el[3] } return statistics_dict
def apply_log(log, petri_net, initial_marking, final_marking, parameters=None, variant=DEFAULT_VARIANT): """ apply alignments to a log Parameters ----------- log object of the form :class:`pm4py.log.log.EventLog` event log petri_net :class:`pm4py.objects.petri.petrinet.PetriNet` the model to use for the alignment initial_marking :class:`pm4py.objects.petri.petrinet.Marking` initial marking of the net final_marking :class:`pm4py.objects.petri.petrinet.Marking` final marking of the net variant selected variant of the algorithm, possible values: {\'Variants.VERSION_STATE_EQUATION_A_STAR, Variants.VERSION_DIJKSTRA_NO_HEURISTICS \'} parameters :class:`dict` parameters of the algorithm, Returns ----------- alignment :class:`list` of :class:`dict` with keys **alignment**, **cost**, **visited_states**, **queued_states** and **traversed_arcs** The alignment is a sequence of labels of the form (a,t), (a,>>), or (>>,t) representing synchronous/log/model-moves. """ if parameters is None: parameters = dict() if not check_soundness.check_relaxed_soundness_net_in_fin_marking(petri_net, initial_marking, final_marking): raise Exception("trying to apply alignments on a Petri net that is not a relaxed sound net!!") start_time = time.time() max_align_time = exec_utils.get_param_value(Parameters.PARAM_MAX_ALIGN_TIME, parameters, sys.maxsize) max_align_time_case = exec_utils.get_param_value(Parameters.PARAM_MAX_ALIGN_TIME_TRACE, parameters, sys.maxsize) parameters_best_worst = copy(parameters) best_worst_cost = exec_utils.get_variant(variant).get_best_worst_cost(petri_net, initial_marking, final_marking, parameters=parameters_best_worst) variants_idxs = exec_utils.get_param_value(Parameters.VARIANTS_IDX, parameters, None) if variants_idxs is None: variants_idxs = variants_module.get_variants_from_log_trace_idx(log, parameters=parameters) one_tr_per_var = [] variants_list = [] for index_variant, var in enumerate(variants_idxs): variants_list.append(var) for var in variants_list: one_tr_per_var.append(log[variants_idxs[var][0]]) all_alignments = [] for trace in one_tr_per_var: this_max_align_time = min(max_align_time_case, (max_align_time - (time.time() - start_time)) * 0.5) parameters[Parameters.PARAM_MAX_ALIGN_TIME_TRACE] = this_max_align_time all_alignments.append(apply_trace(trace, petri_net, initial_marking, final_marking, parameters=copy(parameters), variant=variant)) al_idx = {} for index_variant, variant in enumerate(variants_idxs): for trace_idx in variants_idxs[variant]: al_idx[trace_idx] = all_alignments[index_variant] alignments = [] for i in range(len(log)): alignments.append(al_idx[i]) # assign fitness to traces for index, align in enumerate(alignments): if align is not None: unfitness_upper_part = align['cost'] // align_utils.STD_MODEL_LOG_MOVE_COST if unfitness_upper_part == 0: align['fitness'] = 1 elif (len(log[index]) + best_worst_cost) > 0: align['fitness'] = 1 - ( (align['cost'] // align_utils.STD_MODEL_LOG_MOVE_COST) / (len(log[index]) + best_worst_cost)) else: align['fitness'] = 0 return alignments
def A_eventually_B_eventually_C(df0, A, B, C, parameters=None): """ Applies the A eventually B eventually C rule Parameters ------------ df0 Dataframe A A Attribute value B B Attribute value C C Attribute value parameters Parameters of the algorithm, including the attribute key and the positive parameter: - If True, returns all the cases containing A, B and C and in which A was eventually followed by B and B was eventually followed by C - If False, returns all the cases not containing A or B or C, or in which an instance of A was not eventually followed by an instance of B or an instance of B was not eventually followed by C Returns ------------ filtered_df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True) enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False) timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, []) colset = [case_id_glue, attribute_key] if enable_timestamp: colset.append(timestamp_key) df = df0.copy() df = df[colset] df = pandas_utils.insert_index(df) df_A = df[df[attribute_key] == A].copy() df_B = df[df[attribute_key] == B].copy() df_C = df[df[attribute_key] == C].copy() df_B["@@conceptname"] = df_B[case_id_glue] df_B = df_B.groupby(case_id_glue).last().set_index("@@conceptname") df_C["@@conceptname"] = df_C[case_id_glue] df_C = df_C.groupby(case_id_glue).last().set_index("@@conceptname") df_join = df_A.join(df_B, on=case_id_glue, rsuffix="_2").dropna() df_join["@@diffindex"] = df_join[constants.DEFAULT_INDEX_KEY+"_2"] - df_join[constants.DEFAULT_INDEX_KEY] df_join = df_join[df_join["@@diffindex"] > 0] df_join = df_join.join(df_C, on=case_id_glue, rsuffix="_3").dropna() df_join["@@diffindex2"] = df_join[constants.DEFAULT_INDEX_KEY+"_3"] - df_join[constants.DEFAULT_INDEX_KEY+"_2"] df_join = df_join[df_join["@@diffindex2"] > 0] if enable_timestamp: df_join["@@difftimestamp"] = (df_join[timestamp_key + "_2"] - df_join[timestamp_key]).astype('timedelta64[s]') df_join["@@difftimestamp2"] = (df_join[timestamp_key + "_3"] - df_join[timestamp_key + "_2"]).astype( 'timedelta64[s]') if timestamp_diff_boundaries: df_join = df_join[df_join["@@difftimestamp"] >= timestamp_diff_boundaries[0][0]] df_join = df_join[df_join["@@difftimestamp"] <= timestamp_diff_boundaries[0][1]] df_join = df_join[df_join["@@difftimestamp2"] >= timestamp_diff_boundaries[1][0]] df_join = df_join[df_join["@@difftimestamp2"] <= timestamp_diff_boundaries[1][1]] i1 = df.set_index(case_id_glue).index i2 = df_join.set_index(case_id_glue).index if positive: return df0[i1.isin(i2)] else: return df0[~i1.isin(i2)]
def apply_actlist(trace, model, parameters=None): """ Apply log-skeleton based conformance checking given the list of activities of a trace and a log-skeleton model Parameters -------------- trace List of activities of a trace model Log-skeleton model parameters Parameters of the algorithm, including: - the activity key (pm4py:param:activity_key) - the list of considered constraints (considered_constraints) among: equivalence, always_after, always_before, never_together, directly_follows, activ_freq Returns -------------- aligned_trace Containing: - is_fit => boolean that tells if the trace is perfectly fit according to the model - dev_fitness => deviation based fitness (between 0 and 1; the more the trace is near to 1 the more fit is) - deviations => list of deviations in the model """ if parameters is None: parameters = {} consid_constraints = exec_utils.get_param_value( Parameters.CONSIDERED_CONSTRAINTS, parameters, Parameters.DEFAULT_CONSIDERED_CONSTRAINTS.value) trace_info = trace_skel.get_trace_info(trace) ret = {} ret[Outputs.DEVIATIONS.value] = [] dev_total = 0 conf_total = 0 default_considered_constraints = Parameters.DEFAULT_CONSIDERED_CONSTRAINTS.value i = 0 while i < len(default_considered_constraints): if default_considered_constraints[i] in consid_constraints: if default_considered_constraints[ i] == DiscoveryOutputs.ACTIV_FREQ.value: this_constraints = { x: y for x, y in model[ default_considered_constraints[i]].items() } conf_total += len( list(act for act in trace_info[i] if act in this_constraints)) + len( list(act for act in trace_info[i] if act not in this_constraints)) + len( list(act for act in this_constraints if min(this_constraints[act]) > 0 and not act in trace)) for act in trace_info[i]: if act in this_constraints: if trace_info[i][act] not in this_constraints[act]: dev_total += 1 ret[Outputs.DEVIATIONS.value].append( (default_considered_constraints[i], (act, trace_info[i][act]))) else: dev_total += 1 ret[Outputs.DEVIATIONS.value].append( (default_considered_constraints[i], (act, 0))) for act in this_constraints: if min(this_constraints[act]) > 0 and not act in trace: dev_total += 1 ret[Outputs.DEVIATIONS.value].append( (default_considered_constraints[i], (act, 0))) elif default_considered_constraints[ i] == DiscoveryOutputs.NEVER_TOGETHER.value: this_constraints = { x for x in model[default_considered_constraints[i]] if x[0] in trace } conf_total += len(this_constraints) setinte = this_constraints.intersection(trace_info[i]) dev_total += len(setinte) if len(setinte) > 0: ret[Outputs.DEVIATIONS.value].append( (default_considered_constraints[i], tuple(setinte))) else: this_constraints = { x for x in model[default_considered_constraints[i]] if x[0] in trace } conf_total += len(this_constraints) setdiff = this_constraints.difference(trace_info[i]) dev_total += len(setdiff) if len(setdiff) > 0: ret[Outputs.DEVIATIONS.value].append( (default_considered_constraints[i], tuple(setdiff))) i = i + 1 ret[Outputs.NO_DEV_TOTAL.value] = dev_total ret[Outputs.NO_CONSTR_TOTAL.value] = conf_total ret[Outputs.DEV_FITNESS.value] = 1.0 - float(dev_total) / float( conf_total) if conf_total > 0 else 1.0 ret[Outputs.DEVIATIONS.value] = sorted(ret[Outputs.DEVIATIONS.value], key=lambda x: (x[0], x[1])) ret[Outputs.IS_FIT.value] = len(ret[Outputs.DEVIATIONS.value]) == 0 return ret