def sublog2varlist(log, freq_thres, num): ''' extract lists of variants from selected sublogs together with frequency threshold to filter out infrequent variants :param log: sublog containing the selected case attribute value :param freq_thres: (int) frequency threshold to filter out infrequent variants :return: lists of variant strings ''' variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) filtered_var_list = [] filtered_var_list_1 = [] filtered_var_list_2 = [] for i in range(len(variants_count)): if variants_count[i]['count'] >= freq_thres: filtered_var_list_1.append( variants_count[i]['variant']) # variant string elif i < num: filtered_var_list_2.append(variants_count[i]['variant']) # union set ensure the ordered union will be satisfied filtered_var_list = filtered_var_list_1 + filtered_var_list_2 str_var_list = [ variants_util.get_activities_from_variant(v) for v in filtered_var_list ] return str_var_list
def get_language(log, parameters=None): """ Gets the stochastic language of the log (from the variants) Parameters -------------- log Event log parameters Parameters Returns -------------- dictio Dictionary containing the stochastic language of the log (variant associated to a number between 0 and 1; the sum is 1) """ vars = get_variants(log, parameters=parameters) vars = { variants_util.get_activities_from_variant(x): len(y) for x, y in vars.items() } all_values_sum = sum(vars.values()) for x in vars: vars[x] = vars[x] / all_values_sum return vars
def sublog_percent2actlist(log, upper_percent, parameters=None): ''' just need to var list :param log: same as sublog2varlist() :param freq_thres: same as sublog2varlist() :return: dataframe of variants with their counts together with the correspond var_list(until the percent ) ''' if parameters is None: parameters = {} lower_percent = exec_utils.get_param_value(Parameters.LOWER_PERCENT, parameters, 0) variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True) df = pd.DataFrame.from_dict(variants_count) # calculate the cumunative sum csum = np.array(df['count']).cumsum() csum = csum / csum[-1] num_list = csum[csum <= upper_percent] num_list_lower = csum[csum <= lower_percent] # stop until the percent is satisfied df_w_count = df.iloc[len(num_list_lower):len(num_list), :] # get correspond var_list filtered_var_list = df_w_count['variant'].values.tolist() str_var_list = [ variants_util.get_activities_from_variant(v) for v in filtered_var_list ] return df_w_count, str_var_list
def apply_tree_variants(variants, parameters=None): """ Apply the IM algorithm to a dictionary of variants obtaining a process tree Parameters ---------- variants Variants parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ log = EventLog() activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) var_keys = list(variants.keys()) for var in var_keys: trace = Trace() activities = variants_util.get_activities_from_variant(var) for act in activities: trace.append(Event({activity_key: act})) log.append(trace) return apply_tree(log, parameters=parameters)
def get_language( log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Union[Dict[List[str], float], Dict[str, float]]: """ Gets the stochastic language of the log (from the variants) Parameters -------------- log Event log parameters Parameters Returns -------------- dictio Dictionary containing the stochastic language of the log (variant associated to a number between 0 and 1; the sum is 1) """ log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) vars = get_variants(log, parameters=parameters) vars = { variants_util.get_activities_from_variant(x): len(y) for x, y in vars.items() } all_values_sum = sum(vars.values()) for x in vars: vars[x] = vars[x] / all_values_sum return vars
def apply(log, parameters=None): """ Calculates the Working Together metric Parameters ------------ log Log parameters Possible parameters of the algorithm Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is not directed. """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.generic.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key} variants_occ = {x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics(log, parameters=parameters_variants)} variants_resources = list(variants_occ.keys()) resources = [variants_util.get_activities_from_variant(y) for y in variants_resources] flat_list = sorted(list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) for idx, rv in enumerate(resources): rvj = variants_resources[idx] ord_res_list = sorted(list(set(rv))) for i in range(len(ord_res_list) - 1): res_i = flat_list.index(ord_res_list[i]) for j in range(i + 1, len(ord_res_list)): res_j = flat_list.index(ord_res_list[j]) metric_matrix[res_i, res_j] += float(variants_occ[rvj]) / float(len(log)) metric_matrix[res_j, res_i] += float(variants_occ[rvj]) / float(len(log)) return [metric_matrix, flat_list, False]
def get_prefix_matrix_from_variants_list(variants_list, activities, parameters=None): """ Gets a numeric matrix where each row is associated to a different prefix of activities happening in the variants of the log, along with the count of the particular situation Parameters ------------- variants_list List of variants contained in the log, along with their count activities List of activities in the log parameters Parameters of the algorithm Returns ------------- prefix_mat Prefix matrix of the log """ if parameters is None: parameters = {} skip_last = parameters[SKIP_LAST] if SKIP_LAST in parameters else False prefixes = {} for var in variants_list: variant = variants_util.get_activities_from_variant(var[0]) count = var[1] prefix = [] for index, act in enumerate(variant): if skip_last and index == len(variant) - 1: break prefix.append(act) prefix_repr = get_prefix_repr(prefix, activities) if prefix_repr not in prefixes: prefixes[prefix_repr] = 0 prefixes[prefix_repr] = prefixes[prefix_repr] + count prefix_mat = [] for pref in prefixes: pref_list = copy(list(pref)) for i in range(len(pref_list)): pref_list[i] = pref_list[i] * prefixes[pref] prefix_mat.append(pref_list) prefix_mat = np.asmatrix(prefix_mat) prefix_mat = np.unique(prefix_mat, axis=0) return prefix_mat, activities
def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> List[Any]: """ Calculates the Working Together metric Parameters ------------ log Log parameters Possible parameters of the algorithm Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is not directed. """ if parameters is None: parameters = {} resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) parameters_variants = {variants_filter.Parameters.ACTIVITY_KEY: resource_key, variants_filter.Parameters.ATTRIBUTE_KEY: resource_key} variants_occ = {x: len(y) for x, y in variants_filter.get_variants(log, parameters=parameters_variants).items()} variants_resources = list(variants_occ.keys()) resources = [variants_util.get_activities_from_variant(y) for y in variants_resources] flat_list = sorted(list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) for idx, rv in enumerate(resources): rvj = variants_resources[idx] ord_res_list = sorted(list(set(rv))) for i in range(len(ord_res_list) - 1): res_i = flat_list.index(ord_res_list[i]) for j in range(i + 1, len(ord_res_list)): res_j = flat_list.index(ord_res_list[j]) metric_matrix[res_i, res_j] += float(variants_occ[rvj]) / float(len(log)) metric_matrix[res_j, res_i] += float(variants_occ[rvj]) / float(len(log)) return [metric_matrix, flat_list, False]
def check_is_fitting(*args, activity_key=xes_constants.DEFAULT_NAME_KEY): """ Checks if a trace object is fit against a process model Parameters ----------------- trace Trace object (trace / variant) model Model (process tree, Petri net, BPMN, ...) activity_key Activity key (optional) Returns ----------------- is_fit Boolean value (True if the trace fits; False if the trace does not) """ from pm4py.util import variants_util from pm4py.convert import convert_to_process_tree, convert_to_petri_net trace = args[0] model = args[1:] try: model = convert_to_process_tree(*model) except: # the model cannot be expressed as a process tree, let's say if at least can be expressed as a Petri net model = convert_to_petri_net(*model) if not isinstance(trace, Trace): activities = variants_util.get_activities_from_variant(trace) trace = Trace() for act in activities: trace.append(Event({activity_key: act})) if isinstance(model, ProcessTree): return __check_is_fit_process_tree(trace, model, activity_key=activity_key) elif isinstance(model, tuple) and isinstance(model[0], PetriNet): return __check_is_fit_petri_net(trace, model[0], model[1], model[2], activity_key=activity_key)
def get_variants_matrix_from_variants_list(variants_list, activities, parameters=None): """ Gets a numeric matrix where each row is associated to a different set of activities happening in the (complete) variants of the log, along with the count of the particular situation Parameters ------------- variants_list List of variants contained in the log, along with their count activities List of activities in the log parameters Parameters of the algorithm: keep_unique (default: True) Returns ------------- variants_matrix Variants matrix of the log """ if parameters is None: parameters = {} keep_unique = parameters[KEEP_UNIQUE] if KEEP_UNIQUE in parameters else True variants_mat = [] for var in variants_list: variant = variants_util.get_activities_from_variant(var[0]) count = var[1] this_var_repr = [0] * len(activities) for act in variant: i = activities.index(act) this_var_repr[i] = this_var_repr[i] + count variants_mat.append(this_var_repr) variants_mat = np.asmatrix(variants_mat) if keep_unique: variants_mat = np.unique(variants_mat, axis=0) return variants_mat, activities
def get_dfg_sa_ea_act_from_variants(variants, parameters=None): """ Gets the DFG, the start and end activities, and the activities from the dictionary/set/list of variants in the log Parameters --------------- variants Dictionary/set/list of variants parameters Parameters of the algorithm, including: - variants_sep: the delimiter splitting activities in a variant Returns -------------- dfg DFG list_act List of different activities start_activities Start activities end_activities End activities """ if parameters is None: parameters = {} variants = set( variants_util.get_activities_from_variant(v) for v in variants) dfg = dict( Counter( list((x[i], x[i + 1]) for x in variants for i in range(len(x) - 1)))) list_act = list(set(y for x in variants for y in x)) start_activities = dict(Counter(x[0] for x in variants if x)) end_activities = dict(Counter(x[-1] for x in variants if x)) return dfg, list_act, start_activities, end_activities
def apply( log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> List[Any]: """ Calculates the HW metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.BETA -> beta value as described in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is directed. """ if parameters is None: parameters = {} resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) beta = exec_utils.get_param_value(Parameters.BETA, parameters, 0) parameters_variants = { variants_filter.Parameters.ACTIVITY_KEY: resource_key, variants_filter.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x: len(y) for x, y in variants_filter.get_variants( log, parameters=parameters_variants).items() } variants_resources = list(variants_occ.keys()) resources = [ variants_util.get_activities_from_variant(y) for y in variants_resources ] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] for i in range(len(rv) - 1): res_i = flat_list.index(rv[i]) if not res_i in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, len(rv)): res_j = flat_list.index(rv[j]) if not res_j in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 if beta == 0: sum_i_to_j[res_i][res_j] += variants_occ[rvj] dividend += variants_occ[rvj] break else: sum_i_to_j[res_i][res_j] += variants_occ[rvj] * (beta**( j - i - 1)) dividend += variants_occ[rvj] * (beta**(j - i - 1)) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def apply(log, parameters=None): """ Calculates the HW metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Paramters.BETA -> beta value as described in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is directed. """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) beta = exec_utils.get_param_value(Parameters.BETA, parameters, 0) parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key} variants_occ = {x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics(log, parameters=parameters_variants)} variants_resources = list(variants_occ.keys()) resources = [variants_util.get_activities_from_variant(y) for y in variants_resources] flat_list = sorted(list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for idx, rv in enumerate(resources): rvj = variants_resources[idx] for i in range(len(rv) - 1): res_i = flat_list.index(rv[i]) if not res_i in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, len(rv)): res_j = flat_list.index(rv[j]) if not res_j in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 if beta == 0: sum_i_to_j[res_i][res_j] += variants_occ[rvj] break else: sum_i_to_j[res_i][res_j] += variants_occ[rvj] * (beta ** (j - i - 1)) dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] if beta == 0: dividend = dividend + variants_occ[rvj] * (len(rv) - 1) else: dividend = dividend + variants_occ[rvj] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def apply(log, net, initial_marking, final_marking, parameters=None): """ Method to apply token-based replay Parameters ----------- log Log net Petri net initial_marking Initial marking final_marking Final marking parameters Parameters of the algorithm """ if parameters is None: parameters = {} for t in net.transitions: ma = Marking() for a in t.out_arcs: p = a.target ma[p] = a.weight t.out_marking = ma for t in net.transitions: ma = Marking() for a in t.in_arcs: p = a.source ma[p] = a.weight t.in_marking = ma variants_idxs = variants_filter.get_variants_from_log_trace_idx( log, parameters=parameters) results = [] tmap = {} bmap = {} for t in net.transitions: if t.label is not None: if t.label not in tmap: tmap[t.label] = [] tmap[t.label].append(t) for variant in variants_idxs: vlist = variants_util.get_activities_from_variant(variant) result = tr_vlist(vlist, net, initial_marking, final_marking, tmap, bmap, parameters=parameters) results.append(result) al_idx = {} for index_variant, variant in enumerate(variants_idxs): for trace_idx in variants_idxs[variant]: al_idx[trace_idx] = results[index_variant] ret = [] for i in range(len(log)): ret.append(al_idx[i]) return ret
def apply(log, parameters=None): """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.N -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.generic.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) n = exec_utils.get_param_value(Parameters.N, parameters, 2) parameters_variants = { case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics( log, parameters=parameters_variants) } variants_resources = list(variants_occ.keys()) resources = [ variants_util.get_activities_from_variant(y) for y in variants_resources ] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for idx, rv in enumerate(resources): rvj = variants_resources[idx] for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[rvj] dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] dividend = dividend + variants_occ[rvj] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def apply( log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> List[Any]: """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.N -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) n = exec_utils.get_param_value(Parameters.N, parameters, 2) parameters_variants = { variants_filter.Parameters.ACTIVITY_KEY: resource_key, variants_filter.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x: len(y) for x, y in variants_filter.get_variants( log, parameters=parameters_variants).items() } variants_resources = list(variants_occ.keys()) resources = [ variants_util.get_activities_from_variant(y) for y in variants_resources ] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} dividend = 0 for idx, rv in enumerate(resources): rvj = variants_resources[idx] dividend += variants_occ[rvj] for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[rvj] for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]