def create_dfg(self, parameters=None): if parameters is None: parameters = {} dfg = [(k, v) for k, v in dfg_inst.apply( self.log, parameters=parameters).items() if v > 0] return dfg
def apply_tree(log, parameters=None): """ Apply the IMDF algorithm to a log_skeleton obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log_skeleton to use as activity name (default concept:name) Returns ---------- tree Process tree """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # get the DFG dfg = [(k, v) for k, v in dfg_inst.apply(log, parameters=parameters).items() if v > 0] # gets the start activities from the log_skeleton start_activities = log_start_act_stats.get_start_activities( log, parameters=parameters) # gets the end activities from the log_skeleton end_activities = log_end_act_stats.get_end_activities( log, parameters=parameters) # get the activities in the log_skeleton activities = log_attributes_stats.get_attribute_values(log, activity_key) # check if the log_skeleton contains empty traces contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 return apply_tree_dfg(dfg, parameters=parameters, activities=activities, contains_empty_traces=contains_empty_traces, start_activities=start_activities, end_activities=end_activities)
def apply( log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Tuple[PetriNet, Marking, Marking]: """ This method calls the \"classic\" alpha miner [1]_. Parameters ---------- log: :class:`pm4py.log.log.EventLog` Event log to use in the alpha miner parameters: Parameters of the algorithm, including: activity_key : :class:`str`, optional Key to use within events to identify the underlying activity. By deafult, the value 'concept:name' is used. Returns ------- net: :class:`pm4py.entities.petri.petrinet.PetriNet` A Petri net describing the event log that is provided as an input initial marking: :class:`pm4py.models.net.Marking` marking object representing the initial marking final marking: :class:`pm4py.models.net.Marking` marking object representing the final marking, not guaranteed that it is actually reachable! References ---------- .. [1] Wil M. P. van der Aalst et al., "Workflow Mining: Discovering Process Models from Event Logs", IEEE Trans. Knowl. Data Eng., 16, 1128-1142, 2004. `DOI <https://doi.org/10.1109/TKDE.2004.47>`_. """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pm_util.xes_constants.DEFAULT_NAME_KEY) dfg = { k: v for k, v in dfg_inst.apply(log, parameters=parameters).items() if v > 0 } start_activities = endpoints.derive_start_activities_from_log( log, activity_key) end_activities = endpoints.derive_end_activities_from_log( log, activity_key) return apply_dfg_sa_ea(dfg, start_activities, end_activities, parameters=parameters)
def check_for_cut(self, test_log, deleted_activity=None, parameters=None): if pkgutil.find_loader("networkx"): import networkx as nx if deleted_activity is not None: del self.activities[deleted_activity] if parameters is None: parameters = {} dfg = [(k, v) for k, v in dfg_inst.apply( test_log, parameters=parameters).items() if v > 0] self.dfg = dfg self.outgoing = get_outgoing_edges(self.dfg) self.ingoing = get_ingoing_edges(self.dfg) self.log = test_log conn_components = detection_utils.get_connected_components( self.ingoing, self.outgoing, self.activities) this_nx_graph = transform_dfg_to_directed_nx_graph( self.dfg, activities=self.activities) strongly_connected_components = [ list(x) for x in nx.strongly_connected_components(this_nx_graph) ] # search for cut and return true as soon as a cut is found: xor_cut = self.detect_xor(conn_components) if xor_cut[0]: return True else: sequence_cut = cut_detection.detect_sequential_cut( self, self.dfg, strongly_connected_components) if sequence_cut[0]: return True else: parallel_cut = self.detect_concurrent() if parallel_cut[0]: return True else: loop_cut = self.detect_loop() if loop_cut[0]: return True else: return False else: msg = "networkx is not available. inductive miner cannot be used!" logging.error(msg) raise Exception(msg)
def apply_fall_through(self, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # set flags for fall_throughs, base case is True (enabled) use_empty_trace = (Parameters.EMPTY_TRACE_KEY not in parameters ) or parameters[Parameters.EMPTY_TRACE_KEY] use_act_once_per_trace = ( Parameters.ONCE_PER_TRACE_KEY not in parameters) or parameters[Parameters.ONCE_PER_TRACE_KEY] use_act_concurrent = (Parameters.CONCURRENT_KEY not in parameters ) or parameters[Parameters.CONCURRENT_KEY] use_strict_tau_loop = (Parameters.STRICT_TAU_LOOP_KEY not in parameters ) or parameters[Parameters.STRICT_TAU_LOOP_KEY] use_tau_loop = (Parameters.TAU_LOOP_KEY not in parameters ) or parameters[Parameters.TAU_LOOP_KEY] if use_empty_trace: empty_trace, new_log = fall_through.empty_trace(self.log) # if an empty trace is found, the empty trace fallthrough applies # else: empty_trace = False if empty_trace: logging.debug("empty_trace") activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append(act[activity_key]) self.detected_cut = 'empty_trace' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_once_per_trace: activity_once, new_log, small_log = fall_through.act_once_per_trace( self.log, self.activities, activity_key) small_log = filtering_utils.keep_one_trace_per_variant( small_log, parameters=parameters) else: activity_once = False if use_act_once_per_trace and activity_once: self.detected_cut = 'parallel' # create two new dfgs as we need them to append to self.children later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) self.children.append( SubtreePlain( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log_skeleton start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_concurrent: activity_concurrent, new_log, small_log, activity_left_out = fall_through.activity_concurrent( self, self.log, self.activities, activity_key, parameters=parameters) small_log = filtering_utils.keep_one_trace_per_variant( small_log, parameters=parameters) else: activity_concurrent = False if use_act_concurrent and activity_concurrent: self.detected_cut = 'parallel' # create two new dfgs on to append later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) # append the concurrent activity as leaf: self.children.append( SubtreePlain( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log_skeleton: start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_strict_tau_loop: strict_tau_loop, new_log = fall_through.strict_tau_loop( self.log, self.start_activities, self.end_activities, activity_key) new_log = filtering_utils.keep_one_trace_per_variant( new_log, parameters=parameters) else: strict_tau_loop = False if use_strict_tau_loop and strict_tau_loop: activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append(act[activity_key]) self.detected_cut = 'strict_tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain(new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: if use_tau_loop: tau_loop, new_log = fall_through.tau_loop( self.log, self.start_activities, activity_key) new_log = filtering_utils.keep_one_trace_per_variant( new_log, parameters=parameters) else: tau_loop = False if use_tau_loop and tau_loop: activites_left = [] for trace in new_log: for act in trace: if act[activity_key] not in activites_left: activites_left.append( act[activity_key]) self.detected_cut = 'tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=self.parameters).keys()) self.children.append( SubtreePlain( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: logging.debug("flower model") activites_left = [] for trace in self.log: for act in trace: if act[activity_key] not in activites_left: activites_left.append( act[activity_key]) self.detected_cut = 'flower'
def detect_cut(self, second_iteration=False, parameters=None): if pkgutil.find_loader("networkx"): import networkx as nx if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # check base cases: empty_log = base_case.empty_log(self.log) single_activity = base_case.single_activity(self.log, activity_key) if empty_log: self.detected_cut = 'empty_log' elif single_activity: self.detected_cut = 'single_activity' # if no base cases are found, search for a cut: else: conn_components = detection_utils.get_connected_components( self.ingoing, self.outgoing, self.activities) this_nx_graph = transform_dfg_to_directed_nx_graph( self.dfg, activities=self.activities) strongly_connected_components = [ list(x) for x in nx.strongly_connected_components(this_nx_graph) ] xor_cut = self.detect_xor(conn_components) # the following part searches for a cut in the current log_skeleton # if a cut is found, the log_skeleton is split according to the cut, the resulting logs are saved in new_logs # recursion is used on all the logs in new_logs if xor_cut[0]: logging.debug("xor_cut") self.detected_cut = 'concurrent' new_logs = split.split_xor(xor_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain(l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: sequence_cut = cut_detection.detect_sequential_cut( self, self.dfg, strongly_connected_components) if sequence_cut[0]: logging.debug("sequence_cut") new_logs = split.split_sequence( sequence_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: parallel_cut = self.detect_concurrent() if parallel_cut[0]: logging.debug("parallel_cut") new_logs = split.split_parallel( parallel_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter. get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: loop_cut = self.detect_loop() if loop_cut[0]: logging.debug("loop_cut") new_logs = split.split_loop( loop_cut[1], self.log, activity_key) for i in range(len(new_logs)): new_logs[ i] = filtering_utils.keep_one_trace_per_variant( new_logs[i], parameters=parameters) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [ (k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0 ] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter. get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter. get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreePlain( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, noise_threshold=self. noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) # if the code gets to this point, there is no base_case and no cut found in the log_skeleton # therefore, we now apply fall through: else: self.apply_fall_through(parameters) else: msg = "networkx is not available. inductive miner cannot be used!" logging.error(msg) raise Exception(msg)
def apply_tree(log, parameters): """ Apply the IM_FF algorithm to a log obtaining a process tree Parameters ---------- log Log parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ---------- process_tree Process tree """ if parameters is None: parameters = {} if pkgutil.find_loader("pandas"): import pandas as pd from pm4py.statistics.variants.pandas import get as variants_get if type(log) is pd.DataFrame: vars = variants_get.get_variants_count(log, parameters=parameters) return apply_tree_variants(vars, parameters=parameters) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) log = converter.apply(log, parameters=parameters) # keep only the activity attribute (since the others are not used) log = filtering_utils.keep_only_one_attribute_per_event(log, activity_key) noise_threshold = exec_utils.get_param_value(Parameters.NOISE_THRESHOLD, parameters, shared_constants.NOISE_THRESHOLD_IMF) dfg = [(k, v) for k, v in dfg_inst.apply(log, parameters=parameters).items() if v > 0] c = Counts() activities = attributes_get.get_attribute_values(log, activity_key) start_activities = list(start_activities_get.get_start_activities(log, parameters=parameters).keys()) end_activities = list(end_activities_get.get_end_activities(log, parameters=parameters).keys()) contains_empty_traces = False traces_length = [len(trace) for trace in log] if traces_length: contains_empty_traces = min([len(trace) for trace in log]) == 0 # set the threshold parameter based on f and the max value in the dfg: max_value = 0 for key, value in dfg: if value > max_value: max_value = value threshold = noise_threshold * max_value recursion_depth = 0 sub = subtree.make_tree(log, dfg, dfg, dfg, activities, c, recursion_depth, noise_threshold, threshold, start_activities, end_activities, start_activities, end_activities, parameters=parameters) process_tree = get_tree_repr_implain.get_repr(sub, 0, contains_empty_traces=contains_empty_traces) # Ensures consistency to the parent pointers in the process tree tree_consistency.fix_parent_pointers(process_tree) # Fixes a 1 child XOR that is added when single-activities flowers are found tree_consistency.fix_one_child_xor_flower(process_tree) # folds the process tree (to simplify it in case fallthroughs/filtering is applied) process_tree = util.fold(process_tree) return process_tree
def apply_fall_through_infrequent(self, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, self.parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # set flags for fall_throughs, base case is True (enabled) use_empty_trace = (Parameters.EMPTY_TRACE_KEY not in parameters ) or parameters[Parameters.EMPTY_TRACE_KEY] use_act_once_per_trace = ( Parameters.ONCE_PER_TRACE_KEY not in parameters) or parameters[Parameters.ONCE_PER_TRACE_KEY] use_act_concurrent = (Parameters.CONCURRENT_KEY not in parameters ) or parameters[Parameters.CONCURRENT_KEY] use_strict_tau_loop = (Parameters.STRICT_TAU_LOOP_KEY not in parameters ) or parameters[Parameters.STRICT_TAU_LOOP_KEY] use_tau_loop = (Parameters.TAU_LOOP_KEY not in parameters ) or parameters[Parameters.TAU_LOOP_KEY] if use_empty_trace: empty_traces_present, enough_traces, new_log = fall_through_infrequent.empty_trace_filtering( self.log, self.f) self.log = new_log else: empty_traces_present = False enough_traces = False # if an empty trace is found, the empty trace fallthrough applies if empty_traces_present and enough_traces: logging.debug("empty_trace_if") self.detected_cut = 'empty_trace' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=self.parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) elif empty_traces_present and not enough_traces: # no node is added to the PT, instead we just use recursion on the log without the empty traces self.detect_cut_if(parameters=parameters) else: if use_act_once_per_trace: activity_once, new_log, small_log = fall_through.act_once_per_trace( self.log, self.activities, activity_key) else: activity_once = False if activity_once: self.detected_cut = 'parallel' # create two new dfgs as we need them to append to self.children later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) # append the chosen activity as leaf: self.children.append( SubtreeInfrequent( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_act_concurrent: activity_concurrent, new_log, small_log, key = fall_through.activity_concurrent( self, self.log, self.activities, activity_key, parameters=parameters) else: activity_concurrent = False if activity_concurrent: self.detected_cut = 'parallel' # create two new dfgs on to append later new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) small_dfg = [(k, v) for k, v in dfg_inst.apply( small_log, parameters=parameters).items() if v > 0] small_activities = attributes_filter.get_attribute_values( small_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) # append the concurrent activity as leaf: self.children.append( SubtreeInfrequent( small_log, small_dfg, self.master_dfg, self.initial_dfg, small_activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) # continue with the recursion on the new log: self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=parameters)) else: if use_strict_tau_loop: strict_tau_loop, new_log = fall_through.strict_tau_loop( self.log, self.start_activities, self.end_activities, activity_key) else: strict_tau_loop = False if strict_tau_loop: self.detected_cut = 'strict_tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: if use_tau_loop: tau_loop, new_log = fall_through.tau_loop( self.log, self.start_activities, activity_key) else: tau_loop = False if tau_loop: self.detected_cut = 'tau_loop' new_dfg = [(k, v) for k, v in dfg_inst.apply( new_log, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( new_log, activity_key) start_activities = list( start_activities_filter.get_start_activities( new_log, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( new_log, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( new_log, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: logging.debug("flower_if") self.detected_cut = 'flower'
def detect_cut_if(self, second_iteration=False, parameters=None): if parameters is None: parameters = {} activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) # check base cases: empty_log = base_case.empty_log(self.log) single_activity = base_case.single_activity(self.log, activity_key) if empty_log: self.detected_cut = 'empty_log' elif single_activity: self.detected_cut = 'single_activity' # if no base cases are found, search for a cut: # use the cutting and splitting functions of im_plain: else: found_plain_cut, type_of_cut, cut = self.check_cut_im_plain() if found_plain_cut: self.apply_cut_im_plain(type_of_cut, cut, activity_key) # if im_plain does not find a cut, we filter on our threshold and then again apply the im_cut detection # but this time, we have to use different splitting functions: else: self.filter_dfg_on_threshold() found_plain_cut, type_of_cut, cut = self.check_cut_im_plain() if found_plain_cut: if type_of_cut == 'concurrent': logging.debug("concurrent_cut_if") self.detected_cut = 'concurrent' new_logs = splitting_infrequent.split_xor_infrequent( cut[1], self.log, activity_key) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) elif type_of_cut == 'sequential': logging.debug("sequential_if") new_logs = splitting_infrequent.split_sequence_infrequent( cut[1], self.log, activity_key) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) elif type_of_cut == 'parallel': logging.debug("parallel_if") new_logs = split.split_parallel( cut[1], self.log, activity_key) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) elif type_of_cut == 'loopCut': logging.debug("loopCut_if") new_logs = splitting_infrequent.split_loop_infrequent( cut[1], self.log, activity_key) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self. initial_start_activities, initial_end_activities=self. initial_end_activities, parameters=parameters)) else: self.apply_fall_through_infrequent(parameters)
def apply_cut_im_plain(self, type_of_cut, cut, activity_key): if type_of_cut == 'concurrent': self.detected_cut = 'concurrent' new_logs = split.split_xor(cut[1], self.log, activity_key) for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=self.parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters)) elif type_of_cut == 'sequential': new_logs = split.split_sequence(cut[1], self.log, activity_key) self.detected_cut = "sequential" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=self.parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters)) elif type_of_cut == 'parallel': new_logs = split.split_parallel(cut[1], self.log, activity_key) self.detected_cut = "parallel" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=self.parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters)) elif type_of_cut == 'loopCut': new_logs = split.split_loop(cut[1], self.log, activity_key) self.detected_cut = "loopCut" for l in new_logs: new_dfg = [(k, v) for k, v in dfg_inst.apply( l, parameters=self.parameters).items() if v > 0] activities = attributes_filter.get_attribute_values( l, activity_key) start_activities = list( start_activities_filter.get_start_activities( l, parameters=self.parameters).keys()) end_activities = list( end_activities_filter.get_end_activities( l, parameters=self.parameters).keys()) self.children.append( SubtreeInfrequent( l, new_dfg, self.master_dfg, self.initial_dfg, activities, self.counts, self.rec_depth + 1, self.f, noise_threshold=self.noise_threshold, start_activities=start_activities, end_activities=end_activities, initial_start_activities=self.initial_start_activities, initial_end_activities=self.initial_end_activities, parameters=self.parameters))
def slice_dist_suc(log_1, log_2, unit): (log1_list, freq1_list) = filter_subsets.logslice_percent(log_1, unit) (log2_list, freq2_list) = filter_subsets.logslice_percent(log_2, unit) if len(freq1_list) >= len(freq2_list): max_len = len(freq1_list) min_len = len(freq2_list) max_log = log1_list min_log = log2_list var_count_max = freq1_list var_count_min = freq2_list else: max_len = len(freq2_list) min_len = len(freq1_list) max_log = log2_list min_log = log1_list var_count_max = freq2_list var_count_min = freq1_list dist_matrix = np.zeros((max_len, min_len)) max_per_var = np.zeros(max_len) max_freq = np.zeros(max_len) min_freq = np.zeros(min_len) min_per_var = np.zeros(min_len) index_rec = set(list(range(min_len))) if log1_list == log2_list: print("Please give different variant lists!") dist = 0 else: for i in range(max_len): dist_vec = np.zeros(min_len) dfg1 = native.apply(max_log[i]) df1_dfg = act_dist_calc.occu_var_act(dfg1) for j in range(min_len): dfg2 = native.apply(min_log[j]) df2_dfg = act_dist_calc.occu_var_act(dfg2) df_dfg = pd.merge(df1_dfg, df2_dfg, how='outer', on='var').fillna(0) dist_vec[j] = pdist( np.array( [df_dfg['freq_x'].values, df_dfg['freq_y'].values]), 'cosine')[0] dist_matrix[i][j] = dist_vec[j] if j == (min_len - 1): max_loc_col = np.argmin(dist_vec) if abs(dist_vec[max_loc_col]) <= 1e-8: index_rec.discard(max_loc_col) max_freq[i] = var_count_max[i] * var_count_min[ max_loc_col] * 2 max_per_var[ i] = dist_vec[max_loc_col] * max_freq[i] * 2 else: max_freq[ i] = var_count_max[i] * var_count_min[max_loc_col] max_per_var[i] = dist_vec[max_loc_col] * max_freq[i] if (len(index_rec) != 0): for i in list(index_rec): min_loc_row = np.argmin(dist_matrix[:, i]) min_freq[i] = var_count_max[min_loc_row] * var_count_min[i] min_per_var[i] = dist_matrix[min_loc_row, i] * min_freq[i] dist = (np.sum(max_per_var) + np.sum(min_per_var)) / (np.sum(max_freq) + np.sum(min_freq)) return dist