def get_cycle_time(log: Union[EventLog, pd.DataFrame]) -> float: """ Calculates the cycle time of the event log. The definition that has been followed is the one proposed in: https://www.presentationeze.com/presentations/lean-manufacturing-just-in-time/lean-manufacturing-just-in-time-full-details/process-cycle-time-analysis/calculate-cycle-time/#:~:text=Cycle%20time%20%3D%20Average%20time%20between,is%2024%20minutes%20on%20average. So: Cycle time = Average time between completion of units. Example taken from the website: Consider a manufacturing facility, which is producing 100 units of product per 40 hour week. The average throughput rate is 1 unit per 0.4 hours, which is one unit every 24 minutes. Therefore the cycle time is 24 minutes on average. Parameters ----------------- log Log object Returns ----------------- cycle_time Cycle time (calculated with the aforementioned formula). """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.traces.cycle_time.pandas import get as cycle_time return cycle_time.apply(log, parameters=get_properties(log)) else: from pm4py.statistics.traces.cycle_time.log import get as cycle_time return cycle_time.apply(log, parameters=get_properties(log))
def get_rework_cases_per_activity(log: Union[EventLog, pd.DataFrame]) -> Dict[str, int]: """ Find out for which activities of the log the rework (more than one occurrence in the trace for the activity) occurs. The output is a dictionary associating to each of the aforementioned activities the number of cases for which the rework occurred. Parameters ------------------ log Log object Returns ------------------ rework_dictionary Dictionary associating to each of the aforementioned activities the number of cases for which the rework occurred. """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.rework.pandas import get as rework_get return rework_get.apply(log, parameters=get_properties(log)) else: from pm4py.statistics.rework.log import get as rework_get return rework_get.apply(log, parameters=get_properties(log))
def get_variants_as_tuples(log: Union[EventLog, pd.DataFrame]) -> Dict[Tuple[str], List[Trace]]: """ Gets the variants from the log (where the keys are tuples and not strings) Parameters -------------- log Event log Returns -------------- variants Dictionary of variants along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") import pm4py # the behavior of PM4Py is changed to allow this to work pm4py.util.variants_util.VARIANT_SPECIFICATION = pm4py.util.variants_util.VariantsSpecifications.LIST if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.variants.pandas import get return get.get_variants_count(log, parameters=get_properties(log)) else: from pm4py.statistics.variants.log import get return get.get_variants(log, parameters=get_properties(log))
def get_variants(log: Union[EventLog, pd.DataFrame]) -> Dict[str, List[Trace]]: """ Gets the variants from the log Parameters -------------- log Event log Returns -------------- variants Dictionary of variants along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") import pm4py if pm4py.util.variants_util.VARIANT_SPECIFICATION == pm4py.util.variants_util.VariantsSpecifications.STRING: import warnings warnings.warn('pm4py.get_variants is deprecated. Please use pm4py.get_variants_as_tuples instead.') if pm4py.util.variants_util.VARIANT_SPECIFICATION == pm4py.util.variants_util.VariantsSpecifications.LIST: raise Exception('Please use pm4py.get_variants_as_tuples') if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.variants.pandas import get return get.get_variants_count(log, parameters=get_properties(log)) else: from pm4py.statistics.variants.log import get return get.get_variants(log, parameters=get_properties(log))
def get_all_case_durations(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7]) -> List[float]: """ Gets the durations of the cases in the event log Parameters --------------- log Event log business_hours Enables/disables the computation based on the business hours (default: False) worktiming (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17) weekends (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7)) Returns --------------- durations Case durations (as list) """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") properties = copy(get_properties(log)) properties["business_hours"] = business_hours properties["worktiming"] = worktiming properties["weekends"] = weekends if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.traces.generic.pandas import case_statistics cd = case_statistics.get_cases_description(log, parameters=properties) return sorted([x["caseDuration"] for x in cd.values()]) else: from pm4py.statistics.traces.generic.log import case_statistics return case_statistics.get_all_case_durations(log, parameters=properties)
def discover_organizational_roles(log: Union[EventLog, pd.DataFrame]): """ Mines the organizational roles Parameters --------------- log Event log or Pandas dataframe Returns --------------- roles Organizational roles. List where each role is a sublist with two elements: - The first element of the sublist is the list of activities belonging to a role. Each activity belongs to a single role - The second element of the sublist is a dictionary containing the resources of the role and the number of times they executed activities belonging to the role. """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") from pm4py.algo.organizational_mining.roles import algorithm as roles if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) return roles.apply(log, variant=roles.Variants.PANDAS, parameters=get_properties(log)) else: return roles.apply(log, variant=roles.Variants.LOG, parameters=get_properties(log))
def filter_variants_top_k(log: Union[EventLog, pd.DataFrame], k: int) -> Union[EventLog, pd.DataFrame]: """ Keeps the top-k variants of the log Parameters ------------- log Event log k Number of variants that should be kept parameters Parameters Returns ------------- filtered_log Filtered log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.variants import variants_filter return variants_filter.filter_variants_top_k(log, k, parameters=parameters) else: from pm4py.algo.filtering.log.variants import variants_filter return variants_filter.filter_variants_top_k(log, k, parameters=parameters)
def filter_time_range(log: Union[EventLog, pd.DataFrame], dt1: str, dt2: str, mode="events") -> Union[EventLog, pd.DataFrame]: """ Filter a log on a time interval Parameters ---------------- log Log object dt1 Left extreme of the interval dt2 Right extreme of the interval mode Modality of filtering (events, traces_contained, traces_intersecting) events: any event that fits the time frame is retained traces_contained: any trace completely contained in the timeframe is retained traces_intersecting: any trace intersecting with the time-frame is retained. Returns ---------------- filtered_log Filtered log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): from pm4py.algo.filtering.pandas.timestamp import timestamp_filter if mode == "events": return timestamp_filter.apply_events( log, dt1, dt2, parameters=get_properties(log)) elif mode == "traces_contained": return timestamp_filter.filter_traces_contained( log, dt1, dt2, parameters=get_properties(log)) elif mode == "traces_intersecting": return timestamp_filter.filter_traces_intersecting( log, dt1, dt2, parameters=get_properties(log)) else: warnings.warn('mode provided: ' + mode + ' is not recognized; original log returned!') return log else: from pm4py.algo.filtering.log.timestamp import timestamp_filter if mode == "events": return timestamp_filter.apply_events( log, dt1, dt2, parameters=get_properties(log)) elif mode == "traces_contained": return timestamp_filter.filter_traces_contained( log, dt1, dt2, parameters=get_properties(log)) elif mode == "traces_intersecting": return timestamp_filter.filter_traces_intersecting( log, dt1, dt2, parameters=get_properties(log)) else: warnings.warn('mode provided: ' + mode + ' is not recognized; original log returned!') return log
def get_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str, count_once_per_case=False) -> Dict[str, int]: """ Returns the values for a specified attribute Parameters --------------- log Log object attribute Attribute count_once_per_case If True, consider only an occurrence of the given attribute value inside a case (if there are multiple events sharing the same attribute value, count only 1 occurrence) Returns --------------- attribute_values Dictionary of values along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters["keep_once_per_case"] = count_once_per_case if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get return get.get_attribute_values(log, attribute, parameters=parameters) else: from pm4py.statistics.attributes.log import get return get.get_attribute_values(log, attribute, parameters=parameters)
def filter_variants_percentage( log: Union[EventLog, pd.DataFrame], threshold: float = 0.8) -> Union[EventLog, pd.DataFrame]: """ Filter a log on the percentage of variants Parameters --------------- log Event log threshold Percentage (scale 0.1) of admitted variants Returns -------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): raise Exception( "filtering variants percentage on Pandas dataframe is currently not available! please convert the dataframe to event log with the method: log = pm4py.convert_to_event_log(df)" ) else: from pm4py.algo.filtering.log.variants import variants_filter return variants_filter.filter_log_variants_percentage( log, percentage=threshold, parameters=get_properties(log))
def filter_directly_follows_relation(log: Union[EventLog, pd.DataFrame], relations: List[str], retain: bool = True) -> \ Union[EventLog, pd.DataFrame]: """ Retain traces that contain any of the specified 'directly follows' relations. For example, if relations == [('a','b'),('a','c')] and log [<a,b,c>,<a,c,b>,<a,d,b>] the resulting log will contain traces describing [<a,b,c>,<a,c,b>]. Parameters --------------- log Log object relations List of activity name pairs, which are allowed/forbidden paths retain Parameter that says whether the paths should be kept/removed Returns ---------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): from pm4py.algo.filtering.pandas.paths import paths_filter parameters[paths_filter.Parameters.POSITIVE] = retain return paths_filter.apply(log, relations, parameters=parameters) else: from pm4py.algo.filtering.log.paths import paths_filter parameters[paths_filter.Parameters.POSITIVE] = retain return paths_filter.apply(log, relations, parameters=parameters)
def view_events_per_time_graph(log: Union[EventLog, pd.DataFrame], format: str = "png"): """ Visualizes the events per time graph Parameters ----------------- log Log object format Format of the visualization (png, svg, ...) """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get as attributes_get graph = attributes_get.get_kde_date_attribute( log, parameters=get_properties(log)) else: from pm4py.statistics.attributes.log import get as attributes_get graph = attributes_get.get_kde_date_attribute( log, parameters=get_properties(log)) from pm4py.visualization.graphs import visualizer as graphs_visualizer graph_vis = graphs_visualizer.apply( graph[0], graph[1], variant=graphs_visualizer.Variants.DATES, parameters={"format": format}) graphs_visualizer.view(graph_vis)
def get_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str) -> Dict[str, int]: """ Returns the values for a specified trace attribute Parameters --------------- log Log object attribute Attribute Returns --------------- attribute_values Dictionary of values along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get return get.get_attribute_values(log, attribute) else: from pm4py.statistics.attributes.log import get return get.get_trace_attribute_values(log, attribute)
def save_vis_events_per_time_graph(log: Union[EventLog, pd.DataFrame], file_path: str): """ Saves the events per time graph in the specified path Parameters ---------------- log Log object file_path Destination path """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get as attributes_get graph = attributes_get.get_kde_date_attribute( log, parameters=get_properties(log)) else: from pm4py.statistics.attributes.log import get as attributes_get graph = attributes_get.get_kde_date_attribute( log, parameters=get_properties(log)) format = os.path.splitext(file_path)[1][1:] from pm4py.visualization.graphs import visualizer as graphs_visualizer graph_vis = graphs_visualizer.apply( graph[0], graph[1], variant=graphs_visualizer.Variants.DATES, parameters={"format": format}) graphs_visualizer.save(graph_vis, file_path)
def discover_handover_of_work_network(log: Union[EventLog, pd.DataFrame], beta=0): """ Calculates the handover of work network of the event log. The handover of work network is essentially the DFG of the event log, however, using the resource as a node of the graph, instead of the activity. As such, to use this, resource information should be present in the event log. Parameters --------------- log Event log or Pandas dataframe beta beta parameter for Handover metric Returns --------------- metric_values Values of the metric """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") from pm4py.algo.organizational_mining.sna import algorithm as sna parameters = get_properties(log) parameters["beta"] = beta if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) return sna.apply(log, variant=sna.Variants.HANDOVER_PANDAS, parameters=parameters) else: return sna.apply(log, variant=sna.Variants.HANDOVER_LOG, parameters=parameters)
def filter_activities_rework( log: Union[EventLog, pd.DataFrame], activity: str, min_occurrences: int = 2) -> Union[EventLog, pd.DataFrame]: """ Filters the event log, keeping the cases where the specified activity occurs at least min_occurrences times. Parameters ----------------- log Event log / Pandas dataframe activity Activity min_occurrences Minimum desidered number of occurrences Returns ----------------- filtered_log Log with cases having at least min_occurrences occurrences of the given activity """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters["min_occurrences"] = min_occurrences if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.rework import rework_filter return rework_filter.apply(log, activity, parameters=parameters) else: from pm4py.algo.filtering.log.rework import rework_filter return rework_filter.apply(log, activity, parameters=parameters)
def discover_subcontracting_network(log: Union[EventLog, pd.DataFrame], n=2): """ Calculates the subcontracting network of the process. Parameters --------------- log Event log or Pandas dataframe n n parameter for Subcontracting metric Returns --------------- metric_values Values of the metric """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") from pm4py.algo.organizational_mining.sna import algorithm as sna parameters = get_properties(log) parameters["n"] = n if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) return sna.apply(log, variant=sna.Variants.SUBCONTRACTING_PANDAS, parameters=parameters) else: return sna.apply(log, variant=sna.Variants.SUBCONTRACTING_LOG, parameters=parameters)
def insert_artificial_start_end( log: Union[EventLog, pd.DataFrame]) -> Union[EventLog, pd.DataFrame]: """ Inserts the artificial start/end activities in an event log / Pandas dataframe Parameters ------------------ log Event log / Pandas dataframe Returns ------------------ log Event log / Pandas dataframe with artificial start / end activities """ properties = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.objects.log.util import dataframe_utils return dataframe_utils.insert_artificial_start_end( log, parameters=properties) else: from pm4py.objects.log.util import artificial return artificial.insert_artificial_start_end(log, parameters=properties)
def filter_event_attribute_values( log: Union[EventLog, pd.DataFrame], attribute_key: str, values: Union[Set[str], List[str]], level: str = "case", retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log object on the values of some event attribute Parameters -------------- log Log object attribute_key Attribute to filter values Admitted (or forbidden) values level Specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens, 'event' filter the events eventually trimming the cases) retain Specified if the values should be kept or removed Returns -------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter if level == "event": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_events(log, values, parameters=parameters) elif level == "case": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters) else: from pm4py.algo.filtering.log.attributes import attributes_filter if level == "event": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_events(log, values, parameters=parameters) elif level == "case": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters)
def project_on_event_attribute(log: Union[EventLog, pd.DataFrame], attribute_key=xes_constants.DEFAULT_NAME_KEY) -> \ List[List[str]]: """ Project the event log on a specified event attribute. The result is a list, containing a list for each case: all the cases are transformed to list of values for the specified attribute. Parameters -------------------- log Event log / Pandas dataframe attribute_key The attribute to be used Returns -------------------- projected_cases Projection on the given attribute (a list containing, for each case, a list of its values for the specified attribute). Example: pm4py.project_on_event_attribute(log, "concept:name") [['register request', 'examine casually', 'check ticket', 'decide', 'reinitiate request', 'examine thoroughly', 'check ticket', 'decide', 'pay compensation'], ['register request', 'check ticket', 'examine casually', 'decide', 'pay compensation'], ['register request', 'examine thoroughly', 'check ticket', 'decide', 'reject request'], ['register request', 'examine casually', 'check ticket', 'decide', 'pay compensation'], ['register request', 'examine casually', 'check ticket', 'decide', 'reinitiate request', 'check ticket', 'examine casually', 'decide', 'reinitiate request', 'examine casually', 'check ticket', 'decide', 'reject request'], ['register request', 'check ticket', 'examine thoroughly', 'decide', 'reject request']] """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") output = [] if pandas_utils.check_is_pandas_dataframe(log): pandas_utils.check_pandas_dataframe_columns(log) from pm4py.streaming.conversion import from_pandas it = from_pandas.apply( log, parameters={from_pandas.Parameters.ACTIVITY_KEY: attribute_key}) for trace in it: output.append([ x[xes_constants.DEFAULT_NAME_KEY] if xes_constants.DEFAULT_NAME_KEY is not None else None for x in trace ]) else: for trace in log: output.append([ x[attribute_key] if attribute_key is not None else None for x in trace ]) return output
def discover_performance_dfg(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7], workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) -> Tuple[dict, dict, dict]: """ Discovers a performance directly-follows graph from an event log Parameters --------------- log Event log business_hours Enables/disables the computation based on the business hours (default: False) worktiming (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17) weekends (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7)) Returns --------------- performance_dfg Performance DFG start_activities Start activities end_activities End activities """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.util import constants properties = get_properties(log) from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME dfg = get_dfg_graph(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_key, measure="performance", perf_aggregation_key="all", business_hours=business_hours, worktiming=worktiming, weekends=weekends, workcalendar=workcalendar) from pm4py.statistics.start_activities.pandas import get as start_activities_module from pm4py.statistics.end_activities.pandas import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=properties) end_activities = end_activities_module.get_end_activities(log, parameters=properties) else: from pm4py.algo.discovery.dfg.variants import performance as dfg_discovery properties = get_properties(log) properties[dfg_discovery.Parameters.AGGREGATION_MEASURE] = "all" properties[dfg_discovery.Parameters.BUSINESS_HOURS] = business_hours properties[dfg_discovery.Parameters.WORKTIMING] = worktiming properties[dfg_discovery.Parameters.WEEKENDS] = weekends dfg = dfg_discovery.apply(log, parameters=properties) from pm4py.statistics.start_activities.log import get as start_activities_module from pm4py.statistics.end_activities.log import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=properties) end_activities = end_activities_module.get_end_activities(log, parameters=properties) return dfg, start_activities, end_activities
def filter_between(log: Union[EventLog, pd.DataFrame], act1: str, act2: str) -> Union[EventLog, pd.DataFrame]: """ Finds all the sub-cases leading from an event with activity "act1" to an event with activity "act2" in the log, and returns a log containing only them. Example: Log A B C D E F A B E F C A B F C B C B E F C act1 = B act2 = C Returned sub-cases: B C (from the first case) B E F C (from the second case) B F C (from the third case) B C (from the third case) B E F C (from the third case) Parameters ----------------- log Event log / Pandas dataframe act1 Source activity act2 Target activity Returns ----------------- filtered_log Log containing all the subcases """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.between import between_filter return between_filter.apply(log, act1, act2, parameters=parameters) else: from pm4py.algo.filtering.log.between import between_filter return between_filter.apply(log, act1, act2, parameters=parameters)
def filter_suffixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=True, first_or_last="first"): """ Filters the log, keeping the suffixes from a given activity. E.g., for a log with traces: A,B,C,D A,B,Z,A,B,C,D A,B,C,D,C,E,C,F The suffixes from "C" are respectively: D D D,C,E,C,F Parameters ------------------ log Event log / Pandas dataframe activity Target activity of the filter strict Applies the filter strictly (cuts the occurrences of the selected activity). first_or_last Decides if the first or last occurrence of an activity should be selected as baseline for the filter. Returns ------------------ filtered_log Filtered log / dataframe """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters["strict"] = strict parameters["first_or_last"] = first_or_last if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.suffixes import suffix_filter return suffix_filter.apply(log, activity, parameters=parameters) else: from pm4py.algo.filtering.log.suffixes import suffix_filter return suffix_filter.apply(log, activity, parameters=parameters)
def __builds_events_distribution_graph(log: Union[EventLog, pd.DataFrame], distr_type: str = "days_week"): """ Internal method to build the events distribution graph """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") if distr_type == "days_month": title = "Distribution of the Events over the Days of a Month" x_axis = "Day of month" y_axis = "Number of Events" elif distr_type == "months": title = "Distribution of the Events over the Months" x_axis = "Month" y_axis = "Number of Events" elif distr_type == "years": title = "Distribution of the Events over the Years" x_axis = "Year" y_axis = "Number of Events" elif distr_type == "hours": title = "Distribution of the Events over the Hours" x_axis = "Hour (of day)" y_axis = "Number of Events" elif distr_type == "days_week": title = "Distribution of the Events over the Days of a Week" x_axis = "Day of the Week" y_axis = "Number of Events" elif distr_type == "weeks": title = "Distribution of the Events over the Weeks of a Year" x_axis = "Week of the Year" y_axis = "Number of Events" else: raise Exception("unsupported distribution specified.") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get as attributes_get x, y = attributes_get.get_events_distribution( log, distr_type=distr_type, parameters=get_properties(log)) else: from pm4py.statistics.attributes.log import get as attributes_get x, y = attributes_get.get_events_distribution( log, distr_type=distr_type, parameters=get_properties(log)) return title, x_axis, y_axis, x, y
def filter_log_relative_occurrence_event_attribute( log: Union[EventLog, pd.DataFrame], min_relative_stake: float, attribute_key: str = xes_constants.DEFAULT_NAME_KEY, level="cases") -> Union[EventLog, pd.DataFrame]: """ Filters the event log keeping only the events having an attribute value which occurs: - in at least the specified (min_relative_stake) percentage of events, when level="events" - in at least the specified (min_relative_stake) percentage of cases, when level="cases" Parameters ------------------- log Event log / Pandas dataframe min_relative_stake Minimum percentage of cases (expressed as a number between 0 and 1) in which the attribute should occur. attribute_key The attribute to filter level The level of the filter (if level="events", then events / if level="cases", then cases) Returns ------------------ filtered_log Filtered event log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters[attributes_filter.Parameters.ATTRIBUTE_KEY] = attribute_key parameters[attributes_filter.Parameters. KEEP_ONCE_PER_CASE] = True if level == "cases" else False return attributes_filter.filter_df_relative_occurrence_event_attribute( log, min_relative_stake, parameters=parameters) else: from pm4py.algo.filtering.log.attributes import attributes_filter parameters[attributes_filter.Parameters.ATTRIBUTE_KEY] = attribute_key parameters[attributes_filter.Parameters. KEEP_ONCE_PER_CASE] = True if level == "cases" else False return attributes_filter.filter_log_relative_occurrence_event_attribute( log, min_relative_stake, parameters=parameters)
def filter_paths_performance(log: Union[EventLog, pd.DataFrame], path: Tuple[str, str], min_performance: float, max_performance: float, keep=True) -> Union[EventLog, pd.DataFrame]: """ Filters the event log, either: - (keep=True) keeping the cases having the specified path (tuple of 2 activities) with a duration included between min_performance and max_performance - (keep=False) discarding the cases having the specified path with a duration included between min_performance and max_performance Parameters ---------------- log Event log path Tuple of two activities (source_activity, target_activity) min_performance Minimum allowed performance (of the path) max_performance Maximum allowed performance (of the path) keep Keep/discard the cases having the specified path with a duration included between min_performance and max_performance Returns ---------------- filtered_log Filtered log with the desidered behavior """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters["positive"] = keep parameters["min_performance"] = min_performance parameters["max_performance"] = max_performance path = tuple(path) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.paths import paths_filter return paths_filter.apply_performance(log, path, parameters=parameters) else: from pm4py.algo.filtering.log.paths import paths_filter return paths_filter.apply_performance(log, path, parameters=parameters)
def discover_dfg(log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]: """ Discovers a DFG from a log Parameters -------------- log Event log Returns -------------- dfg DFG start_activities Start activities end_activities End activities """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.util import constants properties = get_properties(log) from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME dfg = get_dfg_graph(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_key) from pm4py.statistics.start_activities.pandas import get as start_activities_module from pm4py.statistics.end_activities.pandas import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=properties) end_activities = end_activities_module.get_end_activities(log, parameters=properties) else: from pm4py.algo.discovery.dfg import algorithm as dfg_discovery dfg = dfg_discovery.apply(log, parameters=get_properties(log)) from pm4py.statistics.start_activities.log import get as start_activities_module from pm4py.statistics.end_activities.log import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=get_properties(log)) end_activities = end_activities_module.get_end_activities(log, parameters=get_properties(log)) return dfg, start_activities, end_activities
def filter_trace_attribute_values( log: Union[EventLog, pd.DataFrame], attribute_key: str, values: Union[Set[str], List[str]], retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter a log on the values of a trace attribute Parameters -------------- log Event log attribute_key Attribute to filter values Values to filter (list of) retain Boolean value (keep/discard matching traces) Returns -------------- filtered_log Filtered event log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = attribute_key if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters) else: from pm4py.algo.filtering.log.attributes import attributes_filter parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_trace_attribute(log, values, parameters=parameters)
def get_activity_position_summary(log: Union[EventLog, pd.DataFrame], activity: str) -> Dict[int, int]: """ Given an event log, returns a dictionary which summarize the positions of the activities in the different cases of the event log. E.g., if an activity happens 1000 times in the position 1 (the second event of a case), and 500 times in the position 2 (the third event of a case), then the returned dictionary would be: {1: 1000, 2: 500} Parameters ----------------- log Event log object / Pandas dataframe activity Activity to consider Returns ----------------- pos_dict_summary Summary of the positions of the activity in the trace (e.g. {1: 1000, 2: 500}) """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") properties = get_properties(log) activity_key = properties[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY case_id_key = properties[ constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME if check_is_pandas_dataframe(log): log = insert_ev_in_tr_index(log, case_id_key, "@@index_in_trace") ret = log[log[activity_key] == activity]["@@index_in_trace"].value_counts().to_dict() return ret else: ret = Counter() for trace in log: for i in range(len(trace)): this_act = trace[i][activity_key] if this_act == activity: ret[i] += 1 return dict(ret)
def filter_end_activities( log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True) -> Union[EventLog, pd.DataFrame]: """ Filter cases having an end activity in the provided list Parameters --------------- log Log object activities List of admitted end activities retain if True, we retain the traces containing the given activities, if false, we drop the traces Returns --------------- filtered_log Filtered log object """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") parameters = get_properties(log) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.algo.filtering.pandas.end_activities import end_activities_filter parameters[end_activities_filter.Parameters.POSITIVE] = retain return end_activities_filter.apply(log, activities, parameters=parameters) else: from pm4py.algo.filtering.log.end_activities import end_activities_filter parameters[end_activities_filter.Parameters.POSITIVE] = retain return end_activities_filter.apply(log, activities, parameters=parameters)