def apply(df, min_freq=0): if min_freq > 0: persps = [x for x in df.columns if not x.startswith("event_")] collation = [] for persp in persps: red_df = df.dropna(subset=[persp]) prevlen = len(df) while True: dfg = df_statistics.get_dfg_graph( red_df, activity_key="event_activity", timestamp_key="event_timestamp", case_id_glue=persp) dfg = [x for x in dfg if dfg[x] >= min_freq] param = {} param[PARAMETER_CONSTANT_CASEID_KEY] = persp param[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = "event_activity" red_df = filter_paths(red_df, dfg, parameters=param) thislen = len(red_df) dfg = df_statistics.get_dfg_graph( red_df, activity_key="event_activity", timestamp_key="event_timestamp", case_id_glue=persp) if len(dfg) == 0 or min( dfg.values()) >= min_freq or prevlen == thislen: collation.append(red_df) break prevlen = thislen return pd.concat(collation) return df
def execute_script(): try: from pm4py.objects.log.importer.parquet import factory as parquet_importer from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics log_path = os.path.join("..", "tests", "input_data", log_name) time1 = time.time() dataframe = parquet_importer.apply( log_path, parameters={"columns": allowed_columns}) time2 = time.time() print(dataframe.columns) print( "time interlapsed importing " + log_name + " on columns " + str(allowed_columns) + ": ", (time2 - time1)) dfg1 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False) time3 = time.time() print( "time interlapsed calculating the DFG on columns " + str(allowed_columns) + " : ", (time3 - time2)) del dataframe time4 = time.time() dataframe = parquet_importer.apply(log_path) print(dataframe.columns) time5 = time.time() print("time interlapsed importing " + log_name + " (all columns): ", (time5 - time4)) dfg2 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False) time6 = time.time() print("time interlapsed calculating the DFG on all columns : ", (time6 - time5)) except: traceback.print_exc()
def apply(df, activity, parameters=None): """ Gets the time passed from each preceding activity Parameters ------------- df Dataframe activity Activity that we are considering parameters Possible parameters of the algorithm Returns ------------- dictio Dictionary containing a 'pre' key with the list of aggregates times from each preceding activity to the given activity """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, None) [dfg_frequency, dfg_performance ] = pandas.get_dfg_graph(df, measure="both", activity_key=activity_key, case_id_glue=case_id_glue, timestamp_key=timestamp_key, start_timestamp_key=start_timestamp_key) pre = [] sum_perf_pre = 0.0 sum_acti_pre = 0.0 for entry in dfg_performance.keys(): if entry[1] == activity: pre.append([ entry[0], float(dfg_performance[entry]), int(dfg_frequency[entry]) ]) sum_perf_pre = sum_perf_pre + float( dfg_performance[entry]) * float(dfg_frequency[entry]) sum_acti_pre = sum_acti_pre + float(dfg_frequency[entry]) perf_acti_pre = 0.0 if sum_acti_pre > 0: perf_acti_pre = sum_perf_pre / sum_acti_pre return {"pre": pre, "pre_avg_perf": perf_acti_pre}
def apply(log, parameters=None, variant=DEFAULT_VARIANT): if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: parameters[ pmutil.constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters: parameters[ pmutil.constants. PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE if isinstance(log, pandas.core.frame.DataFrame): dfg = df_statistics.get_dfg_graph( log, case_id_glue=parameters[ pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY], activity_key=parameters[ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], timestamp_key=parameters[ pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]) return VERSIONS_DFG[variant](dfg, parameters=parameters) return VERSIONS[variant](log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters)
def transient_analysis_from_dataframe(df, delay, parameters=None): """ Gets the transient analysis from a dataframe and a delay Parameters ------------- df Pandas dataframe delay Time delay parameters Parameters of the algorithm Returns ------------- transient_result Transient analysis result """ if parameters is None: parameters = {} activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY case_id_glue = parameters[ constants. PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY log = log_conv_factory.apply(df, variant=log_conv_factory.DF_TO_EVENT_LOG_1V, parameters=parameters) # gets the simple Petri net through simple miner net, im, fm = simple_factory.apply(log, parameters=parameters, classic_output=True) activities_count = dict(df.groupby(activity_key).size()) dfg_performance = df_statistics.get_dfg_graph(df, measure="performance", perf_aggregation_key="mean", case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key) spaths = get_shortest_paths(net) aggregated_statistics = get_decorations_from_dfg_spaths_acticount( net, dfg_performance, spaths, activities_count, variant="performance") # gets the stochastic map out of the dataframe and the Petri net s_map = smap_builder.get_map_exponential_from_aggstatistics( aggregated_statistics, parameters=parameters) return transient_analysis_from_petri_net_and_smap(net, im, s_map, delay, parameters=parameters)
def calculate_process_schema_from_df(dataframe, path_frequency, path_performance): activities_count = attributes_filter.get_attribute_values( dataframe, attribute_key=ACTIVITY_KEY) [dfg_frequency, dfg_performance ] = df_statistics.get_dfg_graph(dataframe, measure="both", perf_aggregation_key="median", case_id_glue=CASEID_GLUE, activity_key=ACTIVITY_KEY, timestamp_key=TIMEST_KEY, sort_caseid_required=False) net, initial_marking, final_marking = inductive_factory.apply_dfg( dfg_frequency) spaths = vis_trans_shortest_paths.get_shortest_paths(net) aggregated_statistics = vis_trans_shortest_paths.get_decorations_from_dfg_spaths_acticount( net, dfg_frequency, spaths, activities_count, variant="frequency") parameters_viz = {"format": "svg"} gviz = pn_vis_factory.apply(net, initial_marking, final_marking, variant="frequency", aggregated_statistics=aggregated_statistics, parameters=parameters_viz) pn_vis_factory.save(gviz, path_frequency) aggregated_statistics = vis_trans_shortest_paths.get_decorations_from_dfg_spaths_acticount( net, dfg_performance, spaths, activities_count, variant="performance") parameters_viz = {"format": "svg"} gviz = pn_vis_factory.apply(net, initial_marking, final_marking, variant="performance", aggregated_statistics=aggregated_statistics, parameters=parameters_viz) pn_vis_factory.save(gviz, path_performance)
def discover_dfg( log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]: """ Discovers a DFG from a log Parameters -------------- log Event log Returns -------------- dfg DFG start_activities Start activities end_activities End activities """ if check_is_dataframe(log): check_dataframe_columns(log) from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph dfg = get_dfg_graph(log) from pm4py.statistics.start_activities.pandas import get as start_activities_module from pm4py.statistics.end_activities.pandas import get as end_activities_module start_activities = start_activities_module.get_start_activities(log) end_activities = end_activities_module.get_end_activities(log) else: from pm4py.algo.discovery.dfg import algorithm as dfg_discovery dfg = dfg_discovery.apply(log) from pm4py.statistics.start_activities.log import get as start_activities_module from pm4py.statistics.end_activities.log import get as end_activities_module start_activities = start_activities_module.get_start_activities(log) end_activities = end_activities_module.get_end_activities(log) return dfg, start_activities, end_activities
def discover_abstraction_dataframe(df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Tuple[ Any, Any, Any, Any, Any, Any, Any]: """ Discovers an abstraction from a dataframe that is useful for the Heuristics Miner ++ algorithm Parameters -------------- df Dataframe parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.START_TIMESTAMP_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns -------------- start_activities Start activities end_activities End activities activities_occurrences Activities along with their number of occurrences dfg Directly-follows graph performance_dfg (Performance) Directly-follows graph sojourn_time Sojourn time for each activity concurrent_activities Concurrent activities """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters, None) if start_timestamp_key is None: start_timestamp_key = xes.DEFAULT_START_TIMESTAMP_KEY parameters = copy(parameters) parameters[Parameters.START_TIMESTAMP_KEY] = start_timestamp_key timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) start_activities = pd_sa.get_start_activities(df, parameters=parameters) end_activities = pd_ea.get_end_activities(df, parameters=parameters) activities_occurrences = pd_attributes.get_attribute_values(df, activity_key, parameters=parameters) efg_parameters = copy(parameters) efg_parameters[pd_efg.Parameters.KEEP_FIRST_FOLLOWING] = True dfg = pd_efg.apply(df, parameters=efg_parameters) performance_dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, start_timestamp_key=start_timestamp_key, measure="performance") sojourn_time = pd_soj_time.apply(df, parameters=parameters) concurrent_activities = pd_conc_act.apply(df, parameters=parameters) return ( start_activities, end_activities, activities_occurrences, dfg, performance_dfg, sojourn_time, concurrent_activities)
def apply(df, parameters=None): """ Discovers a footprint object from a dataframe (the footprints of the dataframe are returned) Parameters -------------- df Dataframe parameters Parameters of the algorithm Returns -------------- footprints_obj Footprints object """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) caseid_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) start_timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, None) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) sort_required = exec_utils.get_param_value(Parameters.SORT_REQUIRED, parameters, DEFAULT_SORT_REQUIRED) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, DEFAULT_INDEX_KEY) df = df[[caseid_key, activity_key, timestamp_key]] if sort_required: df = pandas_utils.insert_index(df, index_key) if start_timestamp_key is not None: df = df.sort_values([caseid_key, start_timestamp_key, timestamp_key, index_key]) else: df = df.sort_values([caseid_key, timestamp_key, index_key]) grouped_df = df.groupby(caseid_key) dfg = df_statistics.get_dfg_graph(df, measure="frequency", activity_key=activity_key, case_id_glue=caseid_key, timestamp_key=timestamp_key, sort_caseid_required=False, sort_timestamp_along_case_id=False, start_timestamp_key=start_timestamp_key) activities = set(df[activity_key].unique()) start_activities = set(grouped_df.first()[activity_key].unique()) end_activities = set(grouped_df.last()[activity_key].unique()) parallel = {(x, y) for (x, y) in dfg if (y, x) in dfg} sequence = set(causal_discovery.apply(dfg, causal_discovery.Variants.CAUSAL_ALPHA)) ret = {} ret[Outputs.DFG.value] = dfg ret[Outputs.SEQUENCE.value] = sequence ret[Outputs.PARALLEL.value] = parallel ret[Outputs.ACTIVITIES.value] = activities ret[Outputs.START_ACTIVITIES.value] = start_activities ret[Outputs.END_ACTIVITIES.value] = end_activities ret[Outputs.MIN_TRACE_LENGTH.value] = int(grouped_df.size().min()) return ret
def apply(log, parameters=None, variant=DFG_NATIVE): """ Calculates DFG graph (frequency or performance) starting from a log Parameters ---------- log Log parameters Possible parameters passed to the algorithms: aggregationMeasure -> performance aggregation measure (min, max, mean, median) activity_key -> Attribute to use as activity timestamp_key -> Attribute to use as timestamp variant Variant of the algorithm to use, possible values: native, frequency, performance, frequency_greedy, performance_greedy Returns ------- dfg DFG graph """ if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: parameters[ pmutil.constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters: parameters[ pmutil.constants. PARAMETER_CONSTANT_CASEID_KEY] = pmutil.constants.CASE_ATTRIBUTE_GLUE if isinstance(log, pandas.core.frame.DataFrame): log = csv_import_adapter.convert_timestamp_columns_in_df( log, timest_columns=[ parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] ]) dfg_frequency, dfg_performance = df_statistics.get_dfg_graph( log, measure="both", activity_key=parameters[ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], timestamp_key=parameters[ pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY], case_id_glue=parameters[ pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY]) if 'native' in variant or 'frequency' in variant: return dfg_frequency else: return dfg_performance return VERSIONS[variant](log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters=parameters)
def apply(df, activity, parameters=None): """ Gets the time passed from each preceding activity Parameters ------------- df Dataframe activity Activity that we are considering parameters Possible parameters of the algorithm Returns ------------- dictio Dictionary containing a 'pre' key with the list of aggregates times from each preceding activity to the given activity """ if parameters is None: parameters = {} case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME activity_key = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY [dfg_frequency, dfg_performance ] = df_statistics.get_dfg_graph(df, measure="both", activity_key=activity_key, case_id_glue=case_id_glue, timestamp_key=timestamp_key) pre = [] sum_perf_pre = 0.0 sum_acti_pre = 0.0 for entry in dfg_performance.keys(): if entry[1] == activity: pre.append([ entry[0], float(dfg_performance[entry]), int(dfg_frequency[entry]) ]) sum_perf_pre = sum_perf_pre + float( dfg_performance[entry]) * float(dfg_frequency[entry]) sum_acti_pre = sum_acti_pre + float(dfg_frequency[entry]) perf_acti_pre = 0.0 if sum_acti_pre > 0: perf_acti_pre = sum_perf_pre / sum_acti_pre return {"pre": pre, "pre_avg_perf": perf_acti_pre}
def execute_script(): log_path = os.path.join("..", "tests", "input_data", log_name) time1 = time.time() dataframe = parquet_importer.apply(log_path, parameters={"columns": allowed_columns}) time2 = time.time() print(dataframe.columns) print("time interlapsed importing "+log_name+" on columns "+str(allowed_columns)+": ",(time2-time1)) dfg1 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False) time3 = time.time() print("time interlapsed calculating the DFG on columns "+str(allowed_columns)+" : ",(time3-time2)) del dataframe time4 = time.time() dataframe = parquet_importer.apply(log_path) print(dataframe.columns) time5 = time.time() print("time interlapsed importing "+log_name+" (all columns): ",(time5-time4)) dfg2 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False) time6 = time.time() print("time interlapsed calculating the DFG on all columns : ",(time6-time5))
def apply(log, parameters=None, variant=DEFAULT_VARIANT): """ Apply the Alpha Miner on top of a log Parameters ----------- log Log variant Variant of the algorithm to use: - Variants.ALPHA_VERSION_CLASSIC - Variants.ALPHA_VERSION_PLUS parameters Possible parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Name of the attribute that contains the activity Returns ----------- net Petri net marking Initial marking final_marking Final marking """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value( Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_util.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, None) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_util.DEFAULT_TIMESTAMP_KEY) if pkgutil.find_loader("pandas"): import pandas if isinstance(log, pandas.core.frame.DataFrame ) and variant == ALPHA_VERSION_CLASSIC: dfg = df_statistics.get_dfg_graph( log, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, start_timestamp_key=start_timestamp_key) return exec_utils.get_variant(variant).apply_dfg( dfg, parameters=parameters) return exec_utils.get_variant(variant).apply( log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters)
def apply(log, parameters=None, variant=DEFAULT_VARIANT): """ Calculates DFG graph (frequency or performance) starting from a log Parameters ---------- log Log parameters Possible parameters passed to the algorithms: Parameters.AGGREGATION_MEASURE -> performance aggregation measure (min, max, mean, median) Parameters.ACTIVITY_KEY -> Attribute to use as activity Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp variant Variant of the algorithm to use, possible values: - Variants.NATIVE - Variants.FREQUENCY - Variants.FREQUENCY_GREEDY - Variants.PERFORMANCE - Variants.PERFORMANCE_GREEDY - Variants.FREQ_TRIPLES Returns ------- dfg DFG graph """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_util.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters, None) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_util.DEFAULT_TIMESTAMP_KEY) case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME) if pkgutil.find_loader("pandas"): import pandas from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics from pm4py.objects.log.util import dataframe_utils if isinstance(log, pandas.core.frame.DataFrame) and not variant == Variants.FREQ_TRIPLES: log = dataframe_utils.convert_timestamp_columns_in_df(log, timest_columns=[ timestamp_key]) dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(log, measure="both", activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_glue, start_timestamp_key=start_timestamp_key) if variant in [Variants.PERFORMANCE, Variants.PERFORMANCE_GREEDY]: return dfg_performance else: return dfg_frequency return exec_utils.get_variant(variant).apply(log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters=parameters)
def discover_performance_dfg(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7], workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) -> Tuple[dict, dict, dict]: """ Discovers a performance directly-follows graph from an event log Parameters --------------- log Event log business_hours Enables/disables the computation based on the business hours (default: False) worktiming (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17) weekends (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7)) Returns --------------- performance_dfg Performance DFG start_activities Start activities end_activities End activities """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.util import constants properties = get_properties(log) from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME dfg = get_dfg_graph(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_key, measure="performance", perf_aggregation_key="all", business_hours=business_hours, worktiming=worktiming, weekends=weekends, workcalendar=workcalendar) from pm4py.statistics.start_activities.pandas import get as start_activities_module from pm4py.statistics.end_activities.pandas import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=properties) end_activities = end_activities_module.get_end_activities(log, parameters=properties) else: from pm4py.algo.discovery.dfg.variants import performance as dfg_discovery properties = get_properties(log) properties[dfg_discovery.Parameters.AGGREGATION_MEASURE] = "all" properties[dfg_discovery.Parameters.BUSINESS_HOURS] = business_hours properties[dfg_discovery.Parameters.WORKTIMING] = worktiming properties[dfg_discovery.Parameters.WEEKENDS] = weekends dfg = dfg_discovery.apply(log, parameters=properties) from pm4py.statistics.start_activities.log import get as start_activities_module from pm4py.statistics.end_activities.log import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=properties) end_activities = end_activities_module.get_end_activities(log, parameters=properties) return dfg, start_activities, end_activities
def apply(log, parameters=None): """ Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking Parameters ----------- log Log parameters Parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ----------- net Petri net initial_marking Initial marking final_marking Final marking """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value( Parameters.CASE_ID_KEY, parameters, pmutil.constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value( Parameters.ACTIVITY_KEY, parameters, pmutil.xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, pmutil.xes_constants.DEFAULT_TIMESTAMP_KEY) if isinstance(log, pandas.core.frame.DataFrame): dfg = df_statistics.get_dfg_graph(log, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key) start_activities = pd_start_act_stats.get_start_activities( log, parameters=parameters) end_activities = pd_end_act_stats.get_end_activities( log, parameters=parameters) activities = pd_attributes_stats.get_attribute_values( log, activity_key, parameters=parameters) return apply_dfg(dfg, activities=activities, start_activities=start_activities, end_activities=end_activities, parameters=parameters) log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG) tree = apply_tree(log, parameters=parameters) net, initial_marking, final_marking = tree_to_petri.apply(tree) return net, initial_marking, final_marking
def calculate_dfg(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[ PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier" filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY], use_transition=use_transition) if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters: columns.append( parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY]) activity_key, parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key else: parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key folder = os.path.join(path, log_name) parquet_list = parquet_importer.get_list_parquet(folder) overall_dfg = Counter() count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) dfg = Counter( df_statistics.get_dfg_graph(df, activity_key=activity_key, sort_timestamp_along_case_id=False, sort_caseid_required=False)) overall_dfg = overall_dfg + dfg if count >= no_samples: break returned_dict = {} for el in overall_dfg: returned_dict[el[0] + "@@" + el[1]] = overall_dfg[el] return returned_dict
def apply(log, parameters=None, variant=ALPHA_VERSION_CLASSIC): """ Apply the Alpha Miner on top of a log Parameters ----------- log Log variant Variant of the algorithm to use (classic) parameters Possible parameters of the algorithm, including: activity key -> Name of the attribute that contains the activity Returns ----------- net Petri net marking Initial marking final_marking Final marking """ if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: parameters[ pmutil.constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters: parameters[ pmutil.constants. PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE if isinstance( log, pandas.core.frame.DataFrame) and variant == ALPHA_VERSION_CLASSIC: dfg = df_statistics.get_dfg_graph( log, case_id_glue=parameters[ pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY], activity_key=parameters[ pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], timestamp_key=parameters[ pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]) return VERSIONS_DFG[variant](dfg, parameters=parameters) return VERSIONS[variant](log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters)
def calculate_performance_dfg(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY], use_transition=use_transition) if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters: columns.append(parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY]) activity_key, parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key else: parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key folder = os.path.join(path, log_name) parquet_list = parquet_importer.get_list_parquet(folder) frequency_dfg = Counter() performance_dfg = Counter() count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) f_dfg, p_dfg = df_statistics.get_dfg_graph(df, activity_key=activity_key, sort_timestamp_along_case_id=False, sort_caseid_required=False, measure="both") f_dfg = Counter(f_dfg) for k in p_dfg: if k not in performance_dfg: performance_dfg[k] = p_dfg[k] else: performance_dfg[k] = (frequency_dfg[k] * performance_dfg[k] + f_dfg[k] * p_dfg[k]) / ( frequency_dfg[k] + f_dfg[k]) frequency_dfg = frequency_dfg + f_dfg if count >= no_samples: break returned_dict = {} for el in performance_dfg: returned_dict[el[0] + "@@" + el[1]] = performance_dfg[el] return returned_dict
def execute_script(): log_path = os.path.join("..", "tests", "input_data", "interval_event_log.csv") dataframe = pm4py.read_csv(log_path) log_path = os.path.join("..", "tests", "input_data", "reviewing.xes") log = pm4py.read_xes(log_path) dataframe = pm4py.convert_to_dataframe(log) parameters = {} #parameters[constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = "start_timestamp" parameters[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = "time:timestamp" parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = "concept:name" parameters[constants.PARAMETER_CONSTANT_CASEID_KEY] = "case:concept:name" parameters["strict"] = True parameters["format"] = "svg" start_activities = sa_get.get_start_activities(dataframe, parameters=parameters) end_activities = ea_get.get_end_activities(dataframe, parameters=parameters) att_count = att_get.get_attribute_values(dataframe, "concept:name", parameters=parameters) parameters["start_activities"] = start_activities parameters["end_activities"] = end_activities soj_time = soj_time_get.apply(dataframe, parameters=parameters) print("soj_time") print(soj_time) conc_act = conc_act_get.apply(dataframe, parameters=parameters) print("conc_act") print(conc_act) efg = efg_get.apply(dataframe, parameters=parameters) print("efg") print(efg) dfg_freq, dfg_perf = df_statistics.get_dfg_graph( dataframe, measure="both", start_timestamp_key="start_timestamp") dfg_gv_freq = dfg_vis_fact.apply(dfg_freq, activities_count=att_count, variant=dfg_vis_fact.Variants.FREQUENCY, soj_time=soj_time, parameters=parameters) dfg_vis_fact.view(dfg_gv_freq) dfg_gv_perf = dfg_vis_fact.apply(dfg_perf, activities_count=att_count, variant=dfg_vis_fact.Variants.PERFORMANCE, soj_time=soj_time, parameters=parameters) dfg_vis_fact.view(dfg_gv_perf) net, im, fm = dfg_conv.apply(dfg_freq) gviz = pn_vis.apply(net, im, fm, parameters=parameters) pn_vis.view(gviz)
def get(df): try: if df.type == "succint": df = succint_mdl_to_exploded_mdl.apply(df) except: pass activ = dict( df.groupby("event_id").first()["event_activity"].value_counts()) max_activ_freq = max(activ.values()) if len(activ.values()) > 0 else 0 dfg = df_statistics.get_dfg_graph(red_df, activity_key="event_activity", timestamp_key="event_timestamp", case_id_glue=persp) max_edge_freq = max(dfg.values()) if len(dfg) > 0 else 0 return {"max_activ_freq": max_activ_freq, "max_edge_freq": max_edge_freq}
def discover_dfg(log: Union[EventLog, pd.DataFrame]) -> Tuple[dict, dict, dict]: """ Discovers a DFG from a log Parameters -------------- log Event log Returns -------------- dfg DFG start_activities Start activities end_activities End activities """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.util import constants properties = get_properties(log) from pm4py.algo.discovery.dfg.adapters.pandas.df_statistics import get_dfg_graph activity_key = properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY timestamp_key = properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in properties else xes_constants.DEFAULT_TIMESTAMP_KEY case_id_key = properties[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME dfg = get_dfg_graph(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_key) from pm4py.statistics.start_activities.pandas import get as start_activities_module from pm4py.statistics.end_activities.pandas import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=properties) end_activities = end_activities_module.get_end_activities(log, parameters=properties) else: from pm4py.algo.discovery.dfg import algorithm as dfg_discovery dfg = dfg_discovery.apply(log, parameters=get_properties(log)) from pm4py.statistics.start_activities.log import get as start_activities_module from pm4py.statistics.end_activities.log import get as end_activities_module start_activities = start_activities_module.get_start_activities(log, parameters=get_properties(log)) end_activities = end_activities_module.get_end_activities(log, parameters=get_properties(log)) return dfg, start_activities, end_activities
def apply(log, parameters=None): """ Apply the IMDF algorithm to a log obtaining a Petri net along with an initial and final marking Parameters ----------- log Log parameters Parameters of the algorithm, including: pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY -> attribute of the log to use as activity name (default concept:name) Returns ----------- net Petri net initial_marking Initial marking final_marking Final marking """ if parameters is None: parameters = {} if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters: parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY] = pmutil.constants.CASE_ATTRIBUTE_GLUE if isinstance(log, pandas.core.frame.DataFrame): dfg = df_statistics.get_dfg_graph(log, case_id_glue=parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY], activity_key=parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], timestamp_key=parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]) start_activities = pd_start_act_stats.get_start_activities(log, parameters=parameters) end_activities = pd_end_act_stats.get_end_activities(log, parameters=parameters) activities = pd_attributes_stats.get_attribute_values(log, parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], parameters=parameters) return apply_dfg(dfg, activities=activities, start_activities=start_activities, end_activities=end_activities, parameters=parameters) log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG) tree = apply_tree(log, parameters=parameters) net, initial_marking, final_marking = tree_to_petri.apply(tree) return net, initial_marking, final_marking
def execute_script(): log_path = os.path.join("..", "tests", "input_data", "running-example.csv") dataframe = pandas_df_imp.import_dataframe_from_path(log_path) activities_count = dict(dataframe.groupby("concept:name").size()) [dfg_frequency, dfg_performance] = df_statistics.get_dfg_graph(dataframe, measure="both", perf_aggregation_key="mean") net, initial_marking, final_marking = alpha_miner.apply_dfg(dfg_frequency) spaths = get_shortest_paths(net) aggregated_statistics = get_decorations_from_dfg_spaths_acticount(net, dfg_performance, spaths, activities_count, variant="performance") # obtain stochastic information for transitions in the model s_map = stochastic_map.get_map_exponential_from_aggstatistics(aggregated_statistics) # gets the reachability graph from the Petri net reachab_graph = construct_reachability_graph(net, initial_marking) # get the tangible reachability graph from the reachability graph and the stochastic map tang_reach_graph = tangible_reachability.get_tangible_reachability_from_reachability(reachab_graph, s_map) # visualize the tangible reachability graph on the screen viz = ts_vis_factory.apply(tang_reach_graph, parameters={"format": "svg", "show_labels": True, "show_names": True}) ts_vis_factory.view(viz) # gets the Q matrix assuming exponential distributions q_matrix = ctmc.get_q_matrix_from_tangible_exponential(tang_reach_graph, s_map) # pick a state to start from states = sorted(list(tang_reach_graph.states), key=lambda x: x.name) state = states[0] print("\n\nstarting from state = ", state.name) # do transient analysis after 1 day transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state(tang_reach_graph, q_matrix, state, 86400) print("\nprobability for each state after 1 day = ", transient_result) # do transient analysis after 10 days transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state(tang_reach_graph, q_matrix, state, 864000) print("\nprobability for each state after 10 days = ", transient_result) # do transient analysis after 100 days transient_result = ctmc.transient_analysis_from_tangible_q_matrix_and_single_state(tang_reach_graph, q_matrix, state, 8640000) print("\nprobability for each state after 100 days = ", transient_result) steady_state = ctmc.steadystate_analysis_from_tangible_q_matrix(tang_reach_graph, q_matrix) print("\nsteady state = ", steady_state)
def get_paths(self, attribute_key, parameters=None): """ Gets the paths from the log Parameters ------------- attribute_key Attribute key Returns ------------- paths List of paths """ if parameters is None: parameters = {} dfg = df_statistics.get_dfg_graph(self.dataframe, activity_key=attribute_key, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME, sort_caseid_required=False, sort_timestamp_along_case_id=False) return dfg
def apply(df: pd.DataFrame, activity: str, parameters: Optional[Dict[Any, Any]] = None) -> Dict[str, Any]: """ Gets the time passed from each preceding activity and to each succeeding activity Parameters ------------- df Dataframe activity Activity that we are considering parameters Possible parameters of the algorithm Returns ------------- dictio Dictionary containing a 'pre' key with the list of aggregated times from each preceding activity to the given activity and a 'post' key with the list of aggregates times from the given activity to each succeeding activity """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, None) business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) workcalendar = exec_utils.get_param_value( Parameters.WORKCALENDAR, parameters, constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) [dfg_frequency, dfg_performance ] = pandas.get_dfg_graph(df, measure="both", activity_key=activity_key, case_id_glue=case_id_glue, timestamp_key=timestamp_key, start_timestamp_key=start_timestamp_key, business_hours=business_hours, worktiming=worktiming, weekends=weekends, workcalendar=workcalendar) pre = [] sum_perf_post = 0.0 sum_acti_post = 0.0 post = [] sum_perf_pre = 0.0 sum_acti_pre = 0.0 for entry in dfg_performance.keys(): if entry[1] == activity: pre.append([ entry[0], float(dfg_performance[entry]), int(dfg_frequency[entry]) ]) sum_perf_pre = sum_perf_pre + float( dfg_performance[entry]) * float(dfg_frequency[entry]) sum_acti_pre = sum_acti_pre + float(dfg_frequency[entry]) if entry[0] == activity: post.append([ entry[1], float(dfg_performance[entry]), int(dfg_frequency[entry]) ]) sum_perf_post = sum_perf_post + float( dfg_performance[entry]) * float(dfg_frequency[entry]) sum_acti_post = sum_acti_post + float(dfg_frequency[entry]) perf_acti_pre = 0.0 if sum_acti_pre > 0: perf_acti_pre = sum_perf_pre / sum_acti_pre perf_acti_post = 0.0 if sum_acti_post > 0: perf_acti_post = sum_perf_post / sum_acti_post return { "pre": pre, "post": post, "post_avg_perf": perf_acti_post, "pre_avg_perf": perf_acti_pre }
def apply(dataframe, parameters=None): """ Gets the performance DFG Parameters ------------ dataframe Dataframe parameters Parameters of the algorithm Returns ------------ base64 Base64 of an SVG representing the model model Text representation of the model format Format of the model """ if parameters is None: parameters = {} decreasingFactor = parameters[ "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR activity_key = parameters[ pm4_constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY timestamp_key = parameters[ pm4_constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY case_id_glue = parameters[ pm4_constants. PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True dataframe = attributes_filter.filter_df_keeping_spno_activities( dataframe, activity_key=activity_key, max_no_activities=constants.MAX_NO_ACTIVITIES) dataframe, end_activities = auto_filter.apply_auto_filter( dataframe, parameters=parameters) end_activities = list(end_activities.keys()) [dfg, dfg_perf ] = df_statistics.get_dfg_graph(dataframe, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_glue, sort_caseid_required=False, sort_timestamp_along_case_id=False, measure="both") activities_count = attributes_filter.get_attribute_values( dataframe, activity_key, parameters=parameters) activities = list(activities_count.keys()) dfg = clean_dfg_based_on_noise_thresh( dfg, activities, decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER, parameters=parameters) dfg_perf = {x: y for x, y in dfg_perf.items() if x in dfg} start_activities = list( start_activities_filter.get_start_activities( dataframe, parameters=parameters).keys()) gviz = dfg_vis_factory.apply(dfg_perf, activities_count=activities_count, variant="performance", parameters={ "format": "svg", "start_activities": start_activities, "end_activities": end_activities }) gviz_base64 = base64.b64encode(str(gviz).encode('utf-8')) ret_graph = get_graph.get_graph_from_dfg(dfg, start_activities, end_activities) net, im, fm = dfg_conv_factory.apply(dfg, parameters={ "start_activities": start_activities, "end_activities": end_activities }) return get_base64_from_gviz(gviz), export_petri_as_string( net, im, fm ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "dfg", "perf", None, "", activity_key
def calculate_process_schema_composite_object(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} performance_required = parameters[ "performance_required"] if "performance_required" in parameters else False no_samples = parameters[ PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier" filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key if performance_required: columns = get_columns_to_import( filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY], use_transition=use_transition) else: columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY], use_transition=use_transition) if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters: columns.append( parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY]) activity_key, parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key else: parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key folder = os.path.join(path, log_name) parquet_list = parquet_importer.get_list_parquet(folder) frequency_dfg = Counter() performance_dfg = Counter() overall_ea = Counter() overall_sa = Counter() values = Counter({}) events = 0 cases = 0 count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) if performance_required: f_dfg, p_dfg = df_statistics.get_dfg_graph( df, activity_key=activity_key, sort_timestamp_along_case_id=False, sort_caseid_required=False, measure="both") else: f_dfg = df_statistics.get_dfg_graph( df, activity_key=activity_key, sort_timestamp_along_case_id=False, sort_caseid_required=False) f_dfg = Counter(f_dfg) if performance_required: for k in p_dfg: if k not in performance_dfg: performance_dfg[k] = p_dfg[k] else: performance_dfg[k] = ( frequency_dfg[k] * performance_dfg[k] + f_dfg[k] * p_dfg[k]) / (frequency_dfg[k] + f_dfg[k]) frequency_dfg = frequency_dfg + f_dfg ea = Counter( end_activities_filter.get_end_activities( df, parameters=parameters)) overall_ea = overall_ea + ea sa = Counter( start_activities_filter.get_start_activities( df, parameters=parameters)) overall_sa = overall_sa + sa values = values + Counter(dict(df[activity_key].value_counts())) events = events + len(df) cases = cases + df[CASE_CONCEPT_NAME].nunique() if count >= no_samples: break returned_dict = {} returned_dict["events"] = events returned_dict["cases"] = cases values = dict(values) for el in values: values[el] = int(values[el]) returned_dict["activities"] = values overall_sa = dict(overall_sa) for el in overall_sa: overall_sa[el] = int(overall_sa[el]) returned_dict["start_activities"] = overall_sa overall_ea = dict(overall_ea) for el in overall_ea: overall_ea[el] = int(overall_ea[el]) returned_dict["end_activities"] = overall_ea returned_dict_freq = {} for el in frequency_dfg: returned_dict_freq[el[0] + "@@" + el[1]] = int(frequency_dfg[el]) returned_dict["frequency_dfg"] = returned_dict_freq if performance_required: returned_dict_perf = {} for el in performance_dfg: returned_dict_perf[el[0] + "@@" + el[1]] = float( performance_dfg[el]) returned_dict["performance_dfg"] = returned_dict_perf return returned_dict
def apply(dataframe, parameters=None): """ Gets the Petri net through Inductive Miner, decorated by performance metric Parameters ------------ dataframe Dataframe parameters Parameters of the algorithm Returns ------------ base64 Base64 of an SVG representing the model model Text representation of the model format Format of the model """ if parameters is None: parameters = {} decreasingFactor = parameters[ "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR activity_key = parameters[ pm4_constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY timestamp_key = parameters[ pm4_constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY case_id_glue = parameters[ pm4_constants. PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True dataframe = attributes_filter.filter_df_keeping_spno_activities( dataframe, activity_key=activity_key, max_no_activities=constants.MAX_NO_ACTIVITIES) dataframe, end_activities = auto_filter.apply_auto_filter( dataframe, parameters=parameters) end_activities = list(end_activities.keys()) activities_count = attributes_filter.get_attribute_values( dataframe, activity_key, parameters=parameters) activities = list(activities_count.keys()) start_activities = list( start_activities_filter.get_start_activities( dataframe, parameters=parameters).keys()) [dfg, dfg_perf ] = df_statistics.get_dfg_graph(dataframe, activity_key=activity_key, timestamp_key=timestamp_key, case_id_glue=case_id_glue, sort_caseid_required=False, sort_timestamp_along_case_id=False, measure="both") dfg = clean_dfg_based_on_noise_thresh( dfg, activities, decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER, parameters=parameters) dfg_perf = {x: y for x, y in dfg_perf.items() if x in dfg} net, im, fm = inductive_miner.apply_dfg(dfg, parameters, activities=activities, start_activities=start_activities, end_activities=end_activities) spaths = get_shortest_paths(net) bpmn_graph, el_corr, inv_el_corr, el_corr_keys_map = petri_to_bpmn.apply( net, im, fm) aggregated_statistics = get_decorations_from_dfg_spaths_acticount( net, dfg_perf, spaths, activities_count, variant="performance") bpmn_aggreg_statistics = convert_performance_map.convert_performance_map_to_bpmn( aggregated_statistics, inv_el_corr) #bpmn_graph = bpmn_embedding.embed_info_into_bpmn(bpmn_graph, bpmn_aggreg_statistics, "performance") bpmn_graph = bpmn_diagram_layouter.apply(bpmn_graph) bpmn_string = bpmn_exporter.get_string_from_bpmn(bpmn_graph) gviz = bpmn_vis_factory.apply_petri( net, im, fm, aggregated_statistics=aggregated_statistics, variant="performance", parameters={"format": "svg"}) gviz2 = bpmn_vis_factory.apply_petri( net, im, fm, aggregated_statistics=aggregated_statistics, variant="performance", parameters={"format": "dot"}) gviz_base64 = get_base64_from_file(gviz2.name) ret_graph = get_graph.get_graph_from_petri(net, im, fm) return get_base64_from_file(gviz.name), export_petri_as_string( net, im, fm ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "indbpmn", "perf", bpmn_string, ".bpmn", activity_key
def apply_pandas(df, parameters=None): """ Discovers a Petri net using Heuristics Miner Parameters ------------ df Pandas dataframe parameters Possible parameters of the algorithm, including: activity_key, case_id_glue, timestamp_key, dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh, loops_length_two_thresh Returns ------------ net Petri net im Initial marking fm Final marking """ if parameters is None: parameters = {} if pkgutil.find_loader("pandas"): activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) start_timestamp_key = exec_utils.get_param_value(Parameters.START_TIMESTAMP_KEY, parameters, None) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics, freq_triples as get_freq_triples from pm4py.statistics.attributes.pandas import get as pd_attributes from pm4py.statistics.start_activities.pandas import get as pd_sa_filter from pm4py.statistics.end_activities.pandas import get as pd_ea_filter start_activities = pd_sa_filter.get_start_activities(df, parameters=parameters) end_activities = pd_ea_filter.get_end_activities(df, parameters=parameters) activities_occurrences = pd_attributes.get_attribute_values(df, activity_key, parameters=parameters) activities = list(activities_occurrences.keys()) heu_net_decoration = exec_utils.get_param_value(Parameters.HEU_NET_DECORATION, parameters, "frequency") if timestamp_key in df: dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, start_timestamp_key=start_timestamp_key) dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, window=2, start_timestamp_key=start_timestamp_key) frequency_triples = get_freq_triples.get_freq_triples(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key) else: dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, sort_timestamp_along_case_id=False) dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, sort_timestamp_along_case_id=False, window=2) frequency_triples = get_freq_triples.get_freq_triples(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, sort_timestamp_along_case_id=False) performance_dfg = None if heu_net_decoration == "performance": performance_dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, start_timestamp_key=start_timestamp_key, measure="performance") heu_net = apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences, start_activities=start_activities, end_activities=end_activities, dfg_window_2=dfg_window_2, freq_triples=frequency_triples, performance_dfg=performance_dfg, parameters=parameters) net, im, fm = hn_conv_alg.apply(heu_net, parameters=parameters) return net, im, fm