def get_end_activities(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key folder = os.path.join(path, log_name) columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY], use_transition=use_transition) parquet_list = parquet_importer.get_list_parquet(folder) overall_ea = Counter() count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) ea = Counter(end_activities_filter.get_end_activities(df, parameters=parameters)) overall_ea = overall_ea + ea if count >= no_samples: break for el in overall_ea: overall_ea[el] = int(overall_ea[el]) return dict(overall_ea)
def get_end_activities(self, parameters=None): """ Gets the end activities from the log Returns ------------- end_activities_dict Dictionary of end activities """ if parameters is None: parameters = {} parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = self.activity_key parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = self.activity_key if self.reduced_grouped_dataframe is not None: parameters[constants.GROUPED_DATAFRAME] = self.reduced_grouped_dataframe return end_activities_filter.get_end_activities(self.get_reduced_dataframe(), parameters=parameters)
def execute_script(): aa = time.time() dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion( inputLog, sep=',') dataframe = csv_import_adapter.convert_caseid_column_to_str( dataframe, case_id_glue=CASEID_GLUE) dataframe = csv_import_adapter.convert_timestamp_columns_in_df( dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS) dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY]) dataframe_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) bb = time.time() print("importing log time=", (bb - aa)) parameters_cde = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY, "sort_by_column": "caseDuration", "sort_ascending": False, "max_ret_cases": 1000 } cases_desc = case_statistics.get_cases_description( dataframe, parameters=parameters_cde) print(cases_desc) bb2 = time.time() print("calculating and printing cases_desc = ", (bb2 - bb)) calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg", "NOFILTERS_PERFORMANCE.svg") GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg") GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_fa cc = time.time() print( "saving initial Inductive Miner process schema along with frequency metrics=", (cc - bb2)) dataframe_cp = case_filter.filter_on_case_performance( dataframe, case_id_glue=CASEID_GLUE, timestamp_key=TIMEST_KEY, min_case_performance=100000, max_case_performance=10000000) dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_cp, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) dataframe_cp = None if DELETE_VARIABLES: del dataframe_cp calculate_process_schema_from_df(dataframe_cp_fa, "FILTER_CP_FREQUENCY.svg", "FILTER_CP_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_cp_fa dd = time.time() print("filtering on case performance and generating process schema=", (dd - cc)) if ENABLE_ATTRIBUTE_FILTER: parameters_att = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER, "positive": True } dataframe_att = attributes_filter.apply(dataframe, ATTRIBUTE_VALUES_TO_FILTER, parameters=parameters_att) # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att) print( "all the activities in the log", attributes_filter.get_attribute_values(dataframe_att, ACTIVITY_KEY)) dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_att, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_att calculate_process_schema_from_df(dataframe_att_fa, "FILTER_ATT_FREQUENCY.svg", "FILTER_ATT_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_att_fa ee = time.time() print("filtering on attribute values and generating process schema=", (ee - dd)) ee = time.time() parameters_sa = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } parameters_ea = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY } start_act = start_activities_filter.get_start_activities( dataframe, parameters=parameters_sa) print("start activities in the log = ", start_act) end_act = end_activities_filter.get_end_activities( dataframe, parameters=parameters_ea) print("end activities in the log = ", end_act) ff = time.time() print("finding start and end activities along with their count", (ff - ee)) if ENABLE_STARTACT_FILTER: dataframe_sa = start_activities_filter.apply(dataframe, STARTACT_TO_FILTER, parameters=parameters_sa) # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa) start_act = start_activities_filter.get_start_activities( dataframe_sa, parameters=parameters_sa) print("start activities in the filtered log = ", start_act) dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_sa, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_sa calculate_process_schema_from_df(dataframe_sa_fa, "FILTER_SA_FREQUENCY.svg", "FILTER_SA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_sa_fa gg = time.time() if ENABLE_STARTACT_FILTER: print("filtering start activities time=", (gg - ff)) if ENABLE_ENDACT_FILTER: dataframe_ea = end_activities_filter.apply(dataframe, ENDACT_TO_FILTER, parameters=parameters_ea) # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea) end_act = end_activities_filter.get_end_activities( dataframe_ea, parameters=parameters_ea) print("end activities in the filtered log = ", end_act) dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities( dataframe_ea, activity_key=ACTIVITY_KEY, max_no_activities=MAX_NO_ACTIVITIES) if DELETE_VARIABLES: del dataframe_ea calculate_process_schema_from_df(dataframe_ea_fa, "FILTER_EA_FREQUENCY.svg", "FILTER_EA_PERFORMANCE.svg") GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg") GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg") if DELETE_VARIABLES: del dataframe_ea_fa hh = time.time() if ENABLE_ENDACT_FILTER: print("filtering end activities time=", (hh - gg)) if REMOVE_GENERATED_IMAGES: for image in GENERATED_IMAGES: os.remove(image)
def calculate_process_schema_composite_object(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} performance_required = parameters[ "performance_required"] if "performance_required" in parameters else False no_samples = parameters[ PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier" filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key if performance_required: columns = get_columns_to_import( filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY], use_transition=use_transition) else: columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY], use_transition=use_transition) if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters: columns.append( parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY]) activity_key, parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key else: parameters[ pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key folder = os.path.join(path, log_name) parquet_list = parquet_importer.get_list_parquet(folder) frequency_dfg = Counter() performance_dfg = Counter() overall_ea = Counter() overall_sa = Counter() values = Counter({}) events = 0 cases = 0 count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters) if performance_required: f_dfg, p_dfg = df_statistics.get_dfg_graph( df, activity_key=activity_key, sort_timestamp_along_case_id=False, sort_caseid_required=False, measure="both") else: f_dfg = df_statistics.get_dfg_graph( df, activity_key=activity_key, sort_timestamp_along_case_id=False, sort_caseid_required=False) f_dfg = Counter(f_dfg) if performance_required: for k in p_dfg: if k not in performance_dfg: performance_dfg[k] = p_dfg[k] else: performance_dfg[k] = ( frequency_dfg[k] * performance_dfg[k] + f_dfg[k] * p_dfg[k]) / (frequency_dfg[k] + f_dfg[k]) frequency_dfg = frequency_dfg + f_dfg ea = Counter( end_activities_filter.get_end_activities( df, parameters=parameters)) overall_ea = overall_ea + ea sa = Counter( start_activities_filter.get_start_activities( df, parameters=parameters)) overall_sa = overall_sa + sa values = values + Counter(dict(df[activity_key].value_counts())) events = events + len(df) cases = cases + df[CASE_CONCEPT_NAME].nunique() if count >= no_samples: break returned_dict = {} returned_dict["events"] = events returned_dict["cases"] = cases values = dict(values) for el in values: values[el] = int(values[el]) returned_dict["activities"] = values overall_sa = dict(overall_sa) for el in overall_sa: overall_sa[el] = int(overall_sa[el]) returned_dict["start_activities"] = overall_sa overall_ea = dict(overall_ea) for el in overall_ea: overall_ea[el] = int(overall_ea[el]) returned_dict["end_activities"] = overall_ea returned_dict_freq = {} for el in frequency_dfg: returned_dict_freq[el[0] + "@@" + el[1]] = int(frequency_dfg[el]) returned_dict["frequency_dfg"] = returned_dict_freq if performance_required: returned_dict_perf = {} for el in performance_dfg: returned_dict_perf[el[0] + "@@" + el[1]] = float( performance_dfg[el]) returned_dict["performance_dfg"] = returned_dict_perf return returned_dict
def apply_pandas(df, parameters=None): """ Discovers a Petri net using Heuristics Miner Parameters ------------ df Pandas dataframe parameters Possible parameters of the algorithm, including: activity_key, case_id_glue, timestamp_key, dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh, loops_length_two_thresh Returns ------------ net Petri net im Initial marking fm Final marking """ if parameters is None: parameters = {} activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY case_id_glue = parameters[ constants. PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY start_activities = pd_sa_filter.get_start_activities(df, parameters=parameters) end_activities = pd_ea_filter.get_end_activities(df, parameters=parameters) activities_occurrences = pd_attributes.get_attribute_values( df, activity_key, parameters=parameters) activities = list(activities_occurrences.keys()) if timestamp_key in df: dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key) dfg_window_2 = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, window=2) frequency_triples = get_freq_triples.get_freq_triples( df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key) else: dfg = df_statistics.get_dfg_graph(df, case_id_glue=case_id_glue, activity_key=activity_key, sort_timestamp_along_case_id=False) dfg_window_2 = df_statistics.get_dfg_graph( df, case_id_glue=case_id_glue, activity_key=activity_key, sort_timestamp_along_case_id=False, window=2) frequency_triples = get_freq_triples.get_freq_triples( df, case_id_glue=case_id_glue, activity_key=activity_key, timestamp_key=timestamp_key, sort_timestamp_along_case_id=False) heu_net = apply_heu_dfg(dfg, activities=activities, activities_occurrences=activities_occurrences, start_activities=start_activities, end_activities=end_activities, dfg_window_2=dfg_window_2, freq_triples=frequency_triples, parameters=parameters) net, im, fm = hn_conv_factory.apply(heu_net, parameters=parameters) return net, im, fm
def apply(df, parameters=None): """ Discover a StarStar model from an ad-hoc built dataframe Parameters ------------- df Dataframe parameters Possible parameters of the algorithm Returns ------------- perspectives_heu Dictionary of perspectives associated to Heuristics Net """ if parameters is None: parameters = {} if len(df) == 0: df = pd.DataFrame({"event_id": [], "event_activity": []}) dependency_thresh = parameters[ DEPENDENCY_THRESH] if DEPENDENCY_THRESH in parameters else DEFAULT_DEPENDENCY_THRESH and_measure_thresh = parameters[ AND_MEASURE_THRESH] if AND_MEASURE_THRESH in parameters else DEFAULT_AND_MEASURE_THRESH min_act_count = parameters[ MIN_ACT_COUNT] if MIN_ACT_COUNT in parameters else DEFAULT_MIN_ACT_COUNT min_dfg_occurrences = parameters[ MIN_DFG_OCCURRENCES] if MIN_DFG_OCCURRENCES in parameters else DEFAULT_MIN_DFG_OCCURRENCES dfg_pre_cleaning_noise_thresh = parameters[ DFG_PRE_CLEANING_NOISE_THRESH] if DFG_PRE_CLEANING_NOISE_THRESH in parameters else DEFAULT_DFG_PRE_CLEANING_NOISE_THRESH decreasing_factor_sa_ea = parameters[ DECREASING_FACTOR] if DECREASING_FACTOR in parameters else 0.5 performance = parameters[ PERFORMANCE] if PERFORMANCE in parameters else False perspectives = parameters[ PERSPECTIVES] if PERSPECTIVES in parameters else None use_timestamp = parameters[ USE_TIMESTAMP] if USE_TIMESTAMP in parameters else True sort_caseid_required = parameters[ SORT_CASEID_REQUIRED] if SORT_CASEID_REQUIRED in parameters else True sort_timestamp_required = parameters[ SORT_TIMESTAMP_REQUIRED] if SORT_TIMESTAMP_REQUIRED in parameters else True perspectives_heu = {} r = lambda: random.randint(0, 255) if perspectives is None: perspectives = list(x for x in df.columns if not x.startswith("event")) # del perspectives[perspectives.index("event_id")] # del perspectives[perspectives.index("event_activity")] # if "event_timestamp" in perspectives: # del perspectives[perspectives.index("event_timestamp")] perspectives = sorted(perspectives) activities_occurrences = attributes_filter.get_attribute_values( df.groupby("event_id").first().reset_index(), "event_activity") activities_occurrences = { x: y for x, y in activities_occurrences.items() if y >= min_act_count } for p_ind, p in enumerate(perspectives): has_timestamp = False if "event_timestamp" in df.columns and use_timestamp: proj_df = df[["event_id", "event_activity", "event_timestamp", p]].dropna(subset=[p]) has_timestamp = True else: proj_df = df[["event_id", "event_activity", p]].dropna(subset=[p]) """ proj_df_activities = Counter(list(proj_df["event_activity"])) proj_df_activities = set(x for x in proj_df_activities if proj_df_activities[x] >= min_act_count) proj_df = proj_df[proj_df["event_activity"].isin(proj_df_activities)] print(proj_df_activities)""" # proj_df = proj_df.groupby(["event_id", "event_activity", p]).first().reset_index() # print('sii') proj_df = proj_df[proj_df["event_activity"].isin( activities_occurrences)] proj_df_first = proj_df.groupby("event_id").first().reset_index() if performance: dfg_frequency, dfg_performance = df_statistics.get_dfg_graph( proj_df, activity_key="event_activity", case_id_glue=p, timestamp_key="event_timestamp", measure="both", sort_caseid_required=sort_caseid_required, sort_timestamp_along_case_id=sort_timestamp_required) else: if has_timestamp: dfg_frequency = df_statistics.get_dfg_graph( proj_df, activity_key="event_activity", case_id_glue=p, timestamp_key="event_timestamp", sort_caseid_required=sort_caseid_required, sort_timestamp_along_case_id=sort_timestamp_required) else: dfg_frequency = df_statistics.get_dfg_graph( proj_df, activity_key="event_activity", case_id_glue=p, sort_timestamp_along_case_id=False, sort_caseid_required=sort_caseid_required) if len(dfg_frequency) > 0: this_color = COLORS[p_ind] if p_ind < len( COLORS) else '#%02X%02X%02X' % (r(), r(), r()) parameters_sa_ea = copy(parameters) parameters_sa_ea[constants.PARAMETER_CONSTANT_CASEID_KEY] = p parameters_sa_ea[ constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = "event_activity" parameters_sa_ea[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = "event_activity" start_activities = start_activities_filter.get_start_activities( proj_df, parameters=parameters_sa_ea) end_activities = end_activities_filter.get_end_activities( proj_df, parameters=parameters_sa_ea) start_activities = clean_sa_ea(start_activities, decreasing_factor_sa_ea) end_activities = clean_sa_ea(end_activities, decreasing_factor_sa_ea) max_entry = dict() max_exit = dict() for x in dfg_frequency: a1 = x[0] a2 = x[1] y = dfg_frequency[x] if not a1 == a2: if not a2 in max_entry or max_entry[a2][1] < y: max_entry[a2] = [x, y] if not a1 in max_exit or max_exit[a1][1] < y: max_exit[a1] = [x, y] max_entry = set(y[0] for y in max_entry.values()) max_exit = set(y[0] for y in max_exit.values()) #max_entry = {} #max_exit = {} dfg_frequency = { x: y for x, y in dfg_frequency.items() if y >= min_dfg_occurrences or x in max_entry or x in max_exit } activities = list(activities_occurrences.keys()) if performance: dfg_performance = { x: y for x, y in dfg_performance.items() if x in dfg_frequency } heu_net = HeuristicsNet( dfg_frequency, start_activities=start_activities, end_activities=end_activities, default_edges_color=this_color, net_name=p, activities=activities, activities_occurrences=activities_occurrences, performance_dfg=dfg_performance) else: heu_net = HeuristicsNet( dfg_frequency, start_activities=start_activities, end_activities=end_activities, default_edges_color=this_color, net_name=p, activities=activities, activities_occurrences=activities_occurrences) heu_net.calculate( dependency_thresh=dependency_thresh, and_measure_thresh=and_measure_thresh, min_act_count=1, min_dfg_occurrences=1, dfg_pre_cleaning_noise_thresh=dfg_pre_cleaning_noise_thresh) if len(heu_net.nodes) > 0: perspectives_heu[p] = heu_net return perspectives_heu