def test_case_statistics(self): from pm4py.statistics.traces.pandas import case_statistics df = self.get_dataframe() case_statistics.get_cases_description(df) case_statistics.get_variants_df(df) case_statistics.get_variant_statistics(df) #case_statistics.get_variant_statistics_with_case_duration(df) case_statistics.get_events(df, "N77802") case_statistics.get_variants_df_with_case_duration(df) case_statistics.get_variants_df_and_list(df) case_statistics.get_kde_caseduration(df)
def get_variants_count(df, parameters=None): """ Gets the dictionary of variants from the current dataframe Parameters -------------- df Dataframe parameters Possible parameters of the algorithm, including: Parameters.ACTIVITY_KEY -> Column that contains the activity Returns -------------- variants_set Dictionary of variants in the log """ if parameters is None: parameters = {} var_stats = case_statistics.get_variant_statistics(df, parameters=parameters) if var_stats: count_key = list(x for x in var_stats[0].keys() if not x == "variant")[0] return {x["variant"]: x[count_key] for x in var_stats} return {}
def get_variants_list(log, parameters=None): """ Gets the list of variants (along with their count) from the particular log_skeleton type Parameters ------------ log Log parameters Parameters of the algorithm Returns ------------- variants_list List of variants of the log_skeleton (along with their count) """ from pm4py.statistics.traces.pandas import case_statistics as pd_case_statistics from pm4py.statistics.traces.log import case_statistics as log_case_statistics variants_list = [] if type(log) is pd.DataFrame: pd_variants = pd_case_statistics.get_variant_statistics( log, parameters=parameters) for var in pd_variants: varkeys = list(var.keys()) del varkeys[varkeys.index("variant")] variants_list.append((var["variant"], var[varkeys[0]])) else: log_variants = log_case_statistics.get_variant_statistics( log, parameters=parameters) for var in log_variants: varkeys = list(var.keys()) del varkeys[varkeys.index("variant")] variants_list.append((var["variant"], var[varkeys[0]])) return variants_list
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) log = EventLog() for vd in variant_stats: variant = vd['variant'].split(",") trace = Trace() for activity in variant: event = Event() event[xes.DEFAULT_NAME_KEY] = activity trace.append(event) log.append(trace) return log
def apply(df, parameters=None): """ Convert a dataframe into a log containing 1 case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.pandas import case_statistics if parameters is None: parameters = {} variant_stats = case_statistics.get_variant_statistics(df, parameters=parameters) activity_key = parameters[ pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY log = EventLog() for vd in variant_stats: variant = vd['variant'].split(",") trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) log.append(trace) return log
def apply(log, parameters=None): """ Calculates the Working Together metric Parameters ------------ log Log parameters Possible parameters of the algorithm Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is not directed. """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) parameters_variants = { case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics( log, parameters=parameters_variants) } variants_resources = list(variants_occ.keys()) resources = [x.split(",") for x in variants_resources] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) for rv in resources: ord_res_list = sorted(list(set(rv))) for i in range(len(ord_res_list) - 1): res_i = flat_list.index(ord_res_list[i]) for j in range(i + 1, len(ord_res_list)): res_j = flat_list.index(ord_res_list[j]) metric_matrix[res_i, res_j] += float( variants_occ[",".join(rv)]) / float(len(log)) metric_matrix[res_j, res_i] += float( variants_occ[",".join(rv)]) / float(len(log)) return [metric_matrix, flat_list, False]
def test_25(self): from pm4py.statistics.traces.pandas import case_statistics df = self.load_running_example_df() variants_count = case_statistics.get_variant_statistics(df, parameters={ case_statistics.Parameters.CASE_ID_KEY: "case:concept:name", case_statistics.Parameters.ACTIVITY_KEY: "concept:name", case_statistics.Parameters.TIMESTAMP_KEY: "time:timestamp"}) variants_count = sorted(variants_count, key=lambda x: x['case:concept:name'], reverse=True)
def get_variants(path, log_name, managed_logs, parameters=None): if parameters is None: parameters = {} no_samples = parameters[PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES use_transition = parameters[ PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION window_size = parameters[PARAMETER_NUM_RET_ITEMS] if PARAMETER_NUM_RET_ITEMS in parameters else DEFAULT_WINDOW_SIZE start = parameters[PARAMETER_START] if PARAMETER_START in parameters else 0 activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER filters = parameters[FILTERS] if FILTERS in parameters else [] parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key folder = os.path.join(path, log_name) columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY], use_transition=use_transition) parquet_list = parquet_importer.get_list_parquet(folder) dictio_variants = {} events = 0 cases = 0 count = 0 for index, pq in enumerate(parquet_list): pq_basename = Path(pq).name if pq_basename in managed_logs: count = count + 1 df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters, force_classifier_insertion=True) events = events + len(df) cases = cases + df[CASE_CONCEPT_NAME].nunique() # dictio = dictio + Counter(dict(df[attribute_key].value_counts())) stats = case_statistics.get_variant_statistics(df) d_variants = {x["variant"]: x for x in stats} for variant in d_variants: d_variants[variant]["count"] = d_variants[variant]["case:concept:name"] del d_variants[variant]["case:concept:name"] if not variant in dictio_variants: dictio_variants[variant] = d_variants[variant] else: dictio_variants[variant]["count"] = dictio_variants[variant]["count"] + d_variants[variant]["count"] list_variants = sorted(list(dictio_variants.values()), key=lambda x: x["count"], reverse=True) list_variants = list_variants[start:min(len(list_variants), window_size)] dictio_variants = {x["variant"]: x for x in list_variants} if count >= no_samples: break list_variants = sorted(list(dictio_variants.values()), key=lambda x: x["count"], reverse=True) return {"variants": list_variants, "events": events, "cases": cases}
def test_filtering_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(input_log, sep=',') variants = case_statistics.get_variant_statistics(dataframe) chosen_variants = [variants[0]["variant"]] dataframe = variants_filter.apply(dataframe, chosen_variants) del dataframe
def test_filtering_variants(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv") dataframe = pd.read_csv(input_log) dataframe = dataframe_utils.convert_timestamp_columns_in_df(dataframe) variants = case_statistics.get_variant_statistics(dataframe) chosen_variants = [variants[0]["variant"]] dataframe = variants_filter.apply(dataframe, chosen_variants) del dataframe
def apply(df, parameters=None): """ Convert a dataframe into a log containing N case per variant (only control-flow perspective is considered) Parameters ------------- df Dataframe parameters Parameters of the algorithm Returns ------------- log Event log """ from pm4py.statistics.traces.pandas import case_statistics if parameters is None: parameters = {} return_variants = parameters[ RETURN_VARIANTS] if RETURN_VARIANTS in parameters else False case_glue = parameters[ pm4_constants. PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else pm4_constants.CASE_CONCEPT_NAME activity_key = parameters[ pm4_constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) log = EventLog() all_variants_log = {} for vd in variant_stats: variant = vd['variant'].split(",") variant_count = vd[case_glue] trace = Trace() for activity in variant: event = Event() event[activity_key] = activity trace.append(event) all_variants_log[vd['variant']] = [] for i in range(variant_count): log.append(trace) all_variants_log[vd['variant']].append(len(log) - 1) if return_variants: return log, all_variants_log return log
def apply_auto_filter(df, parameters=None): """ Apply an automatic filter on variants Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity variants_df -> If provided, avoid recalculation of the variants dataframe Parameters.DECREASING_FACTOR -> Decreasing factor that should be passed to the algorithm Returns ----------- df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) decreasing_factor = exec_utils.get_param_value( Parameters.DECREASING_FACTOR, parameters, filtering_constants.DECREASING_FACTOR) variants_df = case_statistics.get_variants_df(df, parameters=parameters) parameters["variants_df"] = variants_df variants = case_statistics.get_variant_statistics(df, parameters=parameters) admitted_variants = [] if len(variants) > 0: current_variant_count = variants[0][case_id_glue] for i in range(len(variants)): if variants[i][ case_id_glue] >= decreasing_factor * current_variant_count: admitted_variants.append(variants[i]["variant"]) else: break current_variant_count = variants[i][case_id_glue] return apply(df, admitted_variants, parameters=parameters)
def apply_auto_filter(df, parameters=None): """ Apply an automatic filter on variants Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: case_id_glue -> Column that contains the Case ID activity_key -> Column that contains the activity variants_df -> If provided, avoid recalculation of the variants dataframe decreasingFactor -> Decreasing factor that should be passed to the algorithm Returns ----------- df Filtered dataframe """ if parameters is None: parameters = {} case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME variants_df = case_statistics.get_variants_df(df, parameters=parameters) parameters["variants_df"] = variants_df variants = case_statistics.get_variant_statistics(df, parameters=parameters) decreasing_factor = parameters[ "decreasingFactor"] if "decreasingFactor" in parameters else filtering_constants.DECREASING_FACTOR admitted_variants = [] if len(variants) > 0: current_variant_count = variants[0][case_id_glue] for i in range(len(variants)): if variants[i][ case_id_glue] >= decreasing_factor * current_variant_count: admitted_variants.append(variants[i]["variant"]) else: break current_variant_count = variants[i][case_id_glue] return apply(df, admitted_variants, parameters=parameters)
def get_variants_set(df, parameters=None): """ Gets the set of variants from the current dataframe Parameters -------------- df Dataframe parameters Possible parameters of the algorithm, including: activity_key -> Column that contains the activity Returns -------------- variants_set Set of variants in the log """ if parameters is None: parameters = {} var_stats = case_statistics.get_variant_statistics(df, parameters=parameters) return set(x["variant"] for x in var_stats)
def apply(log, parameters=None): """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Parameters.N -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} import numpy from pm4py.statistics.traces.pandas import case_statistics resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) n = exec_utils.get_param_value(Parameters.N, parameters, 2) parameters_variants = { case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key } variants_occ = { x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics( log, parameters=parameters_variants) } variants_resources = list(variants_occ.keys()) resources = [x.split(",") for x in variants_resources] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for rv in resources: for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)] dividend = 0 for rv in resources: dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def test_24(self): from pm4py.statistics.traces.log import case_statistics log = self.load_running_example_xes() variants_count = case_statistics.get_variant_statistics(log) variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
def apply(df, parameters=None): """ Returns a Pandas dataframe from which a sound workflow net could be extracted taking into account a discovery algorithm returning models only with visible transitions Parameters ------------ df Pandas dataframe parameters Possible parameters of the algorithm, including: max_no_variants -> Maximum number of variants to consider to return a Petri net Returns ------------ filtered_df Filtered dataframe """ if parameters is None: parameters = {} if PARAMETER_CONSTANT_CASEID_KEY not in parameters: parameters[PARAMETER_CONSTANT_CASEID_KEY] = CASE_CONCEPT_NAME if PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] = DEFAULT_NAME_KEY if PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] = DEFAULT_TIMESTAMP_KEY if PARAMETER_CONSTANT_ATTRIBUTE_KEY not in parameters: parameters[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[ PARAMETER_CONSTANT_ACTIVITY_KEY] caseid_glue = parameters[PARAMETER_CONSTANT_CASEID_KEY] activity_key = parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] timest_key = parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] max_no_variants = parameters[ "max_no_variants"] if "max_no_variants" in parameters else 20 variants_df = case_statistics.get_variants_df(df, parameters=parameters) parameters["variants_df"] = variants_df variant_stats = case_statistics.get_variant_statistics( df, parameters=parameters) all_variants_list = [] for var in variant_stats: all_variants_list.append([var["variant"], var[caseid_glue]]) all_variants_list = sorted(all_variants_list, key=lambda x: (x[1], x[0]), reverse=True) considered_variants = [] considered_traces = [] i = 0 while i < min(len(all_variants_list), max_no_variants): variant = all_variants_list[i][0] considered_variants.append(variant) filtered_df = variants_filter.apply(df, considered_variants, parameters=parameters) dfg_frequency = dfg_util.get_dfg_graph(filtered_df, measure="frequency", perf_aggregation_key="median", case_id_glue=caseid_glue, activity_key=activity_key, timestamp_key=timest_key) net, initial_marking, final_marking = alpha_miner.apply_dfg( dfg_frequency, parameters=parameters) is_sound = check_soundness.check_petri_wfnet_and_soundness(net) if not is_sound: del considered_variants[-1] else: traces_of_this_variant = variants_filter.apply( df, [variant], parameters=parameters).groupby(caseid_glue) traces_of_this_variant_keys = list( traces_of_this_variant.groups.keys()) trace_of_this_variant = traces_of_this_variant.get_group( traces_of_this_variant_keys[0]) this_trace = transform.transform_event_log_to_trace_log( pandas_df_imp.convert_dataframe_to_event_log( trace_of_this_variant), case_glue=caseid_glue)[0] if not activity_key == DEFAULT_NAME_KEY: for j in range(len(this_trace)): this_trace[j][DEFAULT_NAME_KEY] = this_trace[j][ activity_key] considered_traces.append(this_trace) filtered_log = TraceLog(considered_traces) try: alignments = alignment_factory.apply(filtered_log, net, initial_marking, final_marking) del alignments fitness = replay_fitness_factory.apply(filtered_log, net, initial_marking, final_marking, parameters=parameters) if fitness["log_fitness"] < 0.99999: del considered_variants[-1] del considered_traces[-1] except TypeError: del considered_variants[-1] del considered_traces[-1] i = i + 1 return variants_filter.apply(df, considered_variants, parameters=parameters)
def apply(log, parameters=None): """ Calculates the HW metric Parameters ------------ log Log parameters Possible parameters of the algorithm: Paramters.BETA -> beta value as described in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list. Moreover, last boolean indicates that the metric is directed. """ if parameters is None: parameters = {} resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY) beta = exec_utils.get_param_value(Parameters.BETA, parameters, 0) parameters_variants = {case_statistics.Parameters.ACTIVITY_KEY: resource_key, case_statistics.Parameters.ATTRIBUTE_KEY: resource_key} variants_occ = {x["variant"]: x["case:concept:name"] for x in case_statistics.get_variant_statistics(log, parameters=parameters_variants)} variants_resources = list(variants_occ.keys()) resources = [x.split(",") for x in variants_resources] flat_list = sorted(list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for rv in resources: for i in range(len(rv) - 1): res_i = flat_list.index(rv[i]) if not res_i in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, len(rv)): res_j = flat_list.index(rv[j]) if not res_j in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 if beta == 0: sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)] break else: sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)] * (beta ** (j - i - 1)) dividend = 0 for rv in resources: if beta == 0: dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1) else: dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return [metric_matrix, flat_list, True]
def apply(log, parameters=None): """ Calculates the Subcontracting metric Parameters ------------ log Log parameters Possible parameters of the algorithm: n -> n of the algorithm proposed in the Wil SNA paper Returns ----------- tuple Tuple containing the metric matrix and the resources list """ if parameters is None: parameters = {} resource_key = parameters[ constants. PARAMETER_CONSTANT_RESOURCE_KEY] if constants.PARAMETER_CONSTANT_RESOURCE_KEY in parameters else xes.DEFAULT_RESOURCE_KEY n = parameters[N] if N in parameters else 2 parameters_variants = { constants.PARAMETER_CONSTANT_ACTIVITY_KEY: resource_key, constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: resource_key } variants_occ = { x["variant"]: x["case:concept:name"] for x in get_variant_statistics(log, parameters=parameters_variants) } variants_resources = list(variants_occ.keys()) resources = [x.split(",") for x in variants_resources] flat_list = sorted( list(set([item for sublist in resources for item in sublist]))) metric_matrix = numpy.zeros((len(flat_list), len(flat_list))) sum_i_to_j = {} for rv in resources: for i in range(len(rv) - n): res_i = flat_list.index(rv[i]) res_i_n = flat_list.index(rv[i + n]) if res_i == res_i_n: if res_i not in sum_i_to_j: sum_i_to_j[res_i] = {} for j in range(i + 1, i + n): res_j = flat_list.index(rv[j]) if res_j not in sum_i_to_j[res_i]: sum_i_to_j[res_i][res_j] = 0 sum_i_to_j[res_i][res_j] += variants_occ[",".join(rv)] dividend = 0 for rv in resources: dividend = dividend + variants_occ[",".join(rv)] * (len(rv) - 1) for key1 in sum_i_to_j: for key2 in sum_i_to_j[key1]: metric_matrix[key1][key2] = sum_i_to_j[key1][key2] / dividend return (metric_matrix, flat_list, True)