def get_events(df, case_id, parameters=None): """ Get events belonging to the specified case Parameters ----------- df Pandas dataframe case_id Required case ID parameters Possible parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column in which the case ID is contained Returns ---------- list_eve List of events belonging to the case """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) return pandas_utils.to_dict_records(df[df[case_id_glue] == case_id])
def apply(log, parameters=None): """ Converts the event log to an event stream Parameters ---------- log: :class:`pm4py.log.log.EventLog` An Event log include_case_attributes: Default is True case_attribute_prefix: Default is 'case:' enable_deepcopy Enables deepcopy (avoid references between input and output objects) Returns ------- log : :class:`pm4py.log.log.EventLog` An Event stream """ if parameters is None: parameters = {} stream_post_processing = exec_utils.get_param_value( Parameters.STREAM_POST_PROCESSING, parameters, False) case_pref = exec_utils.get_param_value(Parameters.CASE_ATTRIBUTE_PREFIX, parameters, 'case:') enable_deepcopy = exec_utils.get_param_value(Parameters.DEEP_COPY, parameters, True) include_case_attributes = exec_utils.get_param_value( Parameters.INCLUDE_CASE_ATTRIBUTES, parameters, True) compress = exec_utils.get_param_value(Parameters.COMPRESS, parameters, True) if pkgutil.find_loader("pandas"): import pandas if isinstance(log, pandas.DataFrame): extensions = __detect_extensions(log) list_events = pandas_utils.to_dict_records(log) if stream_post_processing: list_events = __postprocess_stream(list_events) if compress: list_events = __compress(list_events) for i in range(len(list_events)): list_events[i] = Event(list_events[i]) log = log_instance.EventStream(list_events, attributes={'origin': 'csv'}) for ex in extensions: log.extensions[ex.name] = { xes_constants.KEY_PREFIX: ex.prefix, xes_constants.KEY_URI: ex.uri } if isinstance(log, EventLog): return __transform_event_log_to_event_stream( log, include_case_attributes=include_case_attributes, case_attribute_prefix=case_pref, enable_deepcopy=enable_deepcopy) return log
def get_variant_statistics( df: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], Any]] = None ) -> Union[List[Dict[str, int]], List[Dict[List[str], int]]]: """ Get variants from a Pandas dataframe Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity Parameters.MAX_VARIANTS_TO_RETURN -> Maximum number of variants to return variants_df -> If provided, avoid recalculation of the variants dataframe Returns ----------- variants_list List of variants inside the Pandas dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) max_variants_to_return = exec_utils.get_param_value( Parameters.MAX_VARIANTS_TO_RETURN, parameters, None) variants_df = exec_utils.get_param_value( Parameters.VARIANTS_DF, parameters, get_variants_df(df, parameters=parameters)) variants_df = variants_df.reset_index() variants_list = pandas_utils.to_dict_records( variants_df.groupby("variant").agg("count").reset_index()) variants_list = sorted(variants_list, key=lambda x: (x[case_id_glue], x["variant"]), reverse=True) if max_variants_to_return: variants_list = variants_list[:min(len(variants_list ), max_variants_to_return)] return variants_list
def apply(dataframe, list_activities, sample_size, parameters): """ Finds the performance spectrum provided a dataframe and a list of activities Parameters ------------- dataframe Dataframe list_activities List of activities interesting for the performance spectrum (at least two) sample_size Size of the sample parameters Parameters of the algorithm, including: - Parameters.ACTIVITY_KEY - Parameters.TIMESTAMP_KEY - Parameters.CASE_ID_KEY Returns ------------- points Points of the performance spectrum """ if parameters is None: parameters = {} import pandas as pd import numpy as np case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes.DEFAULT_TIMESTAMP_KEY) dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]] dataframe = dataframe[dataframe[activity_key].isin(list_activities)] dataframe = pandas_utils.insert_index(dataframe, constants.DEFAULT_EVENT_INDEX_KEY) dataframe = dataframe.sort_values( [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY]) dataframe[timestamp_key] = dataframe[timestamp_key].astype( np.int64) / 10**9 list_replicas = [] activity_names = [] filt_col_names = [] for i in range(len(list_activities)): if i > 0: dataframe = dataframe.shift(-1) activity_names.append("+'@@'+") ren = {x: x + "_" + str(i) for x in dataframe.columns} list_replicas.append(dataframe.rename(columns=ren)) filt_col_names.append(timestamp_key + "_" + str(i)) activity_names.append("dataframe[activity_key+'_" + str(i) + "']") dataframe = pd.concat(list_replicas, axis=1) for i in range(len(list_activities) - 1): dataframe = dataframe[dataframe[case_id_glue + "_" + str(i)] == dataframe[case_id_glue + "_" + str(i + 1)]] dataframe["@@merged_activity"] = eval("".join(activity_names)) desidered_act = "@@".join(list_activities) dataframe = dataframe[dataframe["@@merged_activity"] == desidered_act] dataframe = dataframe[filt_col_names] if len(dataframe) > sample_size: dataframe = dataframe.sample(n=sample_size) points = pandas_utils.to_dict_records(dataframe) points = [[p[tk] for tk in filt_col_names] for p in points] points = sorted(points, key=lambda x: x[0]) return points