예제 #1
0
def get_events(df, case_id, parameters=None):
    """
    Get events belonging to the specified case

    Parameters
    -----------
    df
        Pandas dataframe
    case_id
        Required case ID
    parameters
        Possible parameters of the algorithm, including:
            Parameters.CASE_ID_KEY -> Column in which the case ID is contained

    Returns
    ----------
    list_eve
        List of events belonging to the case
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)

    return pandas_utils.to_dict_records(df[df[case_id_glue] == case_id])
예제 #2
0
def apply(log, parameters=None):
    """
      Converts the event log to an event stream

      Parameters
      ----------
      log: :class:`pm4py.log.log.EventLog`
          An Event log
      include_case_attributes:
          Default is True
      case_attribute_prefix:
          Default is 'case:'
      enable_deepcopy
          Enables deepcopy (avoid references between input and output objects)

      Returns
          -------
      log : :class:`pm4py.log.log.EventLog`
          An Event stream
      """
    if parameters is None:
        parameters = {}

    stream_post_processing = exec_utils.get_param_value(
        Parameters.STREAM_POST_PROCESSING, parameters, False)
    case_pref = exec_utils.get_param_value(Parameters.CASE_ATTRIBUTE_PREFIX,
                                           parameters, 'case:')
    enable_deepcopy = exec_utils.get_param_value(Parameters.DEEP_COPY,
                                                 parameters, True)
    include_case_attributes = exec_utils.get_param_value(
        Parameters.INCLUDE_CASE_ATTRIBUTES, parameters, True)
    compress = exec_utils.get_param_value(Parameters.COMPRESS, parameters,
                                          True)

    if pkgutil.find_loader("pandas"):
        import pandas
        if isinstance(log, pandas.DataFrame):
            extensions = __detect_extensions(log)
            list_events = pandas_utils.to_dict_records(log)
            if stream_post_processing:
                list_events = __postprocess_stream(list_events)
            if compress:
                list_events = __compress(list_events)
            for i in range(len(list_events)):
                list_events[i] = Event(list_events[i])
            log = log_instance.EventStream(list_events,
                                           attributes={'origin': 'csv'})
            for ex in extensions:
                log.extensions[ex.name] = {
                    xes_constants.KEY_PREFIX: ex.prefix,
                    xes_constants.KEY_URI: ex.uri
                }
    if isinstance(log, EventLog):
        return __transform_event_log_to_event_stream(
            log,
            include_case_attributes=include_case_attributes,
            case_attribute_prefix=case_pref,
            enable_deepcopy=enable_deepcopy)
    return log
예제 #3
0
def get_variant_statistics(
    df: pd.DataFrame,
    parameters: Optional[Dict[Union[str, Parameters], Any]] = None
) -> Union[List[Dict[str, int]], List[Dict[List[str], int]]]:
    """
    Get variants from a Pandas dataframe

    Parameters
    -----------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
            Parameters.CASE_ID_KEY -> Column that contains the Case ID
            Parameters.ACTIVITY_KEY -> Column that contains the activity
            Parameters.MAX_VARIANTS_TO_RETURN -> Maximum number of variants to return
            variants_df -> If provided, avoid recalculation of the variants dataframe

    Returns
    -----------
    variants_list
        List of variants inside the Pandas dataframe
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)

    max_variants_to_return = exec_utils.get_param_value(
        Parameters.MAX_VARIANTS_TO_RETURN, parameters, None)
    variants_df = exec_utils.get_param_value(
        Parameters.VARIANTS_DF, parameters,
        get_variants_df(df, parameters=parameters))

    variants_df = variants_df.reset_index()
    variants_list = pandas_utils.to_dict_records(
        variants_df.groupby("variant").agg("count").reset_index())
    variants_list = sorted(variants_list,
                           key=lambda x: (x[case_id_glue], x["variant"]),
                           reverse=True)
    if max_variants_to_return:
        variants_list = variants_list[:min(len(variants_list
                                               ), max_variants_to_return)]
    return variants_list
예제 #4
0
def apply(dataframe, list_activities, sample_size, parameters):
    """
    Finds the performance spectrum provided a dataframe
    and a list of activities

    Parameters
    -------------
    dataframe
        Dataframe
    list_activities
        List of activities interesting for the performance spectrum (at least two)
    sample_size
        Size of the sample
    parameters
        Parameters of the algorithm,  including:
            - Parameters.ACTIVITY_KEY
            - Parameters.TIMESTAMP_KEY
            - Parameters.CASE_ID_KEY

    Returns
    -------------
    points
        Points of the performance spectrum
    """
    if parameters is None:
        parameters = {}

    import pandas as pd
    import numpy as np

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY,
                                              parameters, xes.DEFAULT_NAME_KEY)
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               xes.DEFAULT_TIMESTAMP_KEY)

    dataframe = dataframe[[case_id_glue, activity_key, timestamp_key]]
    dataframe = dataframe[dataframe[activity_key].isin(list_activities)]
    dataframe = pandas_utils.insert_index(dataframe,
                                          constants.DEFAULT_EVENT_INDEX_KEY)
    dataframe = dataframe.sort_values(
        [case_id_glue, timestamp_key, constants.DEFAULT_EVENT_INDEX_KEY])
    dataframe[timestamp_key] = dataframe[timestamp_key].astype(
        np.int64) / 10**9
    list_replicas = []
    activity_names = []
    filt_col_names = []
    for i in range(len(list_activities)):
        if i > 0:
            dataframe = dataframe.shift(-1)
            activity_names.append("+'@@'+")
        ren = {x: x + "_" + str(i) for x in dataframe.columns}
        list_replicas.append(dataframe.rename(columns=ren))
        filt_col_names.append(timestamp_key + "_" + str(i))

        activity_names.append("dataframe[activity_key+'_" + str(i) + "']")

    dataframe = pd.concat(list_replicas, axis=1)
    for i in range(len(list_activities) - 1):
        dataframe = dataframe[dataframe[case_id_glue + "_" +
                                        str(i)] == dataframe[case_id_glue +
                                                             "_" + str(i + 1)]]
    dataframe["@@merged_activity"] = eval("".join(activity_names))
    desidered_act = "@@".join(list_activities)
    dataframe = dataframe[dataframe["@@merged_activity"] == desidered_act]
    dataframe = dataframe[filt_col_names]

    if len(dataframe) > sample_size:
        dataframe = dataframe.sample(n=sample_size)

    points = pandas_utils.to_dict_records(dataframe)
    points = [[p[tk] for tk in filt_col_names] for p in points]
    points = sorted(points, key=lambda x: x[0])

    return points