예제 #1
0
def apply(log, parameters=None, variant=DFG_NATIVE):
    """
    Calculates DFG graph (frequency or performance) starting from a log

    Parameters
    ----------
    log
        Log
    parameters
        Possible parameters passed to the algorithms:
            aggregationMeasure -> performance aggregation measure (min, max, mean, median)
            activity_key -> Attribute to use as activity
            timestamp_key -> Attribute to use as timestamp
    variant
        Variant of the algorithm to use, possible values:
            native, frequency, performance, frequency_greedy, performance_greedy

    Returns
    -------
    dfg
        DFG graph
    """
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.
                   PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_CASEID_KEY] = pmutil.constants.CASE_ATTRIBUTE_GLUE
    if isinstance(log, pandas.core.frame.DataFrame):
        log = csv_import_adapter.convert_timestamp_columns_in_df(
            log,
            timest_columns=[
                parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]
            ])
        dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(
            log,
            measure="both",
            activity_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
            timestamp_key=parameters[
                pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY],
            case_id_glue=parameters[
                pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY])
        if 'native' in variant or 'frequency' in variant:
            return dfg_frequency
        else:
            return dfg_performance
    return VERSIONS[variant](log_conversion.apply(log, parameters,
                                                  log_conversion.TO_EVENT_LOG),
                             parameters=parameters)
예제 #2
0
 def test_prefiltering_dataframe(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     self.dummy_variable = "dummy_value"
     input_log = os.path.join(INPUT_DATA_DIR, "running-example.csv")
     dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(input_log, sep=',')
     dataframe = attributes_filter.filter_df_keeping_spno_activities(dataframe, activity_key="concept:name")
     dataframe = case_filter.filter_on_ncases(dataframe, case_id_glue="case:concept:name")
     dataframe = csv_import_adapter.convert_timestamp_columns_in_df(dataframe)
     dataframe = dataframe.sort_values('time:timestamp')
     event_log = log_conv_fact.apply(dataframe, variant=log_conv_fact.TO_EVENT_STREAM)
     log = log_conv_fact.apply(event_log)
     del log
예제 #3
0
def convert_df_pm_format(
    df,
    sort=False,
    sort_field="time:timestamp",
    timest_format=None,
    timest_columns=None,
):
    df = convert_timestamp_columns_in_df(df,
                                         timest_format=timest_format,
                                         timest_columns=timest_columns)
    if sort and sort_field:
        df = df.sort_values(sort_field)
    return df
예제 #4
0
    def __init__(
        self,
        df,
        timestamp_field="timestamp",
        case_id_field="case_id",
        activity_field="activity",
    ):

        super(TraceComparison, self).__init__()

        self.timestamp_field = timestamp_field
        self.case_id_field = case_id_field
        self.activity_field = activity_field

        self.event_log = csv_import_adapter.convert_timestamp_columns_in_df(
            df, timest_columns=[timestamp_field])
        self.observe(self.on_trace_change, names="trace")
        self.activities = self.event_log[self.activity_field].unique().tolist()
        self.on_trace_change()
예제 #5
0
    def __init__(
        self,
        df,
        timestamp_field="timestamp",
        case_id_field="case_id",
        activity_field="activity",
    ):

        super(TraceExplorer, self).__init__()
        self.df = csv_import_adapter.convert_timestamp_columns_in_df(
            df, timest_columns=[timestamp_field])
        events = []
        for index, row in self.df.iterrows():
            events.append({
                "ENTITY_ID": row[case_id_field],
                "TS": row[timestamp_field],
                "EVENT_NAME": row[activity_field],
            })
        self.value = events
예제 #6
0
def apply(net,
          initial_marking=None,
          final_marking=None,
          log=None,
          aggregated_statistics=None,
          parameters=None,
          variant="wo_decoration"):
    if parameters is None:
        parameters = {}
    if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[pmutil.constants.
                   PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
    if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
    if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[
            pmutil.constants.
            PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE
    if log is not None:
        if isinstance(log, pandas.core.frame.DataFrame):
            log = csv_import_adapter.convert_timestamp_columns_in_df(
                log,
                timest_columns=[
                    parameters[
                        pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]
                ])
        log = log_conversion.apply(log, parameters,
                                   log_conversion.TO_EVENT_LOG)
    return VERSIONS[variant](net,
                             initial_marking,
                             final_marking,
                             log=log,
                             aggregated_statistics=aggregated_statistics,
                             parameters=parameters)
예제 #7
0
    def __init__(
        self,
        df,
        timestamp_field="timestamp",
        case_id_field="case_id",
        activity_field="activity",
    ):

        super(ProcessMap, self).__init__()

        self.timestamp_field = timestamp_field
        self.case_id_field = case_id_field
        self.activity_field = activity_field
        self.df = csv_import_adapter.convert_timestamp_columns_in_df(
            df, timest_columns=[timestamp_field])
        self.cases = self.df.groupby(case_id_field)
        self.activities = self.cases.agg({
            activity_field: lambda x: "".join(x)
        }).groupby(activity_field)
        self.sorted_activities = self.activities.size().sort_values(
            ascending=False)

        self.observe(self.on_filter_change, names="filter")
        self.on_filter_change()
예제 #8
0
def execute_script():
    aa = time.time()
    dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(
        inputLog, sep=',')
    dataframe = csv_import_adapter.convert_caseid_column_to_str(
        dataframe, case_id_glue=CASEID_GLUE)
    dataframe = csv_import_adapter.convert_timestamp_columns_in_df(
        dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS)
    dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY])
    dataframe_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    bb = time.time()
    print("importing log time=", (bb - aa))

    parameters_cde = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY,
        "sort_by_column": "caseDuration",
        "sort_ascending": False,
        "max_ret_cases": 1000
    }
    cases_desc = case_statistics.get_cases_description(
        dataframe, parameters=parameters_cde)

    print(cases_desc)
    bb2 = time.time()
    print("calculating and printing cases_desc = ", (bb2 - bb))
    calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg",
                                     "NOFILTERS_PERFORMANCE.svg")
    GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg")
    GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_fa
    cc = time.time()
    print(
        "saving initial Inductive Miner process schema along with frequency metrics=",
        (cc - bb2))

    dataframe_cp = case_filter.filter_on_case_performance(
        dataframe,
        case_id_glue=CASEID_GLUE,
        timestamp_key=TIMEST_KEY,
        min_case_performance=100000,
        max_case_performance=10000000)
    dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe_cp,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    dataframe_cp = None
    if DELETE_VARIABLES:
        del dataframe_cp
    calculate_process_schema_from_df(dataframe_cp_fa,
                                     "FILTER_CP_FREQUENCY.svg",
                                     "FILTER_CP_PERFORMANCE.svg")
    GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg")
    GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_cp_fa
    dd = time.time()
    print("filtering on case performance and generating process schema=",
          (dd - cc))

    if ENABLE_ATTRIBUTE_FILTER:
        parameters_att = {
            constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER,
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER,
            "positive": True
        }
        dataframe_att = attributes_filter.apply(dataframe,
                                                ATTRIBUTE_VALUES_TO_FILTER,
                                                parameters=parameters_att)
        # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att)
        print(
            "all the activities in the log",
            attributes_filter.get_attribute_values(dataframe_att,
                                                   ACTIVITY_KEY))
        dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_att,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_att
        calculate_process_schema_from_df(dataframe_att_fa,
                                         "FILTER_ATT_FREQUENCY.svg",
                                         "FILTER_ATT_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_att_fa
        ee = time.time()
        print("filtering on attribute values and generating process schema=",
              (ee - dd))

    ee = time.time()
    parameters_sa = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    parameters_ea = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    start_act = start_activities_filter.get_start_activities(
        dataframe, parameters=parameters_sa)
    print("start activities in the log = ", start_act)
    end_act = end_activities_filter.get_end_activities(
        dataframe, parameters=parameters_ea)
    print("end activities in the log = ", end_act)
    ff = time.time()
    print("finding start and end activities along with their count", (ff - ee))

    if ENABLE_STARTACT_FILTER:
        dataframe_sa = start_activities_filter.apply(dataframe,
                                                     STARTACT_TO_FILTER,
                                                     parameters=parameters_sa)
        # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa)
        start_act = start_activities_filter.get_start_activities(
            dataframe_sa, parameters=parameters_sa)
        print("start activities in the filtered log = ", start_act)
        dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_sa,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_sa
        calculate_process_schema_from_df(dataframe_sa_fa,
                                         "FILTER_SA_FREQUENCY.svg",
                                         "FILTER_SA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_sa_fa
    gg = time.time()
    if ENABLE_STARTACT_FILTER:
        print("filtering start activities time=", (gg - ff))

    if ENABLE_ENDACT_FILTER:
        dataframe_ea = end_activities_filter.apply(dataframe,
                                                   ENDACT_TO_FILTER,
                                                   parameters=parameters_ea)
        # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea)
        end_act = end_activities_filter.get_end_activities(
            dataframe_ea, parameters=parameters_ea)
        print("end activities in the filtered log = ", end_act)
        dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_ea,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_ea
        calculate_process_schema_from_df(dataframe_ea_fa,
                                         "FILTER_EA_FREQUENCY.svg",
                                         "FILTER_EA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_ea_fa
    hh = time.time()
    if ENABLE_ENDACT_FILTER:
        print("filtering end activities time=", (hh - gg))

    if REMOVE_GENERATED_IMAGES:
        for image in GENERATED_IMAGES:
            os.remove(image)
예제 #9
0
def execute_script():
    time1 = time.time()
    dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(
        inputLog, sep=SEP, quotechar=QUOTECHAR)
    time2 = time.time()
    print("time2 - time1: " + str(time2 - time1))
    parameters_filtering = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    if enable_auto_filter:
        dataframe = auto_filter.apply_auto_filter(
            dataframe, parameters=parameters_filtering)
    else:
        dataframe = attributes_filter.apply_auto_filter(
            dataframe, parameters=parameters_filtering)
    time3 = time.time()
    print("time3 - time2: " + str(time3 - time2))
    if enable_filtering_on_cases:
        dataframe = case_filter.filter_on_ncases(dataframe,
                                                 case_id_glue=CASEID_GLUE,
                                                 max_no_cases=max_no_cases)
    time4 = time.time()
    dataframe = csv_import_adapter.convert_caseid_column_to_str(
        dataframe, case_id_glue=CASEID_GLUE)
    dataframe = csv_import_adapter.convert_timestamp_columns_in_df(
        dataframe, timest_columns=TIMEST_COLUMNS, timest_format=TIMEST_FORMAT)
    time6 = time.time()
    print("time6 - time4: " + str(time6 - time4))
    # dataframe = dataframe.sort_values('time:timestamp')
    time7 = time.time()
    print("time7 - time6: " + str(time7 - time6))

    # show the filtered dataframe on the screen
    activities_count = attributes_filter.get_attribute_values(
        dataframe, attribute_key=ACTIVITY_KEY)
    [dfg_frequency, dfg_performance
     ] = df_statistics.get_dfg_graph(dataframe,
                                     measure="both",
                                     perf_aggregation_key="median",
                                     case_id_glue=CASEID_GLUE,
                                     activity_key=ACTIVITY_KEY,
                                     timestamp_key=TIMEST_KEY)
    if enable_filtering_df:
        print("len dfg_frequency 0=", len(dfg_frequency))
        dfg_frequency = dfg_filtering.apply(
            dfg_frequency, {"noiseThreshold": filtering_df_noise})
        print("len dfg_frequency 1=", len(dfg_frequency))
    time8 = time.time()
    print("time8 - time7: " + str(time8 - time7))
    gviz = dfg_vis_factory.apply(dfg_frequency,
                                 activities_count=activities_count,
                                 parameters={"format": "svg"})
    dfg_vis_factory.view(gviz)
    net, initial_marking, final_marking = inductive_factory.apply_dfg(
        dfg_frequency)
    # net, initial_marking, final_marking = alpha_factory.apply_dfg(dfg_frequency)
    spaths = get_shortest_paths(net)
    time9 = time.time()
    print("time9 - time8: " + str(time9 - time8))
    aggregated_statistics = get_decorations_from_dfg_spaths_acticount(
        net, dfg_performance, spaths, activities_count, variant="performance")
    gviz = pn_vis_factory.apply(net,
                                initial_marking,
                                final_marking,
                                variant="performance",
                                aggregated_statistics=aggregated_statistics,
                                parameters={"format": "svg"})
    time10 = time.time()
    print("time10 - time9: " + str(time10 - time9))
    print("time10 - time1: " + str(time10 - time1))
    pn_vis_factory.view(gviz)