Exemplo n.º 1
0
def get_start_activities(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key

    folder = os.path.join(path, log_name)
    columns = get_columns_to_import(filters, [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY], use_transition=use_transition)

    parquet_list = parquet_importer.get_list_parquet(folder)
    overall_sa = Counter()
    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1
            df = get_filtered_parquet(pq, columns, filters, use_transition=use_transition, parameters=parameters)

            ea = Counter(start_activities_filter.get_start_activities(df, parameters=parameters))
            overall_sa = overall_sa + ea
            if count >= no_samples:
                break

    for el in overall_sa:
        overall_sa[el] = int(overall_sa[el])

    return dict(overall_sa)
Exemplo n.º 2
0
    def get_start_activities(self, parameters=None):
        """
        Gets the start activities from the log

        Returns
        ------------
        start_activities_dict
            Dictionary of start activities
        """
        if parameters is None:
            parameters = {}
        parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = self.activity_key
        parameters[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = self.activity_key
        if self.reduced_grouped_dataframe is not None:
            parameters[constants.GROUPED_DATAFRAME] = self.reduced_grouped_dataframe

        return start_activities_filter.get_start_activities(self.get_reduced_dataframe(), parameters=parameters)
Exemplo n.º 3
0
def execute_script():
    aa = time.time()
    dataframe = csv_import_adapter.import_dataframe_from_path_wo_timeconversion(
        inputLog, sep=',')
    dataframe = csv_import_adapter.convert_caseid_column_to_str(
        dataframe, case_id_glue=CASEID_GLUE)
    dataframe = csv_import_adapter.convert_timestamp_columns_in_df(
        dataframe, timest_format=TIMEST_FORMAT, timest_columns=TIMEST_COLUMNS)
    dataframe = dataframe.sort_values([CASEID_GLUE, TIMEST_KEY])
    dataframe_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    bb = time.time()
    print("importing log time=", (bb - aa))

    parameters_cde = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY,
        "sort_by_column": "caseDuration",
        "sort_ascending": False,
        "max_ret_cases": 1000
    }
    cases_desc = case_statistics.get_cases_description(
        dataframe, parameters=parameters_cde)

    print(cases_desc)
    bb2 = time.time()
    print("calculating and printing cases_desc = ", (bb2 - bb))
    calculate_process_schema_from_df(dataframe_fa, "NOFILTERS_FREQUENCY.svg",
                                     "NOFILTERS_PERFORMANCE.svg")
    GENERATED_IMAGES.append("NOFILTERS_FREQUENCY.svg")
    GENERATED_IMAGES.append("NOFILTERS_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_fa
    cc = time.time()
    print(
        "saving initial Inductive Miner process schema along with frequency metrics=",
        (cc - bb2))

    dataframe_cp = case_filter.filter_on_case_performance(
        dataframe,
        case_id_glue=CASEID_GLUE,
        timestamp_key=TIMEST_KEY,
        min_case_performance=100000,
        max_case_performance=10000000)
    dataframe_cp_fa = attributes_filter.filter_df_keeping_spno_activities(
        dataframe_cp,
        activity_key=ACTIVITY_KEY,
        max_no_activities=MAX_NO_ACTIVITIES)
    dataframe_cp = None
    if DELETE_VARIABLES:
        del dataframe_cp
    calculate_process_schema_from_df(dataframe_cp_fa,
                                     "FILTER_CP_FREQUENCY.svg",
                                     "FILTER_CP_PERFORMANCE.svg")
    GENERATED_IMAGES.append("FILTER_CP_FREQUENCY.svg")
    GENERATED_IMAGES.append("FILTER_CP_PERFORMANCE.svg")
    if DELETE_VARIABLES:
        del dataframe_cp_fa
    dd = time.time()
    print("filtering on case performance and generating process schema=",
          (dd - cc))

    if ENABLE_ATTRIBUTE_FILTER:
        parameters_att = {
            constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
            constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: ATTRIBUTE_TO_FILTER,
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ATTRIBUTE_TO_FILTER,
            "positive": True
        }
        dataframe_att = attributes_filter.apply(dataframe,
                                                ATTRIBUTE_VALUES_TO_FILTER,
                                                parameters=parameters_att)
        # dataframe_att = attributes_filter.apply_auto_filter(dataframe, parameters=parameters_att)
        print(
            "all the activities in the log",
            attributes_filter.get_attribute_values(dataframe_att,
                                                   ACTIVITY_KEY))
        dataframe_att_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_att,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_att
        calculate_process_schema_from_df(dataframe_att_fa,
                                         "FILTER_ATT_FREQUENCY.svg",
                                         "FILTER_ATT_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_ATT_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_ATT_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_att_fa
        ee = time.time()
        print("filtering on attribute values and generating process schema=",
              (ee - dd))

    ee = time.time()
    parameters_sa = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    parameters_ea = {
        constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE,
        constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY
    }
    start_act = start_activities_filter.get_start_activities(
        dataframe, parameters=parameters_sa)
    print("start activities in the log = ", start_act)
    end_act = end_activities_filter.get_end_activities(
        dataframe, parameters=parameters_ea)
    print("end activities in the log = ", end_act)
    ff = time.time()
    print("finding start and end activities along with their count", (ff - ee))

    if ENABLE_STARTACT_FILTER:
        dataframe_sa = start_activities_filter.apply(dataframe,
                                                     STARTACT_TO_FILTER,
                                                     parameters=parameters_sa)
        # dataframe_sa = start_activities_filter.apply_auto_filter(dataframe, parameters=parameters_sa)
        start_act = start_activities_filter.get_start_activities(
            dataframe_sa, parameters=parameters_sa)
        print("start activities in the filtered log = ", start_act)
        dataframe_sa_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_sa,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_sa
        calculate_process_schema_from_df(dataframe_sa_fa,
                                         "FILTER_SA_FREQUENCY.svg",
                                         "FILTER_SA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_SA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_SA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_sa_fa
    gg = time.time()
    if ENABLE_STARTACT_FILTER:
        print("filtering start activities time=", (gg - ff))

    if ENABLE_ENDACT_FILTER:
        dataframe_ea = end_activities_filter.apply(dataframe,
                                                   ENDACT_TO_FILTER,
                                                   parameters=parameters_ea)
        # dataframe_ea = end_activities_filter.apply_auto_filter(dataframe, parameters=parameters_ea)
        end_act = end_activities_filter.get_end_activities(
            dataframe_ea, parameters=parameters_ea)
        print("end activities in the filtered log = ", end_act)
        dataframe_ea_fa = attributes_filter.filter_df_keeping_spno_activities(
            dataframe_ea,
            activity_key=ACTIVITY_KEY,
            max_no_activities=MAX_NO_ACTIVITIES)
        if DELETE_VARIABLES:
            del dataframe_ea
        calculate_process_schema_from_df(dataframe_ea_fa,
                                         "FILTER_EA_FREQUENCY.svg",
                                         "FILTER_EA_PERFORMANCE.svg")
        GENERATED_IMAGES.append("FILTER_EA_FREQUENCY.svg")
        GENERATED_IMAGES.append("FILTER_EA_PERFORMANCE.svg")
        if DELETE_VARIABLES:
            del dataframe_ea_fa
    hh = time.time()
    if ENABLE_ENDACT_FILTER:
        print("filtering end activities time=", (hh - gg))

    if REMOVE_GENERATED_IMAGES:
        for image in GENERATED_IMAGES:
            os.remove(image)
Exemplo n.º 4
0
def apply(dataframe, parameters=None):
    """
    Gets the performance DFG

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)
    end_activities = list(end_activities.keys())
    [dfg, dfg_perf
     ] = df_statistics.get_dfg_graph(dataframe,
                                     activity_key=activity_key,
                                     timestamp_key=timestamp_key,
                                     case_id_glue=case_id_glue,
                                     sort_caseid_required=False,
                                     sort_timestamp_along_case_id=False,
                                     measure="both")
    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    activities = list(activities_count.keys())
    dfg = clean_dfg_based_on_noise_thresh(
        dfg,
        activities,
        decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER,
        parameters=parameters)
    dfg_perf = {x: y for x, y in dfg_perf.items() if x in dfg}
    start_activities = list(
        start_activities_filter.get_start_activities(
            dataframe, parameters=parameters).keys())
    gviz = dfg_vis_factory.apply(dfg_perf,
                                 activities_count=activities_count,
                                 variant="performance",
                                 parameters={
                                     "format": "svg",
                                     "start_activities": start_activities,
                                     "end_activities": end_activities
                                 })

    gviz_base64 = base64.b64encode(str(gviz).encode('utf-8'))

    ret_graph = get_graph.get_graph_from_dfg(dfg, start_activities,
                                             end_activities)

    net, im, fm = dfg_conv_factory.apply(dfg,
                                         parameters={
                                             "start_activities":
                                             start_activities,
                                             "end_activities": end_activities
                                         })

    return get_base64_from_gviz(gviz), export_petri_as_string(
        net, im, fm
    ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "dfg", "perf", None, "", activity_key
Exemplo n.º 5
0
def calculate_process_schema_composite_object(path,
                                              log_name,
                                              managed_logs,
                                              parameters=None):
    if parameters is None:
        parameters = {}

    performance_required = parameters[
        "performance_required"] if "performance_required" in parameters else False
    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else "@@classifier"
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    if performance_required:
        columns = get_columns_to_import(
            filters,
            [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY],
            use_transition=use_transition)
    else:
        columns = get_columns_to_import(filters,
                                        [CASE_CONCEPT_NAME, DEFAULT_NAME_KEY],
                                        use_transition=use_transition)

    if pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters:
        columns.append(
            parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY])
        activity_key, parameters[
            pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
                pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY], activity_key
    else:
        parameters[
            pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key
    folder = os.path.join(path, log_name)

    parquet_list = parquet_importer.get_list_parquet(folder)
    frequency_dfg = Counter()
    performance_dfg = Counter()
    overall_ea = Counter()
    overall_sa = Counter()
    values = Counter({})
    events = 0
    cases = 0

    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1
            df = get_filtered_parquet(pq,
                                      columns,
                                      filters,
                                      use_transition=use_transition,
                                      parameters=parameters)

            if performance_required:
                f_dfg, p_dfg = df_statistics.get_dfg_graph(
                    df,
                    activity_key=activity_key,
                    sort_timestamp_along_case_id=False,
                    sort_caseid_required=False,
                    measure="both")
            else:
                f_dfg = df_statistics.get_dfg_graph(
                    df,
                    activity_key=activity_key,
                    sort_timestamp_along_case_id=False,
                    sort_caseid_required=False)

            f_dfg = Counter(f_dfg)

            if performance_required:
                for k in p_dfg:
                    if k not in performance_dfg:
                        performance_dfg[k] = p_dfg[k]
                    else:
                        performance_dfg[k] = (
                            frequency_dfg[k] * performance_dfg[k] + f_dfg[k] *
                            p_dfg[k]) / (frequency_dfg[k] + f_dfg[k])

            frequency_dfg = frequency_dfg + f_dfg
            ea = Counter(
                end_activities_filter.get_end_activities(
                    df, parameters=parameters))
            overall_ea = overall_ea + ea
            sa = Counter(
                start_activities_filter.get_start_activities(
                    df, parameters=parameters))
            overall_sa = overall_sa + sa
            values = values + Counter(dict(df[activity_key].value_counts()))
            events = events + len(df)
            cases = cases + df[CASE_CONCEPT_NAME].nunique()

            if count >= no_samples:
                break

    returned_dict = {}
    returned_dict["events"] = events
    returned_dict["cases"] = cases
    values = dict(values)
    for el in values:
        values[el] = int(values[el])
    returned_dict["activities"] = values
    overall_sa = dict(overall_sa)
    for el in overall_sa:
        overall_sa[el] = int(overall_sa[el])
    returned_dict["start_activities"] = overall_sa
    overall_ea = dict(overall_ea)
    for el in overall_ea:
        overall_ea[el] = int(overall_ea[el])
    returned_dict["end_activities"] = overall_ea
    returned_dict_freq = {}
    for el in frequency_dfg:
        returned_dict_freq[el[0] + "@@" + el[1]] = int(frequency_dfg[el])
    returned_dict["frequency_dfg"] = returned_dict_freq
    if performance_required:
        returned_dict_perf = {}
        for el in performance_dfg:
            returned_dict_perf[el[0] + "@@" + el[1]] = float(
                performance_dfg[el])
        returned_dict["performance_dfg"] = returned_dict_perf

    return returned_dict
Exemplo n.º 6
0
def apply(dataframe, parameters=None):
    """
    Gets the Petri net through Inductive Miner, decorated by performance metric

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)
    end_activities = list(end_activities.keys())

    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    activities = list(activities_count.keys())
    start_activities = list(
        start_activities_filter.get_start_activities(
            dataframe, parameters=parameters).keys())

    [dfg, dfg_perf
     ] = df_statistics.get_dfg_graph(dataframe,
                                     activity_key=activity_key,
                                     timestamp_key=timestamp_key,
                                     case_id_glue=case_id_glue,
                                     sort_caseid_required=False,
                                     sort_timestamp_along_case_id=False,
                                     measure="both")
    dfg = clean_dfg_based_on_noise_thresh(
        dfg,
        activities,
        decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER,
        parameters=parameters)
    dfg_perf = {x: y for x, y in dfg_perf.items() if x in dfg}

    net, im, fm = inductive_miner.apply_dfg(dfg,
                                            parameters,
                                            activities=activities,
                                            start_activities=start_activities,
                                            end_activities=end_activities)
    spaths = get_shortest_paths(net)

    bpmn_graph, el_corr, inv_el_corr, el_corr_keys_map = petri_to_bpmn.apply(
        net, im, fm)

    aggregated_statistics = get_decorations_from_dfg_spaths_acticount(
        net, dfg_perf, spaths, activities_count, variant="performance")

    bpmn_aggreg_statistics = convert_performance_map.convert_performance_map_to_bpmn(
        aggregated_statistics, inv_el_corr)
    #bpmn_graph = bpmn_embedding.embed_info_into_bpmn(bpmn_graph, bpmn_aggreg_statistics, "performance")
    bpmn_graph = bpmn_diagram_layouter.apply(bpmn_graph)
    bpmn_string = bpmn_exporter.get_string_from_bpmn(bpmn_graph)

    gviz = bpmn_vis_factory.apply_petri(
        net,
        im,
        fm,
        aggregated_statistics=aggregated_statistics,
        variant="performance",
        parameters={"format": "svg"})
    gviz2 = bpmn_vis_factory.apply_petri(
        net,
        im,
        fm,
        aggregated_statistics=aggregated_statistics,
        variant="performance",
        parameters={"format": "dot"})

    gviz_base64 = get_base64_from_file(gviz2.name)

    ret_graph = get_graph.get_graph_from_petri(net, im, fm)

    return get_base64_from_file(gviz.name), export_petri_as_string(
        net, im, fm
    ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "indbpmn", "perf", bpmn_string, ".bpmn", activity_key
Exemplo n.º 7
0
def apply(dataframe, parameters=None):
    """
    Gets the Petri net through Inductive Miner, decorated by frequency metric

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)
    end_activities = list(end_activities.keys())
    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    activities = list(activities_count.keys())
    start_activities = list(
        start_activities_filter.get_start_activities(
            dataframe, parameters=parameters).keys())

    dfg = df_statistics.get_dfg_graph(dataframe,
                                      activity_key=activity_key,
                                      timestamp_key=timestamp_key,
                                      case_id_glue=case_id_glue,
                                      sort_caseid_required=False,
                                      sort_timestamp_along_case_id=False)
    dfg = clean_dfg_based_on_noise_thresh(
        dfg,
        activities,
        decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER,
        parameters=parameters)

    net, im, fm = inductive_miner.apply_dfg(dfg,
                                            parameters,
                                            activities=activities,
                                            start_activities=start_activities,
                                            end_activities=end_activities)
    spaths = get_shortest_paths(net)
    aggregated_statistics = get_decorations_from_dfg_spaths_acticount(
        net, dfg, spaths, activities_count, variant="frequency")
    gviz = pn_vis_factory.apply(net,
                                im,
                                fm,
                                parameters={"format": "svg"},
                                variant="frequency",
                                aggregated_statistics=aggregated_statistics)

    gviz_base64 = base64.b64encode(str(gviz).encode('utf-8'))

    ret_graph = get_graph.get_graph_from_petri(net, im, fm)

    return get_base64_from_gviz(gviz), export_petri_as_string(
        net, im, fm
    ), ".pnml", "parquet", activities, start_activities, end_activities, gviz_base64, ret_graph, "inductive", "freq", None, "", activity_key
Exemplo n.º 8
0
def apply_pandas(df, parameters=None):
    """
    Discovers a Petri net using Heuristics Miner

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm,
        including: activity_key, case_id_glue, timestamp_key,
        dependency_thresh, and_measure_thresh, min_act_count, min_dfg_occurrences, dfg_pre_cleaning_noise_thresh,
        loops_length_two_thresh

    Returns
    ------------
    net
        Petri net
    im
        Initial marking
    fm
        Final marking
    """
    if parameters is None:
        parameters = {}

    activity_key = parameters[
        constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    case_id_glue = parameters[
        constants.
        PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    timestamp_key = parameters[
        constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY

    start_activities = pd_sa_filter.get_start_activities(df,
                                                         parameters=parameters)
    end_activities = pd_ea_filter.get_end_activities(df, parameters=parameters)
    activities_occurrences = pd_attributes.get_attribute_values(
        df, activity_key, parameters=parameters)
    activities = list(activities_occurrences.keys())
    if timestamp_key in df:
        dfg = df_statistics.get_dfg_graph(df,
                                          case_id_glue=case_id_glue,
                                          activity_key=activity_key,
                                          timestamp_key=timestamp_key)
        dfg_window_2 = df_statistics.get_dfg_graph(df,
                                                   case_id_glue=case_id_glue,
                                                   activity_key=activity_key,
                                                   timestamp_key=timestamp_key,
                                                   window=2)
        frequency_triples = get_freq_triples.get_freq_triples(
            df,
            case_id_glue=case_id_glue,
            activity_key=activity_key,
            timestamp_key=timestamp_key)

    else:
        dfg = df_statistics.get_dfg_graph(df,
                                          case_id_glue=case_id_glue,
                                          activity_key=activity_key,
                                          sort_timestamp_along_case_id=False)
        dfg_window_2 = df_statistics.get_dfg_graph(
            df,
            case_id_glue=case_id_glue,
            activity_key=activity_key,
            sort_timestamp_along_case_id=False,
            window=2)
        frequency_triples = get_freq_triples.get_freq_triples(
            df,
            case_id_glue=case_id_glue,
            activity_key=activity_key,
            timestamp_key=timestamp_key,
            sort_timestamp_along_case_id=False)

    heu_net = apply_heu_dfg(dfg,
                            activities=activities,
                            activities_occurrences=activities_occurrences,
                            start_activities=start_activities,
                            end_activities=end_activities,
                            dfg_window_2=dfg_window_2,
                            freq_triples=frequency_triples,
                            parameters=parameters)
    net, im, fm = hn_conv_factory.apply(heu_net, parameters=parameters)

    return net, im, fm
Exemplo n.º 9
0
def apply(dataframe, parameters=None):
    """
    Gets the performance HNet

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else ws_constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=ws_constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities_count = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)

    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    start_activities_count = start_activities_filter.get_start_activities(
        dataframe, parameters=parameters)
    activities = list(activities_count.keys())
    start_activities = list(start_activities_count.keys())
    end_activities = list(end_activities_count.keys())

    dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(
        dataframe,
        case_id_glue=case_id_glue,
        activity_key=activity_key,
        timestamp_key=timestamp_key,
        measure="both",
        sort_caseid_required=False,
        sort_timestamp_along_case_id=False)
    heu_net = HeuristicsNet(dfg_frequency,
                            performance_dfg=dfg_performance,
                            activities=activities,
                            start_activities=start_activities,
                            end_activities=end_activities,
                            activities_occurrences=activities_count)
    heu_net.calculate(dfg_pre_cleaning_noise_thresh=ws_constants.
                      DEFAULT_DFG_CLEAN_MULTIPLIER * decreasingFactor)

    vis = heu_vis_factory.apply(heu_net, parameters={"format": "svg"})
    vis2 = heu_vis_factory.apply(heu_net, parameters={"format": "dot"})

    gviz_base64 = get_base64_from_file(vis2.name)

    return get_base64_from_file(vis.name), None, "", "parquet", activities, start_activities, end_activities, gviz_base64, [], "heuristics", "perf", None, "", activity_key
Exemplo n.º 10
0
def apply(dataframe, parameters=None):
    """
    Gets the process tree using Inductive Miner Directly-Follows

    Parameters
    ------------
    dataframe
        Dataframe
    parameters
        Parameters of the algorithm

    Returns
    ------------
    base64
        Base64 of an SVG representing the model
    model
        Text representation of the model
    format
        Format of the model
    """
    if parameters is None:
        parameters = {}

    decreasingFactor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else constants.DEFAULT_DEC_FACTOR

    activity_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_ACTIVITY_KEY] if pm4_constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY
    timestamp_key = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if pm4_constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        pm4_constants.
        PARAMETER_CONSTANT_CASEID_KEY] if pm4_constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME

    parameters[pm4_constants.RETURN_EA_COUNT_DICT_AUTOFILTER] = True
    dataframe = attributes_filter.filter_df_keeping_spno_activities(
        dataframe,
        activity_key=activity_key,
        max_no_activities=constants.MAX_NO_ACTIVITIES)
    dataframe, end_activities = auto_filter.apply_auto_filter(
        dataframe, parameters=parameters)
    end_activities = list(end_activities.keys())

    activities_count = attributes_filter.get_attribute_values(
        dataframe, activity_key, parameters=parameters)
    activities = list(activities_count.keys())
    start_activities = list(
        start_activities_filter.get_start_activities(
            dataframe, parameters=parameters).keys())

    dfg = df_statistics.get_dfg_graph(dataframe,
                                      activity_key=activity_key,
                                      timestamp_key=timestamp_key,
                                      case_id_glue=case_id_glue,
                                      sort_caseid_required=False,
                                      sort_timestamp_along_case_id=False)
    dfg = clean_dfg_based_on_noise_thresh(
        dfg,
        activities,
        decreasingFactor * constants.DEFAULT_DFG_CLEAN_MULTIPLIER,
        parameters=parameters)
    tree = inductive_miner.apply_tree_dfg(dfg,
                                          parameters,
                                          activities=activities,
                                          start_activities=start_activities,
                                          end_activities=end_activities)
    gviz = pt_vis_factory.apply(tree, parameters={"format": "svg"})

    gviz_base64 = base64.b64encode(str(gviz).encode('utf-8'))

    return get_base64_from_gviz(gviz), None, "", "parquet", activities, start_activities, end_activities, gviz_base64, [], "tree", "freq", None, "", activity_key
Exemplo n.º 11
0
def apply(df, parameters=None):
    """
    Discover a StarStar model from an ad-hoc built dataframe

    Parameters
    -------------
    df
        Dataframe
    parameters
        Possible parameters of the algorithm

    Returns
    -------------
    perspectives_heu
        Dictionary of perspectives associated to Heuristics Net
    """

    if parameters is None:
        parameters = {}

    if len(df) == 0:
        df = pd.DataFrame({"event_id": [], "event_activity": []})

    dependency_thresh = parameters[
        DEPENDENCY_THRESH] if DEPENDENCY_THRESH in parameters else DEFAULT_DEPENDENCY_THRESH
    and_measure_thresh = parameters[
        AND_MEASURE_THRESH] if AND_MEASURE_THRESH in parameters else DEFAULT_AND_MEASURE_THRESH
    min_act_count = parameters[
        MIN_ACT_COUNT] if MIN_ACT_COUNT in parameters else DEFAULT_MIN_ACT_COUNT
    min_dfg_occurrences = parameters[
        MIN_DFG_OCCURRENCES] if MIN_DFG_OCCURRENCES in parameters else DEFAULT_MIN_DFG_OCCURRENCES
    dfg_pre_cleaning_noise_thresh = parameters[
        DFG_PRE_CLEANING_NOISE_THRESH] if DFG_PRE_CLEANING_NOISE_THRESH in parameters else DEFAULT_DFG_PRE_CLEANING_NOISE_THRESH
    decreasing_factor_sa_ea = parameters[
        DECREASING_FACTOR] if DECREASING_FACTOR in parameters else 0.5
    performance = parameters[
        PERFORMANCE] if PERFORMANCE in parameters else False
    perspectives = parameters[
        PERSPECTIVES] if PERSPECTIVES in parameters else None
    use_timestamp = parameters[
        USE_TIMESTAMP] if USE_TIMESTAMP in parameters else True
    sort_caseid_required = parameters[
        SORT_CASEID_REQUIRED] if SORT_CASEID_REQUIRED in parameters else True
    sort_timestamp_required = parameters[
        SORT_TIMESTAMP_REQUIRED] if SORT_TIMESTAMP_REQUIRED in parameters else True

    perspectives_heu = {}

    r = lambda: random.randint(0, 255)
    if perspectives is None:
        perspectives = list(x for x in df.columns if not x.startswith("event"))

        # del perspectives[perspectives.index("event_id")]
        # del perspectives[perspectives.index("event_activity")]
        # if "event_timestamp" in perspectives:
        #    del perspectives[perspectives.index("event_timestamp")]
        perspectives = sorted(perspectives)
    activities_occurrences = attributes_filter.get_attribute_values(
        df.groupby("event_id").first().reset_index(), "event_activity")
    activities_occurrences = {
        x: y
        for x, y in activities_occurrences.items() if y >= min_act_count
    }
    for p_ind, p in enumerate(perspectives):
        has_timestamp = False
        if "event_timestamp" in df.columns and use_timestamp:
            proj_df = df[["event_id", "event_activity", "event_timestamp",
                          p]].dropna(subset=[p])
            has_timestamp = True
        else:
            proj_df = df[["event_id", "event_activity", p]].dropna(subset=[p])
        """
        proj_df_activities = Counter(list(proj_df["event_activity"]))
        proj_df_activities = set(x for x in proj_df_activities if proj_df_activities[x] >= min_act_count)
        proj_df = proj_df[proj_df["event_activity"].isin(proj_df_activities)]
        print(proj_df_activities)"""

        # proj_df = proj_df.groupby(["event_id", "event_activity", p]).first().reset_index()
        # print('sii')

        proj_df = proj_df[proj_df["event_activity"].isin(
            activities_occurrences)]
        proj_df_first = proj_df.groupby("event_id").first().reset_index()

        if performance:
            dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(
                proj_df,
                activity_key="event_activity",
                case_id_glue=p,
                timestamp_key="event_timestamp",
                measure="both",
                sort_caseid_required=sort_caseid_required,
                sort_timestamp_along_case_id=sort_timestamp_required)
        else:
            if has_timestamp:
                dfg_frequency = df_statistics.get_dfg_graph(
                    proj_df,
                    activity_key="event_activity",
                    case_id_glue=p,
                    timestamp_key="event_timestamp",
                    sort_caseid_required=sort_caseid_required,
                    sort_timestamp_along_case_id=sort_timestamp_required)
            else:
                dfg_frequency = df_statistics.get_dfg_graph(
                    proj_df,
                    activity_key="event_activity",
                    case_id_glue=p,
                    sort_timestamp_along_case_id=False,
                    sort_caseid_required=sort_caseid_required)
        if len(dfg_frequency) > 0:
            this_color = COLORS[p_ind] if p_ind < len(
                COLORS) else '#%02X%02X%02X' % (r(), r(), r())
            parameters_sa_ea = copy(parameters)
            parameters_sa_ea[constants.PARAMETER_CONSTANT_CASEID_KEY] = p
            parameters_sa_ea[
                constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = "event_activity"
            parameters_sa_ea[
                constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = "event_activity"
            start_activities = start_activities_filter.get_start_activities(
                proj_df, parameters=parameters_sa_ea)
            end_activities = end_activities_filter.get_end_activities(
                proj_df, parameters=parameters_sa_ea)
            start_activities = clean_sa_ea(start_activities,
                                           decreasing_factor_sa_ea)
            end_activities = clean_sa_ea(end_activities,
                                         decreasing_factor_sa_ea)
            max_entry = dict()
            max_exit = dict()
            for x in dfg_frequency:
                a1 = x[0]
                a2 = x[1]
                y = dfg_frequency[x]
                if not a1 == a2:
                    if not a2 in max_entry or max_entry[a2][1] < y:
                        max_entry[a2] = [x, y]
                    if not a1 in max_exit or max_exit[a1][1] < y:
                        max_exit[a1] = [x, y]
            max_entry = set(y[0] for y in max_entry.values())
            max_exit = set(y[0] for y in max_exit.values())
            #max_entry = {}
            #max_exit = {}
            dfg_frequency = {
                x: y
                for x, y in dfg_frequency.items()
                if y >= min_dfg_occurrences or x in max_entry or x in max_exit
            }
            activities = list(activities_occurrences.keys())

            if performance:
                dfg_performance = {
                    x: y
                    for x, y in dfg_performance.items() if x in dfg_frequency
                }
                heu_net = HeuristicsNet(
                    dfg_frequency,
                    start_activities=start_activities,
                    end_activities=end_activities,
                    default_edges_color=this_color,
                    net_name=p,
                    activities=activities,
                    activities_occurrences=activities_occurrences,
                    performance_dfg=dfg_performance)
            else:
                heu_net = HeuristicsNet(
                    dfg_frequency,
                    start_activities=start_activities,
                    end_activities=end_activities,
                    default_edges_color=this_color,
                    net_name=p,
                    activities=activities,
                    activities_occurrences=activities_occurrences)
            heu_net.calculate(
                dependency_thresh=dependency_thresh,
                and_measure_thresh=and_measure_thresh,
                min_act_count=1,
                min_dfg_occurrences=1,
                dfg_pre_cleaning_noise_thresh=dfg_pre_cleaning_noise_thresh)
            if len(heu_net.nodes) > 0:
                perspectives_heu[p] = heu_net

    return perspectives_heu