示例#1
0
def apply_auto_filter(df, parameters=None):
    """
    Apply an automatic filter on variants

    Parameters
    -----------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
            Parameters.CASE_ID_KEY -> Column that contains the Case ID
            Parameters.ACTIVITY_KEY -> Column that contains the activity
            variants_df -> If provided, avoid recalculation of the variants dataframe
            Parameters.DECREASING_FACTOR -> Decreasing factor that should be passed to the algorithm

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    decreasing_factor = exec_utils.get_param_value(
        Parameters.DECREASING_FACTOR, parameters,
        filtering_constants.DECREASING_FACTOR)
    variants_df = case_statistics.get_variants_df(df, parameters=parameters)
    parameters["variants_df"] = variants_df
    variants = case_statistics.get_variant_statistics(df,
                                                      parameters=parameters)

    admitted_variants = []
    if len(variants) > 0:
        current_variant_count = variants[0][case_id_glue]

        for i in range(len(variants)):
            if variants[i][
                    case_id_glue] >= decreasing_factor * current_variant_count:
                admitted_variants.append(variants[i]["variant"])
            else:
                break
            current_variant_count = variants[i][case_id_glue]

    return apply(df, admitted_variants, parameters=parameters)
示例#2
0
def apply_auto_filter(df, parameters=None):
    """
    Apply an automatic filter on variants

    Parameters
    -----------
    df
        Dataframe
    parameters
        Parameters of the algorithm, including:
            case_id_glue -> Column that contains the Case ID
            activity_key -> Column that contains the activity
            variants_df -> If provided, avoid recalculation of the variants dataframe
            decreasingFactor -> Decreasing factor that should be passed to the algorithm

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}
    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    variants_df = case_statistics.get_variants_df(df, parameters=parameters)
    parameters["variants_df"] = variants_df
    variants = case_statistics.get_variant_statistics(df,
                                                      parameters=parameters)
    decreasing_factor = parameters[
        "decreasingFactor"] if "decreasingFactor" in parameters else filtering_constants.DECREASING_FACTOR

    admitted_variants = []
    if len(variants) > 0:
        current_variant_count = variants[0][case_id_glue]

        for i in range(len(variants)):
            if variants[i][
                    case_id_glue] >= decreasing_factor * current_variant_count:
                admitted_variants.append(variants[i]["variant"])
            else:
                break
            current_variant_count = variants[i][case_id_glue]

    return apply(df, admitted_variants, parameters=parameters)
示例#3
0
def apply(df, admitted_variants, parameters=None):
    """
    Apply a filter on variants

    Parameters
    -----------
    df
        Dataframe
    admitted_variants
        List of admitted variants (to include/exclude)
    parameters
        Parameters of the algorithm, including:
            Parameters.CASE_ID_KEY -> Column that contains the Case ID
            Parameters.ACTIVITY_KEY -> Column that contains the activity
            Parameters.POSITIVE -> Specifies if the filter should be applied including traces (positive=True)
            or excluding traces (positive=False)
            variants_df -> If provided, avoid recalculation of the variants dataframe

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters,
                                          True)
    variants_df = parameters[
        "variants_df"] if "variants_df" in parameters else get_variants_df(
            df, parameters=parameters)
    variants_df = variants_df[variants_df["variant"].isin(admitted_variants)]
    i1 = df.set_index(case_id_glue).index
    i2 = variants_df.index
    if positive:
        ret = df[i1.isin(i2)]
    else:
        ret = df[~i1.isin(i2)]
    return ret
示例#4
0
def apply(df, admitted_variants, parameters=None):
    """
    Apply a filter on variants

    Parameters
    -----------
    df
        Dataframe
    admitted_variants
        List of admitted variants (to include/exclude)
    parameters
        Parameters of the algorithm, including:
            case_id_glue -> Column that contains the Case ID
            activity_key -> Column that contains the activity
            positive -> Specifies if the filter should be applied including traces (positive=True)
            or excluding traces (positive=False)
            variants_df -> If provided, avoid recalculation of the variants dataframe

    Returns
    -----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    positive = parameters["positive"] if "positive" in parameters else True
    variants_df = parameters[
        "variants_df"] if "variants_df" in parameters else get_variants_df(
            df, parameters=parameters)
    variants_df = variants_df[variants_df["variant"].isin(admitted_variants)]
    i1 = df.set_index(case_id_glue).index
    i2 = variants_df.index
    if positive:
        ret = df[i1.isin(i2)]
    else:
        ret = df[~i1.isin(i2)]
    return ret
def apply(df, parameters=None):
    """
    Returns a Pandas dataframe from which a sound workflow net could be extracted taking into account
    a discovery algorithm returning models only with visible transitions

    Parameters
    ------------
    df
        Pandas dataframe
    parameters
        Possible parameters of the algorithm, including:
            max_no_variants -> Maximum number of variants to consider to return a Petri net

    Returns
    ------------
    filtered_df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    if PARAMETER_CONSTANT_CASEID_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_CASEID_KEY] = CASE_CONCEPT_NAME
    if PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_ACTIVITY_KEY] = DEFAULT_NAME_KEY
    if PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY] = DEFAULT_TIMESTAMP_KEY
    if PARAMETER_CONSTANT_ATTRIBUTE_KEY not in parameters:
        parameters[PARAMETER_CONSTANT_ATTRIBUTE_KEY] = parameters[
            PARAMETER_CONSTANT_ACTIVITY_KEY]

    caseid_glue = parameters[PARAMETER_CONSTANT_CASEID_KEY]
    activity_key = parameters[PARAMETER_CONSTANT_ACTIVITY_KEY]
    timest_key = parameters[PARAMETER_CONSTANT_TIMESTAMP_KEY]

    max_no_variants = parameters[
        "max_no_variants"] if "max_no_variants" in parameters else 20

    variants_df = case_statistics.get_variants_df(df, parameters=parameters)
    parameters["variants_df"] = variants_df

    variant_stats = case_statistics.get_variant_statistics(
        df, parameters=parameters)

    all_variants_list = []
    for var in variant_stats:
        all_variants_list.append([var["variant"], var[caseid_glue]])

    all_variants_list = sorted(all_variants_list,
                               key=lambda x: (x[1], x[0]),
                               reverse=True)

    considered_variants = []
    considered_traces = []

    i = 0
    while i < min(len(all_variants_list), max_no_variants):
        variant = all_variants_list[i][0]

        considered_variants.append(variant)

        filtered_df = variants_filter.apply(df,
                                            considered_variants,
                                            parameters=parameters)

        dfg_frequency = dfg_util.get_dfg_graph(filtered_df,
                                               measure="frequency",
                                               perf_aggregation_key="median",
                                               case_id_glue=caseid_glue,
                                               activity_key=activity_key,
                                               timestamp_key=timest_key)

        net, initial_marking, final_marking = alpha_miner.apply_dfg(
            dfg_frequency, parameters=parameters)

        is_sound = check_soundness.check_petri_wfnet_and_soundness(net)
        if not is_sound:
            del considered_variants[-1]
        else:
            traces_of_this_variant = variants_filter.apply(
                df, [variant], parameters=parameters).groupby(caseid_glue)
            traces_of_this_variant_keys = list(
                traces_of_this_variant.groups.keys())
            trace_of_this_variant = traces_of_this_variant.get_group(
                traces_of_this_variant_keys[0])

            this_trace = transform.transform_event_log_to_trace_log(
                pandas_df_imp.convert_dataframe_to_event_log(
                    trace_of_this_variant),
                case_glue=caseid_glue)[0]
            if not activity_key == DEFAULT_NAME_KEY:
                for j in range(len(this_trace)):
                    this_trace[j][DEFAULT_NAME_KEY] = this_trace[j][
                        activity_key]
            considered_traces.append(this_trace)
            filtered_log = TraceLog(considered_traces)

            try:
                alignments = alignment_factory.apply(filtered_log, net,
                                                     initial_marking,
                                                     final_marking)
                del alignments
                fitness = replay_fitness_factory.apply(filtered_log,
                                                       net,
                                                       initial_marking,
                                                       final_marking,
                                                       parameters=parameters)
                if fitness["log_fitness"] < 0.99999:
                    del considered_variants[-1]
                    del considered_traces[-1]
            except TypeError:
                del considered_variants[-1]
                del considered_traces[-1]

        i = i + 1

    return variants_filter.apply(df,
                                 considered_variants,
                                 parameters=parameters)
示例#6
0
    if ENABLE_TESTS:
        # TEST 6: discover DFG from Pandas
        t0 = time.time()
        pd_dfg_discovery.get_dfg_graph(roadtraffic_df)
        t1 = time.time()
        T6[0] = (t1 - t0)
        T6[2] = math.ceil(T6[1] / (T6[0] + 0.00000001) * 1000.0)
        print(
            "TEST 6 - Discovering DFG from Pandas - %.5f s (test score: %d)" %
            (T6[0], T6[2]))

    if ENABLE_TESTS:
        # TEST 7: discover variants from dataframe
        t0 = time.time()
        pd_case_statistics.get_variants_df(roadtraffic_df)
        t1 = time.time()
        T7[0] = (t1 - t0)
        T7[2] = math.ceil(T7[1] / (T7[0] + 0.00000001) * 1000.0)
        print(
            "TEST 7 - Discover variants from Pandas dataframe - %.5f s (test score: %d)"
            % (T7[0], T7[2]))

    if ENABLE_TESTS:
        # TEST 8: discover timeframe KDE rom dataframe
        t0 = time.time()
        pd_attributes_get.get_kde_date_attribute(roadtraffic_df,
                                                 "time:timestamp")
        t1 = time.time()
        T8[0] = (t1 - t0)
        T8[2] = math.ceil(T8[1] / (T8[0] + 0.00000001) * 1000.0)