def filter_log_by_variants_percentage(log, variants, variants_percentage=0.0): """ Filter the log by variants percentage Parameters ---------- log Log variants Dictionary with variant as the key and the list of traces as the value variants_percentage Percentage of variants that should be kept (the most common variant is always kept) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog() no_of_traces = len(log) variant_count = get_variants_sorted_by_count(variants) already_added_sum = 0 for i in range(len(variant_count)): variant = variant_count[i][0] varcount = variant_count[i][1] percentage_already_added = already_added_sum / no_of_traces if already_added_sum == 0 or percentage_already_added < variants_percentage: for trace in variants[variant]: filtered_log.append(trace) already_added_sum = already_added_sum + varcount return filtered_log
def filter_variants_top_k(log, k, parameters=None): """ Keeps the top-k variants of the log Parameters ------------- log Event log k Number of variants that should be kept parameters Parameters Returns ------------- filtered_log Filtered log """ if parameters is None: parameters = {} variants = get_variants(log, parameters=parameters) variant_count = get_variants_sorted_by_count(variants) variant_count = variant_count[:min(k, len(variant_count))] variants_to_filter = [x[0] for x in variant_count] return apply(log, variants_to_filter, parameters=parameters)
def filter_variants_variants_percentage(log, variants, variants_percentage=0.0): """ Filter the log by variants percentage Parameters ---------- log Log variants Dictionary with variant as the key and the list of traces as the value variants_percentage Percentage of variants that should be kept (the most common variant is always kept) Returns ---------- filtered_log Filtered log """ filtered_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers, omni_present=log.omni_present, properties=log.properties) no_of_traces = len(log) variant_count = get_variants_sorted_by_count(variants) already_added_sum = 0 shall_break_under = -1 for i in range(len(variant_count)): variant = variant_count[i][0] varcount = variant_count[i][1] if varcount < shall_break_under: break for trace in variants[variant]: filtered_log.append(trace) already_added_sum = already_added_sum + varcount percentage_already_added = already_added_sum / no_of_traces if percentage_already_added >= variants_percentage: shall_break_under = varcount return filtered_log
def find_auto_threshold(log, variants, decreasing_factor): """ Find automatically variants filtering threshold based on specified decreasing factor Parameters ---------- log Log variants Dictionary with variant as the key and the list of traces as the value decreasing_factor Decreasing factor (stops the algorithm when the next variant by occurrence is below this factor in comparison to previous) Returns ---------- variantsPercentage Percentage of variants to keep in the log """ no_of_traces = len(log) variant_count = get_variants_sorted_by_count(variants) already_added_sum = 0 prev_var_count = -1 percentage_already_added = 0 for i in range(len(variant_count)): varcount = variant_count[i][1] percentage_already_added = already_added_sum / no_of_traces if already_added_sum == 0 or varcount > decreasing_factor * prev_var_count: already_added_sum = already_added_sum + varcount else: break prev_var_count = varcount percentage_already_added = already_added_sum / no_of_traces return percentage_already_added