def test_importExportCSVtoCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "running-example.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) event_log = log_conversion.apply( df, variant=log_conversion.TO_EVENT_STREAM) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute( event_log) log = log_conversion.apply(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) event_log_transformed = log_conversion.apply( log, variant=log_conversion.TO_EVENT_STREAM) df = log_conversion.apply(event_log_transformed, variant=log_conversion.TO_DATA_FRAME) df.to_csv(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) df = pd.read_csv( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) df = dataframe_utils.convert_timestamp_columns_in_df(df) event_log_imported_after_export = log_conversion.apply( df, variant=log_conversion.TO_EVENT_STREAM) log_imported_after_export = log_conversion.apply( event_log_imported_after_export) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
def test_applyAlphaMinerToCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" # calculate and compare Petri nets obtained on the same log to verify that instances # are working correctly log1, net1, marking1, fmarking1 = self.obtainPetriNetThroughAlphaMiner( os.path.join(INPUT_DATA_DIR, "running-example.csv")) log2, net2, marking2, fmarking2 = self.obtainPetriNetThroughAlphaMiner( os.path.join(INPUT_DATA_DIR, "running-example.csv")) log1 = sorting.sort_timestamp(log1) log1 = sampling.sample(log1) log1 = index_attribute.insert_trace_index_as_event_attribute(log1) log2 = sorting.sort_timestamp(log2) log2 = sampling.sample(log2) log2 = index_attribute.insert_trace_index_as_event_attribute(log2) petri_exporter.export_net( net1, marking1, os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) self.assertEqual(len(net1.places), len(net2.places)) self.assertEqual(len(net1.transitions), len(net2.transitions)) self.assertEqual(len(net1.arcs), len(net2.arcs)) final_marking = petri.petrinet.Marking() for p in net1.places: if not p.out_arcs: final_marking[p] = 1 aligned_traces = token_replay.apply_log(log1, net1, marking1, final_marking) self.assertEqual(aligned_traces, aligned_traces)
def test_importExportCSVtoCSV(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" event_log = csv_importer.import_event_stream( os.path.join(INPUT_DATA_DIR, "running-example.csv")) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute( event_log) log = log_conv_fact.apply(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) event_log_transformed = log_conv_fact.apply( log, variant=log_conv_fact.TO_EVENT_STREAM) csv_exporter.export( event_log_transformed, os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) event_log_imported_after_export = csv_importer.import_event_stream( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv")) log_imported_after_export = log_conv_fact.apply( event_log_imported_after_export) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.csv"))
def __dotted_attribute_selection(log: Union[EventLog, pd.DataFrame], attributes): """ Default attribute selection for the dotted chart Parameters ----------------- log Event log Returns ----------------- attributes List of attributes """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception( "the method can be applied only to a traditional event log!") if attributes is None: from pm4py.util import xes_constants from pm4py.objects.log.util import sorting from pm4py.objects.conversion.log import converter log = converter.apply(log, variant=converter.Variants.TO_EVENT_LOG) log = sorting.sort_timestamp(log, xes_constants.DEFAULT_TIMESTAMP_KEY) for index, trace in enumerate(log): trace.attributes["@@index"] = index attributes = ["time:timestamp", "case:@@index", "concept:name"] return log, attributes
def apply(con, ref_type="Invoice", keep_first=True, min_extr_date="2020-01-01 00:00:00", gjahr="2020", enable_changes=True, enable_payments=True, allowed_act_doc_types=None, allowed_act_changes=None, mandt="800"): dataframe = o2c_1d_dataframe_extractor.apply( con, ref_type=ref_type, keep_first=keep_first, min_extr_date=min_extr_date, gjahr=gjahr, enable_changes=enable_changes, enable_payments=enable_payments, allowed_act_doc_types=allowed_act_doc_types, allowed_act_changes=allowed_act_changes, mandt=mandt) log = log_converter.apply(dataframe, parameters={"stream_postprocessing": True}) log = sorting.sort_timestamp(log, "time:timestamp") return log
def test_importExportCSVtoXES(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" event_log = csv_importer.import_event_stream(os.path.join(INPUT_DATA_DIR, "running-example.csv")) event_log = sorting.sort_timestamp(event_log) event_log = sampling.sample(event_log) event_log = index_attribute.insert_event_index_as_event_attribute(event_log) log = log_transform.transform_event_stream_to_event_log(event_log) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) xes_exporter.export_log(log, os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes")) log_imported_after_export = xes_importer.import_log( os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes")) self.assertEqual(len(log), len(log_imported_after_export)) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example-exported.xes"))
def testCSVConversion(self): dirname = os.path.dirname(__file__) rootDir = os.path.join(dirname, 'TestFiles', 'ImportExport') csvPath = os.path.join(dirname, 'Ressources/example.csv') fileCreator = FileUtility(rootDir) eventLog = fileCreator.getEventLogFromFile(csvPath) eventLog = sorting.sort_timestamp(eventLog) self.assertEqual(len(eventLog), 6)
def apply(con, gjahr="1997", mandt="800", bukrs="1000", **ext_arg): dataframe = single_doc_transactions_dataframe.apply(con, gjahr=gjahr, bukrs=bukrs, mandt=mandt) log = log_converter.apply(dataframe, parameters={"stream_postprocessing": True}) log = sorting.sort_timestamp(log, "time:timestamp") return log
def execute_script(): log = xes_importer.apply(os.path.join("..", "tests", "input_data", "receipt.xes")) log = sorting.sort_timestamp(log) net, im, fm = inductive_miner.apply(log) log1 = EventLog(log[:500]) log2 = EventLog(log[len(log) - 500:]) statistics = element_usage_comparison.compare_element_usage_two_logs(net, im, fm, log1, log2) gviz = pn_vis.apply(net, im, fm, variant=pn_vis.Variants.FREQUENCY, aggregated_statistics=statistics, parameters={pn_vis.Variants.FREQUENCY.value.Parameters.FORMAT: "svg"}) pn_vis.view(gviz)
def uselog(loginput): log = xes_import_factory.apply(loginput) log = sorting.sort_timestamp(log) # print(log) dfg = dfg_factory.apply(log) dfg_gv = dfg_vis_fact.apply(dfg, log, parameters={"format": "svg"}) this_data = dfg_to_g6.dfg_to_g6(dfg) # dfg_vis_fact.view(dfg_gv) return this_data '''grouplist = get_groups(log)
def get_log_obj_type(self, objtype): columns = [x for x in self.exploded_dataframe.columns if x.startswith("event_")] + [objtype] dataframe = self.exploded_dataframe[columns].dropna(how="any", subset=[objtype]) dataframe = succint_mdl_to_exploded_mdl.apply(dataframe) dataframe = dataframe.rename(columns={"event_activity": "concept:name", "event_timestamp": "time:timestamp", objtype: "case:concept:name"}) stream = EventStream(dataframe.to_dict('r')) log = log_conv_factory.apply(stream) log = sorting.sort_timestamp(log, "time:timestamp") exported_log = base64.b64encode(xes_exporter.export_log_as_string(log)).decode("utf-8") return self.name + "_" + objtype, "xes", exported_log
def apply(con, gjahr="1997", mandt="800", bukrs="1000", ref_type="Goods receipt"): dataframe = doc_flow_transactions_dataframe.apply(con, gjahr=gjahr, bukrs=bukrs, mandt=mandt, ref_type=ref_type) log = log_converter.apply(dataframe, parameters={"stream_postprocessing": True}) log = sorting.sort_timestamp(log, "time:timestamp") return log
def apply(log: EventLog, parameters: Optional[Dict[str, Any]] = None) -> Tuple[List[datetime], np.ndarray]: """ Analyse the evolution of the features over the time using a locally linear embedding. Parameters ----------------- log Event log parameters Variant-specific parameters, including: - Parameters.ACTIVITY_KEY => the activity key - Parameters.TIMESTAMP_KEY => the timestamp key - Parameters.CASE_ID_KEY => the case ID key Returns ---------------- x Date attributes (starting points of the cases) y Deviation from the standard behavior (higher absolute values of y signal a higher deviation from the standard behavior) """ if parameters is None: parameters = {} activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) if type(log) is pd.DataFrame: # keep only the needed columns case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) log = log[[case_id_key, activity_key, timestamp_key]] log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) log = sorting.sort_timestamp(log, timestamp_key) x = [trace[0][timestamp_key] for trace in log] data, feature_names = log_to_features.apply(log, parameters={"str_ev_attr": [activity_key], "str_evsucc_attr": [activity_key]}) tsne = LocallyLinearEmbedding(n_components=1) data = tsne.fit_transform(data) data = np.ndarray.flatten(data) y = data smooth_amount = 1 + math.floor(math.sqrt(len(y))) y = smooth(y, smooth_amount) return x, y
def execute_script(): log = xes_importer.apply( os.path.join("..", "tests", "input_data", "receipt.xes")) log = sorting.sort_timestamp(log) net, im, fm = inductive_miner.apply(log) log1 = EventLog(log[:500]) log2 = EventLog(log[len(log) - 500:]) statistics = element_usage_comparison.compare_element_usage_two_logs( net, im, fm, log1, log2) gviz = pn_vis_factory.apply(net, im, fm, variant="frequency", aggregated_statistics=statistics, parameters={"format": "svg"}) pn_vis_factory.view(gviz)
def apply(con, ref_type="EKKO", gjahr="2014", min_extr_date="2014-01-01 00:00:00", mandt="800", bukrs="1000", extra_els_query=None): dataframe = p2p_1d_dataframe.apply(con, gjahr=gjahr, ref_type=ref_type, min_extr_date=min_extr_date, mandt=mandt, bukrs=bukrs, extra_els_query=extra_els_query) log = log_converter.apply(dataframe, parameters={"stream_postprocessing": True}) print("converted dataframe") log = sorting.sort_timestamp(log, "time:timestamp") return log
def test_alphaMinerVisualizationFromXES(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" log, net, marking, fmarking = self.obtainPetriNetThroughAlphaMiner( os.path.join(INPUT_DATA_DIR, "running-example.xes")) log = sorting.sort_timestamp(log) log = sampling.sample(log) log = index_attribute.insert_trace_index_as_event_attribute(log) petri_exporter.apply(net, marking, os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.pnml")) gviz = pn_viz.graphviz_visualization(net) self.assertEqual(gviz, gviz) final_marking = petri.petrinet.Marking() for p in net.places: if not p.out_arcs: final_marking[p] = 1 aligned_traces = token_replay.apply(log, net, marking, fmarking) self.assertEqual(aligned_traces, aligned_traces)
def __dotted_attribute_selection(log, attributes): """ Default attribute selection for the dotted chart Parameters ----------------- log Event log Returns ----------------- attributes List of attributes """ if attributes is None: from pm4py.util import xes_constants from pm4py.objects.log.util import sorting from pm4py.convert import convert_to_event_log log = convert_to_event_log(log) log = sorting.sort_timestamp(log, xes_constants.DEFAULT_TIMESTAMP_KEY) for index, trace in enumerate(log): trace.attributes["@@index"] = index attributes = ["time:timestamp", "case:@@index", "concept:name"] return log, attributes
def import_log_from_file_object(f, encoding, file_size=sys.maxsize, parameters=None): """ Import a log object from a (XML) file object Parameters ----------- f file object encoding Encoding file_size Size of the file (measured on disk) parameters Parameters of the algorithm, including Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file) Parameters.MAX_BYTES -> Maximum number of bytes to read Parameters.SKYP_BYTES -> Number of bytes to skip Parameters.SET_ATTRIBUTES_TO_READ -> Names of the attributes that should be parsed. If not specified, then, all the attributes are parsed. Returns ----------- log Log file """ values_dict = {} date_parser = dt_parser.get() set_attributes_to_read = exec_utils.get_param_value( Parameters.SET_ATTRIBUTES_TO_READ, parameters, None) max_no_traces_to_import = exec_utils.get_param_value( Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) skip_bytes = exec_utils.get_param_value(Parameters.SKIP_BYTES, parameters, False) max_bytes_to_read = exec_utils.get_param_value(Parameters.MAX_BYTES, parameters, sys.maxsize) if file_size > max_bytes_to_read: skip_bytes = file_size - max_bytes_to_read log = EventLog() tracecount = 0 trace = None event = None f.seek(skip_bytes) for line in f: content = line.decode(encoding).split("\"") if len(content) > 0: tag = content[0].split("<")[-1] if trace is not None: if event is not None: if len(content) == 5: key, value = read_attribute_key_value( tag, content, date_parser, values_dict, set_attributes_to_read) if value is not None: event[key] = value elif tag.startswith("/event"): trace.append(event) event = None elif tag.startswith("event"): event = Event() elif len(content) == 5: key, value = read_attribute_key_value( tag, content, date_parser, values_dict, set_attributes_to_read) if value is not None: trace.attributes[key] = value elif tag.startswith("/trace"): log.append(trace) tracecount += 1 if tracecount > max_no_traces_to_import: break trace = None elif tag.startswith("trace"): trace = Trace() if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) # sets the activity key as default classifier in the log's properties log.properties[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY log.properties[ constants. PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY # sets the default timestamp key log.properties[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY # sets the default resource key log.properties[ constants. PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY # sets the default transition key log.properties[ constants. PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY # sets the default group key log.properties[ constants. PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY return log
def train(log, parameters=None): """ Train the prediction model Parameters ----------- log Event log parameters Possible parameters of the algorithm Returns ------------ model Trained model """ if parameters is None: parameters = {} parameters["enable_sort"] = False activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY business_hours = parameters[ "business_hours"] if "business_hours" in parameters else False worktiming = parameters["worktiming"] if "worktiming" in parameters else [ 7, 17 ] weekends = parameters["weekends"] if "weekends" in parameters else [6, 7] y_orig = parameters["y_orig"] if "y_orig" in parameters else None log = sorting.sort_timestamp(log, timestamp_key) str_evsucc_attr = [activity_key] if "str_ev_attr" in parameters: str_tr_attr = parameters[ "str_tr_attr"] if "str_tr_attr" in parameters else [] str_ev_attr = parameters[ "str_ev_attr"] if "str_ev_attr" in parameters else [] num_tr_attr = parameters[ "num_tr_attr"] if "num_tr_attr" in parameters else [] num_ev_attr = parameters[ "num_ev_attr"] if "num_ev_attr" in parameters else [] else: str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = attributes_filter.select_attributes_from_log_for_tree( log) if activity_key not in str_ev_attr: str_ev_attr.append(activity_key) max_trace_length = max(len(x) for x in log) if max_trace_length == 1: # this you shall use data, feature_names = get_log_representation.get_representation( log, str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr, str_evsucc_attr=str_evsucc_attr) ext_log = log else: ext_log, change_indexes = get_log_with_log_prefixes(log) data, feature_names = get_log_representation.get_representation( ext_log, str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr, str_evsucc_attr=str_evsucc_attr) if y_orig is not None: remaining_time = [y for x in y_orig for y in x] else: if business_hours: remaining_time = [] for trace in ext_log: if trace: timestamp_et = trace[-1][timestamp_key] timestamp_st = trace[0][timestamp_key] bh = BusinessHours(timestamp_st.replace(tzinfo=None), timestamp_et.replace(tzinfo=None), worktiming=worktiming, weekends=weekends) remaining_time.append(bh.getseconds()) else: remaining_time.append(0) else: remaining_time = [] for trace in ext_log: if trace: remaining_time.append( (trace[-1][timestamp_key] - trace[0][timestamp_key]).total_seconds()) else: remaining_time.append(0) regr = ElasticNet(max_iter=10000, l1_ratio=0.7) print(data) regr.fit(data, remaining_time) return { "str_tr_attr": str_tr_attr, "str_ev_attr": str_ev_attr, "num_tr_attr": num_tr_attr, "num_ev_attr": num_ev_attr, "str_evsucc_attr": str_evsucc_attr, "feature_names": feature_names, "remaining_time": remaining_time, "regr": regr, "variant": "elasticnet" }
def import_log(filename, parameters=None): """ Import a log object from a XML file containing the traces, the events and the simple attributes of them Parameters ----------- filename XES file to parse parameters Parameters of the algorithm, including Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted Parameters.INSERT_TRACE_INDICES -> Specify if trace indexes should be added as event attribute for each event Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file) Parameters.MAX_BYTES -> Maximum number of bytes to read Parameters.SKYP_BYTES -> Number of bytes to skip Returns ----------- xes XES file """ if parameters is None: parameters = {} date_parser = dt_parser.get() timestamp_sort = param_util.fetch(Parameters.TIMESTAMP_SORT, parameters) timestamp_key = param_util.fetch(Parameters.TIMESTAMP_KEY, parameters) reverse_sort = param_util.fetch(Parameters.REVERSE_SORT, parameters) insert_trace_indexes = param_util.fetch(Parameters.INSERT_TRACE_INDICES, parameters) max_no_traces_to_import = param_util.fetch(Parameters.MAX_TRACES, parameters) skip_bytes = param_util.fetch(Parameters.SKYP_BYTES, parameters) max_bytes_to_read = param_util.fetch(Parameters.MAX_BYTES, parameters) file_size = os.stat(filename).st_size if file_size > max_bytes_to_read: skip_bytes = file_size - max_bytes_to_read log = EventLog() tracecount = 0 trace = None event = None f = open(filename, "r") f.seek(skip_bytes) for line in f: content = line.split("\"") if len(content) > 0: tag = content[0].split("<")[-1] if trace is not None: if event is not None: if len(content) == 5: if tag.startswith("string"): event[content[1]] = content[3] elif tag.startswith("date"): event[content[1]] = date_parser.apply(content[3]) elif tag.startswith("int"): event[content[1]] = int(content[3]) elif tag.startswith("float"): event[content[1]] = float(content[3]) else: event[content[1]] = content[3] elif tag.startswith("/event"): trace.append(event) event = None elif tag.startswith("event"): event = Event() elif len(content) == 5: if tag.startswith("string"): trace.attributes[content[1]] = content[3] elif tag.startswith("date"): trace.attributes[content[1]] = date_parser.apply(content[3]) elif tag.startswith("int"): trace.attributes[content[1]] = int(content[3]) elif tag.startswith("float"): trace.attributes[content[1]] = float(content[3]) else: trace.attributes[content[1]] = content[3] elif tag.startswith("/trace"): log.append(trace) tracecount += 1 if tracecount > max_no_traces_to_import: break trace = None elif tag.startswith("trace"): trace = Trace() f.close() if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) if insert_trace_indexes: log.insert_trace_index_as_event_attribute() return log
def import_log(filename, parameters=None): """ Import a log object from a XML file containing the traces, the events and the simple attributes of them Parameters ----------- filename XES file to parse parameters Parameters of the algorithm, including timestamp_sort -> Specify if we should sort log by timestamp timestamp_key -> If sort is enabled, then sort the log by using this key reverse_sort -> Specify in which direction the log should be sorted index_trace_indexes -> Specify if trace indexes should be added as event attribute for each event max_no_traces_to_import -> Specify the maximum number of traces to import from the log (read in order in the XML file) Returns ----------- xes XES file """ if parameters is None: parameters = {} timestamp_sort = False timestamp_key = "time:timestamp" reverse_sort = False insert_trace_indexes = False max_no_traces_to_import = 1000000000 skip_bytes = 0 max_bytes_to_read = 100000000000 if "timestamp_sort" in parameters: timestamp_sort = parameters["timestamp_sort"] if "timestamp_key" in parameters: timestamp_key = parameters["timestamp_key"] if "reverse_sort" in parameters: reverse_sort = parameters["reverse_sort"] if "insert_trace_indexes" in parameters: insert_trace_indexes = parameters["insert_trace_indexes"] if "max_no_traces_to_import" in parameters: max_no_traces_to_import = parameters["max_no_traces_to_import"] if "max_bytes_to_read" in parameters: max_bytes_to_read = parameters["max_bytes_to_read"] file_size = os.stat(filename).st_size if file_size > max_bytes_to_read: skip_bytes = file_size - max_bytes_to_read log = log_lib.log.EventLog() tracecount = 0 trace = None event = None f = open(filename, "r") f.seek(skip_bytes) for line in f: content = line.split("\"") if len(content) > 0: tag = content[0].split("<")[-1] if trace is not None: if event is not None: if len(content) == 5: if tag.startswith("string"): event[content[1]] = content[3] elif tag.startswith("date"): event[content[1]] = ciso8601.parse_datetime( content[3]) elif tag.startswith("int"): event[content[1]] = int(content[3]) elif tag.startswith("float"): event[content[1]] = float(content[3]) else: event[content[1]] = content[3] elif tag.startswith("/event"): trace.append(event) event = None elif tag.startswith("event"): event = log_lib.log.Event() elif len(content) == 5: if tag.startswith("string"): trace.attributes[content[1]] = content[3] elif tag.startswith("date"): trace.attributes[content[1]] = ciso8601.parse_datetime( content[3]) elif tag.startswith("int"): trace.attributes[content[1]] = int(content[3]) elif tag.startswith("float"): trace.attributes[content[1]] = float(content[3]) else: trace.attributes[content[1]] = content[3] elif tag.startswith("/trace"): log.append(trace) tracecount += 1 if tracecount > max_no_traces_to_import: break trace = None elif tag.startswith("trace"): trace = log_lib.log.Trace() f.close() if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) if insert_trace_indexes: log.insert_trace_index_as_event_attribute() return log
def calc_FCB_anonymity(self, log_name1, log_name2, event_attributes, life_cycle, all_life_cycle, sensitive, time_accuracy, n, bk_length, result_log_name="", results_dir="", from_time_days=0, to_time_days=0, multiprocess=True): log1 = xes_importer_factory.apply(log_name1) log2 = xes_importer_factory.apply(log_name2) log1 = sorting.sort_timestamp(log1) log2 = sorting.sort_timestamp(log2) utils = Utils() simple_log, traces, sensitive_values, df = utils.create_simple_log_adv( log1, event_attributes, life_cycle, all_life_cycle, sensitive, time_accuracy, from_time_days, to_time_days) # new_event_log = utils.createEventLog(log,simple_log,event_attributes,life_cycle,all_life_cycle, sensitive,time_accuracy) # xes_exporter.export_log(new_event_log, "EL1.xes") simple_log2, traces2, sensitive_values2, df2 = utils.create_simple_log_adv( log2, event_attributes, life_cycle, all_life_cycle, sensitive, time_accuracy, from_time_days, to_time_days) # new_event_log = utils.createEventLog(log,simple_log2,event_attributes,life_cycle,all_life_cycle, sensitive,time_accuracy) # xes_exporter.export_log(new_event_log, "EL2.xes") activities1 = utils.get_unique_act(traces) activities2 = utils.get_unique_act(traces2) uniq_activities = activities2.union(activities1) map_dict_act_chr, map_dict_chr_act, uniq_char = utils.map_act_char( uniq_activities) simple_log_char = utils.convert_simple_log_act_to_char( simple_log, map_dict_act_chr) df_char = utils.convert_lof_dataframe_act_to_char(df, map_dict_act_chr) simple_log_char2 = utils.convert_simple_log_act_to_char( simple_log2, map_dict_act_chr) df_char2 = utils.convert_lof_dataframe_act_to_char( df2, map_dict_act_chr) df_char.replace( np.nan, '--', inplace=True ) # this will consider nan values as a sensitive attribute! df_char2.replace( np.nan, '--', inplace=True ) # this will consider nan values as a sensitive attribute! # utils.add_fake_activities(uniq_char,map_dict_act_chr, map_dict_chr_act, bk_length) bk_candidate_iter = itertools.product(uniq_char, repeat=bk_length) bk_candidate = list(bk_candidate_iter) result_file = "" if results_dir != "" and result_log_name != "": if not os.path.exists(results_dir): os.makedirs(results_dir) file_name = "Result_" + result_log_name + "_bk_length_" + str( bk_length) + "_n_" + str(n) + ".xlsx" result_file = os.path.join(results_dir, file_name) columns = ['bk', 'R1-K', 'R2-K', 'FA', 'CA', 'BA'] df_result = pd.DataFrame(columns=columns) FA_list = [] BA_list = [] CA_list = [] R1_KA_list = [] R2_KA_list = [] results = [] if multiprocess: pool = mp.Pool() workers = [] workers_number = os.cpu_count() data_chunks = self.chunkIt(bk_candidate, workers_number) for worker in range(workers_number): print("In worker %d out of %d" % (worker + 1, workers_number)) workers.append( pool.apply_async( self.FCB_anonymity_worker, args=(data_chunks[worker], df_char, df_char2, n, sensitive, map_dict_act_chr, map_dict_chr_act))) for work in workers: results.append(work.get()) pool.close() pool.join() else: result = self.FCB_anonymity(bk_candidate, df_char, df_char2, n, sensitive, map_dict_act_chr, map_dict_chr_act) results.append(result) for result in results: for key, value in result.items(): if key == 'df_result': df_result = pd.concat([df_result, value], sort=False) elif key == 'FA': FA_list.append(value) elif key == 'CA': CA_list.append(value) elif key == 'BA': BA_list.append(value) elif key == 'R1_KA': R1_KA_list.append(value) elif key == 'R2_KA': R2_KA_list.append(value) FA = min(FA_list) CA = min(CA_list) BA = min(BA_list) R1_KA = min(R1_KA_list) R2_KA = min(R2_KA_list) df_result_last_row = { 'bk': "Event Log", 'R1-K': R1_KA, 'R2-K': R2_KA, 'FA': FA, 'CA': CA, 'BA': BA } df_result = df_result.append(df_result_last_row, ignore_index=True) if result_file != "": writer = ExcelWriter(result_file) df_result.to_excel(writer, 'bk_length_' + str(bk_length) + "-n_" + str(n)) writer.save() las_line = "Result for Event Log, R1-KA:%d, R2-KA:%d, FA:%d, CA:%d, BA:%d" % ( R1_KA, R2_KA, FA, CA, BA) print(las_line) return R1_KA, R2_KA, FA, CA, BA
def import_from_context(context, num_traces, parameters=None): """ Import a XES log from an iterparse context Parameters -------------- context Iterparse context num_traces Number of traces of the XES log parameters Parameters of the algorithm Returns -------------- log Event log """ if parameters is None: parameters = {} max_no_traces_to_import = exec_utils.get_param_value(Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) show_progress_bar = exec_utils.get_param_value(Parameters.SHOW_PROGRESS_BAR, parameters, True) date_parser = dt_parser.get() progress = None if pkgutil.find_loader("tqdm") and show_progress_bar: from tqdm.auto import tqdm progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ") log = None trace = None event = None tree = {} for tree_event, elem in context: if tree_event == _EVENT_START: # starting to read parent = tree[elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_EVENT): if event is not None: raise SyntaxError('file contains <event> in another <event> tag') event = Event() tree[elem] = event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError('file contains <trace> in another <trace> tag') trace = Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_EXTENSION): if log is None: raise SyntaxError('extension found outside of <log> tag') if elem.get(xes_constants.KEY_NAME) is not None and elem.get( xes_constants.KEY_PREFIX) is not None and elem.get(xes_constants.KEY_URI) is not None: log.extensions[elem.get(xes_constants.KEY_NAME)] = { xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX), xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI)} continue elif elem.tag.endswith(xes_constants.TAG_GLOBAL): if log is None: raise SyntaxError('global found outside of <log> tag') if elem.get(xes_constants.KEY_SCOPE) is not None: log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get(xes_constants.KEY_SCOPE)] continue elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER): if log is None: raise SyntaxError('classifier found outside of <log> tag') if elem.get(xes_constants.KEY_KEYS) is not None: classifier_value = elem.get(xes_constants.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get(xes_constants.KEY_NAME)] = [x for x in classifier_value.split("'") if x.strip()] else: log.classifiers[elem.get(xes_constants.KEY_NAME)] = classifier_value.split() continue elif elem.tag.endswith(xes_constants.TAG_LOG): if log is not None: raise SyntaxError('file contains > 1 <log> tags') log = EventLog() tree[elem] = log.attributes continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(xes_constants.TAG_TRACE): log.append(trace) if progress is not None: progress.update() trace = None continue elif elem.tag.endswith(xes_constants.TAG_LOG): continue # gracefully close progress bar if progress is not None: progress.close() del context, progress if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) # sets the activity key as default classifier in the log's properties log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_constants.DEFAULT_NAME_KEY log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = xes_constants.DEFAULT_NAME_KEY # sets the default timestamp key log.properties[constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_constants.DEFAULT_TIMESTAMP_KEY # sets the default resource key log.properties[constants.PARAMETER_CONSTANT_RESOURCE_KEY] = xes_constants.DEFAULT_RESOURCE_KEY # sets the default transition key log.properties[constants.PARAMETER_CONSTANT_TRANSITION_KEY] = xes_constants.DEFAULT_TRANSITION_KEY # sets the default group key log.properties[constants.PARAMETER_CONSTANT_GROUP_KEY] = xes_constants.DEFAULT_GROUP_KEY return log
def import_log(filename, parameters=None): """ Imports an XES file into a log object Parameters ---------- filename: Absolute filename parameters Parameters of the algorithm, including Parameters.TIMESTAMP_SORT -> Specify if we should sort log by timestamp Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log by using this key Parameters.REVERSE_SORT -> Specify in which direction the log should be sorted Parameters.INSERT_TRACE_INDICES -> Specify if trace indexes should be added as event attribute for each event Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log (read in order in the XML file) Returns ------- log : :class:`pm4py.log.log.EventLog` A log """ parameters = dict() if parameters is None else parameters insert_trace_indexes = param_util.fetch(Parameters.INSERT_TRACE_INDICES, parameters) max_no_traces_to_import = param_util.fetch(Parameters.MAX_TRACES, parameters) date_parser = dt_parser.get() context = etree.iterparse(filename, events=[_EVENT_START, _EVENT_END]) # check to see if log has a namespace before looking for traces (but this might be more effort than worth) # but you could just assume that log use on the standard namespace desbried in XES # to only find elements that start a trace use tag="{http://www.xes-standard.org}trace" # or just use the {*} syntax to match to all namespaces with a trace element #count number of traces and setup progress bar no_trace = sum([ 1 for trace in etree.iterparse( filename, events=[_EVENT_START], tag="{*}trace") ]) # make tqdm facultative progress = None if pkgutil.find_loader("tqdm"): from tqdm.auto import tqdm progress = tqdm(total=no_trace, desc="parsing log, completed traces :: ") log = None trace = None event = None tree = {} for tree_event, elem in context: if tree_event == _EVENT_START: # starting to read parent = tree[ elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_EVENT): if event is not None: raise SyntaxError( 'file contains <event> in another <event> tag') event = Event() tree[elem] = event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError( 'file contains <trace> in another <trace> tag') trace = Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_EXTENSION): if log is None: raise SyntaxError('extension found outside of <log> tag') if elem.get(xes_constants.KEY_NAME) is not None and elem.get( xes_constants.KEY_PREFIX) is not None and elem.get( xes_constants.KEY_URI) is not None: log.extensions[elem.get(xes_constants.KEY_NAME)] = { xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX), xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI) } continue elif elem.tag.endswith(xes_constants.TAG_GLOBAL): if log is None: raise SyntaxError('global found outside of <log> tag') if elem.get(xes_constants.KEY_SCOPE) is not None: log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get( xes_constants.KEY_SCOPE)] continue elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER): if log is None: raise SyntaxError('classifier found outside of <log> tag') if elem.get(xes_constants.KEY_KEYS) is not None: classifier_value = elem.get(xes_constants.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get(xes_constants.KEY_NAME)] = [ x for x in classifier_value.split("'") if x.strip() ] else: log.classifiers[elem.get(xes_constants.KEY_NAME )] = classifier_value.split() continue elif elem.tag.endswith(xes_constants.TAG_LOG): if log is not None: raise SyntaxError('file contains > 1 <log> tags') log = EventLog() tree[elem] = log.attributes continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(xes_constants.TAG_TRACE): log.append(trace) #update progress bar as we have a completed trace if progress is not None: progress.update() trace = None continue elif elem.tag.endswith(xes_constants.TAG_LOG): continue #gracefully close progress bar if progress is not None: progress.close() del context, progress if Parameters.TIMESTAMP_SORT in parameters and parameters[ Parameters.TIMESTAMP_SORT]: log = sorting.sort_timestamp( log, timestamp_key=param_util.fetch(Parameters.TIMESTAMP_KEY, parameters), reverse_sort=param_util.fetch(Parameters.REVERSE_SORT, parameters)) if insert_trace_indexes: log = index_attribute.insert_event_index_as_event_attribute(log) return log
def import_data(directory, file_name, separator=";", quote=None, case_id="concept:name", activity="activity", time_stamp="time:timestamp", target="label", num_cases=None): """ Loads data from a file and returns an XLog/pm4py log object. Expects xes file with standard attributes and the target variable named "event: Label". Expects csv file with attributes "case_id", "activity", "timestamp" and "label". :param directory: name of path [str]. :param file_name: name of file [str]. :param separator: separator for csv file [char]. :param quote: boolean flag [bool]. :param case_id: identifier for cases [str]. :param activity: identifier for activities [str]. :param time_stamp: identifier for time stamps [str]. :param target: identifier for target [str]. :param num_cases: boolean flag [bool]. :return: event log [EventLog]. """ extension = os.path.splitext(file_name)[1] print(os.getcwd()) if extension == '.csv': data_dir = os.path.join(directory, file_name) # Specify column names CASEID_GLUE = case_id ACTIVITY_KEY = activity TIMEST_KEY = time_stamp parameters = { constants.PARAMETER_CONSTANT_CASEID_KEY: CASEID_GLUE, constants.PARAMETER_CONSTANT_ACTIVITY_KEY: ACTIVITY_KEY, constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: TIMEST_KEY, 'sep': separator, 'quotechar': quote, 'timest_columns': TIMEST_KEY } # Load pm4py event stream event_stream = csv_importer.import_event_stream(data_dir, parameters=parameters) # Transform event stream to log object log = conversion_factory.apply(event_stream, parameters=parameters) # Sort log by time_stamp log = sorting.sort_timestamp(log, timestamp_key=TIMEST_KEY) # Rename to xes standard for trace in log: for event in trace: event.__setitem__("caseid", event.__getitem__(case_id)) event.__setitem__("concept:name", event.__getitem__(activity)) event.__setitem__("time:timestamp", event.__getitem__(time_stamp)) event.__setitem__("label", event.__getitem__(target)) elif extension == '.xes': data_dir = os.path.join(directory, file_name) log = xes_import_factory.apply(data_dir) print(log) for trace in log: for event in trace: trace.__setitem__("label", event.__getitem__(target)) else: raise TypeError('File type not supported.') # Filter out cases where label is not set (i.e. is nan); limits number of cases in event log if set # util.apply(log) if num_cases is not None: log = log[:num_cases] print("Event log loaded") return log
def average_duration_activity( log: EventLog, t1: Union[datetime, str], t2: Union[datetime, str], r: str, a: str, parameters: Optional[Dict[str, Any]] = None) -> float: """ The average duration of instances of a given activity completed during a given time slot by a given resource. Metric RBI 4.3 in Pika, Anastasiia, et al. "Mining resource profiles from event logs." ACM Transactions on Management Information Systems (TMIS) 8.1 (2017): 1-30. Parameters ----------------- log Event log t1 Left interval t2 Right interval r Resource a Activity Returns ---------------- metric Value of the metric """ if parameters is None: parameters = {} t1 = get_dt_from_string(t1) t2 = get_dt_from_string(t2) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, None) from pm4py.objects.log.util import sorting log = sorting.sort_timestamp(log, timestamp_key) from pm4py.objects.log.util import interval_lifecycle log = interval_lifecycle.to_interval(log, parameters=parameters) if start_timestamp_key is None: log = __insert_start_from_previous_event(log, parameters=parameters) start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY log = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM) log = [ x for x in log if x[resource_key] == r and x[activity_key] == a and x[timestamp_key] >= t1 and x[timestamp_key] < t2 ] return float( mean(x[timestamp_key].timestamp() - x[start_timestamp_key].timestamp() for x in log))
def __compute_workload( log: EventLog, resource: Optional[str] = None, activity: Optional[str] = None, parameters: Optional[Dict[str, Any]] = None) -> Dict[Tuple, int]: """ Computes the workload of resources/activities, corresponding to each event a number (number of concurring events) Parameters --------------- log event log resource (if provided) Resource on which we want to compute the workload activity (if provided) Activity on which we want to compute the workload Returns --------------- workload_dict Dictionary associating to each event the number of concurring events """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) resource_key = exec_utils.get_param_value( Parameters.RESOURCE_KEY, parameters, xes_constants.DEFAULT_RESOURCE_KEY) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) start_timestamp_key = exec_utils.get_param_value( Parameters.START_TIMESTAMP_KEY, parameters, None) from pm4py.objects.log.util import sorting log = sorting.sort_timestamp(log, timestamp_key) from pm4py.objects.log.util import interval_lifecycle log = interval_lifecycle.to_interval(log, parameters=parameters) if start_timestamp_key is None: log = __insert_start_from_previous_event(log, parameters=parameters) start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY events = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM) if resource is not None: events = [x for x in events if x[resource_key] == resource] if activity is not None: events = [x for x in events if x[activity_key] == activity] events = [(x[start_timestamp_key].timestamp(), x[timestamp_key].timestamp(), x[resource_key], x[activity_key]) for x in events] events = sorted(events) from intervaltree import IntervalTree, Interval tree = IntervalTree() ev_map = {} k = 0.000001 for ev in events: tree.add(Interval(ev[0], ev[1] + k)) for ev in events: ev_map[ev] = len(tree[ev[0]:ev[1] + k]) return ev_map
def import_log(filename, parameters=None): """ Imports an XES file into a log_skeleton object Parameters ---------- filename: Absolute filename parameters Parameters of the algorithm, including Parameters.TIMESTAMP_SORT -> Specify if we should sort log_skeleton by timestamp Parameters.TIMESTAMP_KEY -> If sort is enabled, then sort the log_skeleton by using this key Parameters.REVERSE_SORT -> Specify in which direction the log_skeleton should be sorted Parameters.MAX_TRACES -> Specify the maximum number of traces to import from the log_skeleton (read in order in the XML file) Returns ------- log_skeleton : :class:`pm4py.log_skeleton.log_skeleton.EventLog` A log_skeleton """ from lxml import etree if parameters is None: parameters = {} max_no_traces_to_import = exec_utils.get_param_value( Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) date_parser = dt_parser.get() # count number of traces and setup progress bar no_trace = count_traces(filename) context = etree.iterparse(filename, events=[_EVENT_START, _EVENT_END]) # make tqdm facultative progress = None if pkgutil.find_loader("tqdm"): from tqdm.auto import tqdm progress = tqdm(total=no_trace, desc="parsing log_skeleton, completed traces :: ") log = None trace = None event = None tree = {} for tree_event, elem in context: if tree_event == _EVENT_START: # starting to read parent = tree[ elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree) except TypeError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_EVENT): if event is not None: raise SyntaxError( 'file contains <event> in another <event> tag') event = Event() tree[elem] = event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError( 'file contains <trace> in another <trace> tag') trace = Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree) continue elif elem.tag.endswith(xes_constants.TAG_EXTENSION): if log is None: raise SyntaxError( 'extension found outside of <log_skeleton> tag') if elem.get(xes_constants.KEY_NAME) is not None and elem.get( xes_constants.KEY_PREFIX) is not None and elem.get( xes_constants.KEY_URI) is not None: log.extensions[elem.get(xes_constants.KEY_NAME)] = { xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX), xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI) } continue elif elem.tag.endswith(xes_constants.TAG_GLOBAL): if log is None: raise SyntaxError( 'global found outside of <log_skeleton> tag') if elem.get(xes_constants.KEY_SCOPE) is not None: log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get( xes_constants.KEY_SCOPE)] continue elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER): if log is None: raise SyntaxError( 'classifier found outside of <log_skeleton> tag') if elem.get(xes_constants.KEY_KEYS) is not None: classifier_value = elem.get(xes_constants.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get(xes_constants.KEY_NAME)] = [ x for x in classifier_value.split("'") if x.strip() ] else: log.classifiers[elem.get(xes_constants.KEY_NAME )] = classifier_value.split() continue elif elem.tag.endswith(xes_constants.TAG_LOG): if log is not None: raise SyntaxError('file contains > 1 <log_skeleton> tags') log = EventLog() tree[elem] = log.attributes continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(xes_constants.TAG_TRACE): log.append(trace) # update progress bar as we have a completed trace if progress is not None: progress.update() trace = None continue elif elem.tag.endswith(xes_constants.TAG_LOG): continue # gracefully close progress bar if progress is not None: progress.close() del context, progress if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) return log
def import_log(filename, parameters=None): """ Imports an XES file into a log object Parameters ---------- filename: Absolute filename parameters Parameters of the algorithm, including timestamp_sort -> Specify if we should sort log by timestamp timestamp_key -> If sort is enabled, then sort the log by using this key reverse_sort -> Specify in which direction the log should be sorted index_trace_indexes -> Specify if trace indexes should be added as event attribute for each event max_no_traces_to_import -> Specify the maximum number of traces to import from the log (read in order in the XML file) Returns ------- log : :class:`pm4py.log.log.EventLog` A log """ if parameters is None: parameters = {} timestamp_sort = False timestamp_key = "time:timestamp" reverse_sort = False insert_trace_indexes = False max_no_traces_to_import = 1000000000 if "timestamp_sort" in parameters: timestamp_sort = parameters["timestamp_sort"] if "timestamp_key" in parameters: timestamp_key = parameters["timestamp_key"] if "reverse_sort" in parameters: reverse_sort = parameters["reverse_sort"] if "insert_trace_indexes" in parameters: insert_trace_indexes = parameters["insert_trace_indexes"] if "max_no_traces_to_import" in parameters: max_no_traces_to_import = parameters["max_no_traces_to_import"] context = etree.iterparse(filename, events=['start', 'end']) log = None trace = None event = None tree = {} for tree_event, elem in context: if tree_event == EVENT_START: # starting to read parent = tree[ elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(log_lib.util.xes.TAG_STRING): if parent is not None: tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), elem.get(log_lib.util.xes.KEY_VALUE), tree) continue elif elem.tag.endswith(log_lib.util.xes.TAG_DATE): try: dt = ciso8601.parse_datetime( elem.get(log_lib.util.xes.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), dt, tree) except TypeError: logging.info("failed to parse date: " + str(elem.get(log_lib.util.xes.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(log_lib.util.xes.KEY_VALUE))) continue elif elem.tag.endswith(log_lib.util.xes.TAG_EVENT): if event is not None: raise SyntaxError( 'file contains <event> in another <event> tag') event = log_lib.log.Event() tree[elem] = event continue elif elem.tag.endswith(log_lib.util.xes.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError( 'file contains <trace> in another <trace> tag') trace = log_lib.log.Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(log_lib.util.xes.TAG_FLOAT): if parent is not None: try: val = float(elem.get(log_lib.util.xes.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse float: " + str(elem.get(log_lib.util.xes.KEY_VALUE))) continue elif elem.tag.endswith(log_lib.util.xes.TAG_INT): if parent is not None: try: val = int(elem.get(log_lib.util.xes.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse int: " + str(elem.get(log_lib.util.xes.KEY_VALUE))) continue elif elem.tag.endswith(log_lib.util.xes.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(log_lib.util.xes.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), val, tree) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(log_lib.util.xes.KEY_VALUE))) continue elif elem.tag.endswith(log_lib.util.xes.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), None, tree) continue elif elem.tag.endswith(log_lib.util.xes.TAG_ID): if parent is not None: tree = __parse_attribute( elem, parent, elem.get(log_lib.util.xes.KEY_KEY), elem.get(log_lib.util.xes.KEY_VALUE), tree) continue elif elem.tag.endswith(log_lib.util.xes.TAG_EXTENSION): if log is None: raise SyntaxError('extension found outside of <log> tag') if elem.get( log_lib.util.xes.KEY_NAME) is not None and elem.get( log_lib.util.xes.KEY_PREFIX ) is not None and elem.get( log_lib.util.xes.KEY_URI) is not None: log.extensions[elem.get(log_lib.util.xes.KEY_NAME)] = { log_lib.util.xes.KEY_PREFIX: elem.get(log_lib.util.xes.KEY_PREFIX), log_lib.util.xes.KEY_URI: elem.get(log_lib.util.xes.KEY_URI) } continue elif elem.tag.endswith(log_lib.util.xes.TAG_GLOBAL): if log is None: raise SyntaxError('global found outside of <log> tag') if elem.get(log_lib.util.xes.KEY_SCOPE) is not None: log.omni_present[elem.get(log_lib.util.xes.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get( log_lib.util.xes.KEY_SCOPE)] continue elif elem.tag.endswith(log_lib.util.xes.TAG_CLASSIFIER): if log is None: raise SyntaxError('classifier found outside of <log> tag') if elem.get(log_lib.util.xes.KEY_KEYS) is not None: classifier_value = elem.get(log_lib.util.xes.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get( log_lib.util.xes.KEY_NAME)] = [ x for x in classifier_value.split("'") if x.strip() ] else: log.classifiers[elem.get(log_lib.util.xes.KEY_NAME )] = classifier_value.split() continue elif elem.tag.endswith(log_lib.util.xes.TAG_LOG): if log is not None: raise SyntaxError('file contains > 1 <log> tags') log = log_lib.log.EventLog() tree[elem] = log.attributes continue elif tree_event == EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(log_lib.util.xes.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(log_lib.util.xes.TAG_TRACE): log.append(trace) trace = None continue elif elem.tag.endswith(log_lib.util.xes.TAG_LOG): continue del context if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) if insert_trace_indexes: log.insert_trace_index_as_event_attribute() return log
def apply(df0, classifier_function=None, parameters=None): if parameters is None: parameters = {} if classifier_function is None: classifier_function = lambda x: x["event_activity"] min_acti_freq = parameters["min_acti_freq"] if "min_acti_freq" in parameters else 0 min_edge_freq = parameters["min_edge_freq"] if "min_edge_freq" in parameters else 0 df = df0.copy() df = general.preprocess(df, parameters=parameters) df = clean_frequency.apply(df, min_acti_freq=min_acti_freq) df = clean_arc_frequency.apply(df, min_freq=min_edge_freq) models = {} obj_types = [x for x in df.columns if not x.startswith("event_")] activities_repeated = Counter() activities = set() edges = Counter() start_activities = dict() end_activities = dict() acti_spec = Counter() for ot in obj_types: start_activities[ot] = set() end_activities[ot] = set() new_df = df[["event_id", "event_activity", "event_timestamp", ot]].dropna(subset=[ot]) new_df = new_df.sort_values("event_timestamp") new_df = new_df.rename( columns={ot: "case:concept:name", "event_timestamp": "time:timestamp"}) log = new_df.to_dict("r") for ev in log: ev["event_objtype"] = ot ev["concept:name"] = classifier_function(ev) del ev["event_objtype"] del ev["event_activity"] activities.add((ev["event_id"], ev["concept:name"])) log = EventStream(log) this_activities = set(x["concept:name"] for x in log) for act in this_activities: activities_repeated[act] += 1 log = log_conv_factory.apply(log, variant=log_conv_factory.TO_EVENT_LOG) log = sorting.sort_timestamp(log, "time:timestamp") for trace in log: if trace: start_activities[ot].add(trace[0]["concept:name"]) end_activities[ot].add(trace[-1]["concept:name"]) for i in range(len(trace) - 1): ev0 = trace[i] ev1 = trace[i + 1] edges[(ot, ev0["concept:name"], ev1["concept:name"], ev0["event_id"], ev1["event_id"], trace.attributes["concept:name"], ev0["time:timestamp"], ev1["time:timestamp"])] += 1 acti_spec[(ot, trace[i]["concept:name"], trace[i]["event_id"], trace.attributes["concept:name"], trace[i]["time:timestamp"])] += 1 if len(trace) > 0: acti_spec[(ot, trace[-1]["concept:name"], trace[-1]["event_id"], trace.attributes["concept:name"], trace[-1]["time:timestamp"])] += 1 models[ot] = alpha_miner.apply(log, parameters=parameters) activities_repeated = set(x for x in activities_repeated if activities_repeated[x] > 1) activities = dict(Counter(list(x[1] for x in activities))) return {"type": "petri", "models": models, "activities": activities, "activities_repeated": activities_repeated, "edges": edges, "acti_spec": acti_spec}