def test_61(self): import os from pm4py.objects.log.importer.xes import importer as xes_importer log = xes_importer.apply(os.path.join("input_data", "roadtraffic50traces.xes")) from pm4py.objects.log.util import get_log_representation str_trace_attributes = [] str_event_attributes = ["concept:name"] num_trace_attributes = [] num_event_attributes = ["amount"] data, feature_names = get_log_representation.get_representation(log, str_trace_attributes, str_event_attributes, num_trace_attributes, num_event_attributes) data, feature_names = get_log_representation.get_default_representation(log) from pm4py.objects.log.util import get_class_representation target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000) from sklearn import tree clf = tree.DecisionTreeClassifier() clf.fit(data, target) from pm4py.visualization.decisiontree import visualizer as dectree_visualizer gviz = dectree_visualizer.apply(clf, feature_names, classes)
def form_representation_from_dictio_couple(first_cases_repr, second_cases_repr, string_attributes, numeric_attributes, enable_multiplier=False): """ Gets a log_skeleton representation, useful for training the decision tree, from a couple of dictionaries along with the list of string attributes and numeric attributes to consider, to use for root cause analysis Parameters ------------ first_cases_repr First cases representation second_cases_repr Second cases representation string_attributes String attributes contained in the log_skeleton numeric_attributes Numeric attributes contained in the log_skeleton enable_multiplier Enable balancing of classes Returns ------------ data Matrix representation of the event log_skeleton feature_names Array of feature names """ from pm4py.objects.log.util import get_log_representation log = form_log_from_dictio_couple(first_cases_repr, second_cases_repr, enable_multiplier=enable_multiplier) data, feature_names = get_log_representation.get_representation(log, [], string_attributes, [], numeric_attributes) return data, feature_names
def apply(log, parameters=None): """ Apply PCA + DBSCAN clustering after creating a representation of the log containing the wanted attributes and the wanted succession of attributes Parameters ----------- log Trace log parameters Parameters of the algorithm, including: pca_components -> Number of the components for the PCA dbscan_eps -> EPS value for the DBScan clustering str_tr_attr -> String trace attributes to consider in feature representation str_ev_attr -> String event attributes to consider in feature representation num_tr_attr -> Numeric trace attributes to consider in feature representation num_ev_attr -> Numeric event attributes to consider in feature representation str_evsucc_attr -> Succession between event attributes to consider in feature representation Returns ----------- log_list A list containing, for each cluster, a different log """ if parameters is None: parameters = {} pca_components = parameters[ "pca_components"] if "pca_components" in parameters else 3 dbscan_eps = parameters["dbscan_eps"] if "dbscan_eps" in parameters else 0.3 log_list = [] data, feature_names = get_.get_representation(log, str_ev_attr=['concept:name'], str_tr_attr=[], num_ev_attr=[], num_tr_attr=[], str_evsucc_attr=[]) pca = PCA(n_components=pca_components) pca.fit(data) data2d = pca.transform(data) db = DBSCAN(eps=dbscan_eps).fit(data2d) labels = db.labels_ already_seen = {} for i in range(len(log)): if not labels[i] in already_seen: already_seen[labels[i]] = len(list(already_seen.keys())) log_list.append(EventLog()) trace = Trace(log[i]) for attribute in log[i].attributes: trace.attributes[attribute] = log[i].attributes[attribute] log_list[already_seen[labels[i]]].append(trace) return log_list
def test_decisiontree_traceduration(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" log_path = os.path.join("input_data", "roadtraffic50traces.xes") log = xes_importer.apply(log_path) data, feature_names = get_log_representation.get_representation(log, [], ["concept:name"], [], ["amount"]) target, classes = get_class_representation.get_class_representation_by_trace_duration(log, 2 * 8640000) clf = tree.DecisionTreeClassifier(max_depth=7) clf.fit(data, target) gviz = dt_vis.apply(clf, feature_names, classes, parameters={dt_vis.Variants.CLASSIC.value.Parameters.FORMAT: "svg"}) del gviz
def test(model, obj, parameters=None): """ Test the prediction model Parameters ------------ model Prediction model obj Object to predict (Trace / EventLog) parameters Possible parameters of the algorithm Returns ------------ pred Result of the prediction (single value / list) """ if parameters is None: parameters = {} str_tr_attr = model["str_tr_attr"] str_ev_attr = model["str_ev_attr"] num_tr_attr = model["num_tr_attr"] num_ev_attr = model["num_ev_attr"] str_evsucc_attr = model["str_evsucc_attr"] feature_names = model["feature_names"] regr = model["regr"] if type(obj) is EventLog: log = obj else: log = EventLog([obj]) data, feature_names = get_log_representation.get_representation( log, str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr, str_evsucc_attr=str_evsucc_attr, feature_names=feature_names) pred = regr.predict(data) if len(pred) == 1: # prediction on a single case return pred[0] else: return pred
def test_decisiontree_evattrvalue(self): # to avoid static method warnings in tests, # that by construction of the unittest package have to be expressed in such way self.dummy_variable = "dummy_value" log_path = os.path.join("input_data", "roadtraffic50traces.xes") log = xes_importer.import_log(log_path) data, feature_names = get_log_representation.get_representation( log, [], ["concept:name"], [], ["amount"]) target, classes = get_class_representation.get_class_representation_by_str_ev_attr_value_value( log, "concept:name") clf = tree.DecisionTreeClassifier(max_depth=7) clf.fit(data, target) gviz = dt_vis_factory.apply(clf, feature_names, classes, parameters={"format": "svg"}) del gviz
def find_anonmalies_with_isolation_forest(log, original_features, original_log_df, result_path): log_features, feature_names_log = get_log_representation.get_representation( log, str_ev_attr=["concept:name"], str_tr_attr=[], num_ev_attr=[], num_tr_attr=[], str_evsucc_attr=["concept:name"]) log_df = pd.DataFrame(log_features, columns=feature_names_log) features = np.union1d(original_features, feature_names_log) new_features_train = np.setxor1d(original_features, features) new_features_df = pd.DataFrame(columns=new_features_train) train_df = original_log_df.append(new_features_df) train_df = train_df.fillna(0) model = IsolationForest() model.fit(train_df) new_features_test = np.setxor1d(feature_names_log, features) new_features_df = pd.DataFrame(columns=new_features_test) test_df = log_df.append(new_features_df) test_df = test_df.fillna(0) log_df["scores"] = model.decision_function(test_df) results = dict() results["avg"] = log_df["scores"].mean() count_traces = log_df["scores"].count() + 1 anonmalies = log_df[log_df.scores <= 0].shape[0] results["anonmaly_relative_frequency"] = anonmalies / count_traces print(results) with open(result_path, 'wb') as file: pickle.dump(results, file)
def train(log, parameters=None): """ Train the model Parameters ------------- log Log parameters Possible parameters of the algorithm, including default_epochs """ if parameters is None: parameters = {} default_epochs = parameters[ "default_epochs"] if "default_epochs" in parameters else 50 parameters["enable_sort"] = False activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY # log = sorting.sort_timestamp(log, timestamp_key) max_len_trace = max([len(trace) for trace in log]) y_orig = parameters[ "y_orig"] if "y_orig" in parameters else get_remaining_time_from_log( log, max_len_trace=max_len_trace, parameters=parameters) y, log_max_value = normalize_remaining_time(y_orig) y = np.array(y) str_evsucc_attr = [activity_key] if "str_ev_attr" in parameters: str_tr_attr = parameters[ "str_tr_attr"] if "str_tr_attr" in parameters else [] str_ev_attr = parameters[ "str_ev_attr"] if "str_ev_attr" in parameters else [] num_tr_attr = parameters[ "num_tr_attr"] if "num_tr_attr" in parameters else [] num_ev_attr = parameters[ "num_ev_attr"] if "num_ev_attr" in parameters else [] else: str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = attributes_filter.select_attributes_from_log_for_tree( log) if activity_key not in str_ev_attr: str_ev_attr.append(activity_key) data, feature_names = get_log_representation.get_representation( log, str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr, str_evsucc_attr=str_evsucc_attr) X = get_X_from_log(log, feature_names, max_len_trace) in_out_neurons = X.shape[2] hidden_neurons = min(int(in_out_neurons * 7.5), 50) input_shape = (X.shape[1], X.shape[2]) model = Sequential() model.add( LSTM(hidden_neurons, return_sequences=False, input_shape=input_shape)) model.add(Dense(in_out_neurons)) model.add(Activation("linear")) model.compile(loss="mean_squared_error", optimizer="rmsprop") model.fit(X, y, batch_size=X.shape[1], nb_epoch=default_epochs, validation_split=0.2) return { "str_tr_attr": str_tr_attr, "str_ev_attr": str_ev_attr, "num_tr_attr": num_tr_attr, "num_ev_attr": num_ev_attr, "str_evsucc_attr": str_evsucc_attr, "feature_names": feature_names, "regr": model, "max_len_trace": max_len_trace, "log_max_value": log_max_value, "variant": "keras_rnn" }
precision = precision_evaluator.apply( original_log, model, initial_marking, final_marking, variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN) print(str(precision)) fscore = 2 * precision * fitness / (precision + fitness) print("Fscore of: " + str(fscore)) generalization = generalization_evaluator.apply(original_log, model, initial_marking, final_marking) print("Generalization of: " + str(generalization)) log_features, feature_names_log = get_log_representation.get_representation( original_log, str_ev_attr=["concept:name"], str_tr_attr=[], num_ev_attr=[], num_tr_attr=[], str_evsucc_attr=["concept:name"]) log_df = pd.DataFrame(log_features, columns=feature_names_log) model = IsolationForest() model.fit(log_df) log_df["scores"] = model.decision_function(log_df) count_traces = log_df["scores"].count() + 1 anonmalies = log_df[log_df.scores <= 0].shape[0] anonmaly_relative_frequency = anonmalies / count_traces print("Relative frequency anonmalies: " + str(anonmaly_relative_frequency))
from pm4py.visualization.process_tree import visualizer as pt_visualizer gviz = pt_visualizer.apply(tree, parameters= {pt_visualizer.Variants.WO_DECORATION.value.Parameters.FORMAT: "png"}) pt_visualizer.view(gviz) #Decision Tree import os from pm4py.objects.log.importer.xes import importer as xes_importer log = xes_importer.apply(os.path.join("tests", "input_data", "roadtraffic50traces.xes")) from pm4py.objects.log.util import get_log_representation str_trace_attributes = [] str_event_attributes = ["concept:name"] num_trace_attributes = [] num_event_attributes = ["amount"] data, feature_names = get_log_representation.get_representation(log, str_trace_attributes, str_event_attributes, num_trace_attributes, num_event_attributes) #error data, feature_names = get_log_representation.get_default_representation(log) import pandas as pd dataframe = pd.DataFrame(data, columns=feature_names) dataframe dataframe.to_csv("features.csv", index=False) from sklearn import tree clf = tree.DecisionTreeClassifier() clf.fit(data, target) from pm4py.visualization.decisiontree import visualizer as dectree_visualizer gviz = dectree_visualizer.apply(clf, feature_names, classes)
def train(log, parameters=None): """ Train the prediction model Parameters ----------- log Event log parameters Possible parameters of the algorithm Returns ------------ model Trained model """ if parameters is None: parameters = {} parameters["enable_sort"] = False activity_key = parameters[ constants. PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else xes.DEFAULT_NAME_KEY timestamp_key = parameters[ constants. PARAMETER_CONSTANT_TIMESTAMP_KEY] if constants.PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else xes.DEFAULT_TIMESTAMP_KEY business_hours = parameters[ "business_hours"] if "business_hours" in parameters else False worktiming = parameters["worktiming"] if "worktiming" in parameters else [ 7, 17 ] weekends = parameters["weekends"] if "weekends" in parameters else [6, 7] y_orig = parameters["y_orig"] if "y_orig" in parameters else None log = sorting.sort_timestamp(log, timestamp_key) str_evsucc_attr = [activity_key] if "str_ev_attr" in parameters: str_tr_attr = parameters[ "str_tr_attr"] if "str_tr_attr" in parameters else [] str_ev_attr = parameters[ "str_ev_attr"] if "str_ev_attr" in parameters else [] num_tr_attr = parameters[ "num_tr_attr"] if "num_tr_attr" in parameters else [] num_ev_attr = parameters[ "num_ev_attr"] if "num_ev_attr" in parameters else [] else: str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr = attributes_filter.select_attributes_from_log_for_tree( log) if activity_key not in str_ev_attr: str_ev_attr.append(activity_key) max_trace_length = max(len(x) for x in log) if max_trace_length == 1: # this you shall use data, feature_names = get_log_representation.get_representation( log, str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr, str_evsucc_attr=str_evsucc_attr) ext_log = log else: ext_log, change_indexes = get_log_with_log_prefixes(log) data, feature_names = get_log_representation.get_representation( ext_log, str_tr_attr, str_ev_attr, num_tr_attr, num_ev_attr, str_evsucc_attr=str_evsucc_attr) if y_orig is not None: remaining_time = [y for x in y_orig for y in x] else: if business_hours: remaining_time = [] for trace in ext_log: if trace: timestamp_et = trace[-1][timestamp_key] timestamp_st = trace[0][timestamp_key] bh = BusinessHours(timestamp_st.replace(tzinfo=None), timestamp_et.replace(tzinfo=None), worktiming=worktiming, weekends=weekends) remaining_time.append(bh.getseconds()) else: remaining_time.append(0) else: remaining_time = [] for trace in ext_log: if trace: remaining_time.append( (trace[-1][timestamp_key] - trace[0][timestamp_key]).total_seconds()) else: remaining_time.append(0) regr = ElasticNet(max_iter=10000, l1_ratio=0.7) print(data) regr.fit(data, remaining_time) return { "str_tr_attr": str_tr_attr, "str_ev_attr": str_ev_attr, "num_tr_attr": num_tr_attr, "num_ev_attr": num_ev_attr, "str_evsucc_attr": str_evsucc_attr, "feature_names": feature_names, "remaining_time": remaining_time, "regr": regr, "variant": "elasticnet" }