spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): print(prefix_size) for line, times, times2, times3 in zip(lines, lines_t, lines_t2, lines_t3): prediction_end_reached = False times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2)<prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_tree_root = MultileafTree(beam_size, encode(cropped_line, cropped_times, cropped_times3,maxlen,chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) prediction_end_reached = False ground_truth = ''.join(line[prefix_size:prefix_size+predict_size]) ground_truth_t = times2[prefix_size-1] case_end_time = times2[len(times2)-1] ground_truth_t = case_end_time-ground_truth_t predicted = '' for i in range(predict_size): #here we will take data from the node in the tree used to prun
def run_experiments(log_identificator, formula_type): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(log_identificator, formula_type) current_path = os.path.abspath(getsourcefile(lambda: 0)) current_dir = os.path.dirname(current_path) parent_dir = current_dir[:current_dir.rfind(os.path.sep)] sys.path.insert(0, parent_dir) start_time = time.time() lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, \ divisor3, predict_size, target_indices_char, target_char_indices = prepare_testing_data(eventlog) # find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # modify to be able to get second best prediction def getSymbol(predictions, ith_best=0): predictions[ 0] = predictions[0] * stop_symbol_probability_amplifier_current i = np.argsort(predictions)[len(predictions) - ith_best - 1] return target_indices_char[i] one_ahead_gt = [] one_ahead_pred = [] # load model, set this to the model generated by train.py model = load_model(path_to_model_file) stop_symbol_probability_amplifier_current = 1 # make predictions with open( 'output_files/results/' + formula_type + '/suffix_and_remaining_time2_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): # here we checkout the prefixes with formulas verified only on the suffix phase lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces( lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print("prefix size: " + str(prefix_size)) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) for line, times, times2, times3 in zip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): prediction_end_reached = False times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_tree_root = MultileafTree( beam_size, encode(cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) prediction_end_reached = False ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' for i in range(predict_size): # here we will take data from the node in the tree used to prun enc = search_tree_root.data # encode(cropped_line, cropped_times, cropped_times3) y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] stop_symbol_probability_amplifier_current, \ start_of_the_cycle_symbol = amplify(search_tree_root.cropped_line) # cropped_line += prediction if y_t < 0: y_t = 0 # TOO not normalizing here seems like a bug cropped_times.append(y_t) ma = False for i in range(beam_size): prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, i) # end of case was just predicted, therefore, stop predicting further into the future if prediction == '!': if verify_formula_as_compliant( search_tree_root.cropped_line, formula, prefix_size): one_ahead_pred.append( search_tree_root.total_predicted_time) one_ahead_gt.append(ground_truth_t) print('! predicted, end case') ma = True break # else: # prediction_end_reached = True; if ma: break # if the end of prediction was not reached we continue as always, and then function :choose_next_ # top_descendant: will earch for future prediction # in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 if not prediction_end_reached: cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for i in range(beam_size): temp_prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, i) if temp_prediction == '!': continue temp_cropped_line = search_tree_root.cropped_line + temp_prediction # this means that we found the end in one of the alternatives. temp_total_predicted_time = search_tree_root.total_predicted_time + y_t temp_state_data = encode(temp_cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2) search_tree_root.descendants[i] = MultileafTree( beam_size, temp_state_data, temp_cropped_line, temp_total_predicted_time, search_tree_root) search_tree_root = search_tree_root.choose_next_top_descendant( ) if prediction_end_reached: prediction_end_reached = False if search_tree_root is None: print( "Cannot find any trace that is compliant with formula given current beam size" ) break output = [] if search_tree_root is None: predicted = u"" total_predicted_time = 0 else: predicted = (search_tree_root.cropped_line[prefix_size:]) total_predicted_time = search_tree_root.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the # default character encoding of the operating system caused it to be negative, this should never # be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
# lines_t2 = lines_t2[13:] # lines_t3 = lines_t3[13:] for line, times, times2, times3 in izip(lines, lines_t, lines_t2, lines_t3): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_node_root = NodePrediction( encode(cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' queue_next_steps = PriorityQueue() queue_next_steps.put( (-search_node_root.probability_of, search_node_root)) queue_next_steps_future = PriorityQueue()
def runExperiments(logIdentificator, formulaType): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType) start_time = time.time() lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices,divisor, divisor2, \ divisor3, predict_size,target_indices_char,target_char_indices\ = prepare_testing_data(eventlog) # # lines = lines[0:300] # lines_t= lines_t[0:300] # lines_t2=lines_t2[0:300] # lines_t3=lines_t3[0:300] #this is the beam stack size, means how many "best" alternatives will be stored one_ahead_gt = [] one_ahead_pred = [] #find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # load model, set this to the model generated by train.py model = load_model(path_to_model_file) class NodePrediction(): def __init__(self, data, cropped_line, total_predicted_time, probability_of=0): self.data = data self.cropped_line = cropped_line self.total_predicted_time = total_predicted_time self.probability_of = probability_of # make predictions with open( 'output_files/results/' + formulaType + '/suffix_and_remaining_time3_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): print(prefix_size) # lines = lines[13:] # lines_t = lines_t[13:] # lines_t2 = lines_t2[13:] # lines_t3 = lines_t3[13:] lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces( lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print("prefix size: " + str(prefix_size)) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) counterr = 0 for line, times, times2, times3 in izip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_node_root = NodePrediction( encode(cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' queue_next_steps = PriorityQueue() queue_next_steps.put( (-search_node_root.probability_of, search_node_root)) queue_next_steps_future = PriorityQueue() start_of_the_cycle_symbol = " " found_sattisfying_constraint = False current_beam_size = beam_size for i in range(predict_size): for k in range(current_beam_size): if queue_next_steps.empty(): break _, current_prediction_premis = queue_next_steps.get() if not found_sattisfying_constraint: if verify_formula_as_compliant( current_prediction_premis.cropped_line, formula, prefix_size): #the formula verified and we can just finish the predictions #beam size is 1 because predict only sequence of events current_beam_size = 1 #overwrite new queue queue_next_steps_future = PriorityQueue() found_sattisfying_constraint = True enc = current_prediction_premis.data temp_cropped_line = current_prediction_premis.cropped_line y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] if y_t < 0: y_t = 0 cropped_times.append(y_t) if not i == 0: stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = amplify( temp_cropped_line) #in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for j in range(current_beam_size): temp_prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, j) if temp_prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future if verify_formula_as_compliant( temp_cropped_line, formula, prefix_size): one_ahead_pred.append( current_prediction_premis. total_predicted_time) one_ahead_gt.append(ground_truth_t) stop_symbol_probability_amplifier_current = 1 print('! predicted, end case') queue_next_steps = PriorityQueue() break else: continue temp_cropped_line = current_prediction_premis.cropped_line + temp_prediction temp_total_predicted_time = current_prediction_premis.total_predicted_time + y_t temp_state_data = encode(temp_cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2) probability_this = np.sort(y_char)[len(y_char) - 1 - j] temp = NodePrediction( temp_state_data, temp_cropped_line, temp_total_predicted_time, current_prediction_premis.probability_of + np.log(probability_this)) queue_next_steps_future.put( (-temp.probability_of, temp)) # print str(counterr) + ' ' + str(i) + ' ' + str(k) \ # + ' ' + str(j) + ' ' + temp_cropped_line[prefix_size:]\ # + " " + str(temp.probability_of) queue_next_steps = queue_next_steps_future queue_next_steps_future = PriorityQueue() counterr += 1 if current_prediction_premis == None: print "Cannot find any trace that is compliant with formula given current beam size" break output = [] if current_prediction_premis == None: predicted = u"" total_predicted_time = 0 else: predicted = ( current_prediction_premis.cropped_line[prefix_size:]) total_predicted_time = current_prediction_premis.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))