def run_experiments(log_identificator, formula_type):
    eventlog, path_to_model_file, beam_size, \
        prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(log_identificator, formula_type)

    current_path = os.path.abspath(getsourcefile(lambda: 0))
    current_dir = os.path.dirname(current_path)
    parent_dir = current_dir[:current_dir.rfind(os.path.sep)]

    sys.path.insert(0, parent_dir)

    start_time = time.time()

    lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, \
        divisor3, predict_size, target_indices_char, target_char_indices = prepare_testing_data(eventlog)

    # find cycles and modify the probability functionality goes here
    stop_symbol_probability_amplifier_current = 1

    # modify to be able to get second best prediction
    def getSymbol(predictions, ith_best=0):
        predictions[
            0] = predictions[0] * stop_symbol_probability_amplifier_current
        i = np.argsort(predictions)[len(predictions) - ith_best - 1]
        return target_indices_char[i]

    one_ahead_gt = []
    one_ahead_pred = []

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)
    stop_symbol_probability_amplifier_current = 1
    # make predictions
    with open(
            'output_files/results/' + formula_type +
            '/suffix_and_remaining_time2_%s' % eventlog, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "Prefix length", "Groud truth", "Predicted", "Levenshtein",
            "Damerau", "Jaccard", "Ground truth times", "Predicted times",
            "RMSE", "MAE", "Median AE"
        ])
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):
            # here we checkout the prefixes with formulas verified only on the suffix phase
            lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(
                lines, lines_t, lines_t2, lines_t3, formula, prefix_size)
            print("prefix size: " + str(prefix_size))
            print("formulas verifited: " + str(len(lines_s)) + " out of : " +
                  str(len(lines)))
            for line, times, times2, times3 in zip(lines_s, lines_t_s,
                                                   lines_t2_s, lines_t3_s):
                prediction_end_reached = False
                times.append(0)
                cropped_line = ''.join(line[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                if len(times2) < prefix_size:
                    continue  # make no prediction for this case, since this case has ended already

                # initialize root of the tree for beam search
                total_predicted_time_initialization = 0
                search_tree_root = MultileafTree(
                    beam_size,
                    encode(cropped_line, cropped_times, cropped_times3, maxlen,
                           chars, char_indices, divisor, divisor2),
                    cropped_line, total_predicted_time_initialization)

                prediction_end_reached = False

                ground_truth = ''.join(line[prefix_size:prefix_size +
                                            predict_size])
                ground_truth_t = times2[prefix_size - 1]
                case_end_time = times2[len(times2) - 1]
                ground_truth_t = case_end_time - ground_truth_t
                predicted = ''

                for i in range(predict_size):
                    # here we will take data from the node in the tree used to prun
                    enc = search_tree_root.data  # encode(cropped_line, cropped_times, cropped_times3)
                    y = model.predict(enc, verbose=0)  # make predictions
                    # split predictions into seperate activity and time predictions
                    y_char = y[0][0]
                    y_t = y[1][0][0]

                    stop_symbol_probability_amplifier_current, \
                        start_of_the_cycle_symbol = amplify(search_tree_root.cropped_line)

                    # cropped_line += prediction
                    if y_t < 0:
                        y_t = 0
                    # TOO not normalizing here seems like a bug
                    cropped_times.append(y_t)

                    ma = False
                    for i in range(beam_size):
                        prediction = getSymbolAmpl(
                            y_char, target_indices_char, target_char_indices,
                            start_of_the_cycle_symbol,
                            stop_symbol_probability_amplifier_current, i)
                        # end of case was just predicted, therefore, stop predicting further into the future
                        if prediction == '!':
                            if verify_formula_as_compliant(
                                    search_tree_root.cropped_line, formula,
                                    prefix_size):
                                one_ahead_pred.append(
                                    search_tree_root.total_predicted_time)
                                one_ahead_gt.append(ground_truth_t)
                                print('! predicted, end case')
                                ma = True
                                break

                            # else:
                            #     prediction_end_reached = True;
                    if ma:
                        break
                    # if the end of prediction was not reached we continue as always, and then function :choose_next_
                    # top_descendant: will earch for future prediction

                    # in not reached, function :choose_next_top_descendant: will backtrack
                    y_t = y_t * divisor3
                    if not prediction_end_reached:
                        cropped_times3.append(cropped_times3[-1] +
                                              timedelta(seconds=y_t))

                        for i in range(beam_size):
                            temp_prediction = getSymbolAmpl(
                                y_char, target_indices_char,
                                target_char_indices, start_of_the_cycle_symbol,
                                stop_symbol_probability_amplifier_current, i)
                            if temp_prediction == '!':
                                continue
                            temp_cropped_line = search_tree_root.cropped_line + temp_prediction

                            # this means that we found the end in one of the alternatives.
                            temp_total_predicted_time = search_tree_root.total_predicted_time + y_t

                            temp_state_data = encode(temp_cropped_line,
                                                     cropped_times,
                                                     cropped_times3, maxlen,
                                                     chars, char_indices,
                                                     divisor, divisor2)
                            search_tree_root.descendants[i] = MultileafTree(
                                beam_size, temp_state_data, temp_cropped_line,
                                temp_total_predicted_time, search_tree_root)

                    search_tree_root = search_tree_root.choose_next_top_descendant(
                    )
                    if prediction_end_reached:
                        prediction_end_reached = False
                    if search_tree_root is None:
                        print(
                            "Cannot find any trace that is compliant with formula given current beam size"
                        )
                        break

                output = []

                if search_tree_root is None:
                    predicted = u""
                    total_predicted_time = 0
                else:
                    predicted = (search_tree_root.cropped_line[prefix_size:])
                    total_predicted_time = search_tree_root.total_predicted_time

                if len(ground_truth) > 0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(
                        1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(
                        unicode(predicted), unicode(ground_truth)) /
                               max(len(predicted), len(ground_truth)))
                    if dls < 0:
                        dls = 0
                    # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the
                    # default character encoding of the operating system caused it to be negative, this should never
                    # be the case
                    output.append(dls)
                    output.append(1 -
                                  distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(
                        metrics.mean_absolute_error([ground_truth_t],
                                                    [total_predicted_time]))
                    output.append(
                        metrics.median_absolute_error([ground_truth_t],
                                                      [total_predicted_time]))
                    spamwriter.writerow(output)
    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
Пример #2
0
def runExperiments(logIdentificator, formulaType):
    eventlog, path_to_model_file, beam_size, \
        prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType)
    start_time = time.time()

    csvfile = open('../data/%s' % eventlog, 'r')
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')

    next(spamreader, None)  # skip the headers


    lastcase = ''
    line = ''
    firstLine = True
    lines = []
    timeseqs = []  # relative time since previous event
    timeseqs2 = [] # relative time since case start
    timeseqs3 = [] # absolute time of previous event
    times = []
    times2 = []
    times3 = []
    numlines = 0
    casestarttime = None
    lasteventtime = None

    for row in spamreader:
        t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
        if row[0]!=lastcase:
            casestarttime = t
            lasteventtime = t
            lastcase = row[0]
            if not firstLine:
                lines.append(line)
                timeseqs.append(times)
                timeseqs2.append(times2)
                timeseqs3.append(times3)
            line = ''
            times = []
            times2 = []
            times3 = []
            numlines+=1
        line+= getUnicode_fromInt(row[1])
        timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime))
        timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime))
        midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
        timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight
        timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
        timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
        times.append(timediff)
        times2.append(timediff2)
        times3.append(datetime.fromtimestamp(time.mktime(t)))
        lasteventtime = t
        firstLine = False

    # add last case
    lines.append(line)
    timeseqs.append(times)
    timeseqs2.append(times2)
    timeseqs3.append(times3)
    numlines+=1

    divisor = np.mean([item for sublist in timeseqs for item in sublist])
    print('divisor: {}'.format(divisor))
    divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
    print('divisor2: {}'.format(divisor2))
    divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x)-1]-y, x)), timeseqs2))
    print('divisor3: {}'.format(divisor3))

    elems_per_fold = int(round(numlines/3))

    fold1and2lines = lines[:2*elems_per_fold]

    step = 1
    sentences = []
    softness = 0
    next_chars = []
    fold1and2lines = map(lambda x: x+'!',fold1and2lines)
    maxlen = max(map(lambda x: len(x),fold1and2lines))

    chars = map(lambda x : set(x),fold1and2lines)
    chars = list(set().union(*chars))
    chars.sort()
    target_chars = copy.copy(chars)
    chars.remove('!')
    print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
    target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
    print(indices_char)

    #we only need the third fold, because first two were used for training

    fold3 = lines[2*elems_per_fold:]
    fold3_t = timeseqs[2*elems_per_fold:]
    fold3_t2 = timeseqs2[2*elems_per_fold:]
    fold3_t3 = timeseqs3[2*elems_per_fold:]

    lines = fold3
    lines_t = fold3_t
    lines_t2 = fold3_t2
    lines_t3 = fold3_t3

    # set parameters
    predict_size = maxlen

    # load model, set this to the model generated by train.py
    model = load_model(path_to_model_file)

    # define helper functions

    #this one encodes the current sentence into the onehot encoding
    def encode(sentence, times, times3, maxlen=maxlen):
        num_features = len(chars)+5
        X = np.zeros((1, maxlen, num_features), dtype=np.float32)
        leftpad = maxlen-len(sentence)
        times2 = np.cumsum(times)
        for t, char in enumerate(sentence):
            midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
            timesincemidnight = times3[t]-midnight
            multiset_abstraction = Counter(sentence[:t+1])
            for c in chars:
                if c==char:
                    X[0, t+leftpad, char_indices[c]] = 1
            X[0, t+leftpad, len(chars)] = t+1
            X[0, t+leftpad, len(chars)+1] = times[t]/divisor
            X[0, t+leftpad, len(chars)+2] = times2[t]/divisor2
            X[0, t+leftpad, len(chars)+3] = timesincemidnight.seconds/86400
            X[0, t+leftpad, len(chars)+4] = times3[t].weekday()/7
        return X



    #find cycles and modify the probability functionality goes here
    stop_symbol_probability_amplifier_current = 1
    start_of_the_cycle_symbol = " "

    one_ahead_gt = []
    one_ahead_pred = []

    two_ahead_gt = []
    two_ahead_pred = []

    three_ahead_gt = []
    three_ahead_pred = []

    with open('output_files/results/'+formulaType+'/suffix_and_remaining_time1_%s' % eventlog, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"])
        for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to):

            lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(lines, lines_t, lines_t2, lines_t3,
                                                                                     formula, prefix_size)
            print("prefix size: " + str(prefix_size))
            print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines)))

            for line, times, times2, times3 in izip(lines_s, lines_t_s, lines_t2_s, lines_t3_s):
                times.append(0)
                cropped_line = ''.join(line[:prefix_size])
                cropped_times = times[:prefix_size]
                cropped_times3 = times3[:prefix_size]
                if len(times2)<prefix_size:
                    continue # make no prediction for this case, since this case has ended already
                ground_truth = ''.join(line[prefix_size:prefix_size+predict_size])
                ground_truth_t = times2[prefix_size-1]
                case_end_time = times2[len(times2)-1]
                ground_truth_t = case_end_time-ground_truth_t
                predicted = ''
                total_predicted_time = 0
                for i in range(predict_size):
                    enc = encode(cropped_line, cropped_times, cropped_times3)
                    y = model.predict(enc, verbose=0) # make predictions
                    # split predictions into seperate activity and time predictions
                    y_char = y[0][0]
                    y_t = y[1][0][0]
                    prediction = getSymbolAmpl(y_char,target_indices_char,target_char_indices,
                                               start_of_the_cycle_symbol,
                                               stop_symbol_probability_amplifier_current) # undo one-hot encoding
                    cropped_line += prediction


                    stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = amplify(cropped_line)


                    if y_t<0:
                        y_t=0
                    cropped_times.append(y_t)
                    if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future
                        one_ahead_pred.append(total_predicted_time)
                        one_ahead_gt.append(ground_truth_t)
                        print('! predicted, end case')
                        break
                    y_t = y_t * divisor3
                    cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t))
                    total_predicted_time = total_predicted_time + y_t
                    predicted += prediction
                output = []
                if len(ground_truth)>0:
                    output.append(prefix_size)
                    output.append(unicode(ground_truth).encode("utf-8"))
                    output.append(unicode(predicted).encode("utf-8"))
                    output.append(1 - distance.nlevenshtein(predicted, ground_truth))
                    dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth)))
                    if dls<0:
                        dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
                    output.append(dls)
                    output.append(1 - distance.jaccard(predicted, ground_truth))
                    output.append(ground_truth_t)
                    output.append(total_predicted_time)
                    output.append('')
                    output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time]))
                    output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time]))
                    spamwriter.writerow(output)
    print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
            ground_truth = ''.join(line[prefix_size:prefix_size +
                                        predict_size])
            ground_truth_t = times2[prefix_size - 1]
            case_end_time = times2[len(times2) - 1]
            ground_truth_t = case_end_time - ground_truth_t
            predicted = ''
            total_predicted_time = 0
            for i in range(predict_size):
                enc = encode(cropped_line, cropped_times, cropped_times3)
                y = model.predict(enc, verbose=0)  # make predictions
                # split predictions into seperate activity and time predictions
                y_char = y[0][0]
                y_t = y[1][0][0]
                prediction = getSymbolAmpl(
                    y_char, target_indices_char, target_char_indices,
                    start_of_the_cycle_symbol,
                    stop_symbol_probability_amplifier_current
                )  # undo one-hot encoding
                cropped_line += prediction

                stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = amplify(
                    cropped_line)

                if y_t < 0:
                    y_t = 0
                cropped_times.append(y_t)
                if prediction == '!':  # end of case was just predicted, therefore, stop predicting further into the future
                    one_ahead_pred.append(total_predicted_time)
                    one_ahead_gt.append(ground_truth_t)
                    print('! predicted, end case')
                    break
Пример #4
0
                    if y_t < 0:
                        y_t = 0
                    cropped_times.append(y_t)

                    if not i == 0:
                        stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = amplify(
                            temp_cropped_line)

                    #in not reached, function :choose_next_top_descendant: will backtrack
                    y_t = y_t * divisor3
                    cropped_times3.append(cropped_times3[-1] +
                                          timedelta(seconds=y_t))

                    for j in range(current_beam_size):
                        temp_prediction = getSymbolAmpl(
                            y_char, target_indices_char, target_char_indices,
                            start_of_the_cycle_symbol,
                            stop_symbol_probability_amplifier_current, j)

                        if temp_prediction == '!':  # end of case was just predicted, therefore, stop predicting further into the future
                            if verify_formula_as_compliant(temp_cropped_line):
                                one_ahead_pred.append(
                                    current_prediction_premis.
                                    total_predicted_time)
                                one_ahead_gt.append(ground_truth_t)
                                stop_symbol_probability_amplifier_current = 1
                                print('! predicted, end case')
                                queue_next_steps = PriorityQueue()
                                break
                            else:
                                continue