示例#1
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
示例#2
0
    def test_random(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_RANDOM.value)
        training_log1, _, _ = prepare_logs(split)
        training_log2, _, _ = prepare_logs(split)
        training_names1 = trace_names(training_log1)
        training_names2 = trace_names(training_log2)

        self.assertNotEqual(training_names1, training_names2)
示例#3
0
    def test_sequential(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_SEQUENTIAL.value)
        training_log, test_log, _ = prepare_logs(split)
        training_names = trace_names(training_log)
        test_names = trace_names(test_log)

        self.assertListEqual(['3', '2', '1', '6'], training_names)
        self.assertListEqual(['5', '4'], test_names)
示例#4
0
    def test_strict_temporal(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_STRICT_TEMPORAL.value)
        training_log, test_log, _ = prepare_logs(split)

        training_names = trace_names(training_log)
        test_names = trace_names(test_log)

        # Modified log to have only one trace here
        self.assertListEqual(['1'], sorted(training_names))
        self.assertListEqual(sorted(['6', '4']), sorted(test_names))
示例#5
0
    def test_temporal(self):
        split = split_single(
            split_ordering=SplitOrderingMethods.SPLIT_TEMPORAL.value)
        training_log, test_log, _ = prepare_logs(split)

        training_names = trace_names(training_log)
        test_names = trace_names(test_log)

        self.assertListEqual(sorted(['1', '2', '3', '5']),
                             sorted(training_names))
        self.assertListEqual(sorted(['6', '4']), sorted(test_names))
示例#6
0
 def test_size(self):
     split = split_single(test_size=0.5)
     training_log, test_log, _ = prepare_logs(split)
     self.assertEqual(3, len(training_log))
     self.assertEqual(3, len(test_log))
示例#7
0
 def test_split_double(self):
     training_log, test_log, _ = prepare_logs(split_double())
     self.assertEqual(4, len(training_log))
     self.assertEqual(2, len(test_log))