Exemplo n.º 1
0
def explain(retrain_exp: Explanation, training_df_old, test_df_old,
            explanation_target, prefix_target):
    initial_job_obj = retrain_exp.job
    # todo: return performances
    inital_result = dict(initial_job_obj.evaluation.classificationmetrics.
                         to_dict())  # TODO future bug

    train_df, test_df = randomise_features(training_df_old.copy(),
                                           test_df_old.copy(),
                                           explanation_target)
    assert not train_df.equals(training_df_old)
    assert not test_df.equals(test_df_old)

    new_split = save_randomised_set(initial_job_obj.split)

    prediction_job = create_prediction_job(
        initial_job_obj, initial_job_obj.encoding.prefix_length)
    prediction_job.split = new_split
    prediction_job.split.save()
    prediction_job.evaluation = None
    prediction_job.save()
    # assert prediction_job.split.id != initial_job_obj.split.id

    put_labelled_logs(prediction_job, train_df, test_df)

    # todo: build model
    prediction_task(prediction_job.id, do_publish_result=False)
    prediction_job.refresh_from_db()

    # todo: return performances
    return {
        "Initial result": inital_result,
        "Retrain result":
        prediction_job.evaluation.classificationmetrics.to_dict()
    }
Exemplo n.º 2
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Exemplo n.º 3
0
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')
    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError: #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(
                        EventLog(training_log),
                        train_name + '.xes'
                    )
                    test_name = str(int(100 - (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(
                        EventLog(test_log),
                        test_name + '.xes'
                    )
                    job.split.additional_columns = str(train_name + test_name)  # TODO: find better naming policy
                    job.split.save()

                put_loaded_logs(job.split, training_log, test_log, additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(job.split)
        training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
Exemplo n.º 4
0
    def handle(self, *args, **kwargs):
        TARGET_JOB = 439
        initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0]

        # todo: return performances
        print('Initial Job:', initial_job_obj.evaluation.classificationmetrics
              )  # TODO future bug

        training_df_old, test_df_old = get_encoded_logs(initial_job_obj)
        training_df = training_df_old.copy()
        test_df = test_df_old.copy()

        # todo: what should I randomise?
        TARGETS = [
            [('prefix_1', 2)],  # <- simple pattern
            [('prefix_2', 3)],  # <- simple pattern
            [
                ('prefix_3', 2),
                ('prefix_4', 3),
            ]  # <- complex pattern
        ]
        for target in TARGETS:
            if len(target) == 1:
                target = target[0]
                for df in [training_df, test_df]:
                    m_col = df[target[0]]
                    del df[target[0]]
                    target_values1 = list(set(m_col.values))
                    df[target[0]] = m_col.apply(lambda x: x if (x != target[
                        1]) else random.choice(target_values1))
            elif len(target) > 1:
                for df in [training_df, test_df]:
                    m_col = df[[column for column, _ in target]]
                    possible_values = {}
                    for column, _ in target:
                        possible_values[column] = list(set(df[column]))
                        del df[column]
                    df[[column for column, _ in target
                        ]] = m_col.apply(lambda x: x if any(
                            [x[column] != value
                             for column, value in target]) else Series({
                                 column: random.choice(possible_values[column])
                                 for column, value in target
                             }),
                                         axis=1)
            else:
                raise Exception('target list with unexpected value')

        assert not training_df.equals(training_df_old)
        assert not test_df.equals(test_df_old)

        # todo: save new dataset in memory and create split to use it
        initial_split_obj = initial_job_obj.split
        new_split = duplicate_orm_row(initial_split_obj)
        train_log = duplicate_orm_row(new_split.train_log)
        test_log = duplicate_orm_row(new_split.test_log)

        # TODO future bug creates shadows
        train_log.name = 'RETRAIN' + train_log.name
        train_log.path = 'cache/log_cache/' + train_log.name
        train_log.properties = {}
        test_log.name = 'RETRAIN' + test_log.name
        test_log.path = 'cache/log_cache/' + test_log.name
        test_log.properties = {}

        new_split.train_log = train_log
        new_split.test_log = test_log
        new_split.additional_columns = None
        new_split.save()

        prediction_job = create_prediction_job(
            initial_job_obj, initial_job_obj.encoding.prefix_length)
        prediction_job.split = new_split
        prediction_job.split.save()
        prediction_job.save()

        put_labelled_logs(prediction_job, training_df, test_df)

        # todo: build model
        prediction_task(prediction_job.id, do_publish_result=False)
        prediction_job.refresh_from_db()

        # todo: return performances
        print('Retrain Job:', prediction_job.evaluation.classificationmetrics)

        print('Done, cheers!')