Пример #1
0
def explain(retrain_exp: Explanation, training_df_old, test_df_old,
            explanation_target, prefix_target):
    initial_job_obj = retrain_exp.job
    # todo: return performances
    inital_result = dict(initial_job_obj.evaluation.classificationmetrics.
                         to_dict())  # TODO future bug

    train_df, test_df = randomise_features(training_df_old.copy(),
                                           test_df_old.copy(),
                                           explanation_target)
    assert not train_df.equals(training_df_old)
    assert not test_df.equals(test_df_old)

    new_split = save_randomised_set(initial_job_obj.split)

    prediction_job = create_prediction_job(
        initial_job_obj, initial_job_obj.encoding.prefix_length)
    prediction_job.split = new_split
    prediction_job.split.save()
    prediction_job.evaluation = None
    prediction_job.save()
    # assert prediction_job.split.id != initial_job_obj.split.id

    put_labelled_logs(prediction_job, train_df, test_df)

    # todo: build model
    prediction_task(prediction_job.id, do_publish_result=False)
    prediction_job.refresh_from_db()

    # todo: return performances
    return {
        "Initial result": inital_result,
        "Retrain result":
        prediction_job.evaluation.classificationmetrics.to_dict()
    }
Пример #2
0
 def test_hyperopt(self):
     job = Job.objects.create(
         split=create_test_split(split_type=SplitTypes.SPLIT_SINGLE.value,
                                 original_log=create_test_log(
                                     log_name=general_example_filename,
                                     log_path=general_example_filepath)),
         encoding=create_test_encoding(
             value_encoding=ValueEncodings.SIMPLE_INDEX.value,
             prefix_length=3,
             padding=False),
         labelling=create_test_labelling(
             label_type=LabelTypes.REMAINING_TIME.value),
         clustering=create_test_clustering(
             clustering_type=ClusteringMethods.KMEANS.value),
         predictive_model=create_test_predictive_model(
             predictive_model=PredictiveModels.CLASSIFICATION.value,
             prediction_method=ClassificationMethods.RANDOM_FOREST.value),
         hyperparameter_optimizer=create_test_hyperparameter_optimizer(
             hyperoptim_type=HyperparameterOptimizationMethods.HYPEROPT.
             value,
             performance_metric=HyperOptLosses.ACC.value,
             max_evals=2))
     prediction_task(job.pk)
     job = Job.objects.get(pk=1)
     self.assertFalse(classification_random_forest(
     ) == job.predictive_model.classification.__getattribute__(
         ClassificationMethods.RANDOM_FOREST.value.lower()).to_dict())
Пример #3
0
    def test_prediction_task(self):
        job = create_test_job()
        prediction_task(job.id)

        job.refresh_from_db()
        self.assertEqual('completed', job.status)
        self.assertNotEqual({}, job.evaluation)
Пример #4
0
    def test_prediction_task(self):
        prediction_task(1)

        job = Job.objects.get(pk=1)

        self.assertEqual('completed', job.status)
        self.assertNotEqual({}, job.evaluation)
def replay_prediction_task(replay_prediction_job, training_initial_job, log):
    logger.info("Start replay_prediction task ID {}".format(
        replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job,
                                                   max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            new_replay_prediction_job = duplicate_orm_row(prediction_job)
            new_replay_prediction_job.split = Split.objects.filter(
                pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job,
                                   log)
            return
        result = replay_prediction_calculate(replay_prediction_job, log)
        replay_prediction_job.results = {'result': str(result)}
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
Пример #6
0
    def test_explain(self):
        split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            test_size=0.2,
            original_log=None,
            train_log=create_test_log(
                log_name='train_explainability.xes',
                log_path='cache/log_cache/test_logs/train_explainability.xes'
            ),
            test_log=create_test_log(
                log_name='test_explainability.xes',
                log_path='cache/log_cache/test_logs/test_explainability.xes'
            )
        )

        predictive_model = create_test_predictive_model(
            predictive_model=PredictiveModels.CLASSIFICATION.value,
            prediction_method=ClassificationMethods.DECISION_TREE.value
        )

        job = create_test_job(
            split=split,
            encoding=create_test_encoding(
                prefix_length=4,
                padding=True,
                value_encoding=ValueEncodings.SIMPLE_INDEX.value
            ),
            labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'),
            clustering=None,
            create_models=True,
            predictive_model=predictive_model,
            job_type=JobTypes.PREDICTION.value,
            hyperparameter_optimizer=None,
            incremental_train=None
        )

        prediction_task(job.id, do_publish_result=False)
        job.refresh_from_db()

        exp = Explanation.objects.get_or_create(
            type=ExplanationTypes.SHAP.value,
            split=split,
            predictive_model=predictive_model,
            job=job,
            results={}
        )[0]
        training_df_old, test_df_old = get_encoded_logs(job)

        explanation_target = '2_101'
        prefix_target = 'prefix_1'

        explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target)
        training_df_old, test_df_old = get_encoded_logs(job)
        explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target)

        self.assertTrue(type(explanation) is dict)
        self.assertTrue(type(explanation_temp) is dict)
Пример #7
0
    def test_create_models_config_missing(self):
        job = create_test_job()
        del job.create_models  # TODO fixme should we add this field?
        job.save()
        prediction_task(job.id)

        job.refresh_from_db()
        self.assertEqual('completed', job.status)
        self.assertNotEqual({}, job.evaluation)
 def test_prediction_task_save_model_clustering(self):
     job = create_test_job(
         create_models=True,
         clustering=create_test_clustering(
             clustering_type=ClusteringMethods.KMEANS.value))
     prediction_task(job.id)
     job.refresh_from_db()
     self.assertEqual('completed', job.status)
     self.assertIsNotNone(job.predictive_model.model_path)
     self.assertIsNotNone(job.clustering.model_path)
Пример #9
0
    def test_create_models_config_missing(self):
        job = Job.objects.get(pk=1)
        del job.create_models  # TODO fixme should we add this field?
        job.save()
        prediction_task(1)

        job = Job.objects.get(pk=1)

        self.assertEqual('completed', job.status)
        self.assertNotEqual({}, job.evaluation)
Пример #10
0
    def test_replay_prediction(self):
        job = create_test_job(create_models=True)
        runtime_log = create_test_log(
            log_name='runtime_example.xes',
            log_path='cache/log_cache/test_logs/runtime_test.xes')
        log = get_log(runtime_log)
        prediction_task(job.id)
        job.refresh_from_db()

        replay_prediction_task(job, job, log)
Пример #11
0
    def test_runtime(self):
        job = create_test_job(create_models=True)
        runtime_log = create_test_log(
            log_name='runtime_example.xes',
            log_path='cache/log_cache/test_logs/runtime_test.xes')

        prediction_task(job.id)
        job.refresh_from_db()
        job.split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            train_log=runtime_log,
            test_log=runtime_log)

        runtime_task(job)
Пример #12
0
    def test_update(self):
        job = create_test_job()
        prediction_task(job.id)

        job2 = duplicate_orm_row(job)
        job.refresh_from_db()
        job2.incremental_train = job
        job2.type = JobTypes.UPDATE.value
        job2.save()

        initial_job = job2  #.to_dict()

        generated_job = update(split=job.split,
                               payload={
                                   'type': 'classification',
                                   'split_id': 1,
                                   'config': {
                                       'clusterings': ['noCluster'],
                                       'encodings': ['simpleIndex'],
                                       'encoding': {
                                           'padding': False,
                                           'prefix_length': 1,
                                           'generation_type': 'only',
                                           'add_remaining_time': False,
                                           'add_elapsed_time': False,
                                           'add_executed_events': False,
                                           'add_resources_used': False,
                                           'add_new_traces': False,
                                           'features': [],
                                       },
                                       'create_models': False,
                                       'methods': ['randomForest'],
                                       'kmeans': {},
                                       'incremental_train': [job.id],
                                       'hyperparameter_optimizer': {
                                           'algorithm_type': 'tpe',
                                           'max_evaluations': 10,
                                           'performance_metric': 'rmse',
                                           'type': 'none',
                                       },
                                       'labelling': {
                                           'type': 'next_activity',
                                           'attribute_name': '',
                                           'threshold_type': 'threshold_mean',
                                           'threshold': 0,
                                       }
                                   }
                               })[0]  #.to_dict()
Пример #13
0
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log):
    """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace

        :param replay_prediction_job: job dictionary
        :param training_initial_job: job dictionary
        :param log: job dictionary
    """
    logger.info("Start replay_prediction task ID {}".format(replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job, max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            new_replay_prediction_job = duplicate_orm_row(prediction_job)
            new_replay_prediction_job.split = Split.objects.filter(pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.parent_job = replay_prediction_job.parent_job
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job, log)
            return
        result_dict, events_for_trace = replay_prediction_calculate(replay_prediction_job, log)
        replay_prediction_job.results = dict(result_dict)
        replay_prediction_job.event_number = dict(events_for_trace)
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
 def test_prediction_task_save_model(self):
     job = create_test_job(create_models=True)
     prediction_task(job.id)
     job.refresh_from_db()
     self.assertEqual('completed', job.status)
     self.assertIsNotNone(job.predictive_model.model_path)
Пример #15
0
def replay_prediction_task(replay_prediction_job: Job,
                           training_initial_job: Job, log: Log):
    """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace

        :param replay_prediction_job: job dictionary
        :param training_initial_job: job dictionary
        :param log: job dictionary
    """
    logger.info("Start replay_prediction task ID {}".format(
        replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job,
                                                   max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            # new_replay_prediction_job = duplicate_orm_row(prediction_job)  #todo: replace with simple CREATE
            new_replay_prediction_job = Job.objects.create(
                created_date=prediction_job.created_date,
                modified_date=prediction_job.modified_date,
                error=prediction_job.error,
                status=prediction_job.status,
                type=prediction_job.type,
                create_models=prediction_job.create_models,
                case_id=prediction_job.case_id,
                event_number=prediction_job.event_number,
                gold_value=prediction_job.gold_value,
                results=prediction_job.results,
                parent_job=prediction_job.parent_job,
                split=prediction_job.split,
                encoding=prediction_job.encoding,
                labelling=prediction_job.labelling,
                clustering=prediction_job.clustering,
                predictive_model=prediction_job.predictive_model,
                evaluation=prediction_job.evaluation,
                hyperparameter_optimizer=prediction_job.
                hyperparameter_optimizer,
                incremental_train=prediction_job.incremental_train)
            new_replay_prediction_job.split = Split.objects.filter(
                pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.parent_job = replay_prediction_job.parent_job
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job,
                                   log)
            return
        result_dict, events_for_trace = replay_prediction_calculate(
            replay_prediction_job, log)
        replay_prediction_job.results = dict(result_dict)
        replay_prediction_job.event_number = dict(events_for_trace)
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
Пример #16
0
    def test_explain(self):
        split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            test_size=0.2,
            original_log=None,
            train_log=create_test_log(
                log_name='train_explainability.xes',
                log_path='cache/log_cache/test_logs/train_explainability.xes'),
            test_log=create_test_log(
                log_name='test_explainability.xes',
                log_path='cache/log_cache/test_logs/test_explainability.xes'))

        predictive_model = create_test_predictive_model(
            predictive_model=PredictiveModels.CLASSIFICATION.value,
            prediction_method=ClassificationMethods.DECISION_TREE.value)

        job = create_test_job(
            split=split,
            encoding=create_test_encoding(
                prefix_length=4,
                padding=True,
                value_encoding=ValueEncodings.SIMPLE_INDEX.value),
            labelling=create_test_labelling(
                label_type=LabelTypes.ATTRIBUTE_STRING.value,
                attribute_name='label'),
            clustering=None,
            create_models=True,
            predictive_model=predictive_model,
            job_type=JobTypes.PREDICTION.value,
            hyperparameter_optimizer=None,
            incremental_train=None)

        prediction_task(job.id, do_publish_result=False)
        job.refresh_from_db()

        exp = Explanation.objects.get_or_create(
            type=ExplanationTypes.ICE.value,
            split=split,
            predictive_model=predictive_model,
            job=job,
            results={})[0]
        training_df_old, test_df_old = get_encoded_logs(job)

        explanation_target = 'prefix_2'

        explanation = explain(exp,
                              training_df_old,
                              test_df_old,
                              explanation_target,
                              prefix_target=None)

        expected = [{
            'value': 'Contact Hospital',
            'label': 1.2962962962962963,
            'count': 351
        }, {
            'value': 'Create Questionnaire',
            'label': 1.5526992287917738,
            'count': 1167
        }, {
            'value': 'High Insurance Check',
            'label': 1.2667660208643816,
            'count': 671
        }]

        self.assertEqual(expected, explanation)
Пример #17
0
    def test_update(self):
        job = create_test_job()
        prediction_task(job.id)

        # job2 = duplicate_orm_row(job) #todo: replace with simple CREATE
        job2 = Job.objects.create(
            created_date=job.created_date,
            modified_date=job.modified_date,
            error=job.error,
            status=job.status,
            type=job.type,
            create_models=job.create_models,
            case_id=job.case_id,
            event_number=job.event_number,
            gold_value=job.gold_value,
            results=job.results,
            parent_job=job.parent_job,
            split=job.split,
            encoding=job.encoding,
            labelling=job.labelling,
            clustering=job.clustering,
            predictive_model=job.predictive_model,
            evaluation=job.evaluation,
            hyperparameter_optimizer=job.hyperparameter_optimizer,
            incremental_train=job.incremental_train)
        job.refresh_from_db()
        job2.incremental_train = job
        job2.type = JobTypes.UPDATE.value
        job2.save()

        initial_job = job2  #.to_dict()

        generated_job = update(split=job.split,
                               payload={
                                   'type': 'classification',
                                   'split_id': 1,
                                   'config': {
                                       'clusterings': ['noCluster'],
                                       'encodings': ['simpleIndex'],
                                       'encoding': {
                                           'padding': False,
                                           'prefix_length': 1,
                                           'generation_type': 'only',
                                           'add_remaining_time': False,
                                           'add_elapsed_time': False,
                                           'add_executed_events': False,
                                           'add_resources_used': False,
                                           'add_new_traces': False,
                                           'features': [],
                                       },
                                       'create_models': False,
                                       'methods': ['randomForest'],
                                       'kmeans': {},
                                       'incremental_train': [job.id],
                                       'hyperparameter_optimizer': {
                                           'algorithm_type': 'tpe',
                                           'max_evaluations': 10,
                                           'performance_metric': 'rmse',
                                           'type': 'none',
                                       },
                                       'labelling': {
                                           'type': 'next_activity',
                                           'attribute_name': '',
                                           'threshold_type': 'threshold_mean',
                                           'threshold': 0,
                                       }
                                   }
                               })[0]  #.to_dict()
Пример #18
0
    def handle(self, *args, **kwargs):
        TARGET_JOB = 439
        initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0]

        # todo: return performances
        print('Initial Job:', initial_job_obj.evaluation.classificationmetrics
              )  # TODO future bug

        training_df_old, test_df_old = get_encoded_logs(initial_job_obj)
        training_df = training_df_old.copy()
        test_df = test_df_old.copy()

        # todo: what should I randomise?
        TARGETS = [
            [('prefix_1', 2)],  # <- simple pattern
            [('prefix_2', 3)],  # <- simple pattern
            [
                ('prefix_3', 2),
                ('prefix_4', 3),
            ]  # <- complex pattern
        ]
        for target in TARGETS:
            if len(target) == 1:
                target = target[0]
                for df in [training_df, test_df]:
                    m_col = df[target[0]]
                    del df[target[0]]
                    target_values1 = list(set(m_col.values))
                    df[target[0]] = m_col.apply(lambda x: x if (x != target[
                        1]) else random.choice(target_values1))
            elif len(target) > 1:
                for df in [training_df, test_df]:
                    m_col = df[[column for column, _ in target]]
                    possible_values = {}
                    for column, _ in target:
                        possible_values[column] = list(set(df[column]))
                        del df[column]
                    df[[column for column, _ in target
                        ]] = m_col.apply(lambda x: x if any(
                            [x[column] != value
                             for column, value in target]) else Series({
                                 column: random.choice(possible_values[column])
                                 for column, value in target
                             }),
                                         axis=1)
            else:
                raise Exception('target list with unexpected value')

        assert not training_df.equals(training_df_old)
        assert not test_df.equals(test_df_old)

        # todo: save new dataset in memory and create split to use it
        initial_split_obj = initial_job_obj.split
        new_split = duplicate_orm_row(initial_split_obj)
        train_log = duplicate_orm_row(new_split.train_log)
        test_log = duplicate_orm_row(new_split.test_log)

        # TODO future bug creates shadows
        train_log.name = 'RETRAIN' + train_log.name
        train_log.path = 'cache/log_cache/' + train_log.name
        train_log.properties = {}
        test_log.name = 'RETRAIN' + test_log.name
        test_log.path = 'cache/log_cache/' + test_log.name
        test_log.properties = {}

        new_split.train_log = train_log
        new_split.test_log = test_log
        new_split.additional_columns = None
        new_split.save()

        prediction_job = create_prediction_job(
            initial_job_obj, initial_job_obj.encoding.prefix_length)
        prediction_job.split = new_split
        prediction_job.split.save()
        prediction_job.save()

        put_labelled_logs(prediction_job, training_df, test_df)

        # todo: build model
        prediction_task(prediction_job.id, do_publish_result=False)
        prediction_job.refresh_from_db()

        # todo: return performances
        print('Retrain Job:', prediction_job.evaluation.classificationmetrics)

        print('Done, cheers!')