def explain(retrain_exp: Explanation, training_df_old, test_df_old, explanation_target, prefix_target): initial_job_obj = retrain_exp.job # todo: return performances inital_result = dict(initial_job_obj.evaluation.classificationmetrics. to_dict()) # TODO future bug train_df, test_df = randomise_features(training_df_old.copy(), test_df_old.copy(), explanation_target) assert not train_df.equals(training_df_old) assert not test_df.equals(test_df_old) new_split = save_randomised_set(initial_job_obj.split) prediction_job = create_prediction_job( initial_job_obj, initial_job_obj.encoding.prefix_length) prediction_job.split = new_split prediction_job.split.save() prediction_job.evaluation = None prediction_job.save() # assert prediction_job.split.id != initial_job_obj.split.id put_labelled_logs(prediction_job, train_df, test_df) # todo: build model prediction_task(prediction_job.id, do_publish_result=False) prediction_job.refresh_from_db() # todo: return performances return { "Initial result": inital_result, "Retrain result": prediction_job.evaluation.classificationmetrics.to_dict() }
def test_hyperopt(self): job = Job.objects.create( split=create_test_split(split_type=SplitTypes.SPLIT_SINGLE.value, original_log=create_test_log( log_name=general_example_filename, log_path=general_example_filepath)), encoding=create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, prefix_length=3, padding=False), labelling=create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value), clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value), hyperparameter_optimizer=create_test_hyperparameter_optimizer( hyperoptim_type=HyperparameterOptimizationMethods.HYPEROPT. value, performance_metric=HyperOptLosses.ACC.value, max_evals=2)) prediction_task(job.pk) job = Job.objects.get(pk=1) self.assertFalse(classification_random_forest( ) == job.predictive_model.classification.__getattribute__( ClassificationMethods.RANDOM_FOREST.value.lower()).to_dict())
def test_prediction_task(self): job = create_test_job() prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertNotEqual({}, job.evaluation)
def test_prediction_task(self): prediction_task(1) job = Job.objects.get(pk=1) self.assertEqual('completed', job.status) self.assertNotEqual({}, job.evaluation)
def replay_prediction_task(replay_prediction_job, training_initial_job, log): logger.info("Start replay_prediction task ID {}".format( replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() new_replay_prediction_job = duplicate_orm_row(prediction_job) new_replay_prediction_job.split = Split.objects.filter( pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result = replay_prediction_calculate(replay_prediction_job, log) replay_prediction_job.results = {'result': str(result)} replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes' ), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes' ) ) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value ) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value ), labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None ) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.SHAP.value, split=split, predictive_model=predictive_model, job=job, results={} )[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = '2_101' prefix_target = 'prefix_1' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target) training_df_old, test_df_old = get_encoded_logs(job) explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target) self.assertTrue(type(explanation) is dict) self.assertTrue(type(explanation_temp) is dict)
def test_create_models_config_missing(self): job = create_test_job() del job.create_models # TODO fixme should we add this field? job.save() prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertNotEqual({}, job.evaluation)
def test_prediction_task_save_model_clustering(self): job = create_test_job( create_models=True, clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value)) prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertIsNotNone(job.predictive_model.model_path) self.assertIsNotNone(job.clustering.model_path)
def test_create_models_config_missing(self): job = Job.objects.get(pk=1) del job.create_models # TODO fixme should we add this field? job.save() prediction_task(1) job = Job.objects.get(pk=1) self.assertEqual('completed', job.status) self.assertNotEqual({}, job.evaluation)
def test_replay_prediction(self): job = create_test_job(create_models=True) runtime_log = create_test_log( log_name='runtime_example.xes', log_path='cache/log_cache/test_logs/runtime_test.xes') log = get_log(runtime_log) prediction_task(job.id) job.refresh_from_db() replay_prediction_task(job, job, log)
def test_runtime(self): job = create_test_job(create_models=True) runtime_log = create_test_log( log_name='runtime_example.xes', log_path='cache/log_cache/test_logs/runtime_test.xes') prediction_task(job.id) job.refresh_from_db() job.split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, train_log=runtime_log, test_log=runtime_log) runtime_task(job)
def test_update(self): job = create_test_job() prediction_task(job.id) job2 = duplicate_orm_row(job) job.refresh_from_db() job2.incremental_train = job job2.type = JobTypes.UPDATE.value job2.save() initial_job = job2 #.to_dict() generated_job = update(split=job.split, payload={ 'type': 'classification', 'split_id': 1, 'config': { 'clusterings': ['noCluster'], 'encodings': ['simpleIndex'], 'encoding': { 'padding': False, 'prefix_length': 1, 'generation_type': 'only', 'add_remaining_time': False, 'add_elapsed_time': False, 'add_executed_events': False, 'add_resources_used': False, 'add_new_traces': False, 'features': [], }, 'create_models': False, 'methods': ['randomForest'], 'kmeans': {}, 'incremental_train': [job.id], 'hyperparameter_optimizer': { 'algorithm_type': 'tpe', 'max_evaluations': 10, 'performance_metric': 'rmse', 'type': 'none', }, 'labelling': { 'type': 'next_activity', 'attribute_name': '', 'threshold_type': 'threshold_mean', 'threshold': 0, } } })[0] #.to_dict()
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log): """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace :param replay_prediction_job: job dictionary :param training_initial_job: job dictionary :param log: job dictionary """ logger.info("Start replay_prediction task ID {}".format(replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() new_replay_prediction_job = duplicate_orm_row(prediction_job) new_replay_prediction_job.split = Split.objects.filter(pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.parent_job = replay_prediction_job.parent_job new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result_dict, events_for_trace = replay_prediction_calculate(replay_prediction_job, log) replay_prediction_job.results = dict(result_dict) replay_prediction_job.event_number = dict(events_for_trace) replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)
def test_prediction_task_save_model(self): job = create_test_job(create_models=True) prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertIsNotNone(job.predictive_model.model_path)
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log): """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace :param replay_prediction_job: job dictionary :param training_initial_job: job dictionary :param log: job dictionary """ logger.info("Start replay_prediction task ID {}".format( replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() # new_replay_prediction_job = duplicate_orm_row(prediction_job) #todo: replace with simple CREATE new_replay_prediction_job = Job.objects.create( created_date=prediction_job.created_date, modified_date=prediction_job.modified_date, error=prediction_job.error, status=prediction_job.status, type=prediction_job.type, create_models=prediction_job.create_models, case_id=prediction_job.case_id, event_number=prediction_job.event_number, gold_value=prediction_job.gold_value, results=prediction_job.results, parent_job=prediction_job.parent_job, split=prediction_job.split, encoding=prediction_job.encoding, labelling=prediction_job.labelling, clustering=prediction_job.clustering, predictive_model=prediction_job.predictive_model, evaluation=prediction_job.evaluation, hyperparameter_optimizer=prediction_job. hyperparameter_optimizer, incremental_train=prediction_job.incremental_train) new_replay_prediction_job.split = Split.objects.filter( pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.parent_job = replay_prediction_job.parent_job new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result_dict, events_for_trace = replay_prediction_calculate( replay_prediction_job, log) replay_prediction_job.results = dict(result_dict) replay_prediction_job.event_number = dict(events_for_trace) replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes'), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes')) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value), labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.ICE.value, split=split, predictive_model=predictive_model, job=job, results={})[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = 'prefix_2' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target=None) expected = [{ 'value': 'Contact Hospital', 'label': 1.2962962962962963, 'count': 351 }, { 'value': 'Create Questionnaire', 'label': 1.5526992287917738, 'count': 1167 }, { 'value': 'High Insurance Check', 'label': 1.2667660208643816, 'count': 671 }] self.assertEqual(expected, explanation)
def test_update(self): job = create_test_job() prediction_task(job.id) # job2 = duplicate_orm_row(job) #todo: replace with simple CREATE job2 = Job.objects.create( created_date=job.created_date, modified_date=job.modified_date, error=job.error, status=job.status, type=job.type, create_models=job.create_models, case_id=job.case_id, event_number=job.event_number, gold_value=job.gold_value, results=job.results, parent_job=job.parent_job, split=job.split, encoding=job.encoding, labelling=job.labelling, clustering=job.clustering, predictive_model=job.predictive_model, evaluation=job.evaluation, hyperparameter_optimizer=job.hyperparameter_optimizer, incremental_train=job.incremental_train) job.refresh_from_db() job2.incremental_train = job job2.type = JobTypes.UPDATE.value job2.save() initial_job = job2 #.to_dict() generated_job = update(split=job.split, payload={ 'type': 'classification', 'split_id': 1, 'config': { 'clusterings': ['noCluster'], 'encodings': ['simpleIndex'], 'encoding': { 'padding': False, 'prefix_length': 1, 'generation_type': 'only', 'add_remaining_time': False, 'add_elapsed_time': False, 'add_executed_events': False, 'add_resources_used': False, 'add_new_traces': False, 'features': [], }, 'create_models': False, 'methods': ['randomForest'], 'kmeans': {}, 'incremental_train': [job.id], 'hyperparameter_optimizer': { 'algorithm_type': 'tpe', 'max_evaluations': 10, 'performance_metric': 'rmse', 'type': 'none', }, 'labelling': { 'type': 'next_activity', 'attribute_name': '', 'threshold_type': 'threshold_mean', 'threshold': 0, } } })[0] #.to_dict()
def handle(self, *args, **kwargs): TARGET_JOB = 439 initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0] # todo: return performances print('Initial Job:', initial_job_obj.evaluation.classificationmetrics ) # TODO future bug training_df_old, test_df_old = get_encoded_logs(initial_job_obj) training_df = training_df_old.copy() test_df = test_df_old.copy() # todo: what should I randomise? TARGETS = [ [('prefix_1', 2)], # <- simple pattern [('prefix_2', 3)], # <- simple pattern [ ('prefix_3', 2), ('prefix_4', 3), ] # <- complex pattern ] for target in TARGETS: if len(target) == 1: target = target[0] for df in [training_df, test_df]: m_col = df[target[0]] del df[target[0]] target_values1 = list(set(m_col.values)) df[target[0]] = m_col.apply(lambda x: x if (x != target[ 1]) else random.choice(target_values1)) elif len(target) > 1: for df in [training_df, test_df]: m_col = df[[column for column, _ in target]] possible_values = {} for column, _ in target: possible_values[column] = list(set(df[column])) del df[column] df[[column for column, _ in target ]] = m_col.apply(lambda x: x if any( [x[column] != value for column, value in target]) else Series({ column: random.choice(possible_values[column]) for column, value in target }), axis=1) else: raise Exception('target list with unexpected value') assert not training_df.equals(training_df_old) assert not test_df.equals(test_df_old) # todo: save new dataset in memory and create split to use it initial_split_obj = initial_job_obj.split new_split = duplicate_orm_row(initial_split_obj) train_log = duplicate_orm_row(new_split.train_log) test_log = duplicate_orm_row(new_split.test_log) # TODO future bug creates shadows train_log.name = 'RETRAIN' + train_log.name train_log.path = 'cache/log_cache/' + train_log.name train_log.properties = {} test_log.name = 'RETRAIN' + test_log.name test_log.path = 'cache/log_cache/' + test_log.name test_log.properties = {} new_split.train_log = train_log new_split.test_log = test_log new_split.additional_columns = None new_split.save() prediction_job = create_prediction_job( initial_job_obj, initial_job_obj.encoding.prefix_length) prediction_job.split = new_split prediction_job.split.save() prediction_job.save() put_labelled_logs(prediction_job, training_df, test_df) # todo: build model prediction_task(prediction_job.id, do_publish_result=False) prediction_job.refresh_from_db() # todo: return performances print('Retrain Job:', prediction_job.evaluation.classificationmetrics) print('Done, cheers!')