def test_split_avoid_duplication(self): split = create_test_split( split_type=SplitTypes.SPLIT_SINGLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=self.log) job = create_test_job(split=split, encoding=self.encoding, labelling=self.labelling, clustering=None, create_models=False, predictive_model=self.predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None) training_df1, test_df1 = get_encoded_logs(job) split_id1 = job.split.id job = create_test_job(split=split, encoding=self.encoding, labelling=self.labelling, clustering=None, create_models=False, predictive_model=self.predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None) training_df2, test_df2 = get_encoded_logs(job) split_id2 = job.split.id self.assertEqual(split_id1, split_id2)
def test_get_encoded_logs_Loaded_cache(self): job = create_test_job() w_cache = get_encoded_logs(job, True) cached_loaded_log = LoadedLog.objects.filter(split=job.split)[0] cached_train = cached_loaded_log.train_log_path cached_test = cached_loaded_log.test_log_path os.remove('cache/loaded_log_cache/' + get_digested(cached_train) + '.pickle') loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1]) os.remove('cache/loaded_log_cache/' + get_digested(cached_test) + '.pickle') loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1])
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes' ), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes' ) ) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value ) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value ), labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None ) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.SHAP.value, split=split, predictive_model=predictive_model, job=job, results={} )[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = '2_101' prefix_target = 'prefix_1' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target) training_df_old, test_df_old = get_encoded_logs(job) explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target) self.assertTrue(type(explanation) is dict) self.assertTrue(type(explanation_temp) is dict)
def test_get_encoded_logs_cache(self): job = create_test_job() w_cache = get_encoded_logs(job, True) wout_cache = get_encoded_logs(job, False) assert_frame_equal(w_cache[0], wout_cache[0]) assert_frame_equal(w_cache[1], wout_cache[1]) loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1])
def test_get_labelled_logs(self): job = create_test_job() labelled_logs = get_encoded_logs(job) cached_labelled_logs = get_labelled_logs(job) assert_frame_equal(labelled_logs[0], cached_labelled_logs[0]) assert_frame_equal(labelled_logs[1], cached_labelled_logs[1])
def get_decoded_df(request, pk): job = Job.objects.filter(pk=pk)[0] training_df, test_df = get_encoded_logs(job) training_df = training_df[:100] training_df = training_df.drop(['trace_id'], 1) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) return Response(training_df, status=200)
def calculate(job: Job) -> (dict, dict): #TODO dd filter for 'valid' configurations """main entry point for calculations encodes the logs based on the given configuration and runs the selected task :param job: job configuration :return: results and predictive_model split """ logger.info("Start job {} with {}".format(job.type, get_run(job))) training_df, test_df = get_encoded_logs(job) results, model_split = run_by_type(training_df, test_df, job) return results, model_split
def runtime_calculate(job: Job) -> dict: """calculate the prediction for traces in the uncompleted logs :param job: job idctionary :return: runtime results """ training_df, test_df = get_encoded_logs(job) data_df = pd.concat([training_df,test_df]) results = MODEL[job.predictive_model.predictive_model][ModelActions.PREDICT.value](job, data_df) logger.info("End {} job {}, {} . Results {}".format('runtime', job.predictive_model.predictive_model, get_run(job), results)) return results
def explanation_temporal_stability(exp_id: int, explanation_target: str = None): exp = Explanation.objects.filter(pk=exp_id)[0] job = exp.job # load data training_df, test_df = get_encoded_logs(job) result = EXPLANATION[exp.type][TEMPORAL_STABILITY](exp, training_df, test_df, explanation_target) return 'False', result
def explanation(exp_id: int, explanation_target: str = None, prefix_target: str = None): exp = Explanation.objects.filter(pk=exp_id)[0] job = exp.job # load data training_df, test_df = get_encoded_logs(job) result = EXPLANATION[exp.type][EXPLAIN](exp, training_df, test_df, explanation_target, prefix_target) return 'False', result
def get_unique_values(request, pk): job = Job.objects.filter(pk=pk)[0] training_df, test_df = get_encoded_logs(job) decoded_training_df = training_df.copy() decoded_testing_df = test_df.copy() training_df = training_df.drop(['trace_id', 'label'], 1) encoder = retrieve_proper_encoder(job) encoder.decode(df=decoded_training_df, encoding=job.encoding) encoder.decode(df=decoded_testing_df, encoding=job.encoding) result_df = {} for key in training_df.keys(): result_decoded_df = list( set(list(training_df[key]) + list(test_df[key]))) result_encoded_df = list( set( list(decoded_training_df[key]) + list(decoded_testing_df[key]))) result_df[key] = {} for k in range(len(result_decoded_df)): result_df[key][result_encoded_df[k]] = result_decoded_df[k] return Response(result_df, status=200)
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global train_df, validation_df, test_df, global_job global_job = job train_df, test_df = get_encoded_logs(job) train_df, validation_df, test_df = _retrieve_train_validate_test( train_df, test_df) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = algorithm = OPTIMISATION_ALGORITHM[ job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).algorithm_type] _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials) best_candidate = trials.best_trial['result'] job.predictive_model = PredictiveModel.objects.filter( pk=best_candidate['predictive_model_id'])[0] job.predictive_model.save() job.save() best_candidate['results']['elapsed_time'] = timedelta( seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = best_candidate['results']['elapsed_time'] job.evaluation.save() results_df, auc = _test_best_candidate( best_candidate, job.labelling.type, job.predictive_model.predictive_model) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results = classification_prepare_results(results_df, auc) else: results = regression_prepare_results(results_df, job.labelling) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init(job.predictive_model.predictive_model, results, len(set(validation_df['label'])) <= 2) job.evaluation.save() job.save() logger.info( "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}." .format(job.type, get_run(job), best_candidate['results'], results)) # return results, best_candidate['config'], best_candidate['model_split']
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes'), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes')) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value), labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.ICE.value, split=split, predictive_model=predictive_model, job=job, results={})[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = 'prefix_2' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target=None) expected = [{ 'value': 'Contact Hospital', 'label': 1.2962962962962963, 'count': 351 }, { 'value': 'Create Questionnaire', 'label': 1.5526992287917738, 'count': 1167 }, { 'value': 'High Insurance Check', 'label': 1.2667660208643816, 'count': 671 }] self.assertEqual(expected, explanation)