def setUp(self): create_test_job() create_test_job(job_type='asdf') Job.objects.create(type=JobTypes.PREDICTION.value, split=create_test_split(), encoding=None, labelling=None)
def test_split_avoid_duplication(self): split = create_test_split( split_type=SplitTypes.SPLIT_SINGLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=self.log) job = create_test_job(split=split, encoding=self.encoding, labelling=self.labelling, clustering=None, create_models=False, predictive_model=self.predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None) training_df1, test_df1 = get_encoded_logs(job) split_id1 = job.split.id job = create_test_job(split=split, encoding=self.encoding, labelling=self.labelling, clustering=None, create_models=False, predictive_model=self.predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None) training_df2, test_df2 = get_encoded_logs(job) split_id2 = job.split.id self.assertEqual(split_id1, split_id2)
def test_attribute_number(self): encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=2, task_generation_type=TaskGenerationTypes.ONLY_THIS.value) labelling = create_test_labelling( label_type=LabelTypes.ATTRIBUTE_NUMBER.value, attribute_name='AMOUNT') _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 9)) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual( trace_5, ['5', True, True, False, False, False, False, False, False]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual( trace_4, ['4', True, False, True, False, False, False, False, True])
def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=3) _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 10)) self.assertTrue('elapsed_time' in df.columns.values.tolist()) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, [ '5', True, True, True, False, False, False, False, 181200.0, 'decide' ]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', True, False, True, False, False, False, True, 171660.0, 'decide' ])
def test_next_activity_kmeans(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), split=repair_example(), encoding=create_test_encoding(prefix_length=8, padding=True), labelling=create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual( result, { 'f1score': 0.54239884582595577, 'acc': 0.80995475113122173, 'true_positive': '--', 'true_negative': '--', 'false_negative': '--', 'false_positive': '--', 'precision': 0.62344720496894401, 'recall': 0.5224945442336747, 'auc': 0.4730604801339352 })
def test_class_no_cluster(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True, add_elapsed_time=True), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual( result, { 'f1score': 1.0, 'acc': 1.0, 'true_positive': '--', 'true_negative': '--', 'false_negative': '--', 'false_positive': '--', 'precision': 1.0, 'recall': 1.0, 'auc': 0.0 })
def test_prediction_task(self): job = create_test_job() prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertNotEqual({}, job.evaluation)
def test_no_exceptions(self): filtered_labels = [enum.value for enum in LabelTypes] filtered_classification_methods = [ enum.value for enum in ClassificationMethods ] filtered_encoding_methods = [enum.value for enum in ValueEncodings] filtered_padding = [True, False] choices = [ filtered_encoding_methods, filtered_padding, filtered_classification_methods, filtered_labels ] job_combinations = list(itertools.product(*choices)) for (encoding, padding, method, label) in job_combinations: print(encoding, padding, method, label) if method == 'nn' and (padding == False or label == LabelTypes.ATTRIBUTE_STRING.value): pass job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=method), encoding=create_test_encoding(value_encoding=encoding, padding=padding), labelling=create_test_labelling(label_type=label)) # with HidePrints(): calculate(job)
def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) df, _ = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 13)) self.assertTrue('elapsed_time' in df.columns.values.tolist()) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, ['5', 1, 3, 2, 2, 2, 0, 0, 0, 0, 1296240.0, 2]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', 52903968, 32171502, 17803069, 1149821, 72523760, 0, 0, 0, 0, 0, 520920.0, 0 ])
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2) labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) df, _ = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 5)) self.assertListEqual( df.columns.values.tolist(), ['trace_id', 'prefix_1', 'prefix_2', 'elapsed_time', 'label']) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, ['5', 1, 2, 0, 0]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, ['4', 1, 1, 0, 0])
def test_get_encoded_logs_Loaded_cache(self): job = create_test_job() w_cache = get_encoded_logs(job, True) cached_loaded_log = LoadedLog.objects.filter(split=job.split)[0] cached_train = cached_loaded_log.train_log_path cached_test = cached_loaded_log.test_log_path os.remove('cache/loaded_log_cache/' + get_digested(cached_train) + '.pickle') loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1]) os.remove('cache/loaded_log_cache/' + get_digested(cached_test) + '.pickle') loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1])
def test_default(self): job = create_test_job() self.assertEqual('created', job.status) self.assertIsNotNone(job.created_date) self.assertIsNotNone(job.modified_date) self.assertIsNone(job.evaluation)
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=3, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value) _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 10)) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, [ '5', True, True, True, False, False, False, False, 181200.0, False ]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', True, False, True, False, False, False, True, 171660.0, False ])
def test_get_labelled_logs(self): job = create_test_job() labelled_logs = get_encoded_logs(job) cached_labelled_logs = get_labelled_logs(job) assert_frame_equal(labelled_logs[0], cached_labelled_logs[0]) assert_frame_equal(labelled_logs[1], cached_labelled_logs[1])
def test_update_nb(self): job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.HOEFFDING_TREE.value), labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='concept:name'), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value), create_models=True) result1, _ = calculate(job) job = create_test_job(predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.HOEFFDING_TREE.value), encoding=job.encoding, labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='concept:name'), clustering=job.clustering, incremental_train=job) result2, _ = calculate(job) del result1['elapsed_time'] del result2['elapsed_time'] self.assertDictEqual( result1, { 'f1score': 0.0, 'acc': 0.0, 'precision': 0.0, 'recall': 0.0, 'true_positive': 0, 'true_negative': 0, 'false_negative': 2, 'false_positive': 0, 'auc': 0.0 }) self.assertDictEqual( result2, { 'f1score': 0.0, 'acc': 0.0, 'precision': 0.0, 'recall': 0.0, 'true_positive': 0, 'true_negative': 0, 'false_negative': 2, 'false_positive': 0, 'auc': 0.0 })
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes' ), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes' ) ) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value ) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value ), labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None ) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.SHAP.value, split=split, predictive_model=predictive_model, job=job, results={} )[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = '2_101' prefix_target = 'prefix_1' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target) training_df_old, test_df_old = get_encoded_logs(job) explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target) self.assertTrue(type(explanation) is dict) self.assertTrue(type(explanation_temp) is dict)
def test_str(self): job = create_test_job(predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) self.assertEqual( len(job.__str__()), len("{created_date: 2019-10-01 09:38:35.245361+00:00, modified_date: 2019-10-01 09:38:35.245655+00:00, error: , status: created, type: prediction, create_models: False, split: {'id': 1, 'type': 'single', 'test_size': 0.2, 'splitting_method': 'sequential', 'original_log_path': 'cache/log_cache/test_logs/general_example.xes'}, encoding: {'data_encoding': 'label_encoder', 'value_encoding': 'simpleIndex', 'add_elapsed_time': False, 'add_remaining_time': False, 'add_executed_events': False, 'add_resources_used': False, 'add_new_traces': False, 'features': {}, 'prefix_length': 1, 'padding': False, 'task_generation_type': 'only'}, labelling: {'type': 'next_activity', 'attribute_name': None, 'threshold_type': 'threshold_mean', 'threshold': 0.0, 'results': {}}, clustering: {'clustering_method': 'noCluster'}, predictive_model: {'n_estimators': 10, 'max_depth': None, 'max_features': 'auto'}, evaluation: [None], hyperparameter_optimizer: [None], incremental_train: [None]}" ))
def test_create_models_config_missing(self): job = create_test_job() del job.create_models # TODO fixme should we add this field? job.save() prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertNotEqual({}, job.evaluation)
def test_shape_training(self): training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job( encoding=self.encoding, labelling=self.labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value) )) self.assert_shape(training_df, (4, 4)) self.assert_shape(test_df, (2, 4))
def test_prediction_task_save_model_clustering(self): job = create_test_job( create_models=True, clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value)) prediction_task(job.id) job.refresh_from_db() self.assertEqual('completed', job.status) self.assertIsNotNone(job.predictive_model.model_path) self.assertIsNotNone(job.clustering.model_path)
def test_next_activity_DecisionTree(self): job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.DECISION_TREE.value), labelling=create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value)) result, _ = calculate(job) self.assertDictEqual(result, self.results3())
def get_classification_job(predictive_model: str, prediction_method: str, metric: HyperOptLosses = HyperOptLosses.ACC.value): encoding = create_test_encoding(prefix_length=8, padding=True) pred_model = create_test_predictive_model(predictive_model=predictive_model, prediction_method=prediction_method) hyperparameter_optimizer = create_test_hyperparameter_optimizer(performance_metric=metric) job = create_test_job(predictive_model=pred_model, encoding=encoding, hyperparameter_optimizer=hyperparameter_optimizer) return job
def test_replay_prediction(self): job = create_test_job(create_models=True) runtime_log = create_test_log( log_name='runtime_example.xes', log_path='cache/log_cache/test_logs/runtime_test.xes') log = get_log(runtime_log) prediction_task(job.id) job.refresh_from_db() replay_prediction_task(job, job, log)
def test_class_randomForest(self): job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.RANDOM_FOREST.value), labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value)) result, _ = calculate(job) self.assertDictEqual(result, self.results2())
def test_create_runtime(self): job = create_test_job() split = create_test_split() client = APIClient() response = client.post('/runtime/prediction/', { 'jobId': job.id, 'splitId': split.id, }, format='json') self.assertEqual(201, response.status_code)
def test_no_label(self): labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 9))
def test_row_test(self): training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job( encoding=self.encoding, labelling=self.labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value) )) row = test_df[(test_df.trace_id == '4')].iloc[0] self.assertEqual(1, row.prefix_1) self.assertEqual(0, row.elapsed_time) self.assertEqual(0, row.label)
def test_remaining_time(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 11))
def test_get_encoded_logs_cache(self): job = create_test_job() w_cache = get_encoded_logs(job, True) wout_cache = get_encoded_logs(job, False) assert_frame_equal(w_cache[0], wout_cache[0]) assert_frame_equal(w_cache[1], wout_cache[1]) loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1])
def get_regression_job(predictive_model: str, prediction_method: str, metric: HyperOptLosses = HyperOptLosses.ACC.value): encoding = create_test_encoding(prefix_length=8, padding=True) pred_model = create_test_predictive_model(predictive_model=predictive_model, prediction_method=prediction_method) hyperparameter_optimizer = create_test_hyperparameter_optimizer(performance_metric=metric) job = create_test_job(predictive_model=pred_model, encoding=encoding, labelling=create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.NONE.value), hyperparameter_optimizer=hyperparameter_optimizer) return job