def test_class_no_cluster(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True, add_elapsed_time=True), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual( result, { 'f1score': 1.0, 'acc': 1.0, 'true_positive': '--', 'true_negative': '--', 'false_negative': '--', 'false_positive': '--', 'precision': 1.0, 'recall': 1.0, 'auc': 0.0 })
def test_hyperopt(self): job = Job.objects.create( split=create_test_split(split_type=SplitTypes.SPLIT_SINGLE.value, original_log=create_test_log( log_name=general_example_filename, log_path=general_example_filepath)), encoding=create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, prefix_length=3, padding=False), labelling=create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value), clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value), hyperparameter_optimizer=create_test_hyperparameter_optimizer( hyperoptim_type=HyperparameterOptimizationMethods.HYPEROPT. value, performance_metric=HyperOptLosses.ACC.value, max_evals=2)) prediction_task(job.pk) job = Job.objects.get(pk=1) self.assertFalse(classification_random_forest( ) == job.predictive_model.classification.__getattribute__( ClassificationMethods.RANDOM_FOREST.value.lower()).to_dict())
def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) df, _ = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 13)) self.assertTrue('elapsed_time' in df.columns.values.tolist()) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, ['5', 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, ['4', 1, 1, 2, 1, 2, 0, 0, 0, 0, 0, 2, 0])
def test_attribute_number(self): encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=2, task_generation_type=TaskGenerationTypes.ONLY_THIS.value) labelling = create_test_labelling( label_type=LabelTypes.ATTRIBUTE_NUMBER.value, attribute_name='AMOUNT') _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 9)) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual( trace_5, ['5', True, True, False, False, False, False, False, False]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual( trace_4, ['4', True, False, True, False, False, False, False, True])
def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=3) _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 10)) self.assertTrue('elapsed_time' in df.columns.values.tolist()) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, [ '5', True, True, True, False, False, False, False, 181200.0, 'decide' ]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', True, False, True, False, False, False, True, 171660.0, 'decide' ])
def test_next_activity_kmeans(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), split=repair_example(), encoding=create_test_encoding(prefix_length=8, padding=True), labelling=create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual( result, { 'f1score': 0.54239884582595577, 'acc': 0.80995475113122173, 'true_positive': '--', 'true_negative': '--', 'false_negative': '--', 'false_positive': '--', 'precision': 0.62344720496894401, 'recall': 0.5224945442336747, 'auc': 0.4730604801339352 })
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) encoding = create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, prefix_length=3, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value) _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 10)) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, [ '5', True, True, True, False, False, False, False, 181200.0, False ]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, [ '4', True, False, True, False, False, False, True, 171660.0, False ])
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): encoding = create_test_encoding( value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2) labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) df, _ = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 5)) self.assertListEqual( df.columns.values.tolist(), ['trace_id', 'prefix_1', 'prefix_2', 'elapsed_time', 'label']) trace_5 = df[df.trace_id == '5'].iloc[0].values.tolist() self.assertListEqual(trace_5, ['5', 1, 2, 2, 1]) trace_4 = df[df.trace_id == '4'].iloc[0].values.tolist() self.assertListEqual(trace_4, ['4', 1, 1, 1, 1])
def test_no_exceptions(self): filtered_labels = [enum.value for enum in LabelTypes] filtered_classification_methods = [ enum.value for enum in ClassificationMethods ] filtered_encoding_methods = [enum.value for enum in ValueEncodings] filtered_padding = [True, False] choices = [ filtered_encoding_methods, filtered_padding, filtered_classification_methods, filtered_labels ] job_combinations = list(itertools.product(*choices)) for (encoding, padding, method, label) in job_combinations: print(encoding, padding, method, label) if method == 'nn' and (padding == False or label == LabelTypes.ATTRIBUTE_STRING.value): pass job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=method), encoding=create_test_encoding(value_encoding=encoding, padding=padding), labelling=create_test_labelling(label_type=label)) # with HidePrints(): calculate(job)
def test_update_nb(self): job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.HOEFFDING_TREE.value), labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='concept:name'), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value), create_models=True) result1, _ = calculate(job) job = create_test_job(predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.HOEFFDING_TREE.value), encoding=job.encoding, labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='concept:name'), clustering=job.clustering, incremental_train=job) result2, _ = calculate(job) del result1['elapsed_time'] del result2['elapsed_time'] self.assertDictEqual( result1, { 'f1score': 0.0, 'acc': 0.0, 'precision': 0.0, 'recall': 0.0, 'true_positive': 0, 'true_negative': 0, 'false_negative': 2, 'false_positive': 0, 'auc': 0.0 }) self.assertDictEqual( result2, { 'f1score': 0.0, 'acc': 0.0, 'precision': 0.0, 'recall': 0.0, 'true_positive': 0, 'true_negative': 0, 'false_negative': 2, 'false_positive': 0, 'auc': 0.0 })
def test_explain(self): split = create_test_split( split_type=SplitTypes.SPLIT_DOUBLE.value, split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value, test_size=0.2, original_log=None, train_log=create_test_log( log_name='train_explainability.xes', log_path='cache/log_cache/test_logs/train_explainability.xes' ), test_log=create_test_log( log_name='test_explainability.xes', log_path='cache/log_cache/test_logs/test_explainability.xes' ) ) predictive_model = create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.DECISION_TREE.value ) job = create_test_job( split=split, encoding=create_test_encoding( prefix_length=4, padding=True, value_encoding=ValueEncodings.SIMPLE_INDEX.value ), labelling=create_test_labelling(label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=None, create_models=True, predictive_model=predictive_model, job_type=JobTypes.PREDICTION.value, hyperparameter_optimizer=None, incremental_train=None ) prediction_task(job.id, do_publish_result=False) job.refresh_from_db() exp = Explanation.objects.get_or_create( type=ExplanationTypes.SHAP.value, split=split, predictive_model=predictive_model, job=job, results={} )[0] training_df_old, test_df_old = get_encoded_logs(job) explanation_target = '2_101' prefix_target = 'prefix_1' explanation = explain(exp, training_df_old, test_df_old, explanation_target, prefix_target) training_df_old, test_df_old = get_encoded_logs(job) explanation_temp = shap_temporal_stability(exp, training_df_old, test_df_old, explanation_target) self.assertTrue(type(explanation) is dict) self.assertTrue(type(explanation_temp) is dict)
def test_shape_training(self): training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job( encoding=self.encoding, labelling=self.labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value) )) self.assert_shape(training_df, (4, 4)) self.assert_shape(test_df, (2, 4))
def test_str(self): job = create_test_job(predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value, prediction_method=ClassificationMethods.RANDOM_FOREST.value)) self.assertEqual( len(job.__str__()), len("{created_date: 2019-10-01 09:38:35.245361+00:00, modified_date: 2019-10-01 09:38:35.245655+00:00, error: , status: created, type: prediction, create_models: False, split: {'id': 1, 'type': 'single', 'test_size': 0.2, 'splitting_method': 'sequential', 'original_log_path': 'cache/log_cache/test_logs/general_example.xes'}, encoding: {'data_encoding': 'label_encoder', 'value_encoding': 'simpleIndex', 'add_elapsed_time': False, 'add_remaining_time': False, 'add_executed_events': False, 'add_resources_used': False, 'add_new_traces': False, 'features': {}, 'prefix_length': 1, 'padding': False, 'task_generation_type': 'only'}, labelling: {'type': 'next_activity', 'attribute_name': None, 'threshold_type': 'threshold_mean', 'threshold': 0.0, 'results': {}}, clustering: {'clustering_method': 'noCluster'}, predictive_model: {'n_estimators': 10, 'max_depth': None, 'max_features': 'auto'}, evaluation: [None], hyperparameter_optimizer: [None], incremental_train: [None]}" ))
def test_next_activity_DecisionTree(self): job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.DECISION_TREE.value), labelling=create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value)) result, _ = calculate(job) self.assertDictEqual(result, self.results3())
def get_classification_job(predictive_model: str, prediction_method: str, metric: HyperOptLosses = HyperOptLosses.ACC.value): encoding = create_test_encoding(prefix_length=8, padding=True) pred_model = create_test_predictive_model(predictive_model=predictive_model, prediction_method=prediction_method) hyperparameter_optimizer = create_test_hyperparameter_optimizer(performance_metric=metric) job = create_test_job(predictive_model=pred_model, encoding=encoding, hyperparameter_optimizer=hyperparameter_optimizer) return job
def test_class_randomForest(self): job = create_test_job( predictive_model=create_test_predictive_model( prediction_method=ClassificationMethods.RANDOM_FOREST.value), labelling=create_test_labelling( label_type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label'), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value)) result, _ = calculate(job) self.assertDictEqual(result, self.results2())
def test_no_label(self): labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 9))
def test_remaining_time(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value))) self.assertEqual(df.shape, (2, 11))
def test_row_test(self): training_df, test_df = encode_label_logs(self.training_log, self.test_log, create_test_job( encoding=self.encoding, labelling=self.labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value) )) row = test_df[(test_df.trace_id == '4')].iloc[0] self.assertEqual(1, row.prefix_1) self.assertEqual(0, row.elapsed_time) self.assertEqual(0, row.label)
def test_tsp_gru(self): job = create_test_job( predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.TIME_SERIES_PREDICTION.value, prediction_method=TimeSeriesPredictionMethods.RNN.value, configuration={'rnn_type': 'gru'}), labelling=create_test_labelling(), encoding=create_test_encoding(prefix_length=2, padding=True), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value)) result, _ = calculate(job) del result['elapsed_time'] self.assertDictEqual(result, {'nlevenshtein': 0.6})
def test_next_activity(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual(df.shape, (2, 14))
def get_regression_job(predictive_model: str, prediction_method: str, metric: HyperOptLosses = HyperOptLosses.ACC.value): encoding = create_test_encoding(prefix_length=8, padding=True) pred_model = create_test_predictive_model(predictive_model=predictive_model, prediction_method=prediction_method) hyperparameter_optimizer = create_test_hyperparameter_optimizer(performance_metric=metric) job = create_test_job(predictive_model=pred_model, encoding=encoding, labelling=create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.NONE.value), hyperparameter_optimizer=hyperparameter_optimizer) return job
def test_prefix0(self): encoding = create_test_encoding( value_encoding=ValueEncodings.FREQUENCY.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=0) self.assertRaises(ValueError, encode_label_logs, self.training_log, self.test_log, create_test_job( encoding=encoding, labelling=self.labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value) ))
def test_regression_nn(self): job = create_test_job( predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.REGRESSION.value, prediction_method=RegressionMethods.NN.value), labelling=create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value), clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value)) result, _ = calculate(job) del result['elapsed_time'] print(result) self.assertAlmostEqual(result['mae'], 0.0001388888888888889) self.assertAlmostEqual(result['mape'], -1)
def test_attribute_number(self): labelling = create_test_labelling( label_type=LabelTypes.ATTRIBUTE_NUMBER.value, attribute_name='AMOUNT') _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.test_log)) self.assertEqual(df.shape, (2, 15))
def test_regression_no_cluster(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.NO_CLUSTER.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.REGRESSION.value, prediction_method=RegressionMethods.RANDOM_FOREST.value)) result, _ = calculate(job) self.assertAlmostEqual(result['rmse'], 0.03263757) self.assertAlmostEqual(result['mae'], 0.00011685) self.assertAlmostEqual(result['rscore'], 0.13776124) self.assertAlmostEqual(result['mape'], float('inf'))
def test_regression_kmeans(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering(clustering_type=ClusteringMethods.KMEANS.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True), labelling=create_test_labelling(label_type=LabelTypes.DURATION.value), predictive_model=create_test_predictive_model(predictive_model=PredictiveModels.REGRESSION.value, prediction_method=RegressionMethods.RANDOM_FOREST.value) ) result, _ = calculate(job) self.assertAlmostEqual(result['rmse'], 0.48841552839653984) self.assertAlmostEqual(result['mae'], 0.44282462605873457) self.assertAlmostEqual(result['rscore'], 0.015130407121517586) self.assertAlmostEqual(result['mape'], -1)
def test_regression_no_cluster(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering(clustering_type=ClusteringMethods.NO_CLUSTER.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True), labelling=create_test_labelling(label_type=LabelTypes.DURATION.value), predictive_model=create_test_predictive_model(predictive_model=PredictiveModels.REGRESSION.value, prediction_method=RegressionMethods.RANDOM_FOREST.value) ) result, _ = calculate(job) self.assertAlmostEqual(result['rmse'], 0.4868515876868242) self.assertAlmostEqual(result['mae'], 0.44340838774645464) self.assertAlmostEqual(result['rscore'], 0.02142755175443678) self.assertAlmostEqual(result['mape'], -1)
def setUp(self): test_log = get_log( create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) training_log = get_log( create_test_log(log_name=general_example_train_filename, log_path=general_example_train_filepath)) self.training_df, self.test_df = encode_label_logs( training_log, test_log, create_test_job( encoding=create_test_encoding( value_encoding=ValueEncodings.BOOLEAN.value, add_elapsed_time=True), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)))
def test_regression_kmeans(self): self.max_diff = None job = create_test_job( clustering=create_test_clustering( clustering_type=ClusteringMethods.KMEANS.value), split=repair_example(), encoding=create_test_encoding(prefix_length=5, padding=True), predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.REGRESSION.value, prediction_method=RegressionMethods.RANDOM_FOREST.value)) result, _ = calculate(job) self.assertAlmostEqual(result['rmse'], 0.0325738) self.assertAlmostEqual(result['mae'], 0.00014269) self.assertAlmostEqual(result['rscore'], -0.11336870) self.assertAlmostEqual(result['mape'], float('inf'))