def prepare_logs(split: Split): """Returns training_log and test_log""" if split.type == SplitTypes.SPLIT_SINGLE.value: additional_columns = get_additional_columns(get_log( split.original_log)) training_log, test_log = _split_single_log(split) logger.info("\t\tLoaded single log from {}".format( split.original_log.path)) else: # Have to use sklearn to convert some internal data types training_log = get_log(split.train_log) additional_columns = get_additional_columns(training_log) if split.additional_columns is None: split.additional_columns = split.train_log.name + split.test_log.name + '_ac.xes' split.save() training_log, _ = train_test_split(training_log, test_size=0, shuffle=False) test_log, _ = train_test_split(get_log(split.test_log), test_size=0, shuffle=False) logger.info("\t\tLoaded double logs from {} and {}.".format( split.train_log.path, split.test_log.path)) if len(training_log) == 0: raise TypeError( "Training log is empty. Create a new Split with better parameters") return training_log, test_log, additional_columns
def setUp(self): self.train_log = get_log(create_test_log(log_name=general_example_train_filename, log_path=general_example_train_filepath)) self.train_event_names = unique_events(self.train_log) self.train_add_col = get_additional_columns(self.train_log) self.test_log = get_log(create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) self.test_event_names = unique_events(self.test_log) self.test_add_col = get_additional_columns(self.test_log)
def get_train_test_log(split: Split): """Returns training_log and test_log""" if split.type == SplitTypes.SPLIT_SINGLE.value and Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method ).exists() and split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value: return get_train_test_log(Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method )[0]) elif split.original_log is not None and (not Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method ).exists() or split.splitting_method == SplitOrderingMethods.SPLIT_RANDOM.value): training_log, test_log = _split_single_log(split) additional_columns = get_additional_columns(get_log(split.original_log)) if split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value: _ = Split.objects.get_or_create( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method, train_log=create_log(EventLog(training_log), '0-' + str(100 - int(split.test_size * 100)) + '.xes'), test_log=create_log(EventLog(test_log), str(100 - int(split.test_size * 100)) + '-100.xes'), additional_columns=split.additional_columns )[0] logger.info("\t\tLoaded single log from {}".format(split.original_log.path)) else: # Have to use sklearn to convert some internal data types training_log = get_log(split.train_log) additional_columns = get_additional_columns(training_log) if split.additional_columns is None: split.additional_columns = split.train_log.name + split.test_log.name + '_ac.xes' split.save() training_log, train_log_to_append = train_test_split(training_log, test_size=0, shuffle=False) test_log, test_log_to_append = train_test_split(get_log(split.test_log), test_size=0, shuffle=False) logger.info("\t\tLoaded double logs from {} and {}.".format(split.train_log.path, split.test_log.path)) if len(training_log) == 0: raise TypeError("Training log is empty. Create a new Split with better parameters") return training_log, test_log, additional_columns
def test_global_event_attributes(self): log = get_log( create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath_xes)) attributes = get_additional_columns(log) self.assertListEqual(attributes['event_attributes'], ['Activity', 'Costs', 'Resource', 'org:resource'])
def setUp(self): self.log = get_log(create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) self.event_names = unique_events(self.log) self.labelling = create_test_labelling(label_type=LabelTypes.REMAINING_TIME.value) self.add_col = get_additional_columns(self.log) self.encoding = create_test_encoding( value_encoding=ValueEncodings.LAST_PAYLOAD.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=1)
def do_test(self, encoding, log): start_time = time.time() # log = get_logs(log_path)[0] add_col = get_additional_columns(log) event_names = unique_events(log) encoding = EncodingContainer(encoding, prefix_length=20, padding=ZERO_PADDING) log = encode_label_log(log, encoding, PredictiveModels.REGRESSION.value, self.label, event_names=event_names, additional_columns=add_col) print(log.shape) print("Total for %s %s seconds" % (encoding.method, time.time() - start_time))
def test_no_label(self): labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual((2, 12), df.shape)
def test_next_activity(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual(df.shape, (2, 14))
def test_remaining_time(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual(df.shape, (2, 14))
def replay_prediction_calculate(job: Job, log) -> dict: """calculate the prediction for the log coming from replayers :param job: job idctionary :param log: log model :return: runtime results """ additional_columns = get_additional_columns(log) data_df, _ = train_test_split(log, test_size=0, shuffle=False) data_df, _ = encode_label_logs(data_df, EventLog(), job, additional_columns) results = MODEL[job.predictive_model.predictive_model][ModelActions.PREDICT.value](job, data_df) logger.info("End {} job {}, {} . Results {}".format('runtime', job.predictive_model.predictive_model, get_run(job), results)) return results
def test_attribute_number(self): labelling = create_test_labelling( label_type=LabelTypes.ATTRIBUTE_NUMBER.value, attribute_name='AMOUNT') _, df = encode_label_logs( self.test_log, self.test_log, create_test_job( encoding=self.encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.test_log)) self.assertEqual(df.shape, (2, 15))
def setUp(self): self.train_log = get_log( create_test_log(log_name=general_example_train_filename, log_path=general_example_train_filepath)) self.test_log = get_log( create_test_log(log_name=general_example_test_filename, log_path=general_example_test_filepath)) self.add_col = get_additional_columns(self.train_log) self.encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=2) self.encodingPadding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True)
def test_no_label_zero_padding(self): # add things have no effect labelling = create_test_labelling(label_type=LabelTypes.NO_LABEL.value) encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual(df.shape, (2, 52))
def test_add_new_traces(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value) encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, prefix_length=2, add_new_traces=True, add_elapsed_time=True) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual(df.shape, (2, 15)) self.assertTrue('new_traces' in df.columns.values.tolist()) self.assertListEqual(df['new_traces'].tolist(), [0, 0])
def test_next_activity_zero_padding_elapsed_time(self): labelling = create_test_labelling( label_type=LabelTypes.NEXT_ACTIVITY.value) encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.train_log)) self.assertEqual(df.shape, (2, 55)) self.assertTrue('elapsed_time' in df.columns.values.tolist())
def test_label_remaining_time_with_elapsed_time_custom_threshold(self): labelling = create_test_labelling( label_type=LabelTypes.REMAINING_TIME.value, threshold_type=ThresholdTypes.THRESHOLD_CUSTOM.value, threshold=40000) encoding = create_test_encoding( value_encoding=ValueEncodings.COMPLEX.value, add_elapsed_time=True, add_remaining_time=True, task_generation_type=TaskGenerationTypes.ONLY_THIS.value, prefix_length=10, padding=True) _, df = encode_label_logs( self.train_log, self.test_log, create_test_job( encoding=encoding, labelling=labelling, predictive_model=create_test_predictive_model( predictive_model=PredictiveModels.CLASSIFICATION.value)), get_additional_columns(self.test_log)) self.assertEqual(df.shape, (2, 55))
def setUp(self): self.log = get_log("cache/log_cache/repairExample.xes") # self.log = get_logs("log_cache/BPI Challenge 2017.xes.gz")[0] self.label = LabelContainer(LabelTypes.NO_LABEL.value) self.add_col = get_additional_columns(self.log)