def explain(retrain_exp: Explanation, training_df_old, test_df_old, explanation_target, prefix_target): initial_job_obj = retrain_exp.job # todo: return performances inital_result = dict(initial_job_obj.evaluation.classificationmetrics. to_dict()) # TODO future bug train_df, test_df = randomise_features(training_df_old.copy(), test_df_old.copy(), explanation_target) assert not train_df.equals(training_df_old) assert not test_df.equals(test_df_old) new_split = save_randomised_set(initial_job_obj.split) prediction_job = create_prediction_job( initial_job_obj, initial_job_obj.encoding.prefix_length) prediction_job.split = new_split prediction_job.split.save() prediction_job.evaluation = None prediction_job.save() # assert prediction_job.split.id != initial_job_obj.split.id put_labelled_logs(prediction_job, train_df, test_df) # todo: build model prediction_task(prediction_job.id, do_publish_result=False) prediction_job.refresh_from_db() # todo: return performances return { "Initial result": inital_result, "Retrain result": prediction_job.evaluation.classificationmetrics.to_dict() }
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ print('\tGetting Dataset') if use_cache: if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): training_df, test_df = get_labelled_logs(job) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(train_log=job.split.train_log.path, test_log=job.split.test_log.path).exists(): training_log, test_log, additional_columns = get_loaded_logs( job.split) else: training_log, test_log, additional_columns = prepare_logs( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(job.split) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log(EventLog(training_log), train_name + '.xes') test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log(EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = prepare_logs(job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ logger.info('\tGetting Dataset') if use_cache and \ (job.predictive_model is not None and job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value): if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): try: training_df, test_df = get_labelled_logs(job) except FileNotFoundError: #cache invalidation LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).delete() logger.info('\t\tError pre-labeled cache invalidated!') return get_encoded_logs(job, use_cache) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(split=job.split).exists(): try: training_log, test_log, additional_columns = get_loaded_logs(job.split) except FileNotFoundError: # cache invalidation LoadedLog.objects.filter(split=job.split).delete() logger.info('\t\tError pre-loaded cache invalidated!') return get_encoded_logs(job, use_cache) else: training_log, test_log, additional_columns = get_train_test_log(job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0]) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str(int(100 - (job.split.test_size * 100))) job.split.train_log = create_log( EventLog(training_log), train_name + '.xes' ) test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log( EventLog(test_log), test_name + '.xes' ) job.split.additional_columns = str(train_name + test_name) # TODO: find better naming policy job.split.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = get_train_test_log(job.split) training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def handle(self, *args, **kwargs): TARGET_JOB = 439 initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0] # todo: return performances print('Initial Job:', initial_job_obj.evaluation.classificationmetrics ) # TODO future bug training_df_old, test_df_old = get_encoded_logs(initial_job_obj) training_df = training_df_old.copy() test_df = test_df_old.copy() # todo: what should I randomise? TARGETS = [ [('prefix_1', 2)], # <- simple pattern [('prefix_2', 3)], # <- simple pattern [ ('prefix_3', 2), ('prefix_4', 3), ] # <- complex pattern ] for target in TARGETS: if len(target) == 1: target = target[0] for df in [training_df, test_df]: m_col = df[target[0]] del df[target[0]] target_values1 = list(set(m_col.values)) df[target[0]] = m_col.apply(lambda x: x if (x != target[ 1]) else random.choice(target_values1)) elif len(target) > 1: for df in [training_df, test_df]: m_col = df[[column for column, _ in target]] possible_values = {} for column, _ in target: possible_values[column] = list(set(df[column])) del df[column] df[[column for column, _ in target ]] = m_col.apply(lambda x: x if any( [x[column] != value for column, value in target]) else Series({ column: random.choice(possible_values[column]) for column, value in target }), axis=1) else: raise Exception('target list with unexpected value') assert not training_df.equals(training_df_old) assert not test_df.equals(test_df_old) # todo: save new dataset in memory and create split to use it initial_split_obj = initial_job_obj.split new_split = duplicate_orm_row(initial_split_obj) train_log = duplicate_orm_row(new_split.train_log) test_log = duplicate_orm_row(new_split.test_log) # TODO future bug creates shadows train_log.name = 'RETRAIN' + train_log.name train_log.path = 'cache/log_cache/' + train_log.name train_log.properties = {} test_log.name = 'RETRAIN' + test_log.name test_log.path = 'cache/log_cache/' + test_log.name test_log.properties = {} new_split.train_log = train_log new_split.test_log = test_log new_split.additional_columns = None new_split.save() prediction_job = create_prediction_job( initial_job_obj, initial_job_obj.encoding.prefix_length) prediction_job.split = new_split prediction_job.split.save() prediction_job.save() put_labelled_logs(prediction_job, training_df, test_df) # todo: build model prediction_task(prediction_job.id, do_publish_result=False) prediction_job.refresh_from_db() # todo: return performances print('Retrain Job:', prediction_job.evaluation.classificationmetrics) print('Done, cheers!')