def classification(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main classification entry point train and tests the classifier using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) job.encoding = duplicate_orm_row( job.encoding ) #TODO: maybe here would be better an intelligent get_or_create... job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_classifier(job), clusterer) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split
def regression(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main regression entry point train and tests the regressor using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data, test_data = _prep_data(training_df, test_df) job.encoding = duplicate_orm_row( Encoding.objects.filter(pk=job.encoding.pk)[0] ) # TODO: maybe here would be better an intelligent get_or_create... job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_regressor(job), clusterer) results_df = _test(model_split, test_data) results = calculate_results_regression(results_df, job.labelling) return results, model_split
def update_and_test(training_df: DataFrame, test_df: DataFrame, job: Job): train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) job.encoding = job.incremental_train.encoding job.encoding.save() job.save() if list(train_data.columns.values ) != job.incremental_train.encoding.features: # TODO: how do I align the two feature vectors? train_data, _ = train_data.align( pd.DataFrame(columns=job.incremental_train.encoding.features), axis=1, join='right') train_data = train_data.fillna(0) test_data, _ = test_data.align( pd.DataFrame(columns=job.incremental_train.encoding.features), axis=1, join='right') test_data = test_data.fillna(0) # TODO: UPDATE if incremental, otherwise just test model_split = _update(job, train_data) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ print('\tGetting Dataset') if use_cache: if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): training_df, test_df = get_labelled_logs(job) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(train_log=job.split.train_log.path, test_log=job.split.test_log.path).exists(): training_log, test_log, additional_columns = get_loaded_logs( job.split) else: training_log, test_log, additional_columns = prepare_logs( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(job.split) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log(EventLog(training_log), train_name + '.xes') test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log(EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = prepare_logs(job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = { 'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {} } for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter( pk=current_best['predictive_model_id'])[0] job.save() logger.info("End hyperopt job {}, {} . Results {}".format( job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best[ 'model_split']
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict): """runs the specified training/evaluation run :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: results and predictive_model split """ model_split = None # TODO fixme this needs to be fixed in the interface # if job['incremental_train']['base_model'] is not None: # job['type'] = JobTypes.UPDATE.value start_time = time.time() if job.type == JobTypes.PREDICTION.value: clusterer = _init_clusterer(job.clustering, training_df) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results, model_split = classification(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: results, model_split = regression(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value: results, model_split = time_series_prediction( training_df, test_df, clusterer, job) elif job.type == JobTypes.LABELLING.value: results = _label_task(training_df) elif job.type == JobTypes.UPDATE.value: results, model_split = update_and_test(training_df, test_df, job) else: raise ValueError("Type {} not supported".format(job.type)) # TODO: integrateme if job.type != JobTypes.LABELLING.value: if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results) elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(model_split[job.predictive_model.predictive_model] [0].classes_) <= 2) job.save() if job.type == PredictiveModels.CLASSIFICATION.value: save_result(results, job, start_time) print("End job {}, {} .".format(job.type, get_run(job))) print("\tResults {} .".format(results)) return results, model_split
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict): """runs the specified training/evaluation run :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: results and predictive_model split """ model_split = None start_time = time.time() if job.type == JobTypes.PREDICTION.value: clusterer = _init_clusterer(job.clustering, training_df) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results, model_split = classification(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: results, model_split = regression(training_df, test_df, clusterer, job) elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value: results, model_split = time_series_prediction( training_df, test_df, clusterer, job) elif job.type == JobTypes.LABELLING.value: results = _label_task(training_df) elif job.type == JobTypes.UPDATE.value: results, model_split = update_and_test(training_df, test_df, job) else: raise ValueError("Type {} not supported".format(job.type)) # TODO: integrateme if job.type != JobTypes.LABELLING.value: results['elapsed_time'] = timedelta( seconds=time.time() - start_time) #todo find better place for this if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results) elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(model_split[ModelType.CLASSIFIER.value][0].classes_) <= 2) job.evaluation.save() elif job.type == JobTypes.LABELLING.value: job.labelling.results = results job.labelling.save() if job.type == PredictiveModels.CLASSIFICATION.value: #todo this is an old workaround I should remove this save_result(results, job, start_time) print("End job {}, {} .".format(job.type, get_run(job))) print("\tResults {} .".format(results)) return results, model_split
def replay_task(replay_job: Job, training_initial_job: Job) -> list: """ The function create a replay task to ask the server to demo the arriving of events :param replay_job: job dictionary :param training_initial_job: job dictionary :return: List of requests """ logger.error("Start replay task ID {}".format(replay_job.id)) requests = list() try: replay_job.status = JobStatuses.RUNNING.value replay_job.error = '' replay_job.save() requests = replay_core(replay_job, training_initial_job) replay_job.status = JobStatuses.COMPLETED.value for r in requests: if r.status_code != status.HTTP_201_CREATED: replay_job.error += [r] except Exception as e: logger.error(e) replay_job.status = JobStatuses.ERROR.value replay_job.error += [str(e.__repr__())] raise e finally: replay_job.save() publish(replay_job) return requests
def cross_validated_regression(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job, cv=2) -> (dict, dict): """main regression entry point train and tests the regressor using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :param cv: cross validation amount :return: predictive_model scores and split """ train_data, test_data = _prep_data(training_df, test_df) # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0]) # TODO: maybe here would be better an intelligent get_or_create... job.encoding = Encoding.objects.create( data_encoding=job.encoding.data_encoding, value_encoding=job.encoding.value_encoding, add_elapsed_time=job.encoding.add_elapsed_time, add_remaining_time=job.encoding.add_remaining_time, add_executed_events=job.encoding.add_executed_events, add_resources_used=job.encoding.add_resources_used, add_new_traces=job.encoding.add_new_traces, features=job.encoding.features, prefix_length=job.encoding.prefix_length, padding=job.encoding.padding, task_generation_type=job.encoding.task_generation_type) job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_regressor(job), clusterer, do_cv=True) results_df = _test(model_split, test_data) results = calculate_results_regression(results_df, job.labelling) return results, model_split
def replay_prediction(replay_job: Job, training_initial_job: Job, trace_id) -> list: """The function create a set with timestamps of events, then create a list of requests simulating the log in the time passing :param trace_id: :param replay_job: job dictionary :param training_initial_job: job dictionary :return: List of requests """ split = replay_job.split log = get_log(split.train_log) requests_list = list() eventlog = EventLog() trace = log[int(trace_id)] for key in log.attributes.keys(): eventlog.attributes[key] = log.attributes[key] for index in range(len(trace)): new_trace = Trace(trace[0:index]) for key in trace.attributes: new_trace.attributes[key] = trace.attributes[key] eventlog.append(new_trace) replay_job.case_id = trace_id replay_job.event_number = len(trace) replay_job.save() try: logger.error("Sending request for replay_prediction task.") r = requests.post( url="http://127.0.0.1:8000/runtime/replay_prediction/", data=export_log_as_string(eventlog), params={ 'jobId': replay_job.id, 'training_job': training_initial_job.id }, headers={ 'Content-Type': 'text/plain', 'charset': 'UTF-8' }) requests_list.append(str(r)) except Exception as e: requests_list.append(str(e)) logger.warning(str(e)) return requests_list
def classification(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main classification entry point train and tests the classifier using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0]) # TODO: maybe here would be better an intelligent get_or_create... job.encoding = Encoding.objects.create( data_encoding=job.encoding.data_encoding, value_encoding=job.encoding.value_encoding, add_elapsed_time=job.encoding.add_elapsed_time, add_remaining_time=job.encoding.add_remaining_time, add_executed_events=job.encoding.add_executed_events, add_resources_used=job.encoding.add_resources_used, add_new_traces=job.encoding.add_new_traces, features=job.encoding.features, prefix_length=job.encoding.prefix_length, padding=job.encoding.padding, task_generation_type=job.encoding.task_generation_type) job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_classifier(job), clusterer) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split
def set_model_name(job: Job) -> None: if job.create_models: if job.predictive_model.model_path != '': # job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) #todo: replace with simple CREATE job.predictive_model = PredictiveModel.init( job.predictive_model.get_full_dict( ) #todo: doublecheck me, are you sure get_full_dict is returning everything needed? ) #todo: futurebug if object changes job.predictive_model.save() job.save() if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) job.clustering.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def set_model_name(job: Job) -> None: if job.create_models: if job.predictive_model.model_path != '': job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) job.predictive_model.save() job.save() if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) job.clustering.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def save_models(models: dict, job: Job): logger.info("\tStart saving models of JOB {}".format(job.id)) if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: clusterer_filename = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) joblib.dump(models[ModelType.CLUSTERER.value], clusterer_filename) job.clustering.model_path = clusterer_filename job.clustering.save() job.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) joblib.dump(models[job.predictive_model.predictive_model], predictive_model_filename) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def runtime_task(job: Job): """ The function create a runtime task to ask a single prediction to the server :param job: job dictionary """ logger.info("Start runtime task ID {}".format(job.id)) try: job.status = JobStatuses.RUNNING.value job.save() result = runtime_calculate(job) job.results = {'result': str(result)} job.status = JobStatuses.COMPLETED.value job.error = '' except Exception as e: logger.error(e) job.status = JobStatuses.ERROR.value job.error = str(e.__repr__()) raise e finally: job.save() publish(job)
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ logger.info('\tGetting Dataset') if use_cache and \ (job.predictive_model is not None and job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value): if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): try: training_df, test_df = get_labelled_logs(job) except FileNotFoundError: #cache invalidation LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).delete() logger.info('\t\tError pre-labeled cache invalidated!') return get_encoded_logs(job, use_cache) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(split=job.split).exists(): try: training_log, test_log, additional_columns = get_loaded_logs( job.split) except FileNotFoundError: # cache invalidation LoadedLog.objects.filter(split=job.split).delete() logger.info('\t\tError pre-loaded cache invalidated!') return get_encoded_logs(job, use_cache) else: training_log, test_log, additional_columns = get_train_test_log( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: search_for_already_existing_split = Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=job.split.original_log, test_size=job.split.test_size, splitting_method=job.split.splitting_method) if len(search_for_already_existing_split) >= 1: job.split = search_for_already_existing_split[0] job.split.save() job.save() return get_encoded_logs(job, use_cache=use_cache) else: # job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0]) #todo: replace with simple CREATE job.split = Split.objects.create( type=job.split.type, original_log=job.split.original_log, test_size=job.split.test_size, splitting_method=job.split.splitting_method, train_log=job.split.train_log, test_log=job.split.test_log, additional_columns=job.split.additional_columns ) #todo: futurebug if object changes job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = 'SPLITTED_' + job.split.original_log.name.split( '.')[0] + '_0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log( EventLog(training_log), train_name + '.xes') test_name = 'SPLITTED_' + job.split.original_log.name.split( '.')[0] + '_' + str( int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log( EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.split.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = get_train_test_log( job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global train_df, validation_df, test_df, global_job global_job = job train_df, test_df = get_encoded_logs(job) train_df, validation_df, test_df = _retrieve_train_validate_test( train_df, test_df) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = algorithm = OPTIMISATION_ALGORITHM[ job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).algorithm_type] _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials) best_candidate = trials.best_trial['result'] job.predictive_model = PredictiveModel.objects.filter( pk=best_candidate['predictive_model_id'])[0] job.predictive_model.save() job.save() best_candidate['results']['elapsed_time'] = timedelta( seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = best_candidate['results']['elapsed_time'] job.evaluation.save() results_df, auc = _test_best_candidate( best_candidate, job.labelling.type, job.predictive_model.predictive_model) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results = classification_prepare_results(results_df, auc) else: results = regression_prepare_results(results_df, job.labelling) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init(job.predictive_model.predictive_model, results, len(set(validation_df['label'])) <= 2) job.evaluation.save() job.save() logger.info( "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}." .format(job.type, get_run(job), best_candidate['results'], results)) # return results, best_candidate['config'], best_candidate['model_split']
def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict, dict): """runs the specified training/evaluation run :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: results and predictive_model split """ model_split = None start_time = time.time() if job.type == JobTypes.PREDICTION.value: clusterer = init_clusterer(job.clustering, training_df) results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.BUILD_MODEL_AND_TEST.value](training_df, test_df, clusterer, job) elif job.type == JobTypes.LABELLING.value: results = _label_task(training_df) elif job.type == JobTypes.UPDATE.value: results, model_split = MODEL[job.predictive_model.predictive_model][ModelActions.UPDATE_AND_TEST.value](training_df, test_df, job) else: raise ValueError("Type {} not supported".format(job.type)) # TODO: integrateme if job.type != JobTypes.LABELLING.value: results['elapsed_time'] = timedelta(seconds=time.time() - start_time) #todo find better place for this if job.predictive_model.predictive_model == PredictiveModels.REGRESSION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results ) elif job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(set(test_df['label'])) <= 2 ) elif job.predictive_model.predictive_model == PredictiveModels.TIME_SERIES_PREDICTION.value: job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results ) job.evaluation.save() job.save() elif job.type == JobTypes.LABELLING.value: # job.labelling = duplicate_orm_row(job.labelling) #todo: replace with simple CREATE job.labelling = Labelling.objects.create( type=job.labelling.type, attribute_name=job.labelling.attribute_name, threshold_type=job.labelling.threshold_type, threshold=job.labelling.threshold ) #todo: futurebug if object changes job.labelling.results = results job.labelling.save() job.save() # if job.type == PredictiveModels.CLASSIFICATION.value: #todo this is an old workaround I should remove this # save_result(results, job, start_time) logger.info("End job {}, {} .".format(job.type, get_run(job))) logger.info("\tResults {} .".format(results)) return results, model_split
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log): """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace :param replay_prediction_job: job dictionary :param training_initial_job: job dictionary :param log: job dictionary """ logger.info("Start replay_prediction task ID {}".format( replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() # new_replay_prediction_job = duplicate_orm_row(prediction_job) #todo: replace with simple CREATE new_replay_prediction_job = Job.objects.create( created_date=prediction_job.created_date, modified_date=prediction_job.modified_date, error=prediction_job.error, status=prediction_job.status, type=prediction_job.type, create_models=prediction_job.create_models, case_id=prediction_job.case_id, event_number=prediction_job.event_number, gold_value=prediction_job.gold_value, results=prediction_job.results, parent_job=prediction_job.parent_job, split=prediction_job.split, encoding=prediction_job.encoding, labelling=prediction_job.labelling, clustering=prediction_job.clustering, predictive_model=prediction_job.predictive_model, evaluation=prediction_job.evaluation, hyperparameter_optimizer=prediction_job. hyperparameter_optimizer, incremental_train=prediction_job.incremental_train) new_replay_prediction_job.split = Split.objects.filter( pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.parent_job = replay_prediction_job.parent_job new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result_dict, events_for_trace = replay_prediction_calculate( replay_prediction_job, log) replay_prediction_job.results = dict(result_dict) replay_prediction_job.event_number = dict(events_for_trace) replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) #TODO evaluate on validation set if holdout: validation_df = test_df # test_df = training_df.sample(frac=.2) test_df = training_df.tail(int(len(training_df) * 20 / 100)) training_df = training_df.drop(test_df.index) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}} for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0] job.predictive_model.save() job.save() current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = current_best['results']['elapsed_time'] job.evaluation.save() #TODO evaluate on validation set if holdout: results_df, auc = _test( current_best['model_split'], validation_df.drop(['trace_id'], 1), evaluation=True, is_binary_classifier=_check_is_binary_classifier(job.labelling.type) ) results = _prepare_results(results_df, auc) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(set(test_df['label'])) <= 2 ) job.evaluation.save() job.save() if holdout: logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results)) return results, current_best['config'], current_best['model_split'] else: logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best['model_split']
def check_predictive_model_not_overwrite(job: Job) -> None: if job.hyperparameter_optimizer.optimization_method != HyperparameterOptimizationMethods.NONE.value: job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) job.predictive_model.save() job.save()
def replay_core(replay_job: Job, training_initial_job: Job) -> list: """The function create a set with timestamps of events, then create a list of requests simulating the log in the time passing :param replay_job: job dictionary :param training_initial_job: job dictionary :return: List of requests """ split = replay_job.split log = get_log(split.train_log) requests_list = list() eventlog = EventLog() for key in log.attributes.keys(): eventlog.attributes[key] = log.attributes[key] for trace in log: new_trace = Trace(trace) for key in trace.attributes: new_trace.attributes[key] = trace.attributes[key] eventlog.append(new_trace) times = sorted( set([event['time:timestamp'] for trace in eventlog for event in trace])) for t in times[2::int((len(times) - 2) / 5)]: filtered_eventlog = timestamp_filter.apply_events( eventlog, times[0].replace(tzinfo=None), t.replace(tzinfo=None)) trace_list = list() event_number = dict() for trace in filtered_eventlog: trace_list.append(trace.attributes['concept:name']) event_number[trace.attributes['concept:name']] = len(trace) replay_job.case_id = trace_list replay_job.event_number = event_number replay_job.save() try: #TODO check logger usage logger.info("Sending request for replay_prediction task.") r = requests.post( url="http://server:8000/runtime/replay_prediction/", data=export_log_as_string(filtered_eventlog), params={ 'jobId': replay_job.id, 'training_job': training_initial_job.id }, headers={ 'Content-Type': 'text/plain', 'charset': 'UTF-8' }) requests_list.append(str(r)) except Exception as e: requests_list.append(str(e)) logger.warning(str(e)) training_log, test_log, additional_columns = get_train_test_log( replay_job.split) training_df, _ = encode_label_logs(training_log, test_log, replay_job, additional_columns=additional_columns) gold_values = dict(zip(training_df['trace_id'], training_df['label'])) parent_id = replay_job.id # final_job = duplicate_orm_row(replay_job) #todo: replace with simple CREATE final_job = Job.objects.create( created_date=replay_job.created_date, modified_date=replay_job.modified_date, error=replay_job.error, status=replay_job.status, type=replay_job.type, create_models=replay_job.create_models, case_id=replay_job.case_id, event_number=replay_job.event_number, gold_value=replay_job.gold_value, results=replay_job.results, parent_job=replay_job.parent_job, split=replay_job.split, encoding=replay_job.encoding, labelling=replay_job.labelling, clustering=replay_job.clustering, predictive_model=replay_job.predictive_model, evaluation=replay_job.evaluation, hyperparameter_optimizer=replay_job.hyperparameter_optimizer, incremental_train=replay_job.incremental_train) final_job.parent_job = Job.objects.filter(pk=parent_id)[0] final_job.gold_value = gold_values final_job.type = JobTypes.REPLAY_PREDICT.value final_job.save() return requests_list
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log): """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace :param replay_prediction_job: job dictionary :param training_initial_job: job dictionary :param log: job dictionary """ logger.info("Start replay_prediction task ID {}".format(replay_prediction_job.id)) try: replay_prediction_job.status = JobStatuses.RUNNING.value replay_prediction_job.save() max_len = max(len(trace) for trace in log) if replay_prediction_job.encoding.prefix_length != max_len: prediction_job = create_prediction_job(training_initial_job, max_len) prediction_task(prediction_job.id) prediction_job.refresh_from_db() new_replay_prediction_job = duplicate_orm_row(prediction_job) new_replay_prediction_job.split = Split.objects.filter(pk=replay_prediction_job.split.id)[0] new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value new_replay_prediction_job.parent_job = replay_prediction_job.parent_job new_replay_prediction_job.status = JobStatuses.CREATED.value replay_prediction_task(new_replay_prediction_job, prediction_job, log) return result_dict, events_for_trace = replay_prediction_calculate(replay_prediction_job, log) replay_prediction_job.results = dict(result_dict) replay_prediction_job.event_number = dict(events_for_trace) replay_prediction_job.status = JobStatuses.COMPLETED.value replay_prediction_job.error = '' except Exception as e: logger.error(e) replay_prediction_job.status = JobStatuses.ERROR.value replay_prediction_job.error = str(e.__repr__()) raise e finally: replay_prediction_job.save() publish(replay_prediction_job)