def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ print('\tGetting Dataset') if use_cache: if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): training_df, test_df = get_labelled_logs(job) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(train_log=job.split.train_log.path, test_log=job.split.test_log.path).exists(): training_log, test_log, additional_columns = get_loaded_logs( job.split) else: training_log, test_log, additional_columns = prepare_logs( job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(job.split) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str( int(100 - (job.split.test_size * 100))) job.split.train_log = create_log(EventLog(training_log), train_name + '.xes') test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log(EventLog(test_log), test_name + '.xes') job.split.additional_columns = str( train_name + test_name) # TODO: find better naming policy job.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = prepare_logs(job.split) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df
def upload_multiple(request): print('Double upload request received.') test_log = create_log(request.FILES['testSet'], request.FILES['testSet'].name) train_log = create_log(request.FILES['trainingSet'], request.FILES['trainingSet'].name) item = Split.objects.create( type='double', train_log=train_log, test_log=test_log) serializer = SplitSerializer(item) return Response(serializer.data, status=status.HTTP_201_CREATED)
def get_train_test_log(split: Split): """Returns training_log and test_log""" if split.type == SplitTypes.SPLIT_SINGLE.value and Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method ).exists() and split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value: return get_train_test_log(Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method )[0]) elif split.original_log is not None and (not Split.objects.filter( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method ).exists() or split.splitting_method == SplitOrderingMethods.SPLIT_RANDOM.value): training_log, test_log = _split_single_log(split) additional_columns = get_additional_columns(get_log(split.original_log)) if split.splitting_method != SplitOrderingMethods.SPLIT_RANDOM.value: _ = Split.objects.get_or_create( type=SplitTypes.SPLIT_DOUBLE.value, original_log=split.original_log, test_size=split.test_size, splitting_method=split.splitting_method, train_log=create_log(EventLog(training_log), '0-' + str(100 - int(split.test_size * 100)) + '.xes'), test_log=create_log(EventLog(test_log), str(100 - int(split.test_size * 100)) + '-100.xes'), additional_columns=split.additional_columns )[0] logger.info("\t\tLoaded single log from {}".format(split.original_log.path)) else: # Have to use sklearn to convert some internal data types training_log = get_log(split.train_log) additional_columns = get_additional_columns(training_log) if split.additional_columns is None: split.additional_columns = split.train_log.name + split.test_log.name + '_ac.xes' split.save() training_log, train_log_to_append = train_test_split(training_log, test_size=0, shuffle=False) test_log, test_log_to_append = train_test_split(get_log(split.test_log), test_size=0, shuffle=False) logger.info("\t\tLoaded double logs from {} and {}.".format(split.train_log.path, split.test_log.path)) if len(training_log) == 0: raise TypeError("Training log is empty. Create a new Split with better parameters") return training_log, test_log, additional_columns
def progetto_padova(): JOB = Job.objects.get_or_create( status=JobStatuses.CREATED.value, type=JobTypes.PREDICTION.value, split=Split.objects.get_or_create( # this creates the split of the log type=SplitTypes.SPLIT_DOUBLE.value, train_log=create_log( # this imports the log import_log(BASE_DIR + RELATIVE_TRAIN_PATH), RELATIVE_TRAIN_PATH, BASE_DIR, import_in_cache=False), test_log=create_log( # this imports the log import_log(BASE_DIR + RELATIVE_VALIDATION_PATH), RELATIVE_VALIDATION_PATH, BASE_DIR, import_in_cache=False))[0], encoding=Encoding.objects. get_or_create( # this defines the encoding method data_encoding=DataEncodings.LABEL_ENCODER.value, value_encoding=ValueEncodings.SIMPLE_INDEX.value, add_elapsed_time=False, add_remaining_time=False, add_executed_events=False, add_resources_used=False, add_new_traces=False, prefix_length=5, padding=True, task_generation_type=TaskGenerationTypes.ALL_IN_ONE.value, features=[])[0], labelling=Labelling.objects.get_or_create( # this defines the label type=LabelTypes.ATTRIBUTE_STRING.value, attribute_name='label', threshold_type=None, threshold=None)[0], clustering=Clustering.init(ClusteringMethods.NO_CLUSTER.value, configuration={}), predictive_model=PredictiveModel. init( # this defines the predictive model get_prediction_method_config( PredictiveModels.CLASSIFICATION.value, ClassificationMethods.DECISION_TREE.value, payload={ 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 2 })), hyperparameter_optimizer=HyperparameterOptimization.init( { # this defines the hyperparameter optimisation procedure 'type': HyperparameterOptimizationMethods.HYPEROPT.value, 'max_evaluations': 10, 'performance_metric': HyperOptAlgorithms.TPE.value, 'algorithm_type': HyperOptLosses.AUC.value }), create_models=True)[0] # load log train_log, test_log, additional_columns = get_train_test_log(JOB.split) # encode train_df, test_df = encode_label_logs(train_log, test_log, JOB) # train + evaluate results, model_split = MODEL[JOB.predictive_model.predictive_model][ ModelActions.BUILD_MODEL_AND_TEST.value](train_df, test_df, _init_clusterer( JOB.clustering, train_df), JOB) if JOB.create_models: save_models(model_split, JOB) # predict data_df = pd.concat([train_df, test_df]) results = MODEL[JOB.predictive_model.predictive_model][ ModelActions.PREDICT.value](JOB, data_df) results = MODEL[JOB.predictive_model.predictive_model][ ModelActions.PREDICT_PROBA.value](JOB, data_df) # lime exp = Explanation.objects.get_or_create( type=ExplanationTypes.LIME.value, split=JOB. split, # this defines the analysed log, you can use a different one from the training one predictive_model=JOB.predictive_model, job=JOB)[0] error, result = explanation(exp.id, int(EXPLANATION_TARGET))
def post(self, request): log = create_log(self.request.FILES['single'], self.request.FILES['single'].name) serializer = LogSerializer(log) return Response(serializer.data, status=status.HTTP_201_CREATED)
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame): """returns the encoded logs returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible :param job: job configuration :param use_cache: load or not saved datasets from cache :return: training and testing DataFrame """ logger.info('\tGetting Dataset') if use_cache and \ (job.predictive_model is not None and job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value): if LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).exists(): try: training_df, test_df = get_labelled_logs(job) except FileNotFoundError: #cache invalidation LabelledLog.objects.filter(split=job.split, encoding=job.encoding, labelling=job.labelling).delete() logger.info('\t\tError pre-labeled cache invalidated!') return get_encoded_logs(job, use_cache) else: if job.split.train_log is not None and \ job.split.test_log is not None and \ LoadedLog.objects.filter(split=job.split).exists(): try: training_log, test_log, additional_columns = get_loaded_logs(job.split) except FileNotFoundError: # cache invalidation LoadedLog.objects.filter(split=job.split).delete() logger.info('\t\tError pre-loaded cache invalidated!') return get_encoded_logs(job, use_cache) else: training_log, test_log, additional_columns = get_train_test_log(job.split) if job.split.type == SplitTypes.SPLIT_SINGLE.value: job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0]) job.split.type = SplitTypes.SPLIT_DOUBLE.value train_name = '0-' + str(int(100 - (job.split.test_size * 100))) job.split.train_log = create_log( EventLog(training_log), train_name + '.xes' ) test_name = str(int(100 - (job.split.test_size * 100))) + '-100' job.split.test_log = create_log( EventLog(test_log), test_name + '.xes' ) job.split.additional_columns = str(train_name + test_name) # TODO: find better naming policy job.split.save() put_loaded_logs(job.split, training_log, test_log, additional_columns) training_df, test_df = encode_label_logs( training_log, test_log, job, additional_columns=additional_columns) put_labelled_logs(job, training_df, test_df) else: training_log, test_log, additional_columns = get_train_test_log(job.split) training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns) return training_df, test_df