def create_prediction_job(job: Job, max_len: int) -> Job:
    new_job = duplicate_orm_row(job)
    new_job.type = JobTypes.PREDICTION.value
    new_job.status = JobStatuses.CREATED.value
    new_encoding = duplicate_orm_row(
        Encoding.objects.filter(pk=job.encoding.id)[0])
    new_encoding.prefix_length = max_len
    new_encoding.save()
    new_job.encoding = new_encoding
    new_job.create_models = True
    new_job.save()
    return new_job
def replay_prediction_task(replay_prediction_job, training_initial_job, log):
    logger.info("Start replay_prediction task ID {}".format(
        replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job,
                                                   max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            new_replay_prediction_job = duplicate_orm_row(prediction_job)
            new_replay_prediction_job.split = Split.objects.filter(
                pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job,
                                   log)
            return
        result = replay_prediction_calculate(replay_prediction_job, log)
        replay_prediction_job.results = {'result': str(result)}
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
示例#3
0
def regression(training_df: DataFrame, test_df: DataFrame,
               clusterer: Clustering, job: Job) -> (dict, dict):
    """main regression entry point

    train and tests the regressor using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data, test_data = _prep_data(training_df, test_df)

    job.encoding = duplicate_orm_row(
        Encoding.objects.filter(pk=job.encoding.pk)[0]
    )  # TODO: maybe here would be better an intelligent get_or_create...
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_regressor(job), clusterer)
    results_df = _test(model_split, test_data)

    results = calculate_results_regression(results_df, job.labelling)

    return results, model_split
示例#4
0
def classification(training_df: DataFrame, test_df: DataFrame,
                   clusterer: Clustering, job: Job) -> (dict, dict):
    """main classification entry point

    train and tests the classifier using the provided data

    :param clusterer:
    :param training_df: training DataFrame
    :param test_df: testing DataFrame
    :param job: job configuration
    :return: predictive_model scores and split

    """
    train_data = _drop_columns(training_df)
    test_data = _drop_columns(test_df)

    job.encoding = duplicate_orm_row(
        job.encoding
    )  #TODO: maybe here would be better an intelligent get_or_create...
    job.encoding.features = list(train_data.columns.values)
    job.encoding.save()
    job.save()

    model_split = _train(train_data, _choose_classifier(job), clusterer)
    results_df, auc = _test(model_split,
                            test_data,
                            evaluation=True,
                            is_binary_classifier=_check_is_binary_classifier(
                                job.labelling.type))

    results = _prepare_results(results_df, auc)

    return results, model_split
示例#5
0
def set_model_name(job: Job) -> None:
    if job.create_models:
        if job.predictive_model.model_path != '':
            job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])
            job.predictive_model.save()
            job.save()

        if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value:
            job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format(
                job.id,
                job.split.id,
                job.type)
            job.clustering.save()

        if job.type == JobTypes.UPDATE.value:
            job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this?
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format(
                job.id,
                job.split.id,
                job.type,
                str(time.time()))
        else:
            predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format(
                job.id,
                job.split.id,
                job.type)
        job.predictive_model.model_path = predictive_model_filename
        job.predictive_model.save()
        job.save()
示例#6
0
def post_replay(request):
    """ Post request to start a demo of a log arriving to server

        :param request: json
        :return: Response
    """
    jobs = []
    data = request.data
    split_id = int(data['splitId'])
    job_id = int(data['jobId'])

    split = Split.objects.get(pk=split_id)

    try:
        training_initial_job = Job.objects.get(pk=job_id)
        new_job = duplicate_orm_row(training_initial_job)
        new_job.type = JobTypes.REPLAY.value
        new_job.status = JobStatuses.CREATED.value
        new_job.split = split
        new_job.save()
    except Job.DoesNotExist:
        return Response({'error': 'Job ' + str(job_id) + ' not in database'},
                        status=status.HTTP_404_NOT_FOUND)

    django_rq.enqueue(replay_task, new_job, training_initial_job)
    serializer = JobSerializer(jobs, many=True)
    return Response(serializer.data, status=status.HTTP_201_CREATED)
示例#7
0
def post_replay_prediction(request):
    """ Post request to have a single prediction during the replay of a log

        :param request: json
        :return: Response
    """
    jobs = []
    job_id = int(request.query_params['jobId'])
    training_initial_job_id = int(request.query_params['training_job'])
    logger.info("Creating replay_prediction task")

    try:
        training_initial_job = Job.objects.get(pk=training_initial_job_id)
        replay_job = Job.objects.filter(pk=job_id)[0]
        replay_prediction_job = duplicate_orm_row(replay_job)
        replay_prediction_job.parent_job = Job.objects.filter(pk=job_id)[0]
        replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
        replay_prediction_job.status = JobStatuses.CREATED.value
        replay_prediction_job.save()
    except Job.DoesNotExist:
        return Response({'error': 'Job ' + str(job_id) + ' not in database'},
                        status=status.HTTP_404_NOT_FOUND)

    logger.info("Enqueuing replay_prediction task ID {}".format(
        replay_prediction_job.id))
    log = import_log_from_string(request.data.decode('utf-8'))
    django_rq.enqueue(replay_prediction_task, replay_prediction_job,
                      training_initial_job, log)
    serializer = JobSerializer(jobs, many=True)
    return Response(serializer.data, status=status.HTTP_201_CREATED)
示例#8
0
def post_prediction(request):
    """ Post request to have a single static prediction

        :param request: json
        :return: Response
    """
    jobs = []
    data = request.data
    job_id = int(data['jobId'])
    split_id = int(data['splitId'])
    split = Split.objects.get(pk=split_id)

    try:
        job = Job.objects.get(pk=job_id)
        new_job = duplicate_orm_row(job)
        new_job.type = JobTypes.RUNTIME.value
        new_job.status = JobStatuses.CREATED.value
        new_job.split = split
        new_job.save()
    except Job.DoesNotExist:
        return Response({'error': 'Job ' + str(job_id) + ' not in database'},
                        status=status.HTTP_404_NOT_FOUND)

    django_rq.enqueue(runtime_task, new_job)
    serializer = JobSerializer(jobs, many=True)
    return Response(serializer.data, status=status.HTTP_201_CREATED)
示例#9
0
def create_prediction_job(job: Job, max_len: int) -> Job:
    """ The function create a new prediction job to create a model when it isn't in the database

        :param job: job dictionary
        :param max_len: job dictionary
        :return: Job
    """
    new_job = duplicate_orm_row(job)
    new_job.type = JobTypes.PREDICTION.value
    new_job.status = JobStatuses.CREATED.value
    new_encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.id)[0])
    new_encoding.prefix_length = max_len
    new_encoding.save()
    new_job.encoding = new_encoding
    new_job.create_models = True
    new_job.save()
    return new_job
示例#10
0
def get_encoded_logs(job: Job,
                     use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    print('\tGetting Dataset')
    if use_cache:
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            training_df, test_df = get_labelled_logs(job)

        else:
            if job.split.train_log is not None and \
                job.split.test_log is not None and \
                LoadedLog.objects.filter(train_log=job.split.train_log.path,
                                         test_log=job.split.test_log.path).exists():
                training_log, test_log, additional_columns = get_loaded_logs(
                    job.split)

            else:
                training_log, test_log, additional_columns = prepare_logs(
                    job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(job.split)
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(
                        int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(EventLog(training_log),
                                                     train_name + '.xes')
                    test_name = str(int(100 -
                                        (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(EventLog(test_log),
                                                    test_name + '.xes')
                    job.split.additional_columns = str(
                        train_name +
                        test_name)  # TODO: find better naming policy
                    job.save()

                put_loaded_logs(job.split, training_log, test_log,
                                additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = prepare_logs(job.split)
        training_df, test_df = encode_label_logs(
            training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
示例#11
0
def _calculate_and_evaluate(args) -> dict:
    global trial_number
    if trial_number % 20 == 0:
        print("Trial {}".format(trial_number))
    trial_number += 1
    local_job = global_job

    predictive_model = local_job.predictive_model.predictive_model
    prediction_method = local_job.predictive_model.prediction_method

    model_config = {
        'predictive_model': predictive_model,
        'prediction_method': prediction_method,
        **args
    }

    new_predictive_model = PredictiveModel.init(model_config)
    local_job.predictive_model = duplicate_orm_row(new_predictive_model)
    local_job = duplicate_orm_row(local_job)

    performance_metric = local_job.hyperparameter_optimizer.__getattribute__(
        local_job.hyperparameter_optimizer.optimization_method.lower(
        )).performance_metric
    multiplier = _get_metric_multiplier(performance_metric)

    results, model_split = run_by_type(training_df.copy(), test_df.copy(),
                                       local_job)

    try:
        results, model_split = run_by_type(training_df.copy(), test_df.copy(),
                                           local_job)
        return {
            'loss': -results[performance_metric] * multiplier,
            'status': STATUS_OK,
            'results': results,
            'model_split': model_split,
            'config': model_config
        }
    except:
        return {
            'loss': 100,
            'status': STATUS_FAIL,
            'results': {},
            'config': {}
        }
示例#12
0
    def test_update(self):
        job = create_test_job()
        prediction_task(job.id)

        job2 = duplicate_orm_row(job)
        job.refresh_from_db()
        job2.incremental_train = job
        job2.type = JobTypes.UPDATE.value
        job2.save()

        initial_job = job2  #.to_dict()

        generated_job = update(split=job.split,
                               payload={
                                   'type': 'classification',
                                   'split_id': 1,
                                   'config': {
                                       'clusterings': ['noCluster'],
                                       'encodings': ['simpleIndex'],
                                       'encoding': {
                                           'padding': False,
                                           'prefix_length': 1,
                                           'generation_type': 'only',
                                           'add_remaining_time': False,
                                           'add_elapsed_time': False,
                                           'add_executed_events': False,
                                           'add_resources_used': False,
                                           'add_new_traces': False,
                                           'features': [],
                                       },
                                       'create_models': False,
                                       'methods': ['randomForest'],
                                       'kmeans': {},
                                       'incremental_train': [job.id],
                                       'hyperparameter_optimizer': {
                                           'algorithm_type': 'tpe',
                                           'max_evaluations': 10,
                                           'performance_metric': 'rmse',
                                           'type': 'none',
                                       },
                                       'labelling': {
                                           'type': 'next_activity',
                                           'attribute_name': '',
                                           'threshold_type': 'threshold_mean',
                                           'threshold': 0,
                                       }
                                   }
                               })[0]  #.to_dict()
示例#13
0
    def test_replay(self):

        job = create_test_job()
        runtime_job = duplicate_orm_row(job)

        runtime_log = create_test_log(
            log_name='runtime_example.xes',
            log_path='cache/log_cache/test_logs/runtime_test.xes')
        runtime_job.split = create_test_split(
            split_type=SplitTypes.SPLIT_DOUBLE.value,
            split_ordering_method=SplitOrderingMethods.SPLIT_SEQUENTIAL.value,
            train_log=runtime_log,
            test_log=runtime_log)

        requests = replay_task(runtime_job, job)
        self.assertEqual(len(requests), 2)
示例#14
0
def save_randomised_set(initial_split_obj):
    # todo: save new dataset in memory and create split to use it
    new_split = duplicate_orm_row(initial_split_obj)

    # TODO future bug creates shadows,
    train_log = Log.objects.get_or_create(
        name='RETRAIN' + new_split.train_log.name,
        path='cache/log_cache/' + 'RETRAIN' + new_split.train_log.name,
        properties={})[0]
    test_log = Log.objects.get_or_create(
        name='RETRAIN' + new_split.test_log.name,
        path='cache/log_cache/' + 'RETRAIN' + new_split.test_log.name,
        properties={})[0]

    new_split.train_log = train_log
    new_split.test_log = test_log
    new_split.additional_columns = None
    new_split.save()
    return new_split
示例#15
0
def get_prediction(request, pk, explanation_target):
    """ Post request to start a demo of a log arriving to server

        :param pk:
        :param explanation_target:
        :param request: json
        :return: Response
    """
    try:
        training_initial_job = Job.objects.get(pk=pk)
        new_job = duplicate_orm_row(training_initial_job)
        new_job.type = JobTypes.REPLAY.value
        new_job.status = JobStatuses.CREATED.value
        new_job.save()
    except Job.DoesNotExist:
        return Response({'error': 'Job ' + str(pk) + ' not in database'},
                        status=status.HTTP_404_NOT_FOUND)
    return Response(replay_predictions(new_job, Job.objects.get(pk=pk),
                                       explanation_target),
                    status=status.HTTP_200_OK)
示例#16
0
def replay_prediction_task(replay_prediction_job: Job, training_initial_job: Job, log: Log):
    """ The function create a replat prediction task to ask a single prediction to the server for a portion of a trace

        :param replay_prediction_job: job dictionary
        :param training_initial_job: job dictionary
        :param log: job dictionary
    """
    logger.info("Start replay_prediction task ID {}".format(replay_prediction_job.id))
    try:
        replay_prediction_job.status = JobStatuses.RUNNING.value
        replay_prediction_job.save()
        max_len = max(len(trace) for trace in log)
        if replay_prediction_job.encoding.prefix_length != max_len:
            prediction_job = create_prediction_job(training_initial_job, max_len)
            prediction_task(prediction_job.id)
            prediction_job.refresh_from_db()
            new_replay_prediction_job = duplicate_orm_row(prediction_job)
            new_replay_prediction_job.split = Split.objects.filter(pk=replay_prediction_job.split.id)[0]
            new_replay_prediction_job.type = JobTypes.REPLAY_PREDICT.value
            new_replay_prediction_job.parent_job = replay_prediction_job.parent_job
            new_replay_prediction_job.status = JobStatuses.CREATED.value
            replay_prediction_task(new_replay_prediction_job, prediction_job, log)
            return
        result_dict, events_for_trace = replay_prediction_calculate(replay_prediction_job, log)
        replay_prediction_job.results = dict(result_dict)
        replay_prediction_job.event_number = dict(events_for_trace)
        replay_prediction_job.status = JobStatuses.COMPLETED.value
        replay_prediction_job.error = ''
    except Exception as e:
        logger.error(e)
        replay_prediction_job.status = JobStatuses.ERROR.value
        replay_prediction_job.error = str(e.__repr__())
        raise e
    finally:
        replay_prediction_job.save()
        publish(replay_prediction_job)
示例#17
0
def get_encoded_logs(job: Job, use_cache: bool = True) -> (DataFrame, DataFrame):
    """returns the encoded logs

    returns the training and test DataFrames encoded using the given job configuration, loading from cache if possible
    :param job: job configuration
    :param use_cache: load or not saved datasets from cache
    :return: training and testing DataFrame

    """
    logger.info('\tGetting Dataset')
    if use_cache and \
        (job.predictive_model is not None and
         job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value):
        if LabelledLog.objects.filter(split=job.split,
                                      encoding=job.encoding,
                                      labelling=job.labelling).exists():
            try:
                training_df, test_df = get_labelled_logs(job)
            except FileNotFoundError: #cache invalidation
                LabelledLog.objects.filter(split=job.split,
                                           encoding=job.encoding,
                                           labelling=job.labelling).delete()
                logger.info('\t\tError pre-labeled cache invalidated!')
                return get_encoded_logs(job, use_cache)
        else:
            if job.split.train_log is not None and \
               job.split.test_log is not None and \
               LoadedLog.objects.filter(split=job.split).exists():
                try:
                    training_log, test_log, additional_columns = get_loaded_logs(job.split)
                except FileNotFoundError:  # cache invalidation
                    LoadedLog.objects.filter(split=job.split).delete()
                    logger.info('\t\tError pre-loaded cache invalidated!')
                    return get_encoded_logs(job, use_cache)
            else:
                training_log, test_log, additional_columns = get_train_test_log(job.split)
                if job.split.type == SplitTypes.SPLIT_SINGLE.value:
                    job.split = duplicate_orm_row(Split.objects.filter(pk=job.split.pk)[0])
                    job.split.type = SplitTypes.SPLIT_DOUBLE.value
                    train_name = '0-' + str(int(100 - (job.split.test_size * 100)))
                    job.split.train_log = create_log(
                        EventLog(training_log),
                        train_name + '.xes'
                    )
                    test_name = str(int(100 - (job.split.test_size * 100))) + '-100'
                    job.split.test_log = create_log(
                        EventLog(test_log),
                        test_name + '.xes'
                    )
                    job.split.additional_columns = str(train_name + test_name)  # TODO: find better naming policy
                    job.split.save()

                put_loaded_logs(job.split, training_log, test_log, additional_columns)

            training_df, test_df = encode_label_logs(
                training_log,
                test_log,
                job,
                additional_columns=additional_columns)
            put_labelled_logs(job, training_df, test_df)
    else:
        training_log, test_log, additional_columns = get_train_test_log(job.split)
        training_df, test_df = encode_label_logs(training_log, test_log, job, additional_columns=additional_columns)
    return training_df, test_df
示例#18
0
def check_predictive_model_not_overwrite(job: Job) -> None:
    if job.hyperparameter_optimizer.optimization_method != HyperparameterOptimizationMethods.NONE.value:
        job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0])
        job.predictive_model.save()
        job.save()
示例#19
0
    def handle(self, *args, **kwargs):
        TARGET_JOB = 439
        initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0]

        # todo: return performances
        print('Initial Job:', initial_job_obj.evaluation.classificationmetrics
              )  # TODO future bug

        training_df_old, test_df_old = get_encoded_logs(initial_job_obj)
        training_df = training_df_old.copy()
        test_df = test_df_old.copy()

        # todo: what should I randomise?
        TARGETS = [
            [('prefix_1', 2)],  # <- simple pattern
            [('prefix_2', 3)],  # <- simple pattern
            [
                ('prefix_3', 2),
                ('prefix_4', 3),
            ]  # <- complex pattern
        ]
        for target in TARGETS:
            if len(target) == 1:
                target = target[0]
                for df in [training_df, test_df]:
                    m_col = df[target[0]]
                    del df[target[0]]
                    target_values1 = list(set(m_col.values))
                    df[target[0]] = m_col.apply(lambda x: x if (x != target[
                        1]) else random.choice(target_values1))
            elif len(target) > 1:
                for df in [training_df, test_df]:
                    m_col = df[[column for column, _ in target]]
                    possible_values = {}
                    for column, _ in target:
                        possible_values[column] = list(set(df[column]))
                        del df[column]
                    df[[column for column, _ in target
                        ]] = m_col.apply(lambda x: x if any(
                            [x[column] != value
                             for column, value in target]) else Series({
                                 column: random.choice(possible_values[column])
                                 for column, value in target
                             }),
                                         axis=1)
            else:
                raise Exception('target list with unexpected value')

        assert not training_df.equals(training_df_old)
        assert not test_df.equals(test_df_old)

        # todo: save new dataset in memory and create split to use it
        initial_split_obj = initial_job_obj.split
        new_split = duplicate_orm_row(initial_split_obj)
        train_log = duplicate_orm_row(new_split.train_log)
        test_log = duplicate_orm_row(new_split.test_log)

        # TODO future bug creates shadows
        train_log.name = 'RETRAIN' + train_log.name
        train_log.path = 'cache/log_cache/' + train_log.name
        train_log.properties = {}
        test_log.name = 'RETRAIN' + test_log.name
        test_log.path = 'cache/log_cache/' + test_log.name
        test_log.properties = {}

        new_split.train_log = train_log
        new_split.test_log = test_log
        new_split.additional_columns = None
        new_split.save()

        prediction_job = create_prediction_job(
            initial_job_obj, initial_job_obj.encoding.prefix_length)
        prediction_job.split = new_split
        prediction_job.split.save()
        prediction_job.save()

        put_labelled_logs(prediction_job, training_df, test_df)

        # todo: build model
        prediction_task(prediction_job.id, do_publish_result=False)
        prediction_job.refresh_from_db()

        # todo: return performances
        print('Retrain Job:', prediction_job.evaluation.classificationmetrics)

        print('Done, cheers!')