def update_and_test(training_df: DataFrame, test_df: DataFrame, job: Job): train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) job.encoding = job.incremental_train.encoding job.encoding.save() job.save() if list(train_data.columns.values ) != job.incremental_train.encoding.features: # TODO: how do I align the two feature vectors? train_data, _ = train_data.align( pd.DataFrame(columns=job.incremental_train.encoding.features), axis=1, join='right') train_data = train_data.fillna(0) test_data, _ = test_data.align( pd.DataFrame(columns=job.incremental_train.encoding.features), axis=1, join='right') test_data = test_data.fillna(0) # TODO: UPDATE if incremental, otherwise just test model_split = _update(job, train_data) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split
def regression(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main regression entry point train and tests the regressor using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data, test_data = _prep_data(training_df, test_df) job.encoding = duplicate_orm_row( Encoding.objects.filter(pk=job.encoding.pk)[0] ) # TODO: maybe here would be better an intelligent get_or_create... job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_regressor(job), clusterer) results_df = _test(model_split, test_data) results = calculate_results_regression(results_df, job.labelling) return results, model_split
def classification(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main classification entry point train and tests the classifier using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) job.encoding = duplicate_orm_row( job.encoding ) #TODO: maybe here would be better an intelligent get_or_create... job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_classifier(job), clusterer) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split
def cross_validated_regression(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job, cv=2) -> (dict, dict): """main regression entry point train and tests the regressor using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :param cv: cross validation amount :return: predictive_model scores and split """ train_data, test_data = _prep_data(training_df, test_df) # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0]) # TODO: maybe here would be better an intelligent get_or_create... job.encoding = Encoding.objects.create( data_encoding=job.encoding.data_encoding, value_encoding=job.encoding.value_encoding, add_elapsed_time=job.encoding.add_elapsed_time, add_remaining_time=job.encoding.add_remaining_time, add_executed_events=job.encoding.add_executed_events, add_resources_used=job.encoding.add_resources_used, add_new_traces=job.encoding.add_new_traces, features=job.encoding.features, prefix_length=job.encoding.prefix_length, padding=job.encoding.padding, task_generation_type=job.encoding.task_generation_type) job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_regressor(job), clusterer, do_cv=True) results_df = _test(model_split, test_data) results = calculate_results_regression(results_df, job.labelling) return results, model_split
def classification(training_df: DataFrame, test_df: DataFrame, clusterer: Clustering, job: Job) -> (dict, dict): """main classification entry point train and tests the classifier using the provided data :param clusterer: :param training_df: training DataFrame :param test_df: testing DataFrame :param job: job configuration :return: predictive_model scores and split """ train_data = _drop_columns(training_df) test_data = _drop_columns(test_df) # job.encoding = duplicate_orm_row(Encoding.objects.filter(pk=job.encoding.pk)[0]) # TODO: maybe here would be better an intelligent get_or_create... job.encoding = Encoding.objects.create( data_encoding=job.encoding.data_encoding, value_encoding=job.encoding.value_encoding, add_elapsed_time=job.encoding.add_elapsed_time, add_remaining_time=job.encoding.add_remaining_time, add_executed_events=job.encoding.add_executed_events, add_resources_used=job.encoding.add_resources_used, add_new_traces=job.encoding.add_new_traces, features=job.encoding.features, prefix_length=job.encoding.prefix_length, padding=job.encoding.padding, task_generation_type=job.encoding.task_generation_type) job.encoding.features = list(train_data.columns.values) job.encoding.save() job.save() model_split = _train(train_data, _choose_classifier(job), clusterer) results_df, auc = _test(model_split, test_data, evaluation=True, is_binary_classifier=_check_is_binary_classifier( job.labelling.type)) results = _prepare_results(results_df, auc) return results, model_split