def set_model_name(job: Job) -> None: if job.create_models: if job.predictive_model.model_path != '': job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) job.predictive_model.save() job.save() if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) job.clustering.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def set_model_name(job: Job) -> None: if job.create_models: if job.predictive_model.model_path != '': # job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) #todo: replace with simple CREATE job.predictive_model = PredictiveModel.init( job.predictive_model.get_full_dict( ) #todo: doublecheck me, are you sure get_full_dict is returning everything needed? ) #todo: futurebug if object changes job.predictive_model.save() job.save() if job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: job.clustering.model_path = 'cache/model_cache/job_{}-split_{}-clusterer-{}-v0.sav'.format( job.id, job.split.id, job.type) job.clustering.save() if job.type == JobTypes.UPDATE.value: job.type = JobTypes.PREDICTION.value #TODO: Y am I doing this? predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v{}.sav'.format( job.id, job.split.id, job.type, str(time.time())) else: predictive_model_filename = 'cache/model_cache/job_{}-split_{}-predictive_model-{}-v0.sav'.format( job.id, job.split.id, job.type) job.predictive_model.model_path = predictive_model_filename job.predictive_model.save() job.save()
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = { 'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {} } for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter( pk=current_best['predictive_model_id'])[0] job.save() logger.info("End hyperopt job {}, {} . Results {}".format( job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best[ 'model_split']
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global train_df, validation_df, test_df, global_job global_job = job train_df, test_df = get_encoded_logs(job) train_df, validation_df, test_df = _retrieve_train_validate_test( train_df, test_df) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = algorithm = OPTIMISATION_ALGORITHM[ job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).algorithm_type] _run_hyperoptimisation(space, algorithm.suggest, max_evaluations, trials) best_candidate = trials.best_trial['result'] job.predictive_model = PredictiveModel.objects.filter( pk=best_candidate['predictive_model_id'])[0] job.predictive_model.save() job.save() best_candidate['results']['elapsed_time'] = timedelta( seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = best_candidate['results']['elapsed_time'] job.evaluation.save() results_df, auc = _test_best_candidate( best_candidate, job.labelling.type, job.predictive_model.predictive_model) if job.predictive_model.predictive_model == PredictiveModels.CLASSIFICATION.value: results = classification_prepare_results(results_df, auc) else: results = regression_prepare_results(results_df, job.labelling) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init(job.predictive_model.predictive_model, results, len(set(validation_df['label'])) <= 2) job.evaluation.save() job.save() logger.info( "End hyperopt job {}, {}. \n\tResults on validation {}. \n\tResults on test {}." .format(job.type, get_run(job), best_candidate['results'], results)) # return results, best_candidate['config'], best_candidate['model_split']
def check_predictive_model_not_overwrite(job: Job) -> None: if job.hyperparameter_optimizer.optimization_method != HyperparameterOptimizationMethods.NONE.value: job.predictive_model = duplicate_orm_row(PredictiveModel.objects.filter(pk=job.predictive_model.pk)[0]) job.predictive_model.save() job.save()
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) #TODO evaluate on validation set if holdout: validation_df = test_df # test_df = training_df.sample(frac=.2) test_df = training_df.tail(int(len(training_df) * 20 / 100)) training_df = training_df.drop(test_df.index) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}} for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0] job.predictive_model.save() job.save() current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = current_best['results']['elapsed_time'] job.evaluation.save() #TODO evaluate on validation set if holdout: results_df, auc = _test( current_best['model_split'], validation_df.drop(['trace_id'], 1), evaluation=True, is_binary_classifier=_check_is_binary_classifier(job.labelling.type) ) results = _prepare_results(results_df, auc) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(set(test_df['label'])) <= 2 ) job.evaluation.save() job.save() if holdout: logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results)) return results, current_best['config'], current_best['model_split'] else: logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best['model_split']