def start_automated_run(id): """This starts an automated run using the passed in source code for configuration""" path = functions.get_path_from_query_string(request) req_body = request.get_json() with functions.DBContextManager(path) as session: base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError('Base learner origin {} not found'.format(id), 404) if not base_learner_origin.final: raise exceptions.UserError('Base learner origin {} is not final'.format(id)) # Check for any syntax errors module = functions.import_string_code_as_module(req_body['source']) del module automated_run = models.AutomatedRun(req_body['source'], 'queued', base_learner_origin) session.add(automated_run) session.commit() with Connection(get_redis_connection()): rqtasks.start_automated_run.delay(path, automated_run.id) return jsonify(automated_run.serialize)
def test_source(self): module = functions.import_string_code_as_module(cvsetting.leave_one_out['source']) assert hasattr(module, 'return_splits_iterable') list(module.return_splits_iterable(self.X, self.y)) del module
def test_source(self): module = functions.import_string_code_as_module(metricsetting.f1_score_from_preds['source']) assert np.round(module.metric_generator(binary_y, binary_preds), 2) == 0.96 assert np.round(module.metric_generator(multiclass_y, multiclass_preds), 2) == 0.95 del module
def test_source(self): module = functions.import_string_code_as_module( cvsetting.group_k_fold['source']) assert hasattr(module, 'return_splits_iterable') generator = module.return_splits_iterable(self.X, self.y) self.assertRaises(ValueError, list, generator) del module
def test_learner_settings(self): for key in self.transformer_settings: setting = getattr(learnersetting, key) module = functions.import_string_code_as_module(setting['source']) assert hasattr(module.base_learner, 'get_params') assert hasattr(module.base_learner, 'set_params') assert hasattr(module.base_learner, 'fit') assert hasattr(module.base_learner, setting['meta_feature_generator']) module.base_learner.fit(self.X, self.y) del module
def get_automated_runs(): """Return all automated runs""" path = functions.get_path_from_query_string(request) if request.method == 'GET': with functions.DBContextManager(path) as session: automated_runs = session.query(models.AutomatedRun).all() return jsonify(list(map(lambda x: x.serialize, automated_runs))) if request.method == 'POST': req_body = request.get_json() with functions.DBContextManager(path) as session: base_learner_origin = None if req_body['category'] == 'bayes' or req_body[ 'category'] == 'greedy_ensemble_search': base_learner_origin = session.query(models.BaseLearnerOrigin).\ filter_by(id=req_body['base_learner_origin_id']).first() if base_learner_origin is None: raise exceptions.UserError( 'Base learner origin {} not found'.format( req_body['base_learner_origin_id']), 404) if not base_learner_origin.final: raise exceptions.UserError( 'Base learner origin {} is not final'.format( req_body['base_learner_origin_id'])) elif req_body['category'] == 'tpot': pass else: raise exceptions.UserError('Automated run category' ' {} not recognized'.format( req_body['category'])) # Check for any syntax errors module = functions.import_string_code_as_module(req_body['source']) del module automated_run = models.AutomatedRun(req_body['source'], 'queued', req_body['category'], base_learner_origin) session.add(automated_run) session.commit() with Connection(get_redis_connection()): rqtasks.start_automated_run.delay(path, automated_run.id) return jsonify(automated_run.serialize)
def start_tpot(automated_run, session, path): """Starts a TPOT automated run that exports directly to base learner setup Args: automated_run (xcessiv.models.AutomatedRun): Automated run object session: Valid SQLAlchemy session path (str, unicode): Path to project folder """ module = functions.import_string_code_as_module(automated_run.source) extraction = session.query(models.Extraction).first() X, y = extraction.return_train_dataset() tpot_learner = module.tpot_learner tpot_learner.fit(X, y) temp_filename = os.path.join(path, 'tpot-temp-export-{}'.format(os.getpid())) tpot_learner.export(temp_filename) with open(temp_filename) as f: base_learner_source = f.read() base_learner_source = constants.tpot_learner_docstring + base_learner_source try: os.remove(temp_filename) except OSError: pass blo = models.BaseLearnerOrigin(source=base_learner_source, name='TPOT Learner', meta_feature_generator='predict') automated_run.job_status = 'finished' session.add(blo) session.add(automated_run) session.commit()
def start_automated_run(path, automated_run_id): """Starts automated run. This will automatically create base learners until the run finishes or errors out. Args: path (str): Path to Xcessiv notebook automated_run_id (str): Automated Run ID """ with functions.DBContextManager(path) as session: automated_run = session.query(models.AutomatedRun).filter_by(id=automated_run_id).first() if not automated_run: raise exceptions.UserError('Automated run {} ' 'does not exist'.format(automated_run_id)) automated_run.job_id = get_current_job().id automated_run.job_status = 'started' session.add(automated_run) session.commit() try: module = functions.import_string_code_as_module(automated_run.source) random_state = 8 if not hasattr(module, 'random_state') else module.random_state assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators # get non-searchable parameters base_estimator = automated_run.base_learner_origin.return_estimator() base_estimator.set_params(**module.default_params) default_params = functions.make_serializable(base_estimator.get_params()) non_searchable_params = dict((key, val) for key, val in iteritems(default_params) if key not in module.pbounds) # get already calculated base learners in search space existing_base_learners = [] for base_learner in automated_run.base_learner_origin.base_learners: if not base_learner.job_status == 'finished': continue in_search_space = True for key, val in iteritems(non_searchable_params): if base_learner.hyperparameters[key] != val: in_search_space = False break # If no match, move on to the next base learner if in_search_space: existing_base_learners.append(base_learner) # build initialize dictionary target = [] initialization_dict = dict((key, list()) for key in module.pbounds.keys()) for base_learner in existing_base_learners: # check if base learner's searchable hyperparameters are all numerical all_numerical = True for key in module.pbounds.keys(): if not isinstance(base_learner.hyperparameters[key], numbers.Number): all_numerical = False break if not all_numerical: continue # if there is a non-numerical hyperparameter, skip this. for key in module.pbounds.keys(): initialization_dict[key].append(base_learner.hyperparameters[key]) target.append(base_learner.individual_score[module.metric_to_optimize]) initialization_dict['target'] = target if not module.invert_metric \ else list(map(lambda x: -x, target)) print('{} existing in initialization dictionary'. format(len(initialization_dict['target']))) # Create function to be optimized func_to_optimize = return_func_to_optimize( path, session, automated_run.base_learner_origin, module.default_params, module.metric_to_optimize, module.invert_metric, set(module.integers) ) # Create Bayes object bo = BayesianOptimization(func_to_optimize, module.pbounds) bo.initialize(initialization_dict) np.random.seed(random_state) bo.maximize(**module.maximize_config) automated_run.job_status = 'finished' session.add(automated_run) session.commit() except: session.rollback() automated_run.job_status = 'errored' automated_run.description['error_type'] = repr(sys.exc_info()[0]) automated_run.description['error_value'] = repr(sys.exc_info()[1]) automated_run.description['error_traceback'] = \ traceback.format_exception(*sys.exc_info()) session.add(automated_run) session.commit() raise
def start_naive_bayes(automated_run, session, path): """Starts naive bayes automated run Args: automated_run (xcessiv.models.AutomatedRun): Automated run object session: Valid SQLAlchemy session path (str, unicode): Path to project folder """ module = functions.import_string_code_as_module(automated_run.source) random_state = 8 if not hasattr(module, 'random_state') else module.random_state assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators # get non-searchable parameters base_estimator = automated_run.base_learner_origin.return_estimator() base_estimator.set_params(**module.default_params) default_params = functions.make_serializable(base_estimator.get_params()) non_searchable_params = dict((key, val) for key, val in iteritems(default_params) if key not in module.pbounds) # get already calculated base learners in search space existing_base_learners = [] for base_learner in automated_run.base_learner_origin.base_learners: if not base_learner.job_status == 'finished': continue in_search_space = True for key, val in iteritems(non_searchable_params): if base_learner.hyperparameters[key] != val: in_search_space = False break # If no match, move on to the next base learner if in_search_space: existing_base_learners.append(base_learner) # build initialize dictionary target = [] initialization_dict = dict((key, list()) for key in module.pbounds.keys()) for base_learner in existing_base_learners: # check if base learner's searchable hyperparameters are all numerical all_numerical = True for key in module.pbounds.keys(): if not isinstance(base_learner.hyperparameters[key], numbers.Number): all_numerical = False break if not all_numerical: continue # if there is a non-numerical hyperparameter, skip this. for key in module.pbounds.keys(): initialization_dict[key].append(base_learner.hyperparameters[key]) target.append(base_learner.individual_score[module.metric_to_optimize]) initialization_dict['target'] = target if not module.invert_metric \ else list(map(lambda x: -x, target)) print('{} existing in initialization dictionary'.format( len(initialization_dict['target']))) # Create function to be optimized func_to_optimize = return_func_to_optimize( path, session, automated_run.base_learner_origin, module.default_params, module.metric_to_optimize, module.invert_metric, set(module.integers)) # Create Bayes object bo = BayesianOptimization(func_to_optimize, module.pbounds) bo.initialize(initialization_dict) np.random.seed(random_state) bo.maximize(**module.maximize_config) automated_run.job_status = 'finished' session.add(automated_run) session.commit()
def test_source(self): module = functions.import_string_code_as_module(metricsetting.explained_variance_score['source']) assert np.round(module.metric_generator(regression_y, regression_preds), 2) == -0.89 del module
def test_source(self): module = functions.import_string_code_as_module(metricsetting.median_absolute_error['source']) assert np.round(module.metric_generator(regression_y, regression_preds), 2) == 3.72 del module
def test_source(self): module = functions.import_string_code_as_module(metricsetting.mse['source']) assert np.round(module.metric_generator(regression_y, regression_preds), 2) == 168.09 del module
def start_greedy_ensemble_search(automated_run, session, path): """Starts an automated ensemble search using greedy forward model selection. The steps for this search are adapted from "Ensemble Selection from Libraries of Models" by Caruana. 1. Start with the empty ensemble 2. Add to the ensemble the model in the library that maximizes the ensemmble's performance on the error metric. 3. Repeat step 2 for a fixed number of iterations or until all models have been used. Args: automated_run (xcessiv.models.AutomatedRun): Automated run object session: Valid SQLAlchemy session path (str, unicode): Path to project folder """ module = functions.import_string_code_as_module(automated_run.source) assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators best_ensemble = [] # List containing IDs of best performing ensemble for the last round secondary_learner = automated_run.base_learner_origin.return_estimator() secondary_learner.set_params(**module.secondary_learner_hyperparameters) for i in range(module.max_num_base_learners): best_score = -float('inf') # Best metric for this round (not in total!) current_ensemble = best_ensemble[:] # Shallow copy of best ensemble for base_learner in session.query(models.BaseLearner).filter_by(job_status='finished').all(): if base_learner in current_ensemble: # Don't append when learner is already in continue current_ensemble.append(base_learner) # Check if our "best ensemble" already exists existing_ensemble = session.query(models.StackedEnsemble).\ filter_by(base_learner_origin_id=automated_run.base_learner_origin.id, secondary_learner_hyperparameters=secondary_learner.get_params(), base_learner_ids=sorted([bl.id for bl in current_ensemble])).first() if existing_ensemble and existing_ensemble.job_status == 'finished': score = existing_ensemble.individual_score[module.metric_to_optimize] elif existing_ensemble and existing_ensemble.job_status != 'finished': eval_stacked_ensemble(existing_ensemble, session, path) score = existing_ensemble.individual_score[module.metric_to_optimize] else: stacked_ensemble = models.StackedEnsemble( secondary_learner_hyperparameters=secondary_learner.get_params(), base_learners=current_ensemble, base_learner_origin=automated_run.base_learner_origin, job_status='started' ) session.add(stacked_ensemble) session.commit() eval_stacked_ensemble(stacked_ensemble, session, path) score = stacked_ensemble.individual_score[module.metric_to_optimize] score = -score if module.invert_metric else score if best_score < score: best_score = score best_ensemble = current_ensemble[:] current_ensemble.pop()