def specific_base_learner_origin(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError('Base learner origin {} not found'.format(id), 404) if request.method == 'GET': return jsonify(base_learner_origin.serialize) if request.method == 'PATCH': if base_learner_origin.final: raise exceptions.UserError('Cannot modify a final base learner origin') req_body = request.get_json() modifiable_attr = ('meta_feature_generator', 'name', 'source', 'metric_generators') for attr in modifiable_attr: if attr in req_body: setattr(base_learner_origin, attr, req_body[attr]) session.add(base_learner_origin) session.commit() return jsonify(base_learner_origin.serialize) if request.method == 'DELETE': base_learner_origin.cleanup(path) session.delete(base_learner_origin) session.commit() return jsonify(message='Deleted base learner origin')
def verify_base_learner_origin(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: base_learner_origin = session.query( models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError( 'Base learner origin {} not found'.format(id), 404) if request.method == 'POST': req_body = request.get_json() if base_learner_origin.final: raise exceptions.UserError('Base learner origin {} ' 'is already final'.format(id)) base_learner = base_learner_origin.return_estimator() validation_results, hyperparameters = functions.verify_estimator_class( base_learner, base_learner_origin.meta_feature_generator, base_learner_origin.metric_generators, req_body['dataset']) base_learner_origin.validation_results = { req_body['dataset']: validation_results } base_learner_origin.hyperparameters = hyperparameters session.add(base_learner_origin) session.commit() return jsonify(base_learner_origin.serialize)
def confirm_base_learner_origin(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError('Base learner origin {} not found'.format(id), 404) if request.method == 'GET': if base_learner_origin.final: raise exceptions.UserError('Base learner origin {} ' 'is already final'.format(id)) if not base_learner_origin.validation_results: raise exceptions.UserError('Base learner origin {} has not yet been ' 'verified on a dataset'.format(id)) base_learner = base_learner_origin.return_estimator() validation_results, hyperparameters = functions.verify_estimator_class( base_learner, base_learner_origin.meta_feature_generator, base_learner_origin.metric_generators, base_learner_origin.validation_results['dataset'] ) base_learner_origin.validation_results = { 'dataset': base_learner_origin.validation_results['dataset'], 'metrics': validation_results } base_learner_origin.hyperparameters = hyperparameters base_learner_origin.final = True session.add(base_learner_origin) session.commit() return jsonify(base_learner_origin.serialize)
def verify_dataset(X, y): """Verifies if a dataset is valid for use i.e. scikit-learn format Used to verify a dataset by returning shape and basic statistics of returned data. This will also provide quick and dirty check on capability of host machine to process the data. Args: X (array-like): Features array y (array-like): Label array Returns: X_shape (2-tuple of int): Shape of X returned y_shape (1-tuple of int): Shape of y returned Raises: AssertionError: `X_shape` must be of length 2 and `y_shape` must be of length 1. `X` must have the same number of elements as `y` i.e. X_shape[0] == y_shape[0]. If any of these conditions are not met, an AssertionError is raised. """ X_shape, y_shape = np.array(X).shape, np.array(y).shape if len(X_shape) != 2: raise exceptions.UserError("X must be 2-dimensional array") if len(y_shape) != 1: raise exceptions.UserError("y must be 1-dimensional array") if X_shape[0] != y_shape[0]: raise exceptions.UserError("X must have same number of elements as y") return dict(features_shape=X_shape, labels_shape=y_shape)
def start_automated_run(id): """This starts an automated run using the passed in source code for configuration""" path = functions.get_path_from_query_string(request) req_body = request.get_json() with functions.DBContextManager(path) as session: base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError('Base learner origin {} not found'.format(id), 404) if not base_learner_origin.final: raise exceptions.UserError('Base learner origin {} is not final'.format(id)) # Check for any syntax errors module = functions.import_string_code_as_module(req_body['source']) del module automated_run = models.AutomatedRun(req_body['source'], 'queued', base_learner_origin) session.add(automated_run) session.commit() with Connection(get_redis_connection()): rqtasks.start_automated_run.delay(path, automated_run.id) return jsonify(automated_run.serialize)
def search_base_learner(id): """Creates a set of base learners from base learner origin using grid search and queues them up """ path = functions.get_path_from_query_string(request) req_body = request.get_json() if req_body['method'] == 'grid': param_grid = functions.import_object_from_string_code( req_body['source'], 'param_grid') iterator = ParameterGrid(param_grid) elif req_body['method'] == 'random': param_distributions = functions.import_object_from_string_code( req_body['source'], 'param_distributions') iterator = ParameterSampler(param_distributions, n_iter=req_body['n_iter']) else: raise exceptions.UserError('{} not a valid search method'.format( req_body['method'])) with functions.DBContextManager(path) as session: base_learner_origin = session.query( models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError( 'Base learner origin {} not found'.format(id), 404) if not base_learner_origin.final: raise exceptions.UserError( 'Base learner origin {} is not final'.format(id)) learners = [] for params in iterator: est = base_learner_origin.return_estimator() try: est.set_params(**params) except Exception as e: print(repr(e)) continue hyperparameters = functions.make_serializable(est.get_params()) base_learners = session.query(models.BaseLearner).\ filter_by(base_learner_origin_id=id, hyperparameters=hyperparameters).all() if base_learners: # already exists continue base_learner = models.BaseLearner(hyperparameters, 'queued', base_learner_origin) session.add(base_learner) session.commit() with Connection(get_redis_connection()): rqtasks.generate_meta_features.delay(path, base_learner.id) learners.append(base_learner) return jsonify(map(lambda x: x.serialize, learners))
def create_new_stacked_ensemble(): path = functions.get_path_from_query_string(request) req_body = request.get_json() with functions.DBContextManager(path) as session: if request.method == 'GET': return jsonify( list( map(lambda x: x.serialize, session.query(models.StackedEnsemble).all()))) if request.method == 'POST': base_learners = session.query(models.BaseLearner).\ filter(models.BaseLearner.id.in_(req_body['base_learner_ids'])).all() if len(base_learners) != len(req_body['base_learner_ids']): raise exceptions.UserError('Not all base learners found') for learner in base_learners: if learner.job_status != 'finished': raise exceptions.UserError( 'Not all base learners have finished') base_learner_origin = session.query(models.BaseLearnerOrigin).\ filter_by(id=req_body['base_learner_origin_id']).first() if base_learner_origin is None: raise exceptions.UserError( 'Base learner origin {} not ' 'found'.format(req_body['base_learner_origin_id']), 404) # Retrieve full hyperparameters est = base_learner_origin.return_estimator() params = functions.import_object_from_string_code\ (req_body['secondary_learner_hyperparameters_source'], 'params') est.set_params(**params) hyperparameters = functions.make_serializable(est.get_params()) stacked_ensembles = session.query(models.StackedEnsemble).\ filter_by(base_learner_origin_id=req_body['base_learner_origin_id'], secondary_learner_hyperparameters=hyperparameters, base_learner_ids=sorted([bl.id for bl in base_learners])).all() if stacked_ensembles: raise exceptions.UserError('Stacked ensemble exists') stacked_ensemble = models.StackedEnsemble( secondary_learner_hyperparameters=hyperparameters, base_learners=base_learners, base_learner_origin=base_learner_origin, job_status='queued') session.add(stacked_ensemble) session.commit() with Connection(get_redis_connection()): rqtasks.evaluate_stacked_ensemble.delay( path, stacked_ensemble.id) return jsonify(stacked_ensemble.serialize)
def get_automated_runs(): """Return all automated runs""" path = functions.get_path_from_query_string(request) if request.method == 'GET': with functions.DBContextManager(path) as session: automated_runs = session.query(models.AutomatedRun).all() return jsonify(list(map(lambda x: x.serialize, automated_runs))) if request.method == 'POST': req_body = request.get_json() with functions.DBContextManager(path) as session: base_learner_origin = None if req_body['category'] == 'bayes' or req_body[ 'category'] == 'greedy_ensemble_search': base_learner_origin = session.query(models.BaseLearnerOrigin).\ filter_by(id=req_body['base_learner_origin_id']).first() if base_learner_origin is None: raise exceptions.UserError( 'Base learner origin {} not found'.format( req_body['base_learner_origin_id']), 404) if not base_learner_origin.final: raise exceptions.UserError( 'Base learner origin {} is not final'.format( req_body['base_learner_origin_id'])) elif req_body['category'] == 'tpot': pass else: raise exceptions.UserError('Automated run category' ' {} not recognized'.format( req_body['category'])) # Check for any syntax errors module = functions.import_string_code_as_module(req_body['source']) del module automated_run = models.AutomatedRun(req_body['source'], 'queued', req_body['category'], base_learner_origin) session.add(automated_run) session.commit() with Connection(get_redis_connection()): rqtasks.start_automated_run.delay(path, automated_run.id) return jsonify(automated_run.serialize)
def create_base_learner(id): """This creates a single base learner from a base learner origin and queues it up""" path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: base_learner_origin = session.query( models.BaseLearnerOrigin).filter_by(id=id).first() if base_learner_origin is None: raise exceptions.UserError( 'Base learner origin {} not found'.format(id), 404) if not base_learner_origin.final: raise exceptions.UserError( 'Base learner origin {} is not final'.format(id)) req_body = request.get_json() # Retrieve full hyperparameters est = base_learner_origin.return_estimator() hyperparameters = functions.import_object_from_string_code( req_body['source'], 'params') est.set_params(**hyperparameters) hyperparameters = functions.make_serializable(est.get_params()) base_learners = session.query(models.BaseLearner).\ filter_by(base_learner_origin_id=id, hyperparameters=hyperparameters).all() if base_learners: raise exceptions.UserError( 'Base learner exists with given hyperparameters') base_learner = models.BaseLearner(hyperparameters, 'queued', base_learner_origin) if 'single_searches' not in base_learner_origin.description: base_learner_origin.description['single_searches'] = [] base_learner_origin.description['single_searches'] += ([ req_body['source'] ]) session.add(base_learner) session.add(base_learner_origin) session.commit() with Connection(get_redis_connection()): rqtasks.generate_meta_features.delay(path, base_learner.id) return jsonify(base_learner.serialize)
def return_test_dataset(self): """Returns test data set Returns: X (numpy.ndarray): Features y (numpy.ndarray): Labels """ if self.test_dataset['method'] == 'split_from_main': X, y = self.return_main_dataset() X, X_test, y, y_test = train_test_split( X, y, test_size=self.test_dataset['split_ratio'], random_state=self.test_dataset['split_seed'], stratify=y) return X_test, y_test if self.test_dataset['method'] == 'source': if 'source' not in self.test_dataset or not self.test_dataset[ 'source']: raise exceptions.UserError('Source is empty') extraction_code = self.test_dataset["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_test_dataset") X_test, y_test = extraction_function() return np.array(X_test), np.array(y_test)
def export_stacked_ensemble_as_base_learner_origin(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: stacked_ensemble = session.query( models.StackedEnsemble).filter_by(id=id).first() if stacked_ensemble is None: raise exceptions.UserError( 'Stacked ensemble {} not found'.format(id), 404) extraction = session.query(models.Extraction).first() if request.method == 'POST': source = stacked_ensemble.export_as_code( extraction.meta_feature_generation['source']) new_base_learner_origin = models.BaseLearnerOrigin( source=source, name='Xcessiv Ensemble', meta_feature_generator=stacked_ensemble.base_learner_origin. meta_feature_generator) session.add(new_base_learner_origin) session.commit() return jsonify(new_base_learner_origin.serialize)
def export_stacked_ensemble(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: stacked_ensemble = session.query( models.StackedEnsemble).filter_by(id=id).first() if stacked_ensemble is None: raise exceptions.UserError( 'Stacked ensemble {} not found'.format(id), 404) extraction = session.query(models.Extraction).first() if request.method == 'POST': req_body = request.get_json() if req_body['type'] == 'package': stacked_ensemble.export_as_package( os.path.join(path, req_body['name']), extraction.meta_feature_generation['source']) elif req_body['type'] == 'file': if not req_body['name'].endswith('.py'): req_body['name'] += '.py' stacked_ensemble.export_as_file( os.path.join(path, req_body['name']), extraction.meta_feature_generation['source']) return jsonify( message='Stacked ensemble successfully ' 'exported as {} in {}'.format(req_body['name'], path))
def get_sample_dataset(dataset_properties): """Returns sample dataset Args: dataset_properties (dict): Dictionary corresponding to the properties of the dataset used to verify the estimator and metric generators. Returns: X (array-like): Features array y (array-like): Labels array splits (iterator): This is an iterator that returns train test splits for cross-validation purposes on ``X`` and ``y``. """ kwargs = dataset_properties.copy() data_type = kwargs.pop('type') if data_type == 'multiclass': try: X, y = datasets.make_classification(random_state=8, **kwargs) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split( X, y) except Exception as e: raise exceptions.UserError(repr(e)) elif data_type == 'iris': X, y = datasets.load_iris(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'mnist': X, y = datasets.load_digits(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'breast_cancer': X, y = datasets.load_breast_cancer(return_X_y=True) splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y) elif data_type == 'boston': X, y = datasets.load_boston(return_X_y=True) splits = model_selection.KFold(n_splits=2, random_state=8).split(X) elif data_type == 'diabetes': X, y = datasets.load_diabetes(return_X_y=True) splits = model_selection.KFold(n_splits=2, random_state=8).split(X) else: raise exceptions.UserError('Unknown dataset type {}'.format( dataset_properties['type'])) return X, y, splits
def __enter__(self): if not os.path.exists(self.path): raise exceptions.UserError('{} does not exist'.format(self.path)) sqlite_url = 'sqlite:///{}'.format(self.path) engine = create_engine(sqlite_url) self.session = Session(bind=engine) return self.session
def verify_extraction_meta_feature_generation(): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() if extraction.meta_feature_generation['method'] == 'cv': raise exceptions.UserError('Xcessiv will use cross-validation to' ' generate meta-features') X_holdout, y_holdout = extraction.return_holdout_dataset() return jsonify(functions.verify_dataset(X_holdout, y_holdout))
def start_automated_run(path, automated_run_id): """Starts automated run. This will automatically create base learners until the run finishes or errors out. Args: path (str): Path to Xcessiv notebook automated_run_id (str): Automated Run ID """ with functions.DBContextManager(path) as session: automated_run = session.query( models.AutomatedRun).filter_by(id=automated_run_id).first() if not automated_run: raise exceptions.UserError( 'Automated run {} ' 'does not exist'.format(automated_run_id)) automated_run.job_id = get_current_job().id automated_run.job_status = 'started' session.add(automated_run) session.commit() try: if automated_run.category == 'bayes': automatedruns.start_naive_bayes(automated_run, session, path) elif automated_run.category == 'tpot': automatedruns.start_tpot(automated_run, session, path) elif automated_run.category == 'greedy_ensemble_search': automatedruns.start_greedy_ensemble_search( automated_run, session, path) else: raise Exception( 'Something went wrong. Invalid category for automated run') automated_run.job_status = 'finished' session.add(automated_run) session.commit() except: session.rollback() automated_run.job_status = 'errored' automated_run.description['error_type'] = repr(sys.exc_info()[0]) automated_run.description['error_value'] = repr(sys.exc_info()[1]) automated_run.description['error_traceback'] = \ traceback.format_exception(*sys.exc_info()) session.add(automated_run) session.commit() raise
def return_main_dataset(self): """Returns main data set from self Returns: X (numpy.ndarray): Features y (numpy.ndarray): Labels """ if not self.main_dataset['source']: raise exceptions.UserError('Source is empty') extraction_code = self.main_dataset["source"] extraction_function = functions.import_object_from_string_code(extraction_code, "extract_main_dataset") try: X, y = extraction_function() except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) X, y = np.array(X), np.array(y) return X, y
def import_object_from_string_code(code, object): """Used to import an object from arbitrary passed code. Passed in code is treated as a module and is imported and added to `sys.modules` with its SHA256 hash as key. Args: code (string): Python code to import as module object (string): Name of object to extract from imported module """ sha256 = hashlib.sha256(code.encode('UTF-8')).hexdigest() module = imp.new_module(sha256) try: exec_(code, module.__dict__) except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) sys.modules[sha256] = module try: return getattr(module, object) except AttributeError: raise exceptions.UserError("{} not found in code".format(object))
def specific_automated_run(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: automated_run = session.query(models.AutomatedRun).filter_by(id=id).first() if automated_run is None: raise exceptions.UserError('Automated run {} not found'.format(id), 404) if request.method == 'GET': return jsonify(automated_run.serialize) if request.method == 'DELETE': session.delete(automated_run) session.commit() return jsonify(message='Deleted automated run')
def specific_stacked_ensemble(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: stacked_ensemble = session.query(models.StackedEnsemble).filter_by(id=id).first() if stacked_ensemble is None: raise exceptions.UserError('Stacked ensemble {} not found'.format(id), 404) if request.method == 'GET': return jsonify(stacked_ensemble.serialize) if request.method == 'DELETE': session.delete(stacked_ensemble) session.commit() return jsonify(message='Deleted stacked ensemble')
def get_path_from_query_string(req): """Gets path from query string Args: req (flask.request): Request object from Flask Returns: path (str): Value of "path" parameter from query string Raises: exceptions.UserError: If "path" is not found in query string """ if req.args.get('path') is None: raise exceptions.UserError('Path not found in query string') return req.args.get('path')
def specific_base_learner(id): path = functions.get_path_from_query_string(request) with functions.DBContextManager(path) as session: base_learner = session.query(models.BaseLearner).filter_by(id=id).first() if base_learner is None: raise exceptions.UserError('Base learner {} not found'.format(id), 404) if request.method == 'GET': return jsonify(base_learner.serialize) if request.method == 'DELETE': base_learner.cleanup(path) session.delete(base_learner) session.commit() return jsonify(message='Deleted base learner')
def export_as_file(self, file_path, cv_source): """Export the ensemble as a single Python file and saves it to `file_path`. This is EXPERIMENTAL as putting different modules together would probably wreak havoc especially on modules that make heavy use of global variables. Args: file_path (str, unicode): Absolute/local path of place to save file in cv_source (str, unicode): String containing actual code for base learner cross-validation used to generate secondary meta-features. """ if os.path.exists(file_path): raise exceptions.UserError('{} already exists'.format(file_path)) with open(file_path, 'wb') as f: f.write(self.export_as_code(cv_source).encode('utf8'))
def import_string_code_as_module(code): """Used to run arbitrary passed code as a module Args: code (string): Python code to import as module Returns: module: Python module """ sha256 = hashlib.sha256(code.encode('UTF-8')).hexdigest() module = imp.new_module(sha256) try: exec_(code, module.__dict__) except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) sys.modules[sha256] = module return module
def generate_meta_features(path, base_learner_id): """Generates meta-features for specified base learner After generation of meta-features, the file is saved into the meta-features folder Args: path (str): Path to Xcessiv notebook base_learner_id (str): Base learner ID """ with functions.DBContextManager(path) as session: base_learner = session.query(models.BaseLearner).filter_by(id=base_learner_id).first() if not base_learner: raise exceptions.UserError('Base learner {} ' 'does not exist'.format(base_learner_id)) base_learner.job_id = get_current_job().id base_learner.job_status = 'started' session.add(base_learner) session.commit() try: est = base_learner.return_estimator() extraction = session.query(models.Extraction).first() X, y = extraction.return_train_dataset() return_splits_iterable = functions.import_object_from_string_code( extraction.meta_feature_generation['source'], 'return_splits_iterable' ) meta_features_list = [] trues_list = [] for train_index, test_index in return_splits_iterable(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] est = est.fit(X_train, y_train) meta_features_list.append( getattr(est, base_learner.base_learner_origin. meta_feature_generator)(X_test) ) trues_list.append(y_test) meta_features = np.concatenate(meta_features_list, axis=0) y_true = np.concatenate(trues_list) for key in base_learner.base_learner_origin.metric_generators: metric_generator = functions.import_object_from_string_code( base_learner.base_learner_origin.metric_generators[key], 'metric_generator' ) base_learner.individual_score[key] = metric_generator(y_true, meta_features) meta_features_path = base_learner.meta_features_path(path) if not os.path.exists(os.path.dirname(meta_features_path)): os.makedirs(os.path.dirname(meta_features_path)) np.save(meta_features_path, meta_features, allow_pickle=False) base_learner.job_status = 'finished' base_learner.meta_features_exists = True session.add(base_learner) session.commit() except: session.rollback() base_learner.job_status = 'errored' base_learner.description['error_type'] = repr(sys.exc_info()[0]) base_learner.description['error_value'] = repr(sys.exc_info()[1]) base_learner.description['error_traceback'] = \ traceback.format_exception(*sys.exc_info()) session.add(base_learner) session.commit() raise
def evaluate_stacked_ensemble(path, ensemble_id): """Evaluates the ensemble and updates the database when finished/ Args: path (str): Path to Xcessiv notebook ensemble_id (str): Ensemble ID """ with functions.DBContextManager(path) as session: stacked_ensemble = session.query(models.StackedEnsemble).filter_by( id=ensemble_id).first() if not stacked_ensemble: raise exceptions.UserError('Stacked ensemble {} ' 'does not exist'.format(ensemble_id)) stacked_ensemble.job_id = get_current_job().id stacked_ensemble.job_status = 'started' session.add(stacked_ensemble) session.commit() try: meta_features_list = [] for base_learner in stacked_ensemble.base_learners: mf = np.load(base_learner.meta_features_path(path)) if len(mf.shape) == 1: mf = mf.reshape(-1, 1) meta_features_list.append(mf) secondary_features = np.concatenate(meta_features_list, axis=1) # Get data extraction = session.query(models.Extraction).first() return_splits_iterable = functions.import_object_from_string_code( extraction.meta_feature_generation['source'], 'return_splits_iterable' ) X, y = extraction.return_train_dataset() # We need to retrieve original order of meta-features indices_list = [test_index for train_index, test_index in return_splits_iterable(X, y)] indices = np.concatenate(indices_list) X, y = X[indices], y[indices] est = stacked_ensemble.return_secondary_learner() return_splits_iterable_stacked_ensemble = functions.import_object_from_string_code( extraction.stacked_ensemble_cv['source'], 'return_splits_iterable' ) preds = [] trues_list = [] for train_index, test_index in return_splits_iterable_stacked_ensemble(secondary_features, y): X_train, X_test = secondary_features[train_index], secondary_features[test_index] y_train, y_test = y[train_index], y[test_index] est = est.fit(X_train, y_train) preds.append( getattr(est, stacked_ensemble.base_learner_origin. meta_feature_generator)(X_test) ) trues_list.append(y_test) preds = np.concatenate(preds, axis=0) y_true = np.concatenate(trues_list) for key in stacked_ensemble.base_learner_origin.metric_generators: metric_generator = functions.import_object_from_string_code( stacked_ensemble.base_learner_origin.metric_generators[key], 'metric_generator' ) stacked_ensemble.individual_score[key] = metric_generator(y_true, preds) stacked_ensemble.job_status = 'finished' session.add(stacked_ensemble) session.commit() except: session.rollback() stacked_ensemble.job_status = 'errored' stacked_ensemble.description['error_type'] = repr(sys.exc_info()[0]) stacked_ensemble.description['error_value'] = repr(sys.exc_info()[1]) stacked_ensemble.description['error_traceback'] = \ traceback.format_exception(*sys.exc_info()) session.add(stacked_ensemble) session.commit() raise
def start_automated_run(path, automated_run_id): """Starts automated run. This will automatically create base learners until the run finishes or errors out. Args: path (str): Path to Xcessiv notebook automated_run_id (str): Automated Run ID """ with functions.DBContextManager(path) as session: automated_run = session.query(models.AutomatedRun).filter_by(id=automated_run_id).first() if not automated_run: raise exceptions.UserError('Automated run {} ' 'does not exist'.format(automated_run_id)) automated_run.job_id = get_current_job().id automated_run.job_status = 'started' session.add(automated_run) session.commit() try: module = functions.import_string_code_as_module(automated_run.source) random_state = 8 if not hasattr(module, 'random_state') else module.random_state assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators # get non-searchable parameters base_estimator = automated_run.base_learner_origin.return_estimator() base_estimator.set_params(**module.default_params) default_params = functions.make_serializable(base_estimator.get_params()) non_searchable_params = dict((key, val) for key, val in iteritems(default_params) if key not in module.pbounds) # get already calculated base learners in search space existing_base_learners = [] for base_learner in automated_run.base_learner_origin.base_learners: if not base_learner.job_status == 'finished': continue in_search_space = True for key, val in iteritems(non_searchable_params): if base_learner.hyperparameters[key] != val: in_search_space = False break # If no match, move on to the next base learner if in_search_space: existing_base_learners.append(base_learner) # build initialize dictionary target = [] initialization_dict = dict((key, list()) for key in module.pbounds.keys()) for base_learner in existing_base_learners: # check if base learner's searchable hyperparameters are all numerical all_numerical = True for key in module.pbounds.keys(): if not isinstance(base_learner.hyperparameters[key], numbers.Number): all_numerical = False break if not all_numerical: continue # if there is a non-numerical hyperparameter, skip this. for key in module.pbounds.keys(): initialization_dict[key].append(base_learner.hyperparameters[key]) target.append(base_learner.individual_score[module.metric_to_optimize]) initialization_dict['target'] = target if not module.invert_metric \ else list(map(lambda x: -x, target)) print('{} existing in initialization dictionary'. format(len(initialization_dict['target']))) # Create function to be optimized func_to_optimize = return_func_to_optimize( path, session, automated_run.base_learner_origin, module.default_params, module.metric_to_optimize, module.invert_metric, set(module.integers) ) # Create Bayes object bo = BayesianOptimization(func_to_optimize, module.pbounds) bo.initialize(initialization_dict) np.random.seed(random_state) bo.maximize(**module.maximize_config) automated_run.job_status = 'finished' session.add(automated_run) session.commit() except: session.rollback() automated_run.job_status = 'errored' automated_run.description['error_type'] = repr(sys.exc_info()[0]) automated_run.description['error_value'] = repr(sys.exc_info()[1]) automated_run.description['error_traceback'] = \ traceback.format_exception(*sys.exc_info()) session.add(automated_run) session.commit() raise
def extraction_data_statistics(path): """ Generates data statistics for the given data extraction setup stored in Xcessiv notebook. This is in rqtasks.py but not as a job yet. Temporarily call this directly while I'm figuring out Javascript lel. Args: path (str, unicode): Path to xcessiv notebook """ with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() X, y = extraction.return_main_dataset() functions.verify_dataset(X, y) if extraction.test_dataset['method'] == 'split_from_main': X, X_test, y, y_test = train_test_split( X, y, test_size=extraction.test_dataset['split_ratio'], random_state=extraction.test_dataset['split_seed'], stratify=y ) elif extraction.test_dataset['method'] == 'source': if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']: raise exceptions.UserError('Source is empty') extraction_code = extraction.test_dataset["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_test_dataset") X_test, y_test = extraction_function() else: X_test, y_test = None, None # test base learner cross-validation extraction_code = extraction.meta_feature_generation['source'] return_splits_iterable = functions.import_object_from_string_code( extraction_code, 'return_splits_iterable' ) number_of_splits = 0 test_indices = [] try: for train_idx, test_idx in return_splits_iterable(X, y): number_of_splits += 1 test_indices.append(test_idx) except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) # preparation before testing stacked ensemble cross-validation test_indices = np.concatenate(test_indices) X, y = X[test_indices], y[test_indices] # test stacked ensemble cross-validation extraction_code = extraction.stacked_ensemble_cv['source'] return_splits_iterable = functions.import_object_from_string_code( extraction_code, 'return_splits_iterable' ) number_of_splits_stacked_cv = 0 try: for train_idx, test_idx in return_splits_iterable(X, y): number_of_splits_stacked_cv += 1 except Exception as e: raise exceptions.UserError('User code exception', exception_message=str(e)) data_stats = dict() data_stats['train_data_stats'] = functions.verify_dataset(X, y) if X_test is not None: data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test) else: data_stats['test_data_stats'] = None data_stats['holdout_data_stats'] = {'number_of_splits': number_of_splits} data_stats['stacked_ensemble_cv_stats'] = {'number_of_splits': number_of_splits_stacked_cv} extraction.data_statistics = data_stats session.add(extraction) session.commit()
def export_as_package(self, package_path, cv_source): """Exports the ensemble as a Python package and saves it to `package_path`. Args: package_path (str, unicode): Absolute/local path of place to save package in cv_source (str, unicode): String containing actual code for base learner cross-validation used to generate secondary meta-features. Raises: exceptions.UserError: If os.path.join(path, name) already exists. """ if os.path.exists(package_path): raise exceptions.UserError( '{} already exists'.format(package_path)) package_name = os.path.basename(os.path.normpath(package_path)) os.makedirs(package_path) # Write __init__.py with open(os.path.join(package_path, '__init__.py'), 'wb') as f: f.write('from {}.builder import xcessiv_ensemble'.format( package_name).encode('utf8')) # Create package baselearners with each base learner having its own module os.makedirs(os.path.join(package_path, 'baselearners')) open(os.path.join(package_path, 'baselearners', '__init__.py'), 'a').close() for idx, base_learner in enumerate(self.base_learners): base_learner.export_as_file( os.path.join(package_path, 'baselearners', 'baselearner' + str(idx))) # Create metalearner.py containing secondary learner self.base_learner_origin.export_as_file( os.path.join(package_path, 'metalearner'), self.secondary_learner_hyperparameters) # Create cv.py containing CV method for getting meta-features with open(os.path.join(package_path, 'cv.py'), 'wb') as f: f.write(cv_source.encode('utf8')) # Create stacker.py containing class for Xcessiv ensemble ensemble_source = '' stacker_file_loc = os.path.join( os.path.abspath(os.path.dirname(__file__)), 'stacker.py') with open(stacker_file_loc) as f: ensemble_source += f.read() ensemble_source += '\n\n' \ ' def {}(self, X):\n' \ ' return self._process_using_' \ 'meta_feature_generator(X, "{}")\n\n'\ .format(self.base_learner_origin.meta_feature_generator, self.base_learner_origin.meta_feature_generator) with open(os.path.join(package_path, 'stacker.py'), 'wb') as f: f.write(ensemble_source.encode('utf8')) # Create builder.py containing file where `xcessiv_ensemble` is instantiated for import builder_source = '' for idx, base_learner in enumerate(self.base_learners): builder_source += 'from {}.baselearners import baselearner{}\n'.format( package_name, idx) builder_source += 'from {}.cv import return_splits_iterable\n'.format( package_name) builder_source += 'from {} import metalearner\n'.format(package_name) builder_source += 'from {}.stacker import XcessivStackedEnsemble\n'.format( package_name) builder_source += '\nbase_learners = [\n' for idx, base_learner in enumerate(self.base_learners): builder_source += ' baselearner{}.base_learner,\n'.format(idx) builder_source += ']\n' builder_source += '\nmeta_feature_generators = [\n' for idx, base_learner in enumerate(self.base_learners): builder_source += ' baselearner{}.meta_feature_generator,\n'.format( idx) builder_source += ']\n' builder_source += '\nxcessiv_ensemble = XcessivStackedEnsemble(base_learners=base_learners,' \ ' meta_feature_generators=meta_feature_generators,' \ ' secondary_learner=metalearner.base_learner,' \ ' cv_function=return_splits_iterable,' \ ' append_original={})\n'.format(self.append_original) with open(os.path.join(package_path, 'builder.py'), 'wb') as f: f.write(builder_source.encode('utf8'))
def extraction_data_statistics(path): """ Generates data statistics for the given data extraction setup stored in Xcessiv notebook. This is in rqtasks.py but not as a job yet. Temporarily call this directly while I'm figuring out Javascript lel. Args: path (str, unicode): Path to xcessiv notebook """ with functions.DBContextManager(path) as session: extraction = session.query(models.Extraction).first() X, y = extraction.return_main_dataset() functions.verify_dataset(X, y) if extraction.test_dataset['method'] == 'split_from_main': X, X_test, y, y_test = train_test_split( X, y, test_size=extraction.test_dataset['split_ratio'], random_state=extraction.test_dataset['split_seed'], stratify=y ) elif extraction.test_dataset['method'] == 'source': if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']: raise exceptions.UserError('Source is empty') extraction_code = extraction.test_dataset["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_test_dataset") X_test, y_test = extraction_function() else: X_test, y_test = None, None if extraction.meta_feature_generation['method'] == 'holdout_split': X, X_holdout, y, y_holdout = train_test_split( X, y, test_size=extraction.meta_feature_generation['split_ratio'], random_state=extraction.meta_feature_generation['seed'], stratify=y ) elif extraction.meta_feature_generation['method'] == 'holdout_source': if 'source' not in extraction.meta_feature_generation or \ not extraction.meta_feature_generation['source']: raise exceptions.UserError('Source is empty') extraction_code = extraction.meta_feature_generation["source"] extraction_function = functions.\ import_object_from_string_code(extraction_code, "extract_holdout_dataset") X_holdout, y_holdout = extraction_function() else: X_holdout, y_holdout = None, None data_stats = dict() data_stats['train_data_stats'] = functions.verify_dataset(X, y) if X_test is not None: data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test) else: data_stats['test_data_stats'] = None if X_holdout is not None: data_stats['holdout_data_stats'] = functions.verify_dataset(X_holdout, y_holdout) else: data_stats['holdout_data_stats'] = None extraction.data_statistics = data_stats session.add(extraction) session.commit()