def bad_subset(forest, X_test, y_test): # define mask to select large subset with poor performance # currently mask defines the entire set mask = None X_subgroup = X_test[mask] y_subgroup = y_test[mask] subgroup_size = len(X_subgroup) y_subgroup_preds = forest.predict_proba(X_subgroup)[:, 1] performance = cindex(y_subgroup.values, y_subgroup_preds) return performance, subgroup_size
def bad_subset(forest, X_test, y_test): # define mask to select large subset with poor performance # currently mask defines the entire set ### START CODE HERE (REPLACE the code after 'mask =' with your code) ### mask = X_test['BMI'] < 50 ### END CODE HERE ### X_subgroup = X_test[mask] y_subgroup = y_test[mask] subgroup_size = len(X_subgroup) y_subgroup_preds = forest.predict_proba(X_subgroup)[:, 1] performance = cindex(y_subgroup.values, y_subgroup_preds) return performance, subgroup_size
def holdout_grid_search(clf, X_train_hp, y_train_hp, X_val_hp, y_val_hp, hyperparams, fixed_hyperparams={}): ''' Conduct hyperparameter grid search on hold out validation set. Use holdout validation. Hyperparameters are input as a dictionary mapping each hyperparameter name to the range of values they should iterate over. Use the cindex function as your evaluation function. Input: clf: sklearn classifier X_train_hp (dataframe): dataframe for training set input variables y_train_hp (dataframe): dataframe for training set targets X_val_hp (dataframe): dataframe for validation set input variables y_val_hp (dataframe): dataframe for validation set targets hyperparams (dict): hyperparameter dictionary mapping hyperparameter names to range of values for grid search fixed_hyperparams (dict): dictionary of fixed hyperparameters that are not included in the grid search Output: best_estimator (sklearn classifier): fitted sklearn classifier with best performance on validation set best_hyperparams (dict): hyperparameter dictionary mapping hyperparameter names to values in best_estimator ''' best_estimator = None best_hyperparams = {} # hold best running score best_score = 0.0 # get list of param values lists = hyperparams.values() # get all param combinations param_combinations = list(itertools.product(*lists)) total_param_combinations = len(param_combinations) # iterate through param combinations for i, params in enumerate(param_combinations, 1): # fill param dict with params param_dict = {} for param_index, param_name in enumerate(hyperparams): param_dict[param_name] = params[param_index] # create estimator with specified params estimator = clf(**param_dict, **fixed_hyperparams) # fit estimator estimator.fit(X_train_hp, y_train_hp) # get predictions on validation set preds = estimator.predict_proba(X_val_hp) # compute cindex for predictions estimator_score = cindex(y_val_hp, preds[:, 1]) print(f'[{i}/{total_param_combinations}] {param_dict}') print(f'Val C-Index: {estimator_score}\n') # if new high score, update high score, best estimator # and best params if estimator_score >= best_score: best_score = estimator_score best_estimator = estimator best_hyperparams = param_dict # add fixed hyperparamters to best combination of variable hyperparameters best_hyperparams.update(fixed_hyperparams) return best_estimator, best_hyperparams