class WhichEffectLargerRange(EqualityCheckProblem): _var = 'feature_with_bigger_range_of_effects' _expected = 'diag_1_428' _solution = CS(""" # the range of diag_1_428 is wider, largely due to the few points on the far right. feature_with_bigger_range_of_effects = 'diag_1_428' """)
class ImportanceWithAbsFeatures(CodingProblem): _vars = ['perm2'] _solution = CS(""" data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude) data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude) features_2 = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'abs_lat_change', 'abs_lon_change'] X = data[features_2] new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1) second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y) # Create a PermutationImportance object on second_model and fit it to new_val_X and new_val_y perm2 = PermutationImportance(second_model, random_state=1).fit(new_val_X, new_val_y) # show the weights for the permutation importance you just calculated eli5.show_weights(perm2, feature_names = features_2) """) def check(self, perm_obj): assert np.allclose(perm_obj.feature_importances_, np.array([ 0.06128774, 0.08575455, 0.07350467, 0.07330853, 0.57827417, 0.44671882 ]), rtol=0.1)
class ExerciseFormatTutorial(EqualityCheckProblem): _var = 'color' _expected = 'blue' _hint = "Your favorite color rhymes with *glue*." _solution = CS('color = "blue"') @property def correct_message(self): history = self._view.interactions if history['hint'] == 0 and history['solution'] == 0: return ("What?! You got it right without needing a hint or anything?" " Drats. Well hey, you should still continue to the next step" " to get some practice asking for a hint and checking solutions." " (Even though you obviously don't need any help here.)" ) return '' def failure_message(self, var, actual, expected): if ( any(actual.endswith(suff) for suff in ['oo', 'ue', 'ew']) and actual.strip().lower() != 'blue' ): return "Ha ha, very funny." elif actual.strip(' .!').lower() == 'ni': return "Please! Please! No more! We will find you a shrubbery." return ("{} is not your favorite color!" " Well, maybe it is, but we're writing the rules. The point" " of this question is to force you to get some practice asking" " for a hint. Go ahead and uncomment the call to `q0.hint()`" " in the code cell below, for a hint at what your favorite color" " *really* is.").format(actual)
class CircleArea(EqualityCheckProblem): _vars = ['radius', 'area'] _expected = [3/2, (3/2)**2 * 3.14159] _hint = "The syntax to raise a to the b'th power is `a ** b`" _solution = CS('radius = diameter / 2', 'area = pi * radius ** 2')
class DesignDatasetUShapedPdp(CodingProblem): _var = 'pdp_dist' _hint = "Consider explicitly using terms that include mathematical expressions like `(X1 < -1)`" _solution = CS(""" # There are many possible solutions. # One example expression for y is. y = -2 * X1 * (X1<-1) + X1 - 2 * X1 * (X1>1) - X2 # You don't need any more changes """) def check(self, pdp_result): segment_1_end = np.argmin(pdp_result.feature_grids < -1) segment_3_start = np.argmax(pdp_result.feature_grids > 1) segment_2_start = segment_1_end + 1 segment_2_end = segment_3_start - 1 segment_1_slopes_down = pdp_result.pdp[0] > pdp_result.pdp[ segment_1_end] segment_2_slopes_up = pdp_result.pdp[segment_2_start] < pdp_result.pdp[ segment_2_end] segment_3_slopes_down = pdp_result.pdp[ segment_3_start] > pdp_result.pdp[-1] assert segment_1_slopes_down, ( "The partial dependence plot does not slope down for values below -1." ) assert segment_2_slopes_up, ( "The partial dependence plot does not slope up for values between -1 and 1." ) assert segment_3_slopes_down, ( "The partial dependence plot does not slope down for values above 1." )
class SplitData(CodingProblem): # test are on train_X and val_y. If these are right, others will be right too. _vars = ["train_X", "val_X", "train_y", "val_y", "X", "y"] _hint = ( "The function you need to import is part of sklearn. When calling " "the function, the arguments are X and y. Ensure you set the random_state to 1." ) _solution = CS("""from sklearn.model_selection import train_test_split train_x, val_X, train_y, val_y = train_test_split(X, y, random_state=1)""") def check(self, train_X, val_X, train_y, val_y, X, y): true_train_X, _, _, true_val_y = \ [i for i in train_test_split(X, y, random_state=1)] assert train_X.shape == true_train_X.shape, ( "Expected train_X to have shape {}. " "Your code produced train_X with shape {}.").format( true_train_X.shape, train_X.shape) assert val_y.shape == true_val_y.shape, ( "Expected val_y to have shape {}. " "Your code produced val_y with shape {}.").format( true_val_y.shape, val_y.shape) # Verify they have set the seed correctly, to help with later steps assert all(train_X.index == true_train_X.index ), "The training data had different rows than expected"
class MAE(EqualityCheckProblem): _var = 'val_mae' _expected = 29652.931506849316 _hint = ( "The order of arguments to mean_absolute_error doesn't matter. Make sure you fit to only the training data in step 2." ) _solution = CS("""val_mae = mean_absolute_error(val_predictions, val_y)""")
class BestTreeSize(EqualityCheckProblem): _var = 'best_tree_size' _expected = 100 _hint = ("You will call get_mae in the loop. You'll need to map " "the names of your data structure to the names in get_mae") _solution = CS("""# Here is a short solution with a dict comprehension. # The lesson gives an example of how to do this with an explicit loop. scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes} best_tree_size = min(scores, key=scores.get) """)
class CandySplitting(EqualityCheckProblem): _var = 'to_smash' _expected = (121 + 77 + 109) % 3 _default_values = [-1] _hints = [ "You'll probably want to use the modulo operator, `%`.", "`j % k` is the remainder after dividing `j` by `k`", ] _solution = CS("(alice_candies + bob_candies + carol_candies) % 3")
class FromPermImportanceToMarginalEffect(CodingProblem): _var = 'result' _hint = '' _solution = CS(""" data = ["John", "Doe", 53.44] result = f"Hello {data[0]} {data[1]}. Your current balance is ${data[2]}." """) def check(self, result_obj): assert result_obj == "Hello John Doe. Your current balance is $53.44."
class HomeDescription(EqualityCheckProblem): _vars = ['avg_lot_size', 'newest_home_age'] max_year_built = 2010 min_home_age = datetime.datetime.now().year - max_year_built _expected = [10517, min_home_age] _hint = 'Run the describe command. Lot size is in the column called LotArea. Also look at YearBuilt' _solution = CS("""# using data read from home_data.describe() avg_lot_size = 10517 newest_home_age = 8 """)
class ImportanceWithAbsFeatures(CodingProblem): _var = 'result' _hint = '' _solution = CS(""" numbers = [1,2,3,4,5,6,7,8,9] result = numbers[1::2] """) def check(self, result_obj): assert result_obj == [2, 4, 6, 8]
class RawActualsInsteadOfPDP(ThoughtExperiment): _hint = "This requires a groupby (from pandas) on the raw data, rather than using a model" _solution = CS(""" # A simple pandas groupby showing the average readmission rate for each time_in_hospital. # Do concat to keep validation data separate, rather than using all original data all_train = pd.concat([train_X, train_y], axis=1) all_train.groupby(['time_in_hospital']).mean().readmitted.plot() plt.show() """)
class LoadHomeData(EqualityCheckProblem): _var = 'home_data' _hint = "Use the `pd.read_csv` function" _solution = CS('home_data = pd.read_csv(iowa_file_path)') def check(self, df): assert isinstance(df, pd.DataFrame), ("`home_data` should be a DataFrame," " not `{}`").format(type(df),) expected_shape = (1460, 81) assert df.shape == expected_shape, ("Expected {} rows and {} columns, but" " got shape {}").format(expected_shape[0], expected_shape[1], df.shape)
class ValPreds(CodingProblem): _vars = ['val_predictions', 'iowa_model', 'val_X'] _hint = 'Run predict on the right validation data object.' _solution = CS("""val_predictions = iowa_model.predict(val_X)""") def check(self, val_predictions, iowa_model, val_X): assert val_predictions.size == 365, "`val_predictions` is wrong size. Did you predict with the wrong data?" comparison_val_preds = iowa_model.predict(val_X) assert all(comparison_val_preds == val_predictions), ( "Predictions do not match expectations. " "Did you supply the right data")
class SummarizeModel(ThoughtExperiment): _solution = CS(""" # Use permutation importance as a succinct model summary # A measure of model performance on validation data would be useful here too import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names = val_X.columns.tolist()) """)
class WhyLatitude(CodingProblem): _var = 'result' _hint = '' _solution = CS(""" result = 0 numbers = [10, 20, 30, 40] for n in numbers: result = result + n """) def check(self, result_obj): assert result_obj == 100
class FitModelWithAllData(CodingProblem): _vars = ['final_model', 'X', 'y'] _hint = 'Fit with the ideal value of max_leaf_nodes. In the fit step, use all of the data in the dataset' _solution = CS("""# Fit the model with best_tree_size. Fill in argument to make optimal size final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1) # fit the final model final_model.fit(X, y)""") def check(self, final_model, X, y): assert final_model.max_leaf_nodes == 100, "Didn't set max_leaf_nodes to the right value when building the tree" # Model has in-sample R^2 of 0.92 when run on all data, independent of seed. # score(X,y) is 0.88 if model was trained on train_X and train_y assert final_model.score(X, y) > 0.9, "Your model isn't quite as accurate as expected. Did you fit it on all the data?"
class FirstPermImportance(CodingProblem): _var = 'result' _hint = '' _solution = CS(""" numbers = [1,2,3] new_list = [] result = 0 new_list.append(str(numbers[0])) new_list.append(str(numbers[1])) new_list.append(str(numbers[2])) result = new_list[1] """) def check(self, result_obj): assert result_obj == "2"
class EffectNumInpatient(ThoughtExperiment): _solution = CS(""" # PDP for number_inpatient feature from matplotlib import pyplot as plt from pdpbox import pdp, get_dataset, info_plots feature_name = 'number_inpatient' # Create the data that we will plot my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name) # plot it pdp.pdp_plot(my_pdp, feature_name) plt.show() """)
class MakePredictions(CodingProblem): _vars = ['predictions', 'iowa_model', 'X'] _hint = """Use `iowa_model.predict` with an argument holding the data to predict with.""" _solution = CS('iowa_model.predict(X)') def check(self, predictions, iowa_model, X): # This step is just checking that they can make predictions. # If we want to check model is correct, do it in fitting step. ground_truth = iowa_model.predict(X) assert ground_truth.shape == predictions.shape, ( "Your predictions are " "shape {}. Expected shape {}").format(ground_truth.shape, predictions.shape) assert all(predictions == ground_truth), ( "Expected {} but got predictions {}").format(ground_truth, preds)
class UseShap(ThoughtExperiment): _hint = "Here's the time to use SHAP values" _solution = CS(""" # Use SHAP values to show the effect of each feature of a given patient import shap # package used to calculate Shap values sample_data_for_prediction = val_X.iloc[0].astype(float) # to test function def patient_risk_factors(model, patient_data): # Create object that can calculate shap values explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient_data) shap.initjs() shap.force_plot(explainer.expected_value[1], shap_values[1], patient_data) """)
class SumStringsWithNumbers(CodingProblem): _var = 'result' _hint = '' _solution = """It would be helpful to know whether New York City taxis vary prices based on how many passengers they have. Most places do not change fares based on numbers of passengers. If you assume New York City is the same, than only the top 4 features listed should matter. At first glance, it seems all of those should matter equally. """ _solution = CS(""" result = str(1) + " is the first number." + '\n' + "And " + str(3.2) + " is a decimal" """) def check(self, result_obj): assert result_obj == str( 1) + " is the first number." + '\n' + "And " + str( 3.2) + " is a decimal"
class ScaleUpFeatureMagnitude(CodingProblem): _var = 'result' _hint = '' _solution = CS(""" x_list = [1,2,3,] y_list = [3, "hello", 65] # Replace these values first_term = sum(x_list) y_list.reverse() # remember that the `reverse()` operation does not have a result! second_term = y_list[0] result = first_term + second_term """) def check(self, result_obj): assert result_obj == 71
class SetTarget(CodingProblem): _var = 'y' _hint = ( "Use `print(home_data.columns)`. The column you want is at the end " "of the list. Use the dot notation to pull out this column from the DataFrame" ) _solution = CS('y = home_data.SalePrice') def check(self, targ): assert isinstance(targ, pd.Series), ("`home_data` should be a Pandas Series " "with the actual data. Your current " "answer is a `{}`").format( type(targ), ) true_mean = 180921.19589041095 assert int( targ.mean()) == int(true_mean), ("You've selected the wrong data.")
class FirstPermImportance(CodingProblem): _var = 'perm' _hint = 'The only thing you need to change is the first argument to `PermutationImportance()`. Find the right model name in the code above' _solution = CS(""" import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names = base_features) """) def check(self, perm_obj): assert np.allclose(perm_obj.feature_importances_, np.array([ 0.62288714, 0.8266946, 0.53837499, 0.84735854, -0.00291397 ]), rtol=0.1)
class CreateModel(CodingProblem): _var = 'iowa_model' _hint = ( "Include `random_state` when specifying model. Data is specified when fitting it." ) _solution = CS("""from sklearn.tree import DecisionTreeRegressor iowa_model = DecisionTreeRegressor(random_state=1) iowa_model.fit(X, y)""") def check(self, dtree): # Not checking what they fit, because likely mistakes cause exceptions assert type(dtree) == type(DecisionTreeRegressor()), \ ("Expected `dtree` to be of type DecisionTreeRegressor but got an " "object of type `{}`").format(type(dtree)) assert dtree.random_state is not None, "You forgot to set the random_state." assert getattr(dtree, 'tree_', None) is not None, "You have not fit the model."
class SelectPredictionData(CodingProblem): _var = 'X' _hint = ( "Capitalization and spelling are important when specifying variable names. " "Use the brackets notation when specifying data for X.") _solution = CS( """feature_names = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"] X=home_data[feature_names]""") def check(self, df): assert isinstance(df, pd.DataFrame), ("`X` should be a DataFrame," " not `{}`").format(type(df), ) expected_shape = (1460, 7) assert df.shape == expected_shape, ( "Expected {} rows and {} columns, but" " got shape {}").format(expected_shape[0], expected_shape[1], df.shape)
class FitModelWithTrain(CodingProblem): _vars = ['iowa_model', 'train_X', 'train_y', 'val_X'] _hint = 'Remember, you fit with training data. You will test with validation data soon' _solution = CS("""iowa_model = DecisionTreeRegressor(random_state=1) iowa_model.fit(train_X, train_y)""") def check(self, iowa_model, train_X, train_y, val_X): assert iowa_model.tree_, "You have not fit your model yet." assert iowa_model.random_state == 1, "Ensure you created your model with random_state=1" # Fitting this model is cheap. So we do it in check correct_model = DecisionTreeRegressor(random_state=1) correct_model.fit(train_X, train_y) expected_pred = correct_model.predict(val_X.head(10)) actual_pred = iowa_model.predict(val_X.head(10)) print(expected_pred) print(actual_pred) assert all(actual_pred == expected_pred), ( "Model was tested by predicting the value of first row training data " "Expected prediction of {}. Model actually predicted {}" "Did you set the random_state and pass the right data?").format( expected_pred, actual_pred)
class DesignFlatPDPWithHighImportance(CodingProblem): _vars = ['perm', 'pdp_dist'] _hint = "You need for X1 to affect the prediction in order to have it affect permutation importance. But the average effect needs to be 0 to satisfy the PDP requirement. Achieve this by creating an interaction, so the effect of X1 depends on the value of X2 and vice-versa." _solution = CS(""" # Create array holding predictive feature X1 = 4 * rand(n_samples) - 2 X2 = 4 * rand(n_samples) - 2 # Create y. you should have X in the expression for y y = X1 * X2 # Aside from these lines, use the code provided """) def check(self, importance, pdpResult): X1_imp = importance.feature_importances_[0] pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp) assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. " "Actual importance was {}").format(X1_imp) assert (pdpRange < 0.5), ("Tested that the highest point on the Partial " "Dependence Plot is within 0.5 of the lowest point. " "Actual difference was {}").format(pdpRange)