def compute_fi_gain(self): # TODO: fix it fi = self.predictor.get_score(importance_type='gain') fi = Series(self.group_fi(fi)) # xgboost doesn't always return all the columns he got in fit. # https://www.kaggle.com/c/homesite-quote-conversion/discussion/18669 for col in self.x_train_cols: if col not in fi.index: fi[col] = 0 return normalize_series(Series(self.group_fi(fi)))
def run_experiment( model_name: str, get_data: callable, compute_permutation: bool, \ save_results: bool, model, exp_results_path): X, y = get_data() seed(7) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_RATIO) preprocessing_pipeline.fit(X_train) X_train = preprocessing_pipeline.transform(X_train) X_test = preprocessing_pipeline.transform(X_test) print("binning data") num_cols = get_num_cols(X_train.dtypes) bin_mapper = BinMapper(max_bins=256, random_state=42) X_train.loc[:, num_cols] = bin_mapper.fit_transform( X_train.loc[:, num_cols].values) X_test.loc[:, num_cols] = bin_mapper.transform(X_test.loc[:, num_cols].values) original_dtypes = X_train.dtypes model.fit(X_train, y_train) test_prediction = model.predict(X_test) if compute_permutation: permutation_train = model.compute_fi_permutation(X_train, y_train).to_dict() permutation_test = model.compute_fi_permutation(X_test, y_test).to_dict() else: empty_dict = Series({col: nan for col in original_dtypes}) permutation_train = empty_dict permutation_test = empty_dict is_classification = len(unique(y)) == 2 if is_classification: df = DataFrame() df['p'] = model.predict(X_test) df['y'] = y_test df = df[df.p.notna()] logloss = log_loss(df['y'], df['p']) else: logloss = nan fi = Series(model.compute_feature_importance(method='gain')) fi = normalize_series(fi).to_dict() results = dict(model=f"{model_name}", ntrees=len(model.trees), leaves=[tree.n_leaves for tree in model.trees], nleaves=sum([tree.n_leaves for tree in model.trees]), logloss=logloss, gain=fi) # if save_results: # DataFrame(Series(results)).T.to_csv(exp_results_path) print(logloss)
def compute_fi_permutation(self, X, y): results = {} true_error = self.compute_error(X, y) for col in X.columns: permutated_x = X.copy() random_feature_mse = [] for i in range(N_PERMUTATIONS): permute_col(permutated_x, col) random_feature_mse.append(self.compute_error(permutated_x, y)) results[col] = mean(array(random_feature_mse)) - true_error fi = Series(self.group_fi(results)) return normalize_series(fi)
def get_feature_importance(exp, path, loo): all_cols = [ 'gain', 'permutation_train', 'permutation_test', 'shap_train', 'shap_test' ] our_cols = ['gain', 'permutation_train', 'permutation_test'] cols = our_cols if exp.startswith('Ours') else all_cols fi_df = get_fi(path, cols, {col: literal_eval for col in cols}) if loo: fi_df['loo'] = get_loo_fi(path, fi_df.index.tolist()) fi_df['loo'] = normalize_series(fi_df['loo']) return fi_df
def compute_fi_permutation(self, X, y): results = {} mse = mean(square(y - self.predictor.predict(X))) for col in X.columns: permutated_x = X.copy() random_feature_mse = [] for i in range(N_PERMUTATIONS): permutated_x[col] = permutation(permutated_x[col]) random_feature_mse.append(mean(square(y - self.predictor.predict(permutated_x)))) results[col] = mean(array(random_feature_mse)) - mse fi = Series(results) return normalize_series(fi)
def run_experiment( model_name: str, get_data: callable, compute_permutation: bool, \ save_results: bool, model, exp_results_path): X, y = get_data() seed(7) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_RATIO) preprocessing_pipeline.fit(X_train) X_train = preprocessing_pipeline.transform(X_train) X_test = preprocessing_pipeline.transform(X_test) original_dtypes = X_train.dtypes model.fit(X_train, y_train) test_prediction = model.predict(X_test) if compute_permutation: permutation_train = model.compute_fi_permutation(X_train, y_train).to_dict() permutation_test = model.compute_fi_permutation(X_test, y_test).to_dict() else: empty_dict = Series({col: nan for col in original_dtypes}) permutation_train = empty_dict permutation_test = empty_dict is_classification = len(unique(y)) == 2 if is_classification: probabiliteis = model.predict(X_test) logloss = log_loss(y_test, probabiliteis) else: logloss = nan fi = Series(model.compute_feature_importance(method='gain')) fi = normalize_series(fi).to_dict() results = dict(model=f"{model_name}", ntrees=len(model.trees), leaves=[tree.n_leaves for tree in model.trees], nleaves=sum([tree.n_leaves for tree in model.trees]), logloss=logloss, gain=fi) if save_results: DataFrame(Series(results)).T.to_csv(exp_results_path) print(logloss) for tree in model.trees: try: tree_vis = TreeVisualizer() tree_vis.plot(tree) except: return
def get_feature_importance(exp, path, loo): all_cols = [ 'gain', 'permutation_train', 'permutation_test', 'shap_train', 'shap_test' ] our_cols = ['gain', 'permutation_train', 'permutation_test'] cols = our_cols if exp.startswith('Ours') else all_cols df = pd.read_csv(path, converters={col: literal_eval for col in cols}) results = pd.DataFrame() for col in cols: results[col] = pd.Series(df.loc[0, col]) results = results.sort_index() if loo: results['loo'] = get_loo_fi(path, results.index.tolist()) results['loo'] = normalize_series(results['loo']) return results
def compute_fi_permutation(self, X, y): results = {col: 0 for col in self.x_train_cols} true_error = self.compute_error(X, y) # get only features that got positive fi_gain gain_fi = self.compute_fi_gain() positive_fi_gain = gain_fi[gain_fi > 0].index.tolist() for col in positive_fi_gain: start = time() args = [(X, y, col, self.compute_error) for _ in range(N_PERMUTATIONS)] with multiprocessing.Pool(4) as process_pool: prm_results = process_pool.starmap(worker, args) results[col] = mean(array(prm_results)) - true_error end = time() print(f"{col} run took {end - start}") fi = Series(results) return normalize_series(fi)
def compute_fi_gain(self): fi = Series(self.predictor.compute_feature_importance(method='gain')) return normalize_series(fi)
def compute_fi_gain(self): # TODO: fix it fi = dict(zip(self.x_train_cols, self.predictor.feature_importances_)) fi = Series(self.group_fi(fi)) return normalize_series(fi)
def compute_fi_gain(self): # TODO: fix it fi = Series(self.predictor.feature_importances_, index=self.predictor.feature_names_) return normalize_series(fi)