# Save data into files do_save = input('Do you want to save trained model for final submission? (y/n)') if do_save: dump(X_feat_list, os.path.join(dir_out, "featlist.joblib") ) dump(scaler, os.path.join(dir_out, "scaler.joblib")) dump(clf_final, os.path.join(dir_out, "filename.joblib")) dump(best_th_pr, os.path.join(dir_out, "bestTHR.joblib")) dump(imputer, os.path.join(dir_out, "imputer.joblib")) #%% do_pi = input('Do you want to compute Permutation Importance? (y/n)') if do_pi == 'y': from eli5.sklearn import PermutationImportance perm = PermutationImportance(clf_final, random_state=1).fit(X_new, y) results_0 = perm.results_[0] results_mean = np.zeros(results_0.shape) results_std = np.zeros(results_0.shape) perm_means = np.mean(perm.results_, axis=0) perm_stds = np.std (perm.results_, axis=0) results_0_copy = np.copy(perm_means) variable_to_show = 15 importances_normalized = results_0_copy indices_sorted = np.argsort(importances_normalized)[::-1]
def permutation_importance(self, model, X_val, y_val): '''check Feature importance on the Validation data for the fitted model''' perm = PermutationImportance(model, random_state=1).fit(X_val, y_val) return eli5.show_weights(perm, feature_names = X_val.columns.tolist())
def sort_by_imp(train_cols, importance): return sorted(zip(train_cols, importance), key=lambda x: x[1], reverse=True) # Importances from models logreg_imp = sort_by_imp(train_cols, logreg.coef_[0]) rf_imp = sort_by_imp(train_cols, rf.feature_importances_) # Try permutation importance perm_imps = {} for label, model in [('rf', rf), ('logreg', logreg)]: perm = PermutationImportance(model, n_iter=10, cv='prefit').fit(X_valid, y_valid) perm_imp = sort_by_imp(train_cols, perm.feature_importances_) perm_imps[label] = perm_imp fig, ax = plt.subplots() ax = sns.boxplot(data=np.array(perm.results_), ax=ax) _ = ax.set_xticklabels(train_cols, rotation=90) _ = ax.set_ylabel('Improvement in log_loss') _ = ax.set_title(label) fig.tight_layout() plt.show(block=False) """ train_cols = [x[0] for x in perm_imp if x[1] > 0] """ # Try out SHAP
Heatmap makes it easy to identify which features are most related to the target variable, we will plot heatmap of correlated features using the seaborn library. ''' import pandas as pd import numpy as np import seaborn as sns data = pd.read_csv("D://Blogs//train.csv") X = data.iloc[:, 0:20] #independent columns y = data.iloc[:, -1] #target column i.e price range #get correlations of each features in dataset corrmat = data.corr() top_corr_features = corrmat.index plt.figure(figsize=(20, 20)) #plot heat map g = sns.heatmap(data[top_corr_features].corr(), annot=True, cmap="RdYlGn") ''' Permutation Importance: In Permutation Importance we ask Instead we will ask the following question: If I randomly shuffle a single column of the validation data, leaving the target and all other columns in place, how would that affect the accuracy of predictions in that now-shuffled data? ''' train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) my_model = RandomForestClassifier(n_estimators=100, random_state=0).fit(train_X, train_y) import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names=val_X.columns.tolist())
# plt.title('True label:' + str(N_test[i,-2]) + ' likelihood of label ' + str(N_test[i,-2]) + ': ' + str(softmax1_cnn[i][int(y_test[i])])) plt.title('True label:' + str(y_test[i]) + ' likelihood of label ' + str(y_test[i]) + ': ' + str(softmax1_cnn[i][int(y_test[i])])) plt.clim(0.003,0.010) plt.colorbar() plt.show #permutation feature weights import eli5 from eli5 import format_as_image from eli5.sklearn import PermutationImportance from sklearn.neural_network import MLPClassifier NNMLP_clf = MLPClassifier(random_state=48, max_iter=50) NNMLP_clf.fit(new_last_conv1, y_test1[:]) perm_all = PermutationImportance(NNMLP_clf).fit(new_last_conv1, y_test1) print('CNN results') exp = eli5.explain_weights_df(perm_all, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) perm_corr = PermutationImportance(NNMLP_clf).fit(new_last_conv1[correct_cnn[:]], y_test1[[correct_cnn[:]]]) print('CNN Correct results') exp_corr = eli5.explain_weights_df(perm_corr, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) perm_mis = PermutationImportance(NNMLP_clf).fit(new_last_conv1[misclass_cnn[:]], y_test1[misclass_cnn[:]]) print('CNN Misclass results') exp_mis = eli5.explain_weights_df(perm_mis, feature_names = [0,1,2,3,4,5,6,7,8,9,10]) from sklearn.preprocessing import normalize n0= normalize(final_last_conv1[correct_cnn[:]]) n1= normalize(final_last_conv1[misclass_cnn[:]])
def QC(self, cleaned_Data_frm, cleaned_Data_frm1, y, cursor, conn): # try: print('Models Building') float_cols = self.float_col result = pd.concat( [cleaned_Data_frm, cleaned_Data_frm1, y, float_cols], axis=1) self.data_sorted1 = result.sort_values(self.i) self.data_sorted = self.data_sorted1.loc[:, ~self.data_sorted1.columns. duplicated()] print(self.data_sorted.shape) new_list = [ list(set(self.x.columns).difference(self.data_sorted.columns)) ] uploaded_cols = [] for self.col in list( self.data_sorted.select_dtypes(include=[np.float64])): if not self.col in uploaded_cols: print(self.col) X = self.data_sorted.drop([self.col], axis=1) y = self.data_sorted[self.col] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_size, random_state=42) X_train, X_test = train_test(X_train, X_test) print(X_train.shape) Modles_reuslts = [] Names = [] target = self.col print('Models Building') models = ['Random Forest', 'KNN', 'XGB', 'SVR'] l = 0 features = [] for Regressor, params, model in zip(self.Regressor, self.Regressor_grids, models): print(model) gd = RandomizedSearchCV(Regressor, params, cv=5, n_jobs=-1, verbose=True) gd.fit(X_train, y_train) y_pred = gd.predict(X_test) random_best = gd.best_estimator_.predict(X_test) errors = abs(random_best - y_test) mape = np.mean(100 * (errors / y_test)) Accuracy = 100 - mape grid = gd.best_params_ estimator = gd.best_estimator_ l = +1 if model == 'KNN': perm = PermutationImportance(gd, random_state=1).fit( X_train, y_train) importances = perm.feature_importances_ DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, None, target, model) elif model == 'SVR': weights = gd.best_estimator_.coef_ test = ' '.join(str(weights).split()) #replace double whitespace with comma test = test.replace(" ", ",") #str to json loads lists = json.loads(test) importances = lists[0] DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, None, target, model) else: importances = gd.best_estimator_.feature_importances_.tolist( ) #._final_estimator print(Accuracy) features.append(importances) DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, None, target, model) def Reg_model(): model = Sequential() model.add( Dense(500, input_dim=X_train.shape[1], activation="relu")) model.add(Dense(100, activation="relu")) model.add(Dense(50, activation="relu")) model.add(Dense(1)) model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"]) return model model = KerasClassifier(build_fn=Reg_model, verbose=0) # define the grid search parameters batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X_train, y_train) grid = grid.best_params_ model = 'DNN' print("DNN", features) DB_upload(Accuracy, X_train, X_test, y_test, y_pred, features[0], grid, grid, l, None, target, model) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
import eli5 from eli5.sklearn import PermutationImportance n_samples = 20000 # Create array holding predictive feature X1 = 4 * rand(n_samples) - 2 X2 = 4 * rand(n_samples) - 2 # Create y. you should have X1 and X2 in the expression for y y = X1*X2 # create dataframe because pdp_isolate expects a dataFrame as an argument my_df = pd.DataFrame({'X1': X1, 'X2': X2, 'y': y}) predictors_df = my_df.drop(['y'], axis=1) my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y) pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1') pdp.pdp_plot(pdp_dist, 'X1') plt.show() perm = PermutationImportance(my_model).fit(predictors_df, my_df.y) # Check your answer q_7.check() # show the weights for the permutation importance you just calculated eli5.show_weights(perm, feature_names = ['X1', 'X2'])
def optimize_pipeline(self, seq, X, y): """ Constructs and optimizes a pipeline according to the steps passed through `seq` which is a tuple of estimators and transformers. :param seq: the tuple of steps of the pipeline to be optimized :param X: numpy array of training features :param y: numpy array of training values :return: the optimized pipeline and its score """ from .structsearch import SurrogateRandomCV if self.couldBfirst == []: from sklearn.pipeline import Pipeline else: from imblearn.pipeline import Pipeline OPTIM = None n = len(seq) idx = 0 ent_idx = 0 steps = [] config = {} task_name = self.check_point + '_'.join(seq) while ent_idx < n: est = seq[ent_idx] clss = self._get_class(est) pre = 'stp_%d' % idx if self.config_types[est] in ['regressor', 'classifier'] and ent_idx < n - 1: mdl = clss() steps.append((pre, StackingEstimator(mdl, res=self.stack_res, probs=self.stack_probs, decision=self.stack_decision))) ent_idx += 1 elif est == 'sklearn.pipeline.FeatureUnion': self.config[est] = dict() int_idx = 1 int_steps = [] next_est = seq[ent_idx + int_idx] while ((self.config_types[next_est] in ['regressor', 'classifier']) or ( next_est in self.known_feature_selectors)) and (ent_idx + int_idx < n - 1): int_pre = "int_%d" % int_idx if next_est in self.known_feature_selectors: int_mdl = self._get_class(next_est)() # set the parameter's dictionary for kw in self.config[next_est]: self.config[est][int_pre + '__' + kw] = self.config[next_est][kw] else: from eli5.sklearn import PermutationImportance from sklearn.feature_selection import SelectFromModel from numpy import inf int_est = self._get_class(next_est)() int_mdl = SelectFromModel(PermutationImportance(int_est, cv=3), threshold=-inf) self.config[est][int_pre + '__' + 'max_features'] = Integer(1, self.num_features) for kw in self.config[next_est]: self.config[est][int_pre + '__' + 'estimator__estimator__' + kw] = \ self.config[next_est][kw] int_steps.append((int_pre, int_mdl)) int_idx += 1 next_est = seq[ent_idx + int_idx] if int_steps != []: mdl = clss(int_steps) steps.append((pre, mdl)) ent_idx += int_idx else: mdl = clss() steps.append((pre, mdl)) ent_idx += 1 for kw in self.config[est]: config[pre + '__' + kw] = self.config[est][kw] idx += 1 ppln = Pipeline(steps) if self.verbose > 0: print("=" * 90) print(seq) print("-" * 90) for srgt in self.surrogates: OPTIM = SurrogateRandomCV(ppln, params=config, max_iter=srgt[1], min_evals=self.min_random_evals, scoring=self.scoring, cv=self.cv, verbose=max(self.verbose - 1, 0), sampling=srgt[2], regressor=srgt[0], scipy_solver=srgt[3], task_name=task_name, Continue=True, warm_start=True) OPTIM.fit(X, y) return OPTIM.best_estimator_, OPTIM.best_estimator_score
'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count' ] X = data[base_features] train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) first_model = RandomForestRegressor(n_estimators=50, random_state=1).fit(train_X, train_y) # show data print("Data sample:") print(data.head()) # Show permutation importance perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names=val_X.columns.tolist()) print( eli5.format_as_text( eli5.explain_weights(perm, feature_names=val_X.columns.tolist()))) ############ ### Creating new features ############ data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude) data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude) features_2 = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'abs_lat_change', 'abs_lon_change'
def main(): data_path = './' filename = 'task_data.csv' data = pd.read_csv(data_path+filename, index_col='sample index') y_tot1 = data.pop('class_label') y_tot = y_tot1.replace(-1, 0) feat_cols = data.columns X_train, X_test, y_train, y_test = train_test_split(data, y_tot, test_size = 0.33, random_state=42) model = xgboost.XGBClassifier(objective='binary:logistic') model.fit(X_train,y_train) pred = model.predict_proba(X_test)[:, 1] predictionsT = model.predict(X_train) predictions = model.predict(X_test) print( classification_report(y_test.values, pred.round())) auc = roc_auc_score(y_test.values, pred) print( "Area under ROC curve: %.4f"%(auc)) print('test') print(confusion_matrix(y_test, predictions)) plot_confusion_matrix(model, X_test, y_test) plt.savefig("Plots/conf_m.png", transparent=True) # drop column m1 = xgboost.XGBClassifier(objective='binary:logistic') im,co = dropcol_importances(m1 ,X_train,y_train,X_test,y_test) impdrop = perf_out(im,co,"Drop Column") print(impdrop) # Permutation importance result = permutation_importance(model, X_test, y_test, n_repeats=100, random_state=41) impper = perf_out(result.importances_mean,X_test.columns,"Permutation Importance") print(impper) # Permutation importance v2 perm = PermutationImportance(model, random_state=41).fit(X_test,y_test) imppereli5 = perf_out(perm.feature_importances_,X_test.columns,"Permutation Importance ELI5") print(imppereli5) df_tot = pd.concat([impdrop,impper,imppereli5],axis=1,sort=False) if debug==True: plot_importance(model) plt.savefig("Plots/xgb_importance.png", transparent=True) imp_types = ["weight","gain","cover","total_gain","total_cover"] for i in range(len(imp_types)): imp_vals = model.get_booster().get_score(importance_type=imp_types[i]) imp_vals = sorted(imp_vals.items(), key=lambda x: x[1], reverse=True) dftype = pd.DataFrame(imp_vals,columns=['Feature',imp_types[i]]) cur_feats = dftype['Feature'].values diff = set(feat_cols)-set(cur_feats) if len(diff)!=0: null_imp = [0]*len(diff) miss_features = zip(diff, null_imp) dftype = dftype.append(pd.DataFrame(miss_features,columns=['Feature',imp_types[i]]), ignore_index=True) dftype = dftype.set_index('Feature') df_tot = pd.concat([df_tot,dftype],axis=1,sort=False) print(dftype) if imp_types[i]=="total_gain": dftype.to_csv("rank_"+imp_types[i]+".csv") if debug == True: dftype.plot.barh(y=imp_types[i], label=imp_types[i]).invert_yaxis() plt.savefig("Plots/rank_"+imp_types[i]+".png", transparent=True) df_tot = df_tot.sort_values('total_gain', ascending=False) df_tot = df_tot/df_tot.max() print(df_tot) if debug == True: df_tot.plot.barh().invert_yaxis() plt.savefig("Plots/ranks.png", transparent=True)
def module4(): config = configparser.ConfigParser() config.read('./ml_box.ini') runtime_settings = config['RUNTIME'] labelField = runtime_settings['label_field'] labelField = stripNonAlphanumeric([labelField])[0] #clean labelField dash_data_path = runtime_settings['dash_data_path'] ingest_settings = config['INGEST'] data_type = ingest_settings['datatype'] if data_type == 'sql': data_source = ingest_settings['TABLE_NAME'] else: data_source = ingest_settings['file_name'] #Load best model def pickle_load(name): PIK = str(name) + ".pickle" with open(PIK, "rb") as f: temp_item = pickle.load(f) return temp_item best_model_path = dash_data_path + 'best_model_automl' best_model = pickle_load(best_model_path) X_train_path = dash_data_path + 'X_train' X_train = pickle_load(X_train_path) X_test_path = dash_data_path + 'X_test' X_test = pickle_load(X_test_path) Y_train_path = dash_data_path + 'Y_train' Y_train = pickle_load(Y_train_path) Y_test_path = dash_data_path + 'Y_test' Y_test = pickle_load(Y_test_path) def pickle_save(name, item): PIK = str(name) + ".pickle" with open(PIK, "wb") as f: pickle.dump(item, f) Y_pred = best_model.predict(X_test) #ROC curve #calculate the fpr and tpr for all thresholds of the classification probs = best_model.predict_proba(X_test) preds = probs[:, 1] fpr, tpr, threshold = metrics.roc_curve(Y_test, preds) roc_auc = metrics.auc(fpr, tpr) #feature metrics for sensitivity analysis # featureMeans = X_train.mean() featureMins = X_train.min() featureMaxs = X_train.max() featureMetrics = pd.concat([featureMins, featureMaxs], axis=1) featureMetrics.columns = ['min', 'max'] #confusion matrix conf = confusion_matrix(y_true=Y_test, y_pred=Y_pred) # ## Market Basket mb_ignore = runtime_settings['mb_ignore'] if mb_ignore == 'false': clean_df_path = dash_data_path + 'clean_df' norm_df_clean = pickle_load(clean_df_path) #drop non-categorical columns norm_df = norm_df_clean.select_dtypes( include=['int64', 'uint8', 'float64']) #one more filter - get rid of int columns that are not categorical (tenure) #get rid of columns that are never 'negative' (always 1 or greater) continuous_int_cols = list( norm_df.loc[:, (norm_df <= 0).sum() == 0].columns) print("Market basket analysis - removed columns that are never 0:", continuous_int_cols) norm_df.drop(continuous_int_cols, axis=1, inplace=True) #drop target variable norm_df.drop(labelField, axis=1, inplace=True) # drop collinear columns (ex. dropping _No phone services attributes, since PhoneService_No covers these) # NOTE - gives preference for columns appearing first in the data cols_to_drop = [] for x in norm_df.columns: for y in norm_df.columns: corr = norm_df[x].corr(norm_df[y]) if x != y and (corr >= 0.99): if y not in cols_to_drop and x not in cols_to_drop: cols_to_drop.append(y) norm_df.drop(labels=cols_to_drop, axis=1, inplace=True) def encode_units(x): if x < 1: return 0 if x >= 1: return 1 basket_sets = norm_df.applymap(encode_units) #If support is too high, lower to allow values in try: frequent_itemsets = apriori(basket_sets, min_support=0.1, use_colnames=True, max_len=3) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) except ValueError: frequent_itemsets = apriori(basket_sets, min_support=0.001, use_colnames=True, max_len=3) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) max_lift = rules[(rules['lift'] >= 1) & (rules['conviction'] != np.inf)] market_basket = max_lift.sort_values(by='lift', ascending=False).head( 100) #limit to top 100 market_basket['antecedents'] = market_basket['antecedents'].map( lambda x: list(x)) market_basket['consequents'] = market_basket['consequents'].map( lambda x: list(x)) #Drop baskets containing the same items but reversed market_basket['unique'] = ( market_basket['antecedents'] + market_basket['consequents']).apply(lambda x: ' '.join(sorted(x))) market_basket.drop_duplicates(subset=['unique'], keep='first', inplace=True) #drop a<->c baskets #Calculate top Lift and Support cells, set dummy value in new column for later market basket conditional formatting lift_top10_thresh = market_basket['lift'].quantile( q=0.9) #90th percentile threshold # market_basket['lift_highlight'] = np.where( market_basket['lift'] >= lift_top10_thresh, 1, 0) support_top10_thresh = market_basket['support'].quantile( q=0.9) #90th percentile threshold # market_basket['support_highlight'] = np.where( market_basket['support'] >= support_top10_thresh, 1, 0) # market_basket['lift'] = market_basket['lift'].map(lambda x: '{:.2f}x'.format(x)) # market_basket['support'] = market_basket['support'].map(lambda x: '{:.0%}'.format(x)) market_basket['Basket'] = ( market_basket['antecedents'] + market_basket['consequents']).apply(lambda x: ', '.join(x)) market_basket = market_basket[[ 'Basket', 'support', 'lift', 'lift_highlight', 'support_highlight' ]] market_basket.columns = [ 'Basket', 'Support', 'Lift', 'lift_highlight', 'support_highlight' ] #Remove any baskets whose items are a subset of a larger basket (preference for basket specificity) all_baskets = list(market_basket['Basket'].unique()) all_baskets = [set(x.split(', ')) for x in all_baskets] print("Removing basket subsets...") remove_baskets = [] for index, row in market_basket.iterrows(): b = row['Basket'] b_set = set(b.split(', ')) for a in all_baskets: if b_set != a and b_set.issubset( a): #if basket has superset in all_baskets, remove remove_baskets.append(b) market_basket = market_basket[~market_basket['Basket']. isin(remove_baskets)] market_basket.reset_index(inplace=True, drop=True) # ### Get averages and sums for all other columns by these market baskets calc_dict = {} calc_dict_cols = [] for b in market_basket.Basket.values: b_parsed = b.split(', ') basket_filtered_df = norm_df_clean.copy( ) #create df where all items in basket will be true for item in b_parsed: basket_filtered_df = basket_filtered_df[ basket_filtered_df[item] >= 1] for c in basket_filtered_df.columns: #find agg measures of each column basket_filtered_column = basket_filtered_df[ c] #in basket universe m = basket_filtered_column.mean() s = basket_filtered_column.sum() cnt = basket_filtered_column.count() population_column = norm_df_clean[c] #in total universe p_m = population_column.mean() p_s = population_column.sum() p_cnt = population_column.count() basket_col_name = c + '_basket' pop_col_name = c + '_pop' if b in calc_dict.keys(): # data calc_dict[b] += [s, m, cnt, p_s, p_m, p_cnt] else: calc_dict[b] = [s, m, cnt, p_s, p_m, p_cnt] # columns for name_of_column in [ basket_col_name + '_sum', basket_col_name + '_mean', basket_col_name + '_count', pop_col_name + '_sum', pop_col_name + '_mean', pop_col_name + '_count' ]: if name_of_column not in calc_dict_cols: calc_dict_cols.append(name_of_column) market_basket_calcs = pd.DataFrame.from_dict(calc_dict, orient='index', columns=calc_dict_cols) #Format calc columns # calc_format = [col for col in market_basket_calcs if col.endswith('sum') or col.endswith('count') or col.endswith('mean')] # market_basket_calcs[calc_format] = market_basket_calcs[calc_format].applymap(lambda x: '{:.2f}'.format(x) # if len(str(round(x))) <= 1 # else '{:.0f}'.format(x)) market_basket_calcs['Basket'] = market_basket_calcs.index #join calcs to market basket market_basket = market_basket.merge(market_basket_calcs, on='Basket', how='left') # Find top 10th percentile for each calculated field calc_cols = [ col for col in market_basket if col.endswith('sum') or col.endswith('count') or col.endswith('mean') ] for col in calc_cols: top10_thresh = market_basket[col].quantile( q=0.9) #90th percentile threshold market_basket[col + '_highlight'] = np.where( market_basket[col] >= top10_thresh, 1, 0) #Replace "_" with " = " for readability # market_basket['Basket'] = market_basket['Basket'].str.replace('_', ' = ') market_basket_csv_path = dash_data_path + 'market_basket.csv' market_basket.to_csv(market_basket_csv_path, index=False) # ### Data summary norm_df_summary = generateDf() data_summary = norm_df_summary.head(1) data_summary_csv_path = dash_data_path + 'data_summary.csv' data_summary.to_csv(data_summary_csv_path, index=False) clean_df_path = dash_data_path + 'clean_df' data_post_transform = pickle_load(clean_df_path) data_summary_post_transform = data_post_transform.head(1) data_summary_post_transform_csv_path = dash_data_path + 'data_summary_post_transform.csv' data_summary_post_transform.to_csv(data_summary_post_transform_csv_path, index=False) data_post_transform_csv_path = dash_data_path + 'data_post_transform.csv' data_post_transform.to_csv(data_post_transform_csv_path, index=False) # ## Permutation-based feature importance feature_names = list(X_test.columns.values) perm = PermutationImportance(best_model).fit(X_test, Y_test) ex = eli5.explain_weights(perm, feature_names=feature_names) perm_feature_wt = eli5.formatters.as_dataframe.format_as_dataframe(ex) perm_feature_wt = dict(perm_feature_wt[['feature', 'weight']]) #create perm_feature_wt for pre-dummified data (for single instance prediction tab) dummy_memory_path = dash_data_path + 'dummy_memory' dummy_memory = pickle_load(dummy_memory_path) dummy_memory = {x[:-3]: y for x, y in dummy_memory.items() } #remove ' = ' separator from parent def returnPreDummyCol( postDummyCol): #returns pre-dummification column/parent for dummy_parent, dummy_children in dummy_memory.items(): if postDummyCol in dummy_children: return dummy_parent return postDummyCol perm_feature_wt_predummy = perm_feature_wt.copy() perm_feature_wt_predummy = pd.DataFrame(perm_feature_wt_predummy) perm_feature_wt_predummy['feature'] = perm_feature_wt_predummy[ 'feature'].map(returnPreDummyCol) perm_feature_wt_predummy = perm_feature_wt_predummy[['feature']] perm_feature_wt_predummy.drop_duplicates(inplace=True) perm_feature_wt_predummy.reset_index(inplace=True, drop=True) #Create pre-dummified features_metrics for use in single instance prediction tab featureMetricsPreDummy = featureMetrics.copy() featureMetricsPreDummy['preDummy'] = featureMetricsPreDummy.index.map( returnPreDummyCol) featureMetricsPreDummy.drop_duplicates(subset=['preDummy'], inplace=True) #erase calculations for predummy variables # feature_metrics['mean'] = np.where(feature_metrics.index != feature_metrics['preDummy'], float('nan'), feature_metrics['mean']) featureMetricsPreDummy['min'] = np.where( featureMetricsPreDummy.index != featureMetricsPreDummy['preDummy'], float('nan'), featureMetricsPreDummy['min']) featureMetricsPreDummy['max'] = np.where( featureMetricsPreDummy.index != featureMetricsPreDummy['preDummy'], float('nan'), featureMetricsPreDummy['max']) clf = tree.DecisionTreeClassifier() clf.fit(X_train, Y_train) rf = RandomForestClassifier() rf.fit(X_train, Y_train) rf_feature_importances = pd.DataFrame(rf.feature_importances_, index=X_train.columns, columns=['importance']).sort_values( 'importance', ascending=False) #export pre-dummified data sample for use in single instance prediction sample_size = 200 if X_test.shape[0] < sample_size: sample_size = X_test.shape[0] rand_prediction = X_test.sample(n=sample_size) #Given set of observations, return probability of 1 using best model def get_pos_proba_from_x(observation): best_model_automl_path = dash_data_path + 'best_model_automl' best_model_pipeline = pickle_load(best_model_automl_path) probabilities = best_model_pipeline.predict_proba([observation]) return probabilities[0][1] rand_prediction['pos_proba'] = rand_prediction.apply(get_pos_proba_from_x, axis=1) #create index sorted by pos_proba for eventual slider rand_prediction.sort_values(by='pos_proba', inplace=True) #get pre-dummified data for same set of indices sample_indices = rand_prediction.index pre_dummified_clean_df_path = dash_data_path + 'pre_dummified_clean_df' pre_dummy_data = pickle_load(pre_dummified_clean_df_path) rand_prediction_pre_dummy = pre_dummy_data.loc[sample_indices] rand_prediction_pre_dummy['pos_proba'] = rand_prediction['pos_proba'] rand_prediction_pre_dummy.reset_index(inplace=True, drop=True) rf_feature_importances = rf_feature_importances[0:6] feature_names = list(X_test.columns.values) clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, Y_train) ex = eli5.explain_weights(clf, feature_names=feature_names) reg_feature_weight = eli5.formatters.as_dataframe.format_as_dataframe(ex) top_3 = reg_feature_weight.head(3) bottom_3 = reg_feature_weight.tail(3) analysis_lines = [] first_feature_pos = top_3.iloc[[0]]['feature'].item() second_feature_pos = top_3.iloc[[1]]['feature'].item() third_feature_pos = top_3.iloc[[2]]['feature'].item() second_feature_ratio = top_3.iloc[[1]]['weight'].item() / top_3.iloc[[ 0 ]]['weight'].item() third_feature_ratio = top_3.iloc[[2]]['weight'].item() / top_3.iloc[[ 0 ]]['weight'].item() analysis_lines.append( 'For predicting %s classes of %s, the strongest predictors are: ' % (runtime_settings['pos_label'], runtime_settings['label_field'])) analysis_lines.append('The strongest predictor is %s' % (first_feature_pos)) analysis_lines.append( 'The second strongest predictor is %s with a weight %0.2f%% of %s' % (second_feature_pos, second_feature_ratio, first_feature_pos)) analysis_lines.append( 'The third strongest predictor is %s with a weight %0.2f%% of %s' % (third_feature_pos, third_feature_ratio, first_feature_pos)) first_feature_neg = bottom_3.iloc[[2]]['feature'].item() second_feature_neg = bottom_3.iloc[[1]]['feature'].item() third_feature_neg = bottom_3.iloc[[0]]['feature'].item() second_feature_ratio = bottom_3.iloc[[ 0 ]]['weight'].item() / bottom_3.iloc[[1]]['weight'].item() third_feature_ratio = bottom_3.iloc[[0]]['weight'].item() / bottom_3.iloc[[ 2 ]]['weight'].item() # analysis_lines.append('For predicting %s classes of %s, the strongest predictors are: ' % (runtime_settings['neg_label'], runtime_settings['label_field'])) analysis_lines.append('The strongest predictor is %s' % (first_feature_neg)) analysis_lines.append( 'The second strongest predictor is %s with a weight %0.2f%% of %s' % (second_feature_neg, second_feature_ratio, first_feature_neg)) analysis_lines.append( 'The third strongest predictor is %s with a weight %0.2f%% of %s' % (third_feature_neg, third_feature_ratio, first_feature_neg)) # Time Series Analysis # load cleansed data with time variables clean_df_time_path = dash_data_path + 'clean_df_time' clean_df_time = pickle_load(clean_df_time_path) # datetime columns date_cols_path = dash_data_path + 'date_cols' date_cols = pickle_load(date_cols_path) all_date_cols_path = dash_data_path + 'all_date_cols' all_date_cols = pickle_load(all_date_cols_path) # all other columns non_date_cols_path = dash_data_path + 'non_date_cols' non_date_cols = pickle_load(non_date_cols_path) all_non_date_cols_path = dash_data_path + 'all_non_date_cols' all_non_date_cols = pickle_load(all_non_date_cols_path) # If ts_ignore = true, skip time series analysis. if runtime_settings['ts_ignore'] == 'true': print('Ignore Time Series = TRUE. Skipping time series analysis.') ts_acf_path = dash_data_path + 'ts_acf' pickle_save(ts_acf_path, None) ts_runs_path = dash_data_path + 'ts_runs' pickle_save(ts_runs_path, None) ts_trends_path = dash_data_path + 'ts_trends' pickle_save(ts_trends_path, None) ts_forecast_path = dash_data_path + 'ts_forecast' pickle_save(ts_forecast_path, None) ts_best_model_path = dash_data_path + 'ts_best_model' pickle_save(ts_best_model_path, None) # If no datetime data, there is no analysis to be done. Continue on. elif len(date_cols) == 0: print('No datetime data. Skipping time series analysis.') ts_acf_path = dash_data_path + 'ts_acf' pickle_save(ts_acf_path, None) ts_runs_path = dash_data_path + 'ts_runs' pickle_save(ts_runs_path, None) ts_trends_path = dash_data_path + 'ts_trends' pickle_save(ts_trends_path, None) ts_forecast_path = dash_data_path + 'ts_forecast' pickle_save(ts_forecast_path, None) ts_best_model_path = dash_data_path + 'ts_best_model' pickle_save(ts_best_model_path, None) else: # resample and get process control stats for all time-feature pairs for tc in date_cols: for cc in non_date_cols: control_stats_here = timeSeriesPreProcessing( clean_df_time, tc, cc) if (non_date_cols.index(cc) == 0) & (date_cols.index(tc) == 0): ts_control_stats = control_stats_here else: ts_control_stats = ts_control_stats.append( control_stats_here) # use process control stats to select which variables to run decreasing = ts_control_stats.loc[tc][( ts_control_stats.loc[(tc), 'decr'] > 7)]['decr'].sort_values( ascending=False) increasing = ts_control_stats.loc[tc][( ts_control_stats.loc[(tc), 'incr'] > 7)]['incr'].sort_values( ascending=False) below_mn = ts_control_stats.loc[tc][(ts_control_stats.loc[( tc), 'blw_mn'] > 7)]['blw_mn'].sort_values(ascending=False) above_mn = ts_control_stats.loc[tc][(ts_control_stats.loc[( tc), 'abv_mn'] > 7)]['abv_mn'].sort_values(ascending=False) below_lcl = ts_control_stats.loc[tc][(ts_control_stats.loc[( tc), 'blw_lcl'] > 0)]['blw_lcl'].sort_values(ascending=False) above_ucl = ts_control_stats.loc[tc][(ts_control_stats.loc[( tc), 'abv_ucl'] > 0)]['abv_ucl'].sort_values(ascending=False) top_n = 2 # we will choose top_n features from each category to run against each time variable to_run_list_here = list( itertools.product([tc], decreasing[:top_n].index) ) + list(itertools.product([tc], increasing[:top_n].index)) + list( itertools.product([tc], below_mn[:top_n].index) ) + list(itertools.product([tc], above_mn[:top_n].index)) + list( itertools.product([tc], below_lcl[:top_n].index)) + list( itertools.product([tc], above_ucl[:top_n].index)) to_run_list_here = list(set(to_run_list_here)) if date_cols.index(tc) == 0: to_run_list = to_run_list_here else: to_run_list = to_run_list + to_run_list_here # remove duplicates - hopefully this step is unnecessary to_run_list = list(set(to_run_list)) print('to run:', to_run_list) # save TS process control stats ts_control_stats_path = dash_data_path + 'ts_control_stats' pickle_save(ts_control_stats_path, ts_control_stats) # top 7 most informative features perm_feature_wt_df = pd.DataFrame(perm_feature_wt) num_features = min((perm_feature_wt_df.shape[0] - 1), 6) top_10 = perm_feature_wt_df.sort_values( 'weight', ascending=False).reset_index( drop=True).loc[:num_features, 'feature'].values.tolist() top_10.append(labelField) #for cc in top_10: # for tc in date_cols: for tuple in to_run_list: tc = tuple[0] cc = tuple[1] # optimal drop and resample parameters resample_period = ts_control_stats.loc[(tc, cc), 'period'] drop_first = int(ts_control_stats.loc[(tc, cc), 'drop_first']) drop_last = int(ts_control_stats.loc[(tc, cc), 'drop_last']) acf_here, trends_here, forecast_here, best_model_here = runTimeSeries( clean_df_time, resample_period, drop_first, drop_last, tc, cc) # append results to the larger multi-index dfs #if (date_cols.index(tc) == 0) & (top_10.index(cc) == 0): if to_run_list.index(tuple) == 0: ts_acf = acf_here ts_trends = trends_here ts_forecast = forecast_here ts_best_model = best_model_here ts_runs = pd.DataFrame(data={ 'time_var': [tc], 'feature': [cc] }) else: ts_acf = ts_acf.append(acf_here) ts_trends = ts_trends.append(trends_here) ts_forecast = ts_forecast.append(forecast_here) ts_best_model = ts_best_model.append(best_model_here) ts_runs = ts_runs.append({ 'time_var': tc, 'feature': cc }, ignore_index=True) ts_acf_path = dash_data_path + 'ts_acf' pickle_save(ts_acf_path, ts_acf) ts_runs_path = dash_data_path + 'ts_runs' pickle_save(ts_runs_path, ts_runs) ts_trends_path = dash_data_path + 'ts_trends' pickle_save(ts_trends_path, ts_trends) ts_forecast_path = dash_data_path + 'ts_forecast' pickle_save(ts_forecast_path, ts_forecast) ts_best_model_path = dash_data_path + 'ts_best_model' pickle_save(ts_best_model_path, ts_best_model) # ## Prepare for export x_100_test = X_test[:100] # print(x_100_test.shape) #Time to classify 100 new samples start = time.clock() best_model.predict(x_100_test) elapsed = time.clock() - start # print(elapsed) #Prepare various metrics for export model_type = (' ').join(list(best_model.named_steps.keys())) timestamp = datetime.now() params_json = best_model.get_params() precision = precision_score(Y_test, Y_pred) recall = recall_score(Y_test, Y_pred) f1 = f1_score(Y_test, Y_pred) accuracy = accuracy_score(Y_test, Y_pred) log_loss_score = log_loss(Y_test, Y_pred) # mae = mean_absolute_error(Y_test, Y_pred) # mse = mean_squared_error(Y_test, Y_pred) # roc_auc = #roc_auc already defined above test_item_count = Y_test.count() run_time = elapsed #More metrics file_name = data_source row_count = norm_df_summary.shape[0] col_count = norm_df_summary.shape[1] target_variable = labelField model_metrics = [ model_type, timestamp, params_json, precision, recall, f1, accuracy, log_loss_score, # mae, # mse, roc_auc, test_item_count, run_time, rf_feature_importances, file_name, row_count, col_count, target_variable, analysis_lines ] # In[447]: model_metrics_columns = [ 'model_type', 'timestamp', 'params_json', 'precision', 'recall', 'f1', 'accuracy', 'log_loss_score', # 'mae', # 'mse', 'roc_auc', 'test_item_count', 'run_time', 'rf_feature_importances', 'file_name', 'row_count', 'col_count', 'target_variable', 'analysis_lines' ] metrics_df = dict(zip(model_metrics_columns, model_metrics)) metrics_df_path = dash_data_path + 'metrics_df' pickle_save(metrics_df_path, metrics_df) perm_feature_wt_path = dash_data_path + 'perm_feature_wt' pickle_save(perm_feature_wt_path, perm_feature_wt) rand_prediction_path = dash_data_path + 'rand_prediction' pickle_save(rand_prediction_path, rand_prediction_pre_dummy) perm_feature_wt_predummy_path = dash_data_path + 'perm_feature_wt_predummy' pickle_save(perm_feature_wt_predummy_path, perm_feature_wt_predummy) featureMetricsPreDummy_path = dash_data_path + 'featureMetricsPreDummy' pickle_save(featureMetricsPreDummy_path, featureMetricsPreDummy) fpr_path = dash_data_path + 'fpr' pickle_save(fpr_path, list(fpr)) tpr_path = dash_data_path + 'tpr' pickle_save(tpr_path, list(tpr)) # rf_explanation_example_path = dash_data_path + 'rf_explanation_example' # pickle_save(rf_explanation_example_path, rf_explanation_example) featureMetrics_path = dash_data_path + 'featureMetrics' pickle_save(featureMetrics_path, featureMetrics) conf_matrix_path = dash_data_path + 'conf_matrix' pickle_save(conf_matrix_path, conf)
X_train_transformed = transformers.fit_transform(X_train) X_val_transformed = transformers.fit_transform(X_val) model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) model.fit(X_train_transformed, y_train) #Permutation Importance import eli5 from eli5.sklearn import PermutationImportance #1. Calculate permutation importances permuter = PermutationImportance( model, scoring='accuracy', n_iter=5, random_state=42 ) permuter.fit(X_val_transformed, y_val) feature_names = X_val.columns.tolist() pd.Series(permuter.feature_importances_, feature_names).sort_values(ascending=False) #Display permutation importances eli5.show_weights( permuter, top=None, #Shows all features feature_names=feature_names )
early_stopping_rounds=100, random_state=42, scale_pos_weight=15, learning_rate=.005, reg_lambda=.01, verbosity=1) print('fitting...') model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', verbose=True) y_pred_proba = model.predict_proba(X_val)[:, 1] print(f'Validation ROC AUC score: {roc_auc_score(y_val, y_pred_proba)}') print('permuting...') permuter = PermutationImportance(model, cv='prefit', n_iter=5, scoring='roc_auc', random_state=42) permuter.fit(X_val, y_val) features_of_import = pd.Series(permuter.feature_importances_, val.columns).sort_values(ascending=True) print('importance', features_of_import) print('plotting...') fig1 = go.Figure() fig1.add_trace(go.Bar(x=features_of_import, y=val.columns)) py.iplot(fig1, filename='features1') mask = features_of_import > 0 trimmed_columns = train.columns[mask] train_trimmed = train[trimmed_columns]
df.drop(['Name'], axis=1, inplace=True, errors='ignore') df.drop(['Cabin'], axis=1, inplace=True, errors='ignore') df['Fare'].fillna(value=df['Fare'].mean(), inplace=True) fare = np.array(df['Fare']) df['Fare'] = normalize([fare]).T df.drop(['Fare'], axis=1, inplace=True, errors='ignore') train_features = train_dataset.drop("Survived", axis=1) train_labels = train_dataset["Survived"] test_features = test_dataset random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_features, train_labels) perm = PermutationImportance(random_forest, random_state=1).fit(train_features, train_labels) eli5.show_weights(perm, feature_names=train_features.columns.tolist()) Y_pred = random_forest.predict(test_features) c_matrix = confusion_matrix(train_features, train_labels) out_df = pd.DataFrame(columns=['PassengerId', 'Survived']) out_df['PassengerId'] = test_data_ID out_df['Survived'] = Y_pred submission_filepath = 'sample_submission.csv' out_df.to_csv(submission_filepath, index=False) sns.heatmap(c_matrix.T, square=True, annot=True, fmt='g', cbar=True) plt.xlabel('true labels') plt.ylabel('predicted labels')
(RobustScaler(), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), (SelectKBest(selection_score_func, k=1), ['<NAME3>']), (SelectKBest(selection_score_func, k=2), ['<NAME2>', '<NAME3>']), (FeatureUnion([('k', SelectKBest(selection_score_func, k=2)), ('p', SelectPercentile(selection_score_func, 30)) ]), ['k:<NAME2>', 'k:<NAME3>', 'p:<NAME3>']), (VarianceThreshold(0.0), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), (VarianceThreshold(1.0), ['<NAME2>']), (GenericUnivariateSelect(), ['<NAME2>']), (GenericUnivariateSelect(mode='k_best', param=2), ['<NAME2>', '<NAME3>']), (SelectFromModel(LogisticRegression( 'l1', C=0.01, random_state=42)), ['<NAME0>', '<NAME2>']), (SelectFromModel( PermutationImportance( LogisticRegression(random_state=42), cv=5, random_state=42, refit=False, ), threshold=0.1, ), ['<NAME2>', '<NAME3>']), (RFE(LogisticRegression(random_state=42), 2), ['<NAME1>', '<NAME3>']), (RFECV(LogisticRegression(random_state=42)), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), ] + _additional_test_cases) def test_transform_feature_names_iris(transformer, expected, iris_train): X, y, _, _ = iris_train transformer.fit(X, y) # Test in_names being provided res = transform_feature_names(transformer, ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']) assert res == expected
auc(svm_fpr, svm_tpr) auc(rf_fpr, rf_tpr) #Permutation importance diease_data_train_X_scaled_df = pd.DataFrame(diease_data_train_X_scaled) temp = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal' ] diease_data_train_X_scaled_df.columns = temp import eli5 #for purmutation importance from eli5.sklearn import PermutationImportance sgd_perm = PermutationImportance(sgd_predictor, random_state=1).fit( diease_data_train_X_scaled, diease_data_train_y) sgd_importance = eli5.explain_weights( sgd_perm, feature_names=diease_data_train_X_scaled_df.columns.tolist()) svm_perm = PermutationImportance(svm_predictor, random_state=1).fit( diease_data_train_X_scaled, diease_data_train_y) svm_importance = eli5.explain_weights( svm_perm, feature_names=diease_data_train_X_scaled_df.columns.tolist()) rf_perm = PermutationImportance(rf_predictor, random_state=1).fit(diease_data_train_X_scaled, diease_data_train_y) rf_importance = eli5.explain_weights( rf_perm, feature_names=diease_data_train_X_scaled_df.columns.tolist()) #plots the features in descending order of relative importance
sns.distplot(train_df.revenue, ax=ax[0]) ax[0].set_title("Train Set Revenue Histogram") sns.distplot(predictions_extra_trees_tuned_test, ax=ax[1]) ax[1].set_title("Test Set Revenue Prediction Histogram") f.tight_layout() # ## Feature Selection # ### Feature Selection with Eli5 for xgboost # In[ ]: import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(clf_stra_xgb, random_state=42).fit(xtrain, ytrain) # In[ ]: eli5.show_weights(perm, feature_names=xvalid.columns.tolist(), top=100) # In[ ]: from sklearn.feature_selection import SelectFromModel max_selected_features = 10 sel = SelectFromModel(perm, max_features=max_selected_features, threshold=0.005, prefit=True)
import eli5 from eli5.sklearn import PermutationImportance encoder = GDB_pipeline.named_steps.ordinalencoder X_train_encoded = encoder.fit_transform(X_train_cut) X_val_encoded = encoder.transform(X_val_cut) imputer = GDB_pipeline.named_steps.iterativeimputer X_train_imputed = imputer.fit_transform(X_train_encoded) X_val_imputed = imputer.fit_transform(X_val_encoded) model = GDB_pipeline.named_steps.gradientboostingclassifier # model.fit(X_train_imputed,y_train) permuter = PermutationImportance(model, scoring='accuracy', n_iter=2) permuter.fit(X_val_imputed, y_val) feature_names = X_val_encoded.columns.tolist() eli5.show_weights(permuter, top=None, feature_names=feature_names) # In[78]: from pdpbox import pdp plt.style.use('seaborn-dark-palette') feature = 'down' model = GDB_pipeline.named_steps['gradientboostingclassifier'] model_features = X_train_cut.columns X_train_imputed = pd.DataFrame(X_train_imputed) X_train_imputed.columns = X_train_cut.columns pdp_dist = pdp.pdp_isolate(model=model,
def classification(self, cleaned_Data_frm1, cleaned_Data_frm, y, cursor, conn): # try: Modles_reuslts = [] Names = [] print("Model building") float_cols = self.float_col result = pd.concat( [cleaned_Data_frm1, cleaned_Data_frm, y, float_cols], axis=1) self.data_sorted1 = result.loc[:, ~result.columns.duplicated()] self.data_sorted = self.data_sorted1.sort_values(self.i) new_list = [ list(set(self.data_sorted.columns).difference(self.x.columns)) ] X = self.data_sorted.drop([self.i], axis=1) y = self.data_sorted[self.i] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42) X_train, X_test = train_test(X_train, X_test) # List of pipelines for ease of iteration l = 0 access_key_id = self.access_key_id secret_access_key = self.secret_access_key models = ['Random Forest', 'KNN', 'XGB', 'SVC'] for classifier, params, model in zip(self.Classifier, self.Classifiers_grids, models): print(classifier) l += 1 gd = RandomizedSearchCV(classifier, params, cv=5, n_jobs=-1, verbose=True, refit=True) gd.fit(X_train, y_train) grid = gd.best_params_ estimator = gd.best_estimator_ y_pred = gd.predict(X_test) cm = confusion_matrix(y_test, y_pred) target = self.i Accuracy = metrics.accuracy_score(y_test, y_pred) print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) if model == 'KNN': perm = PermutationImportance(gd, random_state=1).fit( X_train, y_train) importances = perm.feature_importances_ DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, cm, target, model) elif model == 'SVC': importances = gd.best_estimator_.coef_ imp = importances.tolist() importances = imp[0] DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, cm, target, model) else: importances = gd.best_estimator_.feature_importances_.tolist() DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, cm, target, model) # encoded_classes = list(self.cleaned_Data_frm) # model architecture if self.types == 'Classification_problem': def DNN(): model = Sequential() model.add( Dense(512, input_dim=X_train.shape[1], init='normal', activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(32, init='normal', activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(1, init='normal', activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy']) return model classifier = KerasClassifier(build_fn=DNN, verbose=1) batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X_train, y_train) estimator = grid.best_estimator_ Accuracy = grid_result.best_score_ print("%s" % (estimator)) perm = PermutationImportance(grid, scoring='accuracy', random_state=1).fit(X_train, y_train) print(perm.feature_importances_) DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, cm, target, model) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) else: a = np.unique(self.y) a.sort() b = a[-1] b += 1 def DNN(dropout_rate=0.0, weight_constraint=0): # create model model = Sequential() model.add( Dense(42, input_dim=X_train.shape[1], kernel_initializer='uniform', activation='relu', kernel_constraint=maxnorm(weight_constraint))) model.add(Dropout(dropout_rate)) model.add( Dense(20, kernel_initializer='uniform', activation='relu')) model.add(Dense(b, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model classifier = KerasClassifier(build_fn=DNN, epochs=50, batch_size=10, verbose=1) weight_constraint = [1, 2, 3, 4, 5] dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint) grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X_train, y_train) estimator = grid.best_estimator_ Accuracy = grid_result.best_score_ print(Accuracy) DB_upload(Accuracy, X_train, X_test, y_test, y_pred, importances, grid, estimator, l, cm, target, model) print("%s" % (estimator))
max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=4, min_samples_split=2, min_weight_fraction_leaf=0, n_estimators=14, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) model1.fit(X_train_transformed, y_train) # Get permutation importances ! pip install eli5 from eli5.sklearn import PermutationImportance import eli5 permuter = PermutationImportance( model1, scoring='r2', n_iter=2, random_state=42 ) permuter.fit(X_val_transformed, y_val) feature_names = X_val.columns.tolist() eli5.show_weights( permuter, top=None, # show permutation importances for all features feature_names=feature_names ) from sklearn.metrics import mean_squared_error, r2_score # Coefficient of determination r2 for the training set
if is_labeled_data: feature_partial = variables.get("FEATURE_PARTIAL_PLOTS") feature_partial_plots = [x.strip() for x in feature_partial.split(',')] features_to_plot = variables.get("FEATURE_PARTIAL2D_PLOTS") features_to_plot2d = [x.strip() for x in features_to_plot.split(',')] shap_row_to_show = int(variables.get("SHAP_ROW_SHOW")) columns = [LABEL_COLUMN] dataframe_test = dataframe.drop(columns, axis=1, inplace=False) dataframe_label = dataframe.filter(columns, axis=1) feature_names = dataframe_test.columns.values # PERMUTATION IMPORTANCE perm = PermutationImportance(loaded_model, random_state=1).fit( dataframe_test.values, dataframe_label.values.ravel()) html_table = eli5.show_weights( perm, feature_names=dataframe_test.columns.tolist(), top=50) # PARTIAL DEPENDENCE PLOTS partial_feature_find = [ i for i in feature_partial_plots if i in feature_names ] html_partial_plot = '' for i in partial_feature_find: pdp_feature = pdp.pdp_isolate(model=loaded_model, dataset=dataframe_test, model_features=feature_names, feature=i) # preg pdp_plot_feature = pdp.pdp_plot(
test_size=0.2, random_state=times) x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) total_predict = np.zeros(len(y_test)) for i in range(len(MLA)): skf = StratifiedKFold(n_splits=5, random_state=times) clf = copy.deepcopy(MLA[i]) clf.random_state = times sel = SelectFromModel( PermutationImportance(clf, cv=skf, random_state=times)).fit(x_train, y_train) x_train_trans = sel.transform(x_train) x_test_trans = sel.transform(x_test) vali_auc = np.mean( cross_val_score(clf, x_train_trans, y_train, cv=skf, scoring='roc_auc')) clf.fit(x_train_trans, y_train) predict_result = clf.predict_proba(x_test_trans)[:, 1] total_predict += predict_result test_auc = roc_auc_score(y_test, predict_result)
print('{}----------'.format(j)) print() print(multilabel_confusion_matrix(y_true=y_test, y_pred=target_y_pred[i])) print( classification_report(y_true=y_test, y_pred=target_y_pred[i], digits=2)) # %% # Computing feature importance print('Best MLP estimator: {}'.format(target_clf[0])) print() print('Best results') print(multilabel_confusion_matrix(y_true=y_test, y_pred=target_y_pred[0])) print(classification_report(y_true=y_test, y_pred=target_y_pred[0], digits=2)) perm = PermutationImportance(estimator=target_clf[0], n_iter=100, random_state=42).fit(X_test, y_test) # Create a dataframe of the variables and feature importances feature_importances_df = pd.DataFrame({ 'Variable': X.columns, 'Feature_Importances': perm.feature_importances_ }) # Print out the top 3 positive variables feature_importances_df_sorted = feature_importances_df.sort_values( by='Feature_Importances', axis=0, ascending=False) print() print(feature_importances_df_sorted)
ax.set_ylim([0.0, 1.0]) plt.show() # 7.10 AUC auc(fpr,tpr) # 88.71% # 7.11 # Find feature importance of any BLACK Box model # Refer: https://eli5.readthedocs.io/en/latest/blackbox/permutation_importance.html # See at the end: How PermutationImportance works? # 7.11.1 Instantiate the importance object perm = PermutationImportance( clf, random_state=1 ) # 7.11.2 fit data & learn # Takes sometime start = time.time() perm.fit(X_test, y_test) end = time.time() (end - start)/60 # 7.11.3 Conclude: Get feature weights """ # If you are using jupyter notebook, use:
return model_fe estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=1, batch_size=1) #estimator.fit(X, y) #prediction1 = estimator.predict(X_test_enc) #accuracy_score(Y_test_enc, prediction1) keras_callbacks = [ EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=8) ] estimator.fit(X_train_enc, y_train_enc, verbose=2, validation_split=0.10,callbacks=keras_callbacks) #perm = PermutationImportance(estimator, random_state=1).fit(X_train_enc, y_train_enc) #eli5.show_weights(perm, feature_names = X_train_enc.columns.tolist()) perm = PermutationImportance(estimator, random_state=1).fit(X_train_enc, y_train_enc) #kheili tool mikeshe engar kolan eli5 baraye tedad balaye features ha konde! # from google.colab import drive drive.mount('/content/gdrive') import pickle with open("/content/gdrive/My Drive/perm.pkl", 'wb') as output: pickle.dump(perm, output, pickle.HIGHEST_PROTOCOL) with open("/content/gdrive/My Drive/estimator.pkl", 'wb') as output: pickle.dump(estimator, output, pickle.HIGHEST_PROTOCOL) eli5.show_weights(perm, feature_names = X_train_enc.columns.tolist())
# instantiate the tuned random forest booster_grid_search = GridSearchCV(booster, param_grid, cv=3, n_jobs=-1) # train the tuned random forest booster_grid_search.fit(X_train, y_train) # print best estimator parameters found during the grid search print(booster_grid_search.best_params_) best_random = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=None, oob_score=False, random_state=42, verbose=0, warm_start=False) from eli5.sklearn import PermutationImportance perm = PermutationImportance(best_random.fit(X_train, y_train), random_state=1).fit(X_test, y_test) eli5.show_weights(perm, feature_names=list(X_df.columns))
print2(features_weight.head()) # visualize the top 10 important feature affecting prices top10 = features_weight[:10].sort_values(by="coefficients") plt.barh(top10.features, top10.coefficients) plt.xticks(rotation=45) plt.axvline(x=0.05, color='red', linestyle='-') plt.gcf().subplots_adjust(left=0.15) plt.show() # construct the data analysis pipeline xgbpipe = Pipeline([("scaler", StandardScaler()), ("XGBRegressor", best_model.best_estimator_.get_params()['model'])]) xgbpipe.fit(X_train, y_train) # visualize the top 10 important feature affecting prices using eli5 permutation perm = PermutationImportance(xgbpipe).fit(X_test, y_test) _imp_eli5 = dfform(perm.feature_importances_) eli5_top10 = _imp_eli5.head(10).sort_values(by="coefficients") plt.barh(eli5_top10.features, eli5_top10.coefficients) plt.axvline(x=0.05, color='red', linestyle='-') plt.xlabel("Importance Weight") plt.title("Airbnb Listing in New York") plt.gcf().subplots_adjust(left=0.15) plt.savefig("plot.jpeg") plt.show()
def create_model(): model = Sequential() model.add(Dense(100, input_dim=features_number, activation='relu')) model.add(Dense(25, activation='relu')) if use_binary: model.add(Dense(1, activation='sigmoid')) else: model.add(Dense(Y.shape[1], activation='sigmoid')) model.add(Dense(Y.shape[1], activation=activations.softmax)) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model create_model() head = open('/home/nader/workspace/robo/cyrus/script/DataAnalysisPath/feature_import/head').readlines()[0][:-2].split(',')[:features_number] # create model train_epoch_number = 20 model = KerasClassifier(build_fn=create_model, epochs=train_epoch_number, batch_size=10, verbose=1) # if use_new_model: model.fit(X, Y) perm = PermutationImportance(model, random_state=1).fit(X,Y) a = eli5.explain_weights(perm, feature_names=head, top=features_number) fi = a.feature_importances.importances nnfeatures = open('feature_import/nn_out_desc.csv', 'w') for f in fi: nnfeatures.write(f.feature + ',' + str(f.weight) + '\n') print(f.feature, f.weight, f.std, f.value) print(len(fi))
# OK, so it's working well. # <a id='section4'></a> # # The Explanation # # Now let's see what the model gives us from the ML explainability tools. # # **Permutation importance** is the first tool for understanding a machine-learning model, and involves shuffling individual variables in the validation data (after a model has been fit), and seeing the effect on accuracy. Learn more [here](https://www.kaggle.com/dansbecker/permutation-importance). # # Let's take a look, # # In[ ]: perm = PermutationImportance(model, random_state=1).fit(X_test, y_test) eli5.show_weights(perm, feature_names=X_test.columns.tolist()) # So, it looks like the most important factors in terms of permutation is a thalessemia result of 'reversable defect'. The high importance of 'max heart rate achieved' type makes sense, as this is the immediate, subjective state of the patient at the time of examination (as opposed to, say, age, which is a much more general factor). # # Let's take a closer look at the number of major vessles using a **Partial Dependence Plot** (learn more [here](https://www.kaggle.com/dansbecker/partial-plots)). These plots vary a single variable in a single row across a range of values and see what effect it has on the outcome. It does this for several rows and plots the average effect. Let's take a look at the 'num_major_vessels' variable, which was at the top of the permutation importance list, # In[ ]: base_features = dt.columns.values.tolist() base_features.remove('target') feat_name = 'num_major_vessels' pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features,
'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1] } grid_search = GridSearchCV(SVC(), params, cv=5) grid_search.fit(train, target) grid_search.best_params_ import eli5 from eli5.sklearn import PermutationImportance importance_model = SVC(C=10, gamma=0.1, probability=True) importance_model.fit(train, target) perm = PermutationImportance(importance_model, random_state=42).fit(test_data, test_target) eli5.show_weights(perm, feature_names=test_data.columns.tolist()) import shap data_for_prediction = test_data.iloc[0] k_explainer = shap.KernelExplainer(importance_model.predict_proba, train_data) k_shap_values = k_explainer.shap_values(data_for_prediction) shap.initjs() shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction) test_target.iloc[0], target_string.iloc[0] model = SVC(C=10, gamma=0.1) model.fit(train, target_string) predictions = model.predict(test) predictions[:10]