def plot_cumulative_features_importance(features, threshold=0.90, plot_size=(12, 8), feature_type='features', return_threshold=False): """ """ plt.figure(figsize=plot_size) # Number of features needed for threshold cumulative importance importance_idx = np.min(np.where(features['cumulative_coefficient_frequency'] > threshold)) thr_percentage = 100 * threshold required_features = importance_idx+1 l = '{} {} required for \n{:.0f}% cumulative importance.'.format(required_features, feature_type, thr_percentage) # Cumulative importance plot plt.plot(range(len(features)), features['cumulative_coefficient_frequency'], 'b-', label=l) plt.xlabel('Number of {}'.format(feature_type.capitalize()), fontsize=12, labelpad=20) plt.ylabel('Cumulative {} frequency'.format(feature_type.capitalize()), fontsize=12, labelpad=20) plt.title('Cumulative {} Importance'.format(feature_type.capitalize()), fontsize=12, pad=20) #plt.title(f'Cumulative Feature Importance\n\n{l}', fontsize=14, pad=20) # Threshold vertical line plot plt.vlines(importance_idx + 1, ymin=0, ymax=1.05, linestyles='--', colors='red') plt.legend(loc='lower right', fontsize=10) plt.tight_layout() plt.show() if return_threshold: return required_features
def subplot_feature_importance(features, threshold=0.90, plot_size=(12, 8), return_data=False): """ """ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=plot_size) # First plot : Cumulative importance plot # Number of features needed for threshold cumulative importance importance_idx = np.min(np.where(features['cumulative_coefficient_frequency'] > threshold)) + 1 thr_percentage = 100 * threshold # Legend label l = '{} features required for \n{:.0f}% cumulative importance.'.format(importance_idx, thr_percentage) ax1.plot(range(len(features)), features['cumulative_coefficient_frequency'], 'b-', label=l) ax1.set_xlabel('Number of Features', fontsize=12, labelpad=20) ax1.set_ylabel('Cumulative Coefficient frequency', fontsize=12, labelpad=20) ax1.set_title('Cumulative Feature Importance', fontsize=14, pad=20) #plt.title(f'Cumulative Feature Importance\n\n{l}', fontsize=14, pad=20) # Threshold vertical line plot ax1.vlines(importance_idx + 1, ymin=0, ymax=1.05, linestyles='--', colors='red') ax1.legend(loc='lower right', fontsize=10) # Second plot : n selected features features.head(importance_idx).plot(x='feature', y='coefficient', kind='barh', fontsize=12, figsize=plot_size, ax=ax2) fig.gca().invert_yaxis() model_label = features.index.name ax2.set_title(f'{model_label} Top {importance_idx} Features', fontsize=14, pad=20) ax2.set_xlabel('Coefficients', fontsize=12, labelpad=20) ax2.set_ylabel('Features labels', fontsize=12, labelpad=20) fig.tight_layout() if return_data: return features.head(importance_idx) return fig
def plot_cumulative_features_importance(features, threshold=0.90, plot_size=(12, 8)): """ :param features: a dataframe which contains features data :param threshold: :param plot_size: :return: """ plt.figure(figsize=plot_size) # Number of features needed for threshold cumulative importance importance_idx = np.min(np.where(features['cumulative_coefficient_frequency'] > threshold)) thr_percentage = 100 * threshold l = '{} features required for {:.0f}% of cumulative importance.'.format(importance_idx+1, thr_percentage) # Cumulative importance plot plt.plot(range(len(features)), features['cumulative_coefficient_frequency'], 'b-', label=l) plt.xlabel('Number of Features', fontsize=12, labelpad=20) plt.ylabel('Cumulative Coefficient frequency', fontsize=12, labelpad=20) plt.title('Cumulative Feature Importance', fontsize=14, pad=20) # plt.title(f'Cumulative Feature Importance\n\n{l}', fontsize=14, pad=20) # Threshold vertical line plot plt.vlines(importance_idx + 1, ymin=0, ymax=1.05, linestyles='--', colors='red') plt.legend(loc='lower right', fontsize=12) plt.tight_layout() plt.show()