def features_correlation(df, cols, target, fig_size=(6, 6), path=None): """ Correlation of variables in the dataframe with respect to the target Parameters ---------- df : pd.Dataframe dataframe with the data to calculate the correlation cols : array columns to be correlated with the target target : str target name fig_size : tuple figure size path : str path where the graphics will be saved Returns ------- None """ f, ax = plt.subplots(1, figsize=fig_size) ax.set_xlabel("Feature Correlation") visualizer = FeatureCorrelation(labels=list(cols)) visualizer.fit(df[cols], df[target]) f.tight_layout() if (path != None): f.savefig(path + '/features_correlation.png')
def test_feature_correlation_labels(self): """ Test labels as feature labels """ viz = FeatureCorrelation(labels=self.labels) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, self.labels)
def test_feature_correlation_select_feature_by_index_out_of_range(self): """ Test selecting feature by feature index but index is out of range """ e = 'Feature index is out of range' with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_index=[0, 2, 10]) viz.fit(self.X, self.y)
def test_feature_correlation_select_feature_by_index(self): """ Test selecting feature by index """ viz = FeatureCorrelation(feature_index=[0, 2, 3]) viz.fit(self.X, self.y) assert viz.scores_.shape[0] == 3
def test_feature_correlation_labels_from_index(self): """ Test getting feature labels from index """ viz = FeatureCorrelation() viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, np.arange(self.X.shape[1]))
def test_feature_correlation_sort(self): """ Test sorting of correlation """ viz = FeatureCorrelation(sort=True) viz.fit(self.X, self.y) assert np.all(viz.scores_[:-1] <= viz.scores_[1:])
def test_feature_correlation_select_feature_by_index_out_of_range(self): """ Test selecting feature by feature index but index is out of range """ e = "Feature index is out of range" with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_index=[0, 2, 10]) viz.fit(self.X, self.y)
def test_feature_correlation_integrated_mutual_info_regression(self): """ Test FeatureCorrelation visualizer with mutual information regression """ viz = FeatureCorrelation(method='mutual_info-regression') viz.fit(self.X, self.y, random_state=23456) viz.poof() self.assert_images_similar(viz)
def feature_correlation_pearson( path="images/feature_correlation_pearson.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(X, y) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_select_feature_by_name(self): """ Test selecting feature by feature names """ feature_names = ["age", "sex", "bp", "s5"] viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, feature_names)
def test_feature_correlation_select_feature_by_name_no_labels(self): """ Test selecting feature by feature names with labels is not supplied """ feature_names = ["age"] e = "age not in labels" with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_names=feature_names) viz.fit(self.X, self.y)
def test_feature_correlation_labels_from_dataframe(self): """ Test getting feature labels from DataFrame """ X_pd = pd.DataFrame(self.X, columns=self.labels) viz = FeatureCorrelation() viz.fit(X_pd, self.y) npt.assert_array_equal(viz.features_, self.labels)
def test_feature_correlation_integrated_pearson(self): """ Test FeatureCorrelation visualizer with pearson correlation coefficient """ viz = FeatureCorrelation() viz.fit(self.X, self.y) viz.poof() self.assert_images_similar(viz)
def test_feature_correlation_select_feature_by_name_no_labels(self): """ Test selecting feature by feature names with labels is not supplied """ feature_names = ['age'] e = 'age not in labels' with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_names=feature_names) viz.fit(self.X, self.y)
def test_feature_correlation_select_feature_by_name(self): """ Test selecting feature by feature names """ feature_names = ['age', 'sex', 'bp', 's5'] viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, feature_names)
def target_visualizer(self, classes=None, params={'BalancedBinningReference': { 'bins': 5 }}): LOGGER.info('Initializing target visualizer') if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False: os.makedirs(os.path.join(os.getcwd(), 'visualizer/')) visualizers = [] y = self.y.squeeze() try: LOGGER.info('Visualizer BalancedBinningReference') visualizer = BalancedBinningReference() if visualizer.__class__.__name__ in params.keys(): visualizer = BalancedBinningReference( **params[visualizer.__class__.__name__]) visualizer.fit(y) visualizer.show(outpath=os.path.join( os.getcwd(), f"visualizer/{visualizer.__class__.__name__}.png")) plt.cla() except: LOGGER.warn('ERROR BalancedBinning') try: LOGGER.info('Visualizer CLassBalance') visualizer = ClassBalance() if visualizer.__class__.__name__ in params.keys(): visualizer = ClassBalance( **params[visualizer.__class__.__name__]) visualizer.fit(y) visualizer.show(outpath=os.path.join( os.getcwd(), f"visualizer/{visualizer.__class__.__name__}.png")) plt.cla() except: LOGGER.warn('ERROR ClassBalance') try: LOGGER.info('Visualizer Feature Correlation') visualizer = FeatureCorrelation( method='mutual_info-classification', feature_names=self.X.columns.tolist(), sort=True) if visualizer.__class__.__name__ in params.keys(): visualizer = FeatureCorrelation( **params[visualizer.__class__.__name__]) visualizer.fit(self.X, y) visualizer.show(outpath=os.path.join( os.getcwd(), f"visualizer/{visualizer.__class__.__name__}.png")) plt.cla() except: LOGGER.warn('ERROR FeatureCorrelation')
def test_feature_correlation_integrated_mutual_info_classification(self): """ Test FeatureCorrelation visualizer with mutual information on wine dataset (classification) """ data = datasets.load_wine() X, y = data['data'], data['target'] viz = FeatureCorrelation(method='mutual_info-classification') viz.fit(X, y, random_state=12345) viz.poof() self.assert_images_similar(viz)
def feature_correlation_mutual_info_regression( path="images/feature_correlation_mutual_info_regression.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) discrete_features = [False for _ in range(len(feature_names))] discrete_features[1] = True visualizer = FeatureCorrelation(method='mutual_info-regression', labels=feature_names, sort=True) visualizer.fit(X, y, discrete_features=discrete_features, random_state=0) visualizer.poof(outpath=path, clear_figure=True)
def feature_correlation_mutual_info_classification( path="images/feature_correlation_mutual_info_classification.png"): data = datasets.load_wine() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) X_pd = pd.DataFrame(X, columns=feature_names) feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols'] visualizer = FeatureCorrelation(method='mutual_info-classification', feature_names=feature_to_plot) visualizer.fit(X_pd, y, random_state=0) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_select_feature_by_index_and_name(self): """ Test selecting feature warning when both index and names are provided """ feature_index = [0, 2, 3] feature_names = ['age'] e = ('Both feature_index and feature_names are specified. ' 'feature_names is ignored') with pytest.raises(YellowbrickWarning, match=e): viz = FeatureCorrelation(feature_index=feature_index, feature_names=feature_names) viz.fit(self.X, self.y) assert viz.scores_.shape[0] == 3
def test_feature_correlation_method_not_implemented(self): """ Test FeatureCorrelation visualizer with unknown method """ method = "foo" e = "Method foo not implement; choose from *" with pytest.raises(YellowbrickValueError, match=e): FeatureCorrelation(method=method)
def showCorrelation(gripperjack_nr, part): data = DynamicCsvConverter(gripperjack_nr, part, '5min', 'max', pd.read_csv( 'C:\\Users\\Lukassen\\PycharmProjects\\GelredomeVeldErrorVoorspellen\\Recources\\Volledige_Gelredome_Data_CSV.csv', index_col=False)) data = data.make_file() # to see correlation with the to be predicted remove 'to_be_predicted' from drop columns and put the 'to_be_predicted variable in the data.pop method' data = data.drop(columns=['Timestamp']) data = data.dropna() y = data.pop('to_be_predicted') X = data # Create a list of the feature names features = np.array(data.columns) # Create a list of the discrete features discrete = [False for _ in range(len(features))] discrete[1] = True # Instantiate the visualizer visualizer = FeatureCorrelation(labels=features, size=(1200, 700)) visualizer.title = part visualizer.fit(X, y) values.append(visualizer.scores_) visualizer.show()
def create_correlation_matrix(data): encoded = encode_data(data) kendall = encoded.corr(method ='kendall')['class'].to_frame() pearson = encoded.corr(method ='pearson')['class'].to_frame() spearman = encoded.corr(method ='spearman')['class'].to_frame() kendall.columns = ['kendall'] pearson.columns = ['pearson'] spearman.columns = ['spearman'] correlation_matrix = pd.concat([kendall, pearson, spearman], axis=1, sort=False) X, y = encoded.drop(columns = ['class']), encoded['class'] visualizer = FeatureCorrelation(method='mutual_info-classification', labels=X.columns) visualizer.fit(X, y) correlation_matrix = correlation_matrix.drop('class', axis = 0) correlation_matrix['mutual_info-classification'] = visualizer.scores_.tolist() return correlation_matrix
def feature_correlation_pearson(path="images/feature_correlation_pearson.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(X, y) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_integrated_mutual_info_regression(self): """ Test FeatureCorrelation visualizer with mutual information regression """ viz = FeatureCorrelation(method="mutual_info-regression") viz.fit(self.X, self.y, random_state=23456) viz.finalize() self.assert_images_similar(viz)
def test_feature_correlation_integrated_pearson(self): """ Test FeatureCorrelation visualizer with pearson correlation coefficient """ viz = FeatureCorrelation() viz.fit(self.X, self.y) viz.finalize() self.assert_images_similar(viz)
def mutual_info_classification(classes, feature_names, X, y): from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression data set visualizer = FeatureCorrelation(method='mutual_info-classification', feature_names=feature_names, sort=True) visualizer.fit(X, y, random_state=0) visualizer.poof()
def pearson_correlation(classes, fetures, X, Y): from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression data set # data = datasets.load_diabetes() # X, y = data['data'], data['target'] # feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=fetures) visualizer.fit(X, Y) visualizer.poof()
def report(self, pipeline: AbstractPipeline): folder = get_cache_path() path = pkg_resources.resource_filename( 'crcdal', 'cache/' + folder + '/' + self.sub_folder + '/') pkg_resources.ensure_directory(path) feature_names = list(pipeline.train.columns()) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(pipeline.train, pipeline.train_y) visualizer.poof(outpath=path + pipeline.dataset_tag + '_model_feature_correlation_report.csv')
def mutual_info_regress(classes, feature_names, X, y): from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression data set discrete_features = [False for _ in range(len(feature_names))] discrete_features[1] = True visualizer = FeatureCorrelation(method='mutual_info-regression', labels=feature_names) visualizer.fit(X, y, discrete_features=discrete_features, random_state=0) visualizer.poof()
def test_feature_correlation_integrated_mutual_info_classification(self): """ Test FeatureCorrelation visualizer with mutual information on wine dataset (classification) """ data = datasets.load_wine() X, y = data["data"], data["target"] viz = FeatureCorrelation(method="mutual_info-classification") viz.fit(X, y, random_state=12345) viz.finalize() self.assert_images_similar(viz)
import pandas as pd import numpy as np import math import seaborn as sns import matplotlib.pyplot as plt from yellowbrick.target import FeatureCorrelation dataset = pd.read_csv('house_prices.csv') #removendo atributos que nao serao utilizados para analise dataset.drop(labels=['id', 'date', 'sqft_living15', 'sqft_lot15'], axis=1, inplace=True) print(dataset.columns) grafico = FeatureCorrelation(labels=dataset.columns[1:]) grafico.fit(dataset.iloc[:, 1:16].values, dataset.iloc[:, 0].values) plt.show()
visualizer.fit_transform(X, y) visualizer.poof() # %% visualizer = Rank1D() visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %% visualizer = Rank2D() visualizer.fit_transform(X) visualizer.poof() # %% visualizer = FeatureCorrelation() visualizer.fit(X, y) visualizer.poof() # %% visualizer = FeatureCorrelation(method='mutual_info-classification') visualizer.fit(X, y) visualizer.poof() # %% visualizer = RadViz(classes=class_names) visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %%
correlated_features.add(colname) ## yellowbrick from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression dataset data = datasets.load_diabetes() X, y = data['data'], data['target'] # Create a list of the feature names features = np.array(data['feature_names']) # Instantiate the visualizer visualizer = FeatureCorrelation(labels=features) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show() ## PCA - Principal Component Analysis https://www.kaggle.com/ryanholbrook/principal-component-analysis from sklearn.decomposition import PCA # Create principal components pca = PCA() X_pca = pca.fit_transform(X_scaled) # Convert to dataframe component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])] X_pca = pd.DataFrame(X_pca, columns=component_names)
def featcorr(): data = load_diabetes() oz = FeatureCorrelation(ax=newfig()) oz.fit(data.data, data.target) savefig(oz, "feature_correlation")
from matplotlib import rcParams as rc from yellowbrick.target import FeatureCorrelation rc['xtick.labelsize'] = 15.0 rc['ytick.labelsize'] = 15.0 rc['xtick.direction'] = 'out' rc['axes.labelsize'] = 15.0 rc['axes.titlesize'] = 18.0 rc['savefig.format'] = 'png' rc['savefig.dpi'] = 600 rc['legend.fontsize'] = 15 x = df.drop('Death_Event', axis=1) y = df['Death_Event'] fig = plt.figure(figsize=(8, 6)) corr = FeatureCorrelation(method='pearson', label=x.columns, sort=True).fit(x, y) plt.savefig('../../outputs/visuals/correlations') corr.show() fig, ax = plt.subplots(figsize=(20, 10)) sns.heatmap(df.corr(), annot=True, square=False, ax=ax) ax.set_title('Correlations between features') plt.savefig('../../outputs/visuals/correlations_all') plt.show() # Age distribution of Patients fig, ax = plt.subplots(figsize=(8, 6)) sns.kdeplot(df['Age'], legend=False, shade=True, ax=ax) plt.savefig('../../outputs/visuals/age_distribution') ax.set_title('Age Distribution of Patients')
cv=cv) clf = scores['estimator'][np.argmax(scores['test_score'])] print(np.max(scores['test_score'])) # %% explainer = shap.TreeExplainer(clf) shap_values = explainer.shap_values(Xv) # %% shap.summary_plot(shap_values, Xv, plot_type="bar") # %% feat = feature_names[feat][np.mean(abs(shap_values), axis=0) > 0.55] print(feat) X = X[feat] # %% visualizer = FeatureCorrelation(method='mutual_info-classification') visualizer.fit(X, y) visualizer.poof() # %% # This step doesn't always produce the same result, idk why. feat = visualizer.features_[visualizer.scores_ > 0.04] X = X[feat] # %% # Our final 10 features: # [263, 268, 287, 288, 300, 302, 307, 308, 313, 315] print(feat)