def showCorrelation(gripperjack_nr, part): data = DynamicCsvConverter(gripperjack_nr, part, '5min', 'max', pd.read_csv( 'C:\\Users\\Lukassen\\PycharmProjects\\GelredomeVeldErrorVoorspellen\\Recources\\Volledige_Gelredome_Data_CSV.csv', index_col=False)) data = data.make_file() # to see correlation with the to be predicted remove 'to_be_predicted' from drop columns and put the 'to_be_predicted variable in the data.pop method' data = data.drop(columns=['Timestamp']) data = data.dropna() y = data.pop('to_be_predicted') X = data # Create a list of the feature names features = np.array(data.columns) # Create a list of the discrete features discrete = [False for _ in range(len(features))] discrete[1] = True # Instantiate the visualizer visualizer = FeatureCorrelation(labels=features, size=(1200, 700)) visualizer.title = part visualizer.fit(X, y) values.append(visualizer.scores_) visualizer.show()
def features_correlation(df, cols, target, fig_size=(6, 6), path=None): """ Correlation of variables in the dataframe with respect to the target Parameters ---------- df : pd.Dataframe dataframe with the data to calculate the correlation cols : array columns to be correlated with the target target : str target name fig_size : tuple figure size path : str path where the graphics will be saved Returns ------- None """ f, ax = plt.subplots(1, figsize=fig_size) ax.set_xlabel("Feature Correlation") visualizer = FeatureCorrelation(labels=list(cols)) visualizer.fit(df[cols], df[target]) f.tight_layout() if (path != None): f.savefig(path + '/features_correlation.png')
def test_feature_correlation_sort(self): """ Test sorting of correlation """ viz = FeatureCorrelation(sort=True) viz.fit(self.X, self.y) assert np.all(viz.scores_[:-1] <= viz.scores_[1:])
def feature_correlation_pearson(path="images/feature_correlation_pearson.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(X, y) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_select_feature_by_index_out_of_range(self): """ Test selecting feature by feature index but index is out of range """ e = "Feature index is out of range" with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_index=[0, 2, 10]) viz.fit(self.X, self.y)
def test_feature_correlation_labels(self): """ Test labels as feature labels """ viz = FeatureCorrelation(labels=self.labels) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, self.labels)
def test_feature_correlation_labels_from_index(self): """ Test getting feature labels from index """ viz = FeatureCorrelation() viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, np.arange(self.X.shape[1]))
def test_feature_correlation_select_feature_by_index_out_of_range(self): """ Test selecting feature by feature index but index is out of range """ e = 'Feature index is out of range' with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_index=[0, 2, 10]) viz.fit(self.X, self.y)
def test_feature_correlation_select_feature_by_index(self): """ Test selecting feature by index """ viz = FeatureCorrelation(feature_index=[0, 2, 3]) viz.fit(self.X, self.y) assert viz.scores_.shape[0] == 3
def test_feature_correlation_integrated_mutual_info_regression(self): """ Test FeatureCorrelation visualizer with mutual information regression """ viz = FeatureCorrelation(method="mutual_info-regression") viz.fit(self.X, self.y, random_state=23456) viz.finalize() self.assert_images_similar(viz)
def test_feature_correlation_integrated_mutual_info_regression(self): """ Test FeatureCorrelation visualizer with mutual information regression """ viz = FeatureCorrelation(method='mutual_info-regression') viz.fit(self.X, self.y, random_state=23456) viz.poof() self.assert_images_similar(viz)
def feature_correlation_pearson( path="images/feature_correlation_pearson.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(X, y) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_integrated_pearson(self): """ Test FeatureCorrelation visualizer with pearson correlation coefficient """ viz = FeatureCorrelation() viz.fit(self.X, self.y) viz.finalize() self.assert_images_similar(viz)
def test_feature_correlation_select_feature_by_name_no_labels(self): """ Test selecting feature by feature names with labels is not supplied """ feature_names = ["age"] e = "age not in labels" with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_names=feature_names) viz.fit(self.X, self.y)
def test_feature_correlation_labels_from_dataframe(self): """ Test getting feature labels from DataFrame """ X_pd = pd.DataFrame(self.X, columns=self.labels) viz = FeatureCorrelation() viz.fit(X_pd, self.y) npt.assert_array_equal(viz.features_, self.labels)
def test_feature_correlation_select_feature_by_name_no_labels(self): """ Test selecting feature by feature names with labels is not supplied """ feature_names = ['age'] e = 'age not in labels' with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_names=feature_names) viz.fit(self.X, self.y)
def test_feature_correlation_select_feature_by_name(self): """ Test selecting feature by feature names """ feature_names = ["age", "sex", "bp", "s5"] viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, feature_names)
def test_feature_correlation_integrated_pearson(self): """ Test FeatureCorrelation visualizer with pearson correlation coefficient """ viz = FeatureCorrelation() viz.fit(self.X, self.y) viz.poof() self.assert_images_similar(viz)
def test_feature_correlation_select_feature_by_name(self): """ Test selecting feature by feature names """ feature_names = ['age', 'sex', 'bp', 's5'] viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, feature_names)
def mutual_info_classification(classes, feature_names, X, y): from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression data set visualizer = FeatureCorrelation(method='mutual_info-classification', feature_names=feature_names, sort=True) visualizer.fit(X, y, random_state=0) visualizer.poof()
def pearson_correlation(classes, fetures, X, Y): from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression data set # data = datasets.load_diabetes() # X, y = data['data'], data['target'] # feature_names = np.array(data['feature_names']) visualizer = FeatureCorrelation(labels=fetures) visualizer.fit(X, Y) visualizer.poof()
def report(self, pipeline: AbstractPipeline): folder = get_cache_path() path = pkg_resources.resource_filename( 'crcdal', 'cache/' + folder + '/' + self.sub_folder + '/') pkg_resources.ensure_directory(path) feature_names = list(pipeline.train.columns()) visualizer = FeatureCorrelation(labels=feature_names) visualizer.fit(pipeline.train, pipeline.train_y) visualizer.poof(outpath=path + pipeline.dataset_tag + '_model_feature_correlation_report.csv')
def feature_correlation_mutual_info_classification( path="images/feature_correlation_mutual_info_classification.png"): data = datasets.load_wine() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) X_pd = pd.DataFrame(X, columns=feature_names) feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols'] visualizer = FeatureCorrelation(method='mutual_info-classification', feature_names=feature_to_plot) visualizer.fit(X_pd, y, random_state=0) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_integrated_mutual_info_classification(self): """ Test FeatureCorrelation visualizer with mutual information on wine dataset (classification) """ data = datasets.load_wine() X, y = data["data"], data["target"] viz = FeatureCorrelation(method="mutual_info-classification") viz.fit(X, y, random_state=12345) viz.finalize() self.assert_images_similar(viz)
def test_feature_correlation_integrated_mutual_info_classification(self): """ Test FeatureCorrelation visualizer with mutual information on wine dataset (classification) """ data = datasets.load_wine() X, y = data['data'], data['target'] viz = FeatureCorrelation(method='mutual_info-classification') viz.fit(X, y, random_state=12345) viz.poof() self.assert_images_similar(viz)
def mutual_info_regress(classes, feature_names, X, y): from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression data set discrete_features = [False for _ in range(len(feature_names))] discrete_features[1] = True visualizer = FeatureCorrelation(method='mutual_info-regression', labels=feature_names) visualizer.fit(X, y, discrete_features=discrete_features, random_state=0) visualizer.poof()
def feature_correlation_mutual_info_regression( path="images/feature_correlation_mutual_info_regression.png"): data = datasets.load_diabetes() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) discrete_features = [False for _ in range(len(feature_names))] discrete_features[1] = True visualizer = FeatureCorrelation(method='mutual_info-regression', labels=feature_names, sort=True) visualizer.fit(X, y, discrete_features=discrete_features, random_state=0) visualizer.poof(outpath=path, clear_figure=True)
def test_feature_correlation_select_feature_by_index_and_name(self): """ Test selecting feature warning when both index and names are provided """ feature_index = [0, 2, 3] feature_names = ['age'] e = ('Both feature_index and feature_names are specified. ' 'feature_names is ignored') with pytest.raises(YellowbrickWarning, match=e): viz = FeatureCorrelation(feature_index=feature_index, feature_names=feature_names) viz.fit(self.X, self.y) assert viz.scores_.shape[0] == 3
def create_correlation_matrix(data): encoded = encode_data(data) kendall = encoded.corr(method ='kendall')['class'].to_frame() pearson = encoded.corr(method ='pearson')['class'].to_frame() spearman = encoded.corr(method ='spearman')['class'].to_frame() kendall.columns = ['kendall'] pearson.columns = ['pearson'] spearman.columns = ['spearman'] correlation_matrix = pd.concat([kendall, pearson, spearman], axis=1, sort=False) X, y = encoded.drop(columns = ['class']), encoded['class'] visualizer = FeatureCorrelation(method='mutual_info-classification', labels=X.columns) visualizer.fit(X, y) correlation_matrix = correlation_matrix.drop('class', axis = 0) correlation_matrix['mutual_info-classification'] = visualizer.scores_.tolist() return correlation_matrix
import pandas as pd import numpy as np import math import seaborn as sns import matplotlib.pyplot as plt from yellowbrick.target import FeatureCorrelation dataset = pd.read_csv('house_prices.csv') #removendo atributos que nao serao utilizados para analise dataset.drop(labels=['id', 'date', 'sqft_living15', 'sqft_lot15'], axis=1, inplace=True) print(dataset.columns) grafico = FeatureCorrelation(labels=dataset.columns[1:]) grafico.fit(dataset.iloc[:, 1:16].values, dataset.iloc[:, 0].values) plt.show()
plt.xticks(fontsize=14) plt.yticks(fontsize=12) locationFileNameRFC = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx]) \ +'_label_'+str(labelsIdx)+ '_idx_'+str(idx)+str('Date')+str(dateIdx)+'_RandForImp.png') vizRFC.fit(X, y) vizRFC.show(outpath=locationFileNameRFC) plt.show() plt.figure() # Instantiate the visualizer visualizerFC = FeatureCorrelation(labels=features, color="rebeccapurple", title=' ') visualizerFC.fit(X, y) locationFileNameFC = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx])+'_idx_'+str(idx) \ +'_label_'+str(labelName)+'_date_'+str(dateIdx)+'_label_'+str(labelsIdx)+'_FeatureCorrelation_w_depn_var.png') plt.xlabel('', fontsize=11) plt.xticks(fontsize=14) plt.yticks(fontsize=12) visualizerFC.show(outpath=locationFileNameFC) plt.show() # # # Instantiate the visualizer set_palette('yellowbrick') plt.figure() classes = np.array([0, 1.]) plt.xticks(fontsize=9) visualizerRadViz = RadViz(classes=classes,
def featcorr(): data = load_diabetes() oz = FeatureCorrelation(ax=newfig()) oz.fit(data.data, data.target) savefig(oz, "feature_correlation")
## yellowbrick from sklearn import datasets from yellowbrick.target import FeatureCorrelation # Load the regression dataset data = datasets.load_diabetes() X, y = data['data'], data['target'] # Create a list of the feature names features = np.array(data['feature_names']) # Instantiate the visualizer visualizer = FeatureCorrelation(labels=features) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show() ## PCA - Principal Component Analysis https://www.kaggle.com/ryanholbrook/principal-component-analysis from sklearn.decomposition import PCA # Create principal components pca = PCA() X_pca = pca.fit_transform(X_scaled) # Convert to dataframe component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])] X_pca = pd.DataFrame(X_pca, columns=component_names) ## Target Encoding
cv=cv) clf = scores['estimator'][np.argmax(scores['test_score'])] print(np.max(scores['test_score'])) # %% explainer = shap.TreeExplainer(clf) shap_values = explainer.shap_values(Xv) # %% shap.summary_plot(shap_values, Xv, plot_type="bar") # %% feat = feature_names[feat][np.mean(abs(shap_values), axis=0) > 0.55] print(feat) X = X[feat] # %% visualizer = FeatureCorrelation(method='mutual_info-classification') visualizer.fit(X, y) visualizer.poof() # %% # This step doesn't always produce the same result, idk why. feat = visualizer.features_[visualizer.scores_ > 0.04] X = X[feat] # %% # Our final 10 features: # [263, 268, 287, 288, 300, 302, 307, 308, 313, 315] print(feat)
def draw_feature_correlation(self): visualizer = FeatureCorrelation(method='mutual_info-classification', labels=self.get_feature_labels(), sort=True) visualizer.fit(self.training_data, self.training_labels) visualizer.poof()