def plot_feature_relationships(self, df : pd.DataFrame, cols=[], postfix='') -> None: no_date_df = df.drop(columns=['date']) if not cols: cols = list(no_date_df.columns) scatterplotmatrix(no_date_df.values, names=cols, alpha=0.7) plt.savefig(f'feature_relationships{postfix}.png')
import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import pandas as pd import matplotlib.pyplot as plt from mlxtend.plotting import scatterplotmatrix import numpy as np from mlxtend.plotting import heatmap # Load dataset diabetes = pd.read_csv('diabetes.csv', header=0) diabetes.columns = ['PREG', 'GLU', 'BP', 'SKIN', 'INSU', 'BMI', 'DPF', 'AGE', 'OUT'] features = ['PREG', 'GLU', 'BP', 'SKIN', 'INSU', 'BMI', 'DPF', 'AGE'] X = diabetes[features].values y = diabetes['OUT'].T # EDA cm = np.corrcoef(diabetes[diabetes.columns].values.T) hm = heatmap(cm, row_names=diabetes.columns, column_names=diabetes.columns) scatterplotmatrix(diabetes[diabetes.columns].values, figsize=(10, 8), names=diabetes.columns, alpha=0.4) plt.show()
import matplotlib.pyplot as plt # In[6]: from mlxtend.plotting import scatterplotmatrix # In[7]: cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] # In[ ]: # In[8]: scatterplotmatrix(df[cols].values, figsize=(10, 8), names=cols, alpha=0.5) plt.tight_layout() plt.show() # In[9]: from mlxtend.plotting import heatmap # In[10]: import numpy as np # In[11]: cm = np.corrcoef(df[cols].values.T)
# Main program if __name__ == '__main__': # Load full data set fileName = 'heights_weights.xlsx' dataFull = loadData(fileName, '.xlsx') print('\n') # Extract columns from data set columnNames = dataFull.columns columnNames = [columnNames[1], columnNames[2]] # Plot data scatterplotmatrix(dataFull[columnNames].values, figsize=(10, 9), names=columnNames, alpha=0.5) plt.tight_layout() plt.show() # Specify feature and target variables X = dataFull[columnNames[0]].values # height y = dataFull[columnNames[1]].values # weight # Split data into training (80%)/test data (20%) sets X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.2, random_state=42) X_train = X_train[:, np.newaxis] y_train = y_train[:, np.newaxis] X_test = X_test[:, np.newaxis] y_test = y_test[:, np.newaxis]
# Avg. Area Number of Bedrooms 0 # Area Population 0 # Price 0 # Address 0 print( colored( '==================================VISUALIZATION=========================================================', 'white')) print(colored('SCATTERPLOT OF VARIABLES', 'red')) cols = [ 'Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price' ] scatterplotmatrix(df1[cols].values, figsize=(40, 32), names=cols, alpha=0.5) plt.title('Scatterplot of 6 Variables in a Housing File') plt.tight_layout() plt.savefig('Scatterplot_of_6_housing_variables.png') # Display the plot. By the way, do we need to show the large plot, or is it better to just save it? #plt.show() print(colored('PRINTING DATA DESCRIPTION', 'red')) # Generate summary statistics about the numeric data (6 columns out of 7): count, mean (50th percentile), standard deviation, minimum, 25th percentile, 75th percentile, maximum. # I want to print the column LABELS on top of the column descriptions. # Column Names: 'Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price'. print(df1.describe()) print( colored(
def scatter_plot(self, data, cols): print("1") scatterplotmatrix(data[cols].values, figsize=(10, 10), names=cols, alpha=0.5) print("2") plt.show() print('3')