'Proanthocyanins', \ 'Color intensity', \ 'Hue', \ 'OD280', \ 'Proline']) # True labels label = wine['Label'] del wine['Label'] # Data description wine.dtypes wine.describe() # Scatter plot sm(wine, alpha=0.7, figsize=(18, 18)) plt.show() # Correlation Heatmap correlation = wine.corr() plt.subplots(figsize=(9, 9)) sns.heatmap(correlation.round(2), annot=True, vmax=1, square=True, cmap='RdYlGn_r') plt.show() # regression sns.jointplot(x=wine.columns[5], y=wine.columns[6], data=wine, kind="reg") plt.show()
# statistical summary print(data.describe()) # class distribution print(data.groupby('class').size()) # univariate plots and whisker plot data.plot(kind="box", subplots='True', layout=(2, 2), sharex=False, sharey=False) # histogram of the variable data.hist() # pl.show() # multivariate plots sm(data) # pl.show() # creating a validation dataset # splitting dataset array = data.values X = array[:, 0:4] y = array[:, 4] X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True) # logistic regression # linear discriminant analysis # KNN # classification and regression trees # gaussian naive bayes # support vector machine
# 1.2 hist df['a'].plot(kind='hist', title='histogram') # 1.3 kde: 확률 밀도 분포 곡선 df['a'].plot(kind='kde', title='kde') # 전체 확률 변수 대상 df.plot(kind='kde', title='Kernel density plot') # 2. box plot df.plot(kind='box', title='box plot') df.describe() # 3. scatter matrix from pandas.plotting import scatter_matrix as sm sm(df) import os os.chdir('E:\Code\Python\itwill\python-2\data') iris = pd.read_csv('iris.csv') cols = list(iris.columns) cols x = iris[cols[:4]] x.shape sm(x, diagonal='kde')
Assignment1 """ # Data Loading import pandas as pd data = pd.read_csv( r"C:\Users\김성제\Google Drive\공부\4학년 1학기\Business Analytics\Data set\Week2\\kc_house_data.csv" ) # for scatter matrix from pandas.plotting import scatter_matrix as sm sm(data[['price', 'grade']]) # Histogram of 'yr_built' and 'yr_renovated' (not in report) data['yr_built'].plot.hist(bins=40) data['yr_renovated'].plot.hist(bins=40) # KDE of each column data['yr_built'].plot.kde() data['yr_renovated'].plot.kde() # Simple statistic information of yr_built data['yr_built'].describe() # Convert to new data set ---- yr_built yr_built_new = [] for year in data['yr_built']: