示例#1
0
                            'Proanthocyanins', \
                            'Color intensity', \
                            'Hue', \
                            'OD280', \
                            'Proline'])
# True labels
label = wine['Label']
del wine['Label']

# Data description
wine.dtypes

wine.describe()

# Scatter plot
sm(wine, alpha=0.7, figsize=(18, 18))
plt.show()

# Correlation Heatmap
correlation = wine.corr()
plt.subplots(figsize=(9, 9))
sns.heatmap(correlation.round(2),
            annot=True,
            vmax=1,
            square=True,
            cmap='RdYlGn_r')
plt.show()

# regression
sns.jointplot(x=wine.columns[5], y=wine.columns[6], data=wine, kind="reg")
plt.show()
示例#2
0
# statistical summary
print(data.describe())

# class distribution
print(data.groupby('class').size())

# univariate plots and whisker plot
data.plot(kind="box", subplots='True', layout=(2, 2), sharex=False, sharey=False)

# histogram of the variable
data.hist()
# pl.show()

# multivariate plots
sm(data)
# pl.show()

# creating a validation dataset
# splitting dataset
array = data.values
X = array[:, 0:4]
y = array[:, 4]
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True)

# logistic regression
# linear discriminant analysis
# KNN
# classification and regression trees
# gaussian naive bayes
# support vector machine
# 1.2 hist
df['a'].plot(kind='hist', title='histogram')

# 1.3 kde: 확률 밀도 분포 곡선
df['a'].plot(kind='kde', title='kde')

# 전체 확률 변수 대상
df.plot(kind='kde', title='Kernel density plot')

# 2. box plot
df.plot(kind='box', title='box plot')

df.describe()

# 3. scatter matrix
from pandas.plotting import scatter_matrix as sm

sm(df)

import os
os.chdir('E:\Code\Python\itwill\python-2\data')

iris = pd.read_csv('iris.csv')

cols = list(iris.columns)
cols
x = iris[cols[:4]]
x.shape

sm(x, diagonal='kde')
示例#4
0
Assignment1

"""

# Data Loading
import pandas as pd

data = pd.read_csv(
    r"C:\Users\김성제\Google Drive\공부\4학년 1학기\Business Analytics\Data set\Week2\\kc_house_data.csv"
)

# for scatter matrix
from pandas.plotting import scatter_matrix as sm

sm(data[['price', 'grade']])

# Histogram of 'yr_built' and 'yr_renovated' (not in report)
data['yr_built'].plot.hist(bins=40)
data['yr_renovated'].plot.hist(bins=40)

# KDE of each column
data['yr_built'].plot.kde()
data['yr_renovated'].plot.kde()

# Simple statistic information of yr_built
data['yr_built'].describe()

# Convert to new data set ---- yr_built
yr_built_new = []
for year in data['yr_built']: