def graph():
    fig, axes = plt.subplots(15, 2, figsize=(10, 20))
    maligant = cancer.data[cancer.target == 0]
    bengin = cancer.data[cancer.target == 1]

    ax = axes.ravel()

    for i in range(30):
        _, bins = np.histogram(cancer.data[:, i], bins=50)
        ax[i].hist(maligant[:, i], bins=bins, color=mglearn.cm3(0), alpha=.5)
        ax[i].hist(bengin[:, i], bins=bins, color=mglearn.cm3(2), alpha=.5)
        ax[i].set_title(cancer.feature_names[i])
        ax[i].set_yticks(())
    ax[0].set_xlabel("Feature magnitude")
    ax[0].set_ylabel("Frequency")
    ax[0].legend(["Maligant", "Bengin"], loc='best')
    fig.tight_layout()
    plt.show()

'''
mglearn.plots.plot_pca_illustration()

## PCA를 적용해 유방암 데이터셋 시각화
import matplotlib.pyplot as plt
import numpy as np
cancer=load_breast_cancer()
fig,axes=plt.subplots(5,6,figsize=(10,20))
malignant=cancer.data[cancer.target==0]
benign=cancer.data[cancer.target==1]
ax=axes.ravel()
for i in range(30):
    _,bins=np.histogram(cancer.data[:,i],bins=50)
    ax[i].hist(malignant[:,i],bins=bins,color=mglearn.cm3(0),alpha=.5)
    ax[i].hist(benign[:,i],bins=bins,color=mglearn.cm3(2),alpha=.5)
    ax[i].set_title(cancer.feature_names[i])
    ax[i].set_yticks(())
ax[0].set_xlabel('attr size')
ax[0].set_ylabel('frequency')
ax[0].legend(['neg','pos'],loc='best')
fig.tight_layout()

## 처음 두개의 주성분을 사용해 그린 유방암 데이터셋의 2차원 산점도 #####
cancer=load_breast_cancer()
standard_scaler=StandardScaler()    
standard_scaler.fit(cancer.data)
x_scaled=standard_scaler.transform(cancer.data)
'''
PCA객체를 생성 -> fit 메서드 호출-> 주성분을 찾고,
Пример #3
0
male = mydata.loc[mydata['label'] == 'male']
female = mydata.loc[mydata['label'] == 'female']
'''
for i in range(20):
	trace1 = go.Histogram(x = male.ix[:,i])
	trace2 = go.Histogram(x = female.ix[:,i])
	data = [trace1,trace2]
	layout = go.Layout(barmode = 'overlay')
	fig = go.Figure(data = data,layout=layout)
	py.iplot(fig,filename = 'hist')
'''

fig, axes = plt.subplots(10, 2, figsize=(10, 20))
ax = axes.ravel()
for i in range(20):
    ax[i].hist(male.ix[:, i], bins=20, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(female.ix[:, i], bins=20, color=mglearn.cm3(2), alpha=.5)
    ax[i].set_title(list(male)[i])
    ax[i].set_yticks(())

ax[0].set_xlabel("Feature magnitude")
ax[0].set_ylabel("Frequency")
ax[0].legend(["male", "female"], loc="best")
fig.tight_layout()

#prepare data for modeling
mydata.loc[:, 'label'][mydata['label'] == "male"] = 0
mydata.loc[:, 'label'][mydata['label'] == "female"] = 1

#print (mydata.head(1))
mydata_train, mydata_test = train_test_split(mydata,
Пример #4
0
print()

# In[13]:

mglearn.plots.plot_pca_illustration()

# In[14]:

fig, axes = plt.subplots(13, 1, figsize=(10, 20))
Class_0 = wine.data[wine.target == 0]
Class_1 = wine.data[wine.target == 1]
Class_2 = wine.data[wine.target == 2]
ax = axes.ravel()
for i in range(13):
    _, bins = np.histogram(wine.data[:, i], bins=50)
    ax[i].hist(Class_0[:, i], bins=bins, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(Class_1[:, i], bins=bins, color=mglearn.cm3(2), alpha=.5)
    ax[i].hist(Class_2[:, i], bins=bins, color=mglearn.cm3(1), alpha=.5)
    ax[i].set_title(wine.feature_names[i])
    ax[i].set_yticks(())
    ax[0].set_xlabel("Feature magnitude")
    ax[0].set_ylabel("Frequency")
    ax[0].legend(["Class_0", "Class_1", "Class_2"], loc="best")
fig.tight_layout()

# In[15]:

print("Scale Dataset")
scaler = StandardScaler()
scaler.fit(wine.data)
X_scaled = scaler.transform(wine.data)
Пример #5
0
    "C:/Users/Кирилл/Desktop/ds/voice.csv")  # считываем данные с таблицы voice
# Предварительный просмотр
mydata.head()  # вызывает первые 5 строк
print(
    mydata.shape
)  #определяем какой формы массив,определяем число элементов вдоль каждой оси массива(3168,21)
# Построение диаграмм
male = mydata.loc[mydata['label'] ==
                  'male']  #используем метки для доступа к данным
female = mydata.loc[mydata['label'] ==
                    'female']  #используем метки для доступа к данным
fig, axes = plt.subplots(
    10, 2, figsize=(10, 20))  #построение 20 графиков nrows=10 ,ncols=2
ax = axes.ravel()  # делает массив плоским,
for i in range(20):
    ax[i].hist(male.iloc[:, i], bins=20, color=mglearn.cm3(0),
               alpha=.5)  #вычисляем гистограмму набора данных
    ax[i].hist(female.iloc[:, i], bins=20, color=mglearn.cm3(2),
               alpha=.5)  #вычисляем гистограмму набора данных
    ax[i].set_title(list(male)[i])  # добавляем заголовок
    ax[i].set_yticks(())  # устанавливаются метки

ax[0].set_xlabel("Feature magnitude")  #подпись оси х
ax[0].set_ylabel("Frequency")  #подпись оси y
ax[0].legend(
    ["male", "female"],
    loc="best")  # табличка с подписью, что относится к male, что к female
fig.tight_layout()  # чтобы не было наложений
# Подготавливаем данные для моделирования
mydata.loc[:, 'label'][mydata['label'] ==
                       "male"] = 0  #используем метки для доступа к данным
Пример #6
0
#Plot the histograms
narr0 = mydata.loc[mydata['diag'] == 'narr']
dial = mydata.loc[mydata['diag'] == 'dial']

narr = narr0[:len(dial)]
frames = [narr, dial]
mydata = pd.concat(frames)

fig, axes = plt.subplots(9, 2, figsize=(9, 17))

ax = axes.ravel()

print(ax.shape)

for i in range(1, 16):
    ax[i].hist(narr.ix[:, i], bins='auto', color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(dial.ix[:, i], bins='auto', color=mglearn.cm3(3), alpha=.5)
    ax[i].set_title(list(narr)[i])
    ax[i].set_yticks(())

ax[0].set_xlabel("Feature magnitude")
ax[0].set_ylabel("Frequency")
ax[0].legend(["", "dial"], loc="best")
fig.tight_layout()

#Prepare data for modeling
mydata.loc[mydata['diag'] == 'dial', 'label'] = 0
mydata.loc[mydata['diag'] == 'narr', 'label'] = 1
mydata_train, mydata_test = train_test_split(mydata,
                                             random_state=0,
                                             test_size=.2)
data, and finding a representation that is more informative for further processing.
One of the simplest and most widely used algorithms for all of these is principal component
analysis. We’ll also look at two other algorithms: non-negative matrix factorization
(NMF), which is commonly used for feature extraction, and t-SNE, which is
commonly used for visualization using two-dimensional scatter plots
"""
# Principal Component Analysis (PCA)-------------------------------------------
mglearn.plots.plot_pca_illustration()

fig, axes = plt.subplots(15, 2, figsize=(10, 20))
malignant = cancer.data[cancer.target == 0]
benign = cancer.data[cancer.target == 1]
ax = axes.ravel()
for i in range(30):
    _, bins = np.histogram(cancer.data[:, i], bins=50)
    ax[i].hist(malignant[:, i], bins=bins, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(benign[:, i], bins=bins, color=mglearn.cm3(2), alpha=.5)
    ax[i].set_title(cancer.feature_names[i])
    ax[i].set_yticks(())
ax[0].set_xlabel("Feature magnitude")
ax[0].set_ylabel("Frequency")
ax[0].legend(["malignant", "benign"], loc="best")
fig.tight_layout()

"""
However, this plot doesn’t show us anything about the interactions between variables
and how these relate to the classes. Using PCA, we can capture the main interactions
and get a slightly more complete picture. We can find the first two principal components,
and visualize the data in this new two-dimensional space with a single scatter
plot.
Before we apply PCA, we scale our data so that each feature has unit variance using