Exemplo n.º 1
0
def plot_data(dataset):

    dataset.hist()
    plt.show()

    scatter_matrix(dataset)
    plt.show()
Exemplo n.º 2
0
def showGraph(dataset):

    # 直方图
    dataset.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
    pyplot.show()

    # 密度图
    dataset.plot(kind='density', subplots=True, layout=(4, 4), sharex=False, fontsize=1)
    pyplot.show()

    # 箱线图
    dataset.plot(kind='box', subplots=True, layout=(4, 4), sharex=False, sharey=False, fontsize=8)
    pyplot.show()

    # 散点矩阵图
    scatter_matrix(dataset)
    pyplot.show()

    # 相关矩阵图
    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none')
    fig.colorbar(cax)
    ticks = np.arange(0, 14, 1)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)
    pyplot.show()
Exemplo n.º 3
0
 def plot_data(self):
 
     self.dataset.hist()
     plt.show()
 
     scatter_matrix(self.dataset)
     plt.show()
Exemplo n.º 4
0
def visualizeData(inputDF):

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-scatter.png'
    myPlot = inputDF.plot(
        label    = 'population',
        kind     = 'scatter',
        x        = 'longitude',
        y        = 'latitude',
        s        = inputDF["population"] / 100,
        c        = 'median_house_value',
        cmap     = plt.get_cmap("jet"),
        colorbar = True,
        alpha    = 0.1
        )
    plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-correlations.png'
    corrMatrix = inputDF.corr()
    attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
    myPlot = scatter_matrix(frame=inputDF[attributes], figsize=(12,8))
    plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2)

    print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)')
    print(   corrMatrix["median_house_value"].sort_values(ascending=False) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-medianIncome.png'
    myPlot = inputDF.plot(
        kind    = 'scatter',
        x       = "median_income",
        y       = "median_house_value",
        alpha   = 0.1,
        figsize = (12,8)
        )
    plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-correlations-02.png'

    tempDF = inputDF.copy()

    tempDF[     "roomsPerHousehold"] = tempDF["total_rooms"]    / tempDF["households"]
    tempDF["populationPerHousehold"] = tempDF["population"]     / tempDF["households"]
    tempDF[       "bedroomsPerRoom"] = tempDF["total_bedrooms"] / tempDF["total_rooms"]

    corrMatrix = tempDF.corr()
    print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)')
    print(   corrMatrix["median_house_value"].sort_values(ascending=False) )

    attributes = ["median_house_value","median_income","roomsPerHousehold","bedroomsPerRoom"]
    myPlot = scatter_matrix(frame=tempDF[attributes], figsize=(12,8))
    plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Exemplo n.º 5
0
def showGraph(dataset):
    # 箱线图
    dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
    pyplot.show()

    # 直方图
    dataset.hist()
    pyplot.show()

    # 散点矩阵图
    scatter_matrix(dataset)
    pyplot.show()
def hist_scatter(unique_track):
    #histograms and scatterplots
    vars_of_interest = ['acousticness','danceability', 'energy','instrumentalness', 'liveness','loudness',
            'speechiness','tempo','time_signature','valence']
    for var in vars_of_interest:
        plt.figure()
        #the histogram shows counts,
        plt.hist(unique_track[var])
        plt.ylabel('Counts')
        plt.xlabel(var)
        plt.title('Histogram of '+var)
    
    #correlation matrix and interpretation using pairwise scatter plot
    plt.figure()
    scatter_matrix(unique_track[vars_of_interest])
    unique_track[vars_of_interest].corr().to_csv('correlation.csv')
    print('\nCorrelation Matrix')
    print(unique_track[vars_of_interest].corr())
Exemplo n.º 7
0
housing = strat_train_set.copy()
# visualization of features
housing.plot(kind="scatter",
             x="longitude",
             y="latitude",
             alpha=0.4,
             s=housing["population"] / 100,
             label="population",
             figsize=(10, 7),
             c="median_house_value",
             cmap=plt.get_cmap("jet"),
             colorbar=True)
#coorelations among features
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
scatter_matrix(housing[["median_house_value", "median_income", "total_rooms"]],
               figsize=(12, 8))
#combination of attributes
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_household"] = housing["total_bedrooms"] / housing[
    "total_rooms"]
housing[
    "population_per_threshold"] = housing["population"] / housing["households"]
corr_matrix = housing.corr()
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
#Data cleaning (getting rid of the corresponding districts)
housing.dropna(subset=["total_bedrooms"])
#Data cleaning (getting rid of the whole attribute)
housing.drop("total_bedrooms", axis=1)
#Data cleaning (setting values to some value)
median = housing["total_bedrooms"].median()
Exemplo n.º 8
0
def scatter_plot(data):
    scatter_matrix_plot = scatter_matrix(dataset, figsize=(20, 20))
    for ax in scatter_matrix_plot.ravel():
        ax.set_xlabel(ax.get_xlabel(), fontsize=7, rotation=45)
        ax.set_ylabel(ax.get_ylabel(), fontsize=7, rotation=90)
    return scatter_matrix_plot
Exemplo n.º 9
0
# Log-X
df.plot.scatter(x='CRIM', y='PRICE', logx=True)
plt.title('Scatter plot of Price vs. log(Crime)')
plt.show()
"""
Scatter Plot Matrix (산점도 행렬)
"""
# Import LIbrary
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

df = pd.read_csv('iris.csv')

# Scatter Plot Matrix with Histogram
scatter_matrix(df, alpha=0.5)
plt.show()

# Scatter Plot Matrix with Kernel Density Estimation
scatter_matrix(df, alpha=0.5, diagonal='kde')
plt.show()
"""
Heatmap (히트맵)
"""
## Using Pandas hexbin
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('iris.csv')
df.plot.hexbin(x='sepal-length', y='sepal-width', gridsize=25)
plt.show()
Exemplo n.º 10
0
# Check Correlation
Correlation = data.corr()
pd.DataFrame(Correlation)
correlation_Y = pd.DataFrame(Correlation["Survived"])
correlation_Y.sort_values(by="Survived", ascending=False)
print(correlation_Y)

# data Visualization
# histogram
data.hist()
plt.figure(figsize=(10.8, 7.6))
plt.show()

# Multimodal Data Visualizations
scatter_matrix(data)
plt.figure(figsize=(21.6, 15.2))
plt.show()

# correlation matrix
# matshow: Plot a matrix or array as an image
fig = plt.figure(figsize=(21.6, 15.2))
ax = fig.add_subplot(111)
cax = ax.matshow(data.corr(), vmin=-1, vmax=1, interpolation="none")
fig.colorbar(cax)
ticks = np.arange(0, 20, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
# set names
names = [
    "Survived", "Pclass", "Age", "SibSp", "Parch", "Fare", "Female", "Male",
from __future__ import print_function
# https://stackoverflow.com/questions/29433824/unable-to-import-matplotlib-pyplot-as-plt-in-virtualenv
import matplotlib
matplotlib.use('TkAgg') 
from pandas import read_csv
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

url = "https://goo.gl/vhm1eU"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(url, names=names)

# the distribution data 
description = data.describe()
print(description)

# the dimensions data 
print('The dimensions of data: ', data.shape)
scatter_matrix(data)
plt.show()
from pandas import to_datetime
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

# Load data
DIR = '../Logsn/ind_and_selBcol/v140/'
FILE = 'HPmth023.csv'
filename = DIR + FILE

# Use the following labels
names = ['BatteryStateOfCharge_Percent','BatteryVoltage_V','A_mean','min',
    'Wh_sum','DV','fD_all','fD_sel','cyc','TemperatureEnvironment_C','t_1','SOH']
dataset = read_csv(filename, usecols=names)
data = dataset.values

# No split-out of validation dataset to test and validation sets
test_size = 0.4
train_size = None
#print(train_size, type(train_size))

# IMPORTANT: keep time series order by shuffle=False
X_train, X_test = train_test_split(data, test_size=test_size, train_size=train_size, shuffle=False)
#print(X_train)

# convert to dataframe
dfX = DataFrame(X_train)

scatter_matrix(dfX)
pyplot.show()
#Datatypes of each attribute
print("printing Datatypes")
print(dataset.dtypes)

#Describe Dataset
print("printing Desctiption of ")
print(dataset.describe())

#Correlations
print("Printing Data Correlation")
print(dataset.corr())

#Histogram
dataset.hist
scatter_matrix(dataset)
allData = plt.subplot(441)
allData.set_title('All Data Together')
#plt.show(4,4,0)

setosaData = pd.read_csv('iris_setosa.csv')
setosaData.hist
scatter_matrix(setosaData)
setosa = plt.subplot(442)
setosa.set_title('Setosa Data')
#plt.show(4,4,1)

versicolorData = pd.read_csv('iris_versicolor.csv')
versicolorData.hist
scatter_matrix(versicolorData)
versiColor = plt.subplot(443)
training_data.plot(kind='density', subplots=True, sharex=False, figsize=(10,10))
plt.show()


# In[ ]:


training_data.corr()


# In[ ]:


from pandas.plotting import scatter_matrix
scatter_matrix(training_data, figsize=(10,10))
plt.show()


# # Working with Rows

# In[ ]:


for idx, row in test_data.iterrows():
    print(row['Name'], row['Pclass'])


# # Making Some Predictions

# In[ ]:
Exemplo n.º 15
0
 def make_scatter_matrix(self):
     fig, ax = plt.subplots()
     scatter_matrix(self.load.data[self.load.inputs], diagonal='kde')
     plt.savefig(f"Data/Visual/{self.load}_scatter_matrix.png",
                 bbox_inches='tight')
     plt.close(fig)
Exemplo n.º 16
0
dataset.hist()
plt.show()

# 绘制密度图--是一种表现与数据值对应的边界或域对象的图形表示方法,一般用于呈现连续变量
dataset.plot(kind ='density',subplots = True,sharex= False)
plt.show()

# 绘制箱线图--盒须图,是一种非常好的用于显示数据分布状况的手段。中位数,上四分位数,下四分位数,上边缘,下边缘,边缘之外的异常值
dataset.plot(kind = 'box',subplots = True,sharex = False)
plt.show()
'''

# 相关矩阵图
correlations = dataset.corr(method='pearson')
print(correlations)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(len(dataset.columns))
ax.set_xticks(ticks)  #设置x轴或者y轴只显示哪些刻度
ax.set_yticks(ticks)
ax.set_xticklabels(dataset.columns, rotation=90)  #设置刻度标签,rotation,fontsize
ax.set_yticklabels(dataset.columns)
plt.show()

#散点矩阵图
from pandas.plotting import scatter_matrix

scatter_matrix(dataset)
# plt.show()
Exemplo n.º 17
0
# writer=pd.ExcelWriter("C:\\Users\\harish647\\Desktop\\iris.xlsx",engine='xlsxwriter')
# df1.to_excel(writer,sheet_name='Sheet1')
# df2.to_excel(writer,sheet_name='Sheet2')

# #print(df1.boxplot(by='sepal_length',column=['sepal_width'],grid=True))
# #df1.hist()#histogram plot
# #plt.show()#univarient plot

# tips=sns.load_dataset('iris')
# print(tips.head())

# sns.set_style("whitegrid")
# #sns.boxplot(x='sepal_length',y='sepal_width',hue='species',data=tips,palette='deep')#boxplot
# sns.despine()
# sns.set_context('poster',font_scale=2)#setFont

# sns.lmplot(x='sepal_length',y='sepal_width',size=2,data=tips)#regression plot

##multivarient plot

data.plot(kind='box', subplots=True, layout=(2, 2), sharex=False,
          sharey=False)  #whisker plot and box which is mainly for univarient
scatter_matrix(data)  #scatter matrix plot how one effetced by other

plt.show()

#writer.save()
# wb=openpyxl.load_workbook("C:\\Users\\harish647\\Desktop\\iris.xlsx")
# print(wb.sheetnames())

#print(df)
Exemplo n.º 18
0
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

iris = load_iris()
print(iris)
print(iris.data)
print(iris.feature_names)
print(iris.target)
print(iris.target_names)

X = iris.data
y = iris.target

df = pd.DataFrame(X, columns=iris.feature_names)
df['class'] = y
print(df)
print(df.describe())

scatter_matrix(df)
plt.show()
Exemplo n.º 19
0
                      s=strat_train_copy['population'] / 100,
                      label='population',
                      figsize=(10, 7),
                      c='median_house_value',
                      cmap=plt.get_cmap("jet"),
                      colorbar=True)
plt.legend()
plt.show()

#looking the correlations between attributes
corr_matrix = strat_train_copy.corr()
print(corr_matrix['median_house_value'].sort_values(ascending=False))
attributes = [
    'median_house_value', 'median_income', 'total_rooms', 'housing_median_age'
]
scatter_matrix(strat_train_copy[attributes], figsize=(12, 8))
plt.show()

#creating new attributes

strat_train_copy['rooms_per_household'] = strat_train_copy[
    'total_rooms'] / strat_train_copy['households']
strat_train_copy['bedrooms_per_room'] = strat_train_copy[
    'total_bedrooms'] / strat_train_copy['total_rooms']
strat_train_copy['population_per_household'] = strat_train_copy[
    'population'] / strat_train_copy['households']
corr_matrix = strat_train_copy.corr()
print(corr_matrix['median_house_value'].sort_values(ascending=False))
housing = strat_train.drop("median_house_value", axis=1)
housing_labels = strat_train["median_house_value"].copy()
Exemplo n.º 20
0
 def scat(**kwds):
     return plotting.scatter_matrix(df, **kwds)
Exemplo n.º 21
0
            axis=1).plot(kind='box',
                         subplots=True,
                         layout=(2, 2),
                         sharex=False,
                         sharey=False,
                         figsize=(9, 9),
                         title='Box Plot for each input variable')
plt.savefig('fruits_boxplot')
plt.show()

fruits.drop('fruit_label', axis=1).hist(bins=30, figsize=(9, 9))
pl.suptitle("Histogram for each numeric input variable")
plt.savefig('fruits_hist')
plt.show()

scatter_matrix(fruits.drop('fruit_label', axis=1), figsize=(10, 5))
plt.show()

clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
    clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
    clf.score(X_test, y_test)))

clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
    clf2.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
    clf2.score(X_test, y_test)))
Exemplo n.º 22
0
df["ind"] = pd.Series(df.index).apply(lambda i: i % 50)
df.pivot("ind", "species")[col].plot(kind="box")
plt.show()
plt.close()

df.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)")
plt.title("길이 대 너비")
plt.show()
plt.close()

colors = ["r", "g", "b"]
markers = [".", "*", "^"]
fig, ax = plt.subplots(1, 1)
for i, spec in enumerate(df["species"].unique()):
    ddf = df[df["species"] == spec]
    ddf.plot(kind="scatter",
             x="sepal width (cm)",
             y="sepal length (cm)",
             alpha=0.5,
             s=10 * (i + 1),
             ax=ax,
             color=colors[i],
             marker=markers[i],
             label=spec)
plt.legend()
plt.show()

scatter_matrix(df)
plt.show()
plt.close()
# Scatter Matrix Plot for Multivariate Data
from matplotlib import pyplot as plt

from pandas import read_csv
#import pandas as pd

from pandas.plotting import scatter_matrix

import warnings

warnings.filterwarnings(action="ignore")

hNames = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]

dataframe = read_csv("indians-diabetes.data.csv", names=hNames)

scatter_matrix(dataframe)

plt.show()
Exemplo n.º 24
0
print("Dimensões da base:", df.shape)
print()

print(df.info())
print()

print(df.describe())
print()

df.hist(figsize=[10, 10])
plt.show()

paleta_cores = {0: 'green', 1: 'red'}
cores = [paleta_cores[c] for c in df['classe']]

scatter_matrix(df[atributos], figsize=[11, 11], c=cores)
plt.show()

#%%
# *************************
# *** Pré-processamento ***
# *************************

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

minMaxScaler = MinMaxScaler(feature_range=(0, 1))
standardScaler = StandardScaler()  # média 0 e desvio padrão 1

Xescalonado = minMaxScaler.fit_transform(X)
Xpadronizado = standardScaler.fit_transform(X)
Exemplo n.º 25
0
x = frame2['No of Siblings or Spouses on Board']
y = frame2['Name']
fig, ax = plt.subplots()
ax.bar(y, x)
plt.show()

women_frame = frame2.loc[frame2['Sex'] == 'Female']
y = women_frame['Age']
x = women_frame['Passenger Class']
plt.bar(x, y)
plt.xticks(x)
plt.show()

men_frame = frame2.loc[frame2['Sex'] == 'Male']
y = men_frame['Age']
x = men_frame['Passenger Class']
plt.bar(x, y)
plt.xticks(x)
plt.show()

y = frame2['Age']
x = frame2['Passenger Class']
plt.scatter(y, x)
plt.show()

frame3 = pd.read_csv('airquality.csv',
                     sep=',',
                     usecols=["Ozone", "Solar.R", "Wind", "Temp"])
scatter_matrix(frame3)
plt.show()
Exemplo n.º 26
0
extended_pheno_data_males[extended_pheno_data_males['Diagnosis'] == 'TD'][[
    'FIQ', 'VIQ'
]].mean()

#Or
#for general descriptives
extended_pheno_data_males.describe()

#Groupby for better implementation
#cleaner aesthitics
#groupby spits/is an object

ASD_TD_pheno_datamales = extended_pheno_data_males.groupby('Diagnosis')
ASD_TD_mean = ASD_TD_pheno_datamales.mean()
ASD_TD_max = ASD_TD_pheno_datamales.max()
'''
#Plot some of them (?)
plotting.scatter_matrix(extended_pheno_data_males[['FIQ', 'Parcel_64','Parcel_148']])
plt.show() #shows
plt.close() #terminates figure?
plotting.scatter_matrix(extended_pheno_data_males[['FIQ', 'Parcel_1','Parcel_2', 'Parcel_3', 'Parcel_4', 'Parcel_5', 'Parcel_6', 'Parcel_7', 'Parcel_8', 'Parcel_9', 'Parcel_10', 'Parcel_11', 'Parcel_12', 'Parcel_13', 'Parcel_14', 'Parcel_15']])
plt.show()  #looking for bimodal plots as if there are 2 populations
'''
#STATS - R like formulas

#Regression
#Simple
model = ols("Parcel_48 ~ FIQ", extended_pheno_data_males).fit()
print(model.summary())

model = ols("Parcel_48 ~ Diagnosis + 1", extended_pheno_data_males).fit(
y = np.random.randint(0, 50, 1000)

np.corrcoef(x, y)

# In[4]: Correlation Matrix
import pandas as pd

df = pd.DataFrame({'a': np.random.randint(0, 50, 1000)})
df['b'] = df['a'] + np.random.normal(0, 10, 1000) # positively correlated with 'a'
df['c'] = 100 - df['a'] + np.random.normal(0, 5, 1000) # negatively correlated with 'a'
df['d'] = np.random.randint(0, 50, 1000) # not correlated with 'a'

from pandas.plotting import scatter_matrix

df.corr()
scatter_matrix(df, figsize=(6, 6))
plt.show()

# In[5]
# http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-25-chi.html
import numpy as np
import pandas as pd
import scipy.stats as stats

np.random.seed(10)

# Sample data randomly at fixed probabilities
voter_race = np.random.choice(a= ["C1","C2","C3","C4","C5"],
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)
X=data[choix] # Isolation des variables d'entrées Humidity3pm et RainToday
y=data['RainTomorrow'] # Isolation de la variable de sortie RainTomorrow

X=X.values
y=y.values

scaler = StandardScaler().fit(X) # normalisation des valeurs ( moyenne à 0 et écart type de 1)
X[:] = scaler.transform(X) # remplacement des valeurs du dataframe par les valeurs normalisées en conservant le type DataFrame

for k in data:
    print("Calcul du coefficient de corrélation de la colonne ",k ," avec RainTomorrow")
    print(data['RainTomorrow'].corr(data[k])) # calcul des coefficients de corrélation pour chaques colonnes avec la variable de sortie "RainTomorrow"
# On sélectionne les colonnes RISK_MM RainToday et Humidity3pm

params=['Humidity3pm','RainToday','RainTomorrow'] # création de la liste des variables intéressantes
scatter_matrix(data[params], alpha=0.2, figsize=(12,10),diagonal='kde') #Trace la matrice des graphiques
plt.show() 


X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2) # Création des jeux de données (20% de la taille du jeu initial) et d'apprentissages (80% de la taille du jeu initial)


from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression() #
logisticRegr.fit(X_train, y_train) #Entrainement du modèle sur le jeu d'apprentissage ( Calcul des coefficients )

y_pred = logisticRegr.predict(X_test)  # Prédictions sur les données d'entrées du jeu de test


for k in range (19):
Exemplo n.º 29
0
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np

# 1. Import and parse the dataset.
data = pd.read_csv('diabetes.csv')
print(data.head())

# 2. Print out summary stats.
print(data.describe())

# 3. Create (i) histogram plots for the data in 
# each column, and (ii) scatter plots showing 
# the correlation between columns.
scatter_matrix(data, alpha=0.2, figsize=(10, 10))
plt.show()

# 4. Split the data into training, test, and 
# holdout data sets
mask = np.random.rand(len(data)) < 0.8
training = data[mask]
test = data[~mask]

mask = np.random.rand(len(training)) < 0.8
holdout = training[~mask]
training = training[mask]

# save these data sets
training.to_csv('training.csv', index=False)
test.to_csv('test.csv', index=False)
from pandas.plotting import andrews_curves

plt.figure(6)
andrews_curves(dataset, 'class')
# plt.show()
from pandas.plotting import parallel_coordinates

plt.figure(7)
parallel_coordinates(dataset, 'class')
# plt.show()
# 散点图矩阵,这有助于发现变量之间的结构化关系,散点图代表了两变量的相关程度
# 如果呈现出沿着对角线分布的趋势,说明它们的相关性较高
from pandas.plotting import scatter_matrix

plt.figure(8)
scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde')
plt.show()

# 三. 线性回归分析鸢尾花
# 该部分主要采用线性回归算法对鸢尾花的特征数据进行分析,
# 预测花瓣长度、花瓣宽度、花萼长度、花萼宽度四个特征之间的线性关系。
from sklearn.datasets import load_iris

hua = load_iris()
# 获取花瓣的长和宽
x = [n[0] for n in hua.data]
y = [n[1] for n in hua.data]
import numpy as np  # 转换成数组

x = np.array(x).reshape(len(x), 1)
y = np.array(y).reshape(len(y), 1)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import seaborn as sb
from sklearn.naive_bayes import GaussianNB
import os
os.chdir("C:\\Directory")
df = pd.read_csv("Annexure_1_result.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.info()
df.describe()
scatter_matrix(df, figsize=(15, 10))
plt.show()
df.head()
predict_df = df.drop([
    "Text_id", "Sampled_date", "T_site", "T_plant", "Sampling_point",
    "Condition"
],
                     axis=1)
X = predict_df.drop(["Fault"], axis=1)
y = predict_df["Fault"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)
model = []
model.append(('LR', LogisticRegression()))
# Drop the highly correlated features from our training data 
one_hot_df6 = one_hot_df6.drop(to_drop, axis=1)

#Check columns after drop 
print('\r\n*********After: Dropping Highly Correlated Fields**************************************')
one_hot_df6.info(verbose=False)

onehots_stats = one_hot_df6.describe()

#join one hot with df

mergedf = pd.merge(one_hot_df6, df5, left_index=True, right_index=True)

#scatter plot of all the numerics
from pandas.plotting import scatter_matrix
ax = scatter_matrix(df5,figsize=(10, 10))

df_grouped = df5.groupby(by=['vendor_name'])
print (df_grouped.describe())



# this python magics will allow plot to be embedded into the notebook
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline

# lets look at the boxplots separately
vars_to_plot_separate = [['state_bottle_cost', 'state_bottle_retail'],
                        ['sale_dollars'],
Exemplo n.º 33
0
# partition the data into two classes
y_train_1 = y_train == 1  # apple in True class, others in False class
y_test_1 = y_test == 1  # apple in True class, others in False class
y_train = 2 - y_train_1  # apple = 1; others =2
y_test = 2 - y_test_1

seeData = True
if seeData:
    # plotting a scatter matrix
    from matplotlib import cm
    from pandas.plotting import scatter_matrix
    cmap = cm.get_cmap('gnuplot')
    scatter = scatter_matrix(X_train,
                             c=y_train,
                             marker='o',
                             s=40,
                             hist_kwds={'bins': 15},
                             figsize=(9, 9),
                             cmap=cmap)

    # plotting a 3D scatter plot
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import axes3d  # must keep
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X_train['width'],
               X_train['height'],
               X_train['color_score'],
               c=y_train,
               marker='o',
               s=100)
                            colorbar=True
                            )
plt.legend()

#%%
# Lets check correlation coef's for Median House Values
corr_matrix = housing.corr()

corr_matrix['median_house_value'].sort_values(ascending=False)

#%%
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']

scatter_matrix(housing[attributes], figsize=(12,8))

#%%
# Focus on median_income and median_house_vale
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)

#%%
housing['rooms_per_household'] = housing['total_rooms']/ housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

corr_matrix = housing.corr()

corr_matrix['median_house_value'].sort_values(ascending=False)

#%%
Exemplo n.º 35
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

#analysing the data
corr_matrix = data.corr()
from pandas.plotting import scatter_matrix

attributes = [
    'X3 distance to the nearest MRT station',
    'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
    'Y house price of unit area'
]
scatter_matrix(data[attributes])
#from the scatter matrix, we can see that there doesn't exist a strong
#correlation between any 2 variables

#selecting a suitable model for the data
# multiple linear regression
from sklearn.linear_model import LinearRegression

regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)

#evaluating the selected model
# multiple linear regression
from sklearn.metrics import r2_score

r2 = r2_score(y_test, regressor1.predict(X_test))
Exemplo n.º 36
0
def plot(input_ts='-',
         columns=None,
         start_date=None,
         end_date=None,
         clean=False,
         skiprows=None,
         index_type='datetime',
         names=None,
         ofilename='plot.png',
         type='time',
         xtitle='',
         ytitle='',
         title='',
         figsize='10,6.0',
         legend=None,
         legend_names=None,
         subplots=False,
         sharex=True,
         sharey=False,
         colors='auto',
         linestyles='auto',
         markerstyles=' ',
         style='auto',
         logx=False,
         logy=False,
         xaxis='arithmetic',
         yaxis='arithmetic',
         xlim=None,
         ylim=None,
         secondary_y=False,
         mark_right=True,
         scatter_matrix_diagonal='kde',
         bootstrap_size=50,
         bootstrap_samples=500,
         norm_xaxis=False,
         norm_yaxis=False,
         lognorm_xaxis=False,
         lognorm_yaxis=False,
         xy_match_line='',
         grid=False,
         label_rotation=None,
         label_skip=1,
         force_freq=None,
         drawstyle='default',
         por=False,
         invert_xaxis=False,
         invert_yaxis=False,
         round_index=None,
         plotting_position='weibull',
         source_units=None,
         target_units=None,
         lag_plot_lag=1):
    r"""Plot data."""
    # Need to work around some old option defaults with the implementation of
    # mando
    legend = bool(legend == '' or legend == 'True' or legend is None)

    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from matplotlib.ticker import FixedLocator

    tsd = tsutils.common_kwds(tsutils.read_iso_ts(input_ts,
                                                  skiprows=skiprows,
                                                  names=names,
                                                  index_type=index_type),
                              start_date=start_date,
                              end_date=end_date,
                              pick=columns,
                              round_index=round_index,
                              dropna='all',
                              source_units=source_units,
                              target_units=target_units,
                              clean=clean)

    if type in ['bootstrap',
                'heatmap',
                'autocorrelation',
                'lag_plot']:
        if len(tsd.columns) != 1:
            raise ValueError("""
*
*   The '{1}' plot can only work with 1 time-series in the DataFrame.
*   The DataFrame that you supplied has {0} time-series.
*
""".format(len(tsd.columns), type))

    if por is True:
        tsd = tsutils.common_kwds(tsutils.read_iso_ts(tsd),
                                  start_date=start_date,
                                  end_date=end_date,
                                  round_index=round_index,
                                  dropna='no')

    # This is to help pretty print the frequency
    try:
        try:
            pltfreq = str(tsd.index.freq, 'utf-8').lower()
        except TypeError:
            pltfreq = str(tsd.index.freq).lower()
        if pltfreq.split(' ')[0][1:] == '1':
            beginstr = 3
        else:
            beginstr = 1
        if pltfreq == 'none':
            short_freq = ''
        else:
            # short freq string (day) OR (2 day)
            short_freq = '({0})'.format(pltfreq[beginstr:-1])
    except AttributeError:
        short_freq = ''

    if legend_names:
        lnames = tsutils.make_list(legend_names)
        if len(lnames) != len(set(lnames)):
            raise ValueError("""
*
*   Each name in legend_names must be unique.
*
""")
        if len(tsd.columns) == len(lnames):
            renamedict = dict(list(zip(tsd.columns, lnames)))
        elif type == 'xy' and len(tsd.columns) // 2 == len(lnames):
            renamedict = dict(list(zip(tsd.columns[2::2], lnames[1:])))
            renamedict[tsd.columns[1]] = lnames[0]
        else:
            raise ValueError("""
*
*   For 'legend_names' you must have the same number of comma
*   separated names as columns in the input data.  The input
*   data has {0} where the number of 'legend_names' is {1}.
*
*   If 'xy' type you need to have legend names as x,y1,y2,y3,...
*
""".format(len(tsd.columns), len(lnames)))
        tsd.rename(columns=renamedict, inplace=True)
    else:
        lnames = tsd.columns

    if colors == 'auto':
        colors = color_list
    else:
        colors = tsutils.make_list(colors)

    if linestyles == 'auto':
        linestyles = line_list
    else:
        linestyles = tsutils.make_list(linestyles)

    if markerstyles == 'auto':
        markerstyles = marker_list
    else:
        markerstyles = tsutils.make_list(markerstyles)
        if markerstyles is None:
            markerstyles = ' '

    if style != 'auto':

        nstyle = tsutils.make_list(style)
        if len(nstyle) != len(tsd.columns):
            raise ValueError("""
*
*   You have to have the same number of style strings as time-series to plot.
*   You supplied '{0}' for style which has {1} style strings,
*   but you have {2} time-series.
*
""".format(style, len(nstyle), len(tsd.columns)))
        colors = []
        markerstyles = []
        linestyles = []
        for st in nstyle:
            colors.append(st[0])
            if len(st) == 1:
                markerstyles.append(' ')
                linestyles.append('-')
                continue
            if st[1] in marker_list:
                markerstyles.append(st[1])
                try:
                    linestyles.append(st[2:])
                except IndexError:
                    linestyles.append(' ')
            else:
                markerstyles.append(' ')
                linestyles.append(st[1:])
    if linestyles is None:
        linestyles = [' ']
    else:
        linestyles = [' ' if i == '  ' else i for i in linestyles]
    markerstyles = [' ' if i is None else i for i in markerstyles]

    icolors = itertools.cycle(colors)
    imarkerstyles = itertools.cycle(markerstyles)
    ilinestyles = itertools.cycle(linestyles)

    style = ['{0}{1}{2}'.format(next(icolors),
                                next(imarkerstyles),
                                next(ilinestyles))
             for i in list(range(len(tsd.columns)))]

    # reset to beginning of iterator
    icolors = itertools.cycle(colors)
    imarkerstyles = itertools.cycle(markerstyles)
    ilinestyles = itertools.cycle(linestyles)

    if (logx is True or
            logy is True or
            norm_xaxis is True or
            norm_yaxis is True or
            lognorm_xaxis is True or
            lognorm_yaxis is True):
        warnings.warn("""
*
*   The --logx, --logy, --norm_xaxis, --norm_yaxis, --lognorm_xaxis, and
*   --lognorm_yaxis options are deprecated.
*
*   For --logx use --xaxis="log"
*   For --logy use --yaxis="log"
*   For --norm_xaxis use --type="norm_xaxis"
*   For --norm_yaxis use --type="norm_yaxis"
*   For --lognorm_xaxis use --type="lognorm_xaxis"
*   For --lognorm_yaxis use --type="lognorm_yaxis"
*
""")

    if xaxis == 'log':
        logx = True
    if yaxis == 'log':
        logy = True

    if type in ['norm_xaxis',
                'lognorm_xaxis',
                'weibull_xaxis']:
        xaxis = 'normal'
        if logx is True:
            logx = False
            warnings.warn("""
*
*   The --type={1} cannot also have the xaxis set to {0}.
*   The {0} setting for xaxis is ignored.
*
""".format(xaxis, type))

    if type in ['norm_yaxis',
                'lognorm_yaxis',
                'weibull_yaxis']:
        yaxis = 'normal'
        if logy is True:
            logy = False
            warnings.warn("""
*
*   The --type={1} cannot also have the yaxis set to {0}.
*   The {0} setting for yaxis is ignored.
*
""".format(yaxis, type))

    xlim = _know_your_limits(xlim, axis=xaxis)
    ylim = _know_your_limits(ylim, axis=yaxis)

    figsize = tsutils.make_list(figsize)

    if not isinstance(tsd.index, pd.DatetimeIndex):
        tsd.insert(0, tsd.index.name, tsd.index)

    if type in ['xy',
                'double_mass']:
        if tsd.shape[1] % 2 != 0:
            raise AttributeError("""
*
*   The 'xy' and 'double_mass' types must have an even number of columns
*   arranged as x,y pairs.  You supplied {0} columns.
*
""".format(tsd.shape[1]))
        colcnt = tsd.shape[1] // 2
    elif type in ['norm_xaxis',
                  'norm_yaxis',
                  'lognorm_xaxis',
                  'lognorm_yaxis',
                  'weibull_xaxis',
                  'weibull_yaxis']:
        colcnt = tsd.shape[1]

    if type in ['xy',
                'double_mass',
                'norm_xaxis',
                'norm_yaxis',
                'lognorm_xaxis',
                'lognorm_yaxis',
                'weibull_xaxis',
                'weibull_yaxis',
                'heatmap']:
        _, ax = plt.subplots(figsize=figsize)
        plotdict = {(False, True): ax.semilogy,
                    (True, False): ax.semilogx,
                    (True, True): ax.loglog,
                    (False, False): ax.plot}

    if type == 'time':
        ax = tsd.plot(legend=legend, subplots=subplots, sharex=sharex,
                      sharey=sharey, style=None, logx=logx, logy=logy,
                      xlim=xlim, ylim=ylim, secondary_y=secondary_y,
                      mark_right=mark_right, figsize=figsize,
                      drawstyle=drawstyle)
        for index, line in enumerate(ax.lines):
            plt.setp(line, color=style[index][0])
            plt.setp(line, marker=style[index][1])
            plt.setp(line, linestyle=style[index][2:])
        xtitle = xtitle or 'Time'
        if legend is True:
            plt.legend(loc='best')
    elif type in ['taylor']:
        from .. skill_metrics import centered_rms_dev
        from .. skill_metrics import taylor_diagram
        ref = tsd.iloc[:, 0]
        std = [pd.np.std(ref)]
        ccoef = [1.0]
        crmsd = [0.0]
        for col in range(1, len(tsd.columns)):
            std.append(pd.np.std(tsd.iloc[:, col]))
            ccoef.append(pd.np.corrcoef(tsd.iloc[:, col],
                                        ref)[0][1])
            crmsd.append(centered_rms_dev(tsd.iloc[:, col].values,
                                          ref.values))
        taylor_diagram(pd.np.array(std),
                       pd.np.array(crmsd),
                       pd.np.array(ccoef))
    elif type in ['target']:
        from .. skill_metrics import centered_rms_dev
        from .. skill_metrics import rmsd
        from .. skill_metrics import bias
        from .. skill_metrics import target_diagram
        biases = []
        rmsds = []
        crmsds = []
        ref = tsd.iloc[:, 0].values
        for col in range(1, len(tsd.columns)):
            biases.append(bias(tsd.iloc[:, col].values, ref))
            crmsds.append(centered_rms_dev(tsd.iloc[:, col].values,
                                           ref))
            rmsds.append(rmsd(tsd.iloc[:, col].values,
                              ref))
        target_diagram(pd.np.array(biases),
                       pd.np.array(crmsds),
                       pd.np.array(rmsds))
    elif type in ['xy',
                  'double_mass']:
        # PANDAS was not doing the right thing with xy plots
        # if you wanted lines between markers.
        # Fell back to using raw matplotlib.
        # Boy I do not like matplotlib.

        for colindex in range(colcnt):
            ndf = tsd.iloc[:, colindex*2:colindex*2 + 2]
            if type == 'double_mass':
                ndf = ndf.dropna().cumsum()
            oxdata = pd.np.array(ndf.iloc[:, 0])
            oydata = pd.np.array(ndf.iloc[:, 1])

            plotdict[(logx, logy)](oxdata,
                                   oydata,
                                   linestyle=next(ilinestyles),
                                   color=next(icolors),
                                   marker=next(imarkerstyles),
                                   label=lnames[colindex],
                                   drawstyle=drawstyle)

        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        if legend is True:
            ax.legend(loc='best')

        if type == 'double_mass':
            xtitle = xtitle or 'Cumulative {0}'.format(tsd.columns[0])
            ytitle = ytitle or 'Cumulative {0}'.format(tsd.columns[1])

    elif type in ['norm_xaxis',
                  'norm_yaxis',
                  'lognorm_xaxis',
                  'lognorm_yaxis',
                  'weibull_xaxis',
                  'weibull_yaxis']:
        ppf = tsutils.set_ppf(type.split('_')[0])
        ys = tsd.iloc[:, :]

        for colindex in range(colcnt):
            oydata = pd.np.array(ys.iloc[:, colindex].dropna())
            oydata = pd.np.sort(oydata)[::-1]
            n = len(oydata)
            norm_axis = ax.xaxis
            oxdata = ppf(tsutils.set_plotting_position(n,
                                                       plotting_position))

            if type in ['norm_yaxis',
                        'lognorm_yaxis',
                        'weibull_yaxis']:
                oxdata, oydata = oydata, oxdata
                norm_axis = ax.yaxis

            plotdict[(logx, logy)](oxdata,
                                   oydata,
                                   linestyle=next(ilinestyles),
                                   color=next(icolors),
                                   marker=next(imarkerstyles),
                                   label=lnames[colindex],
                                   drawstyle=drawstyle)

        # Make it pretty
        xtmaj = pd.np.array([0.01, 0.1, 0.5, 0.9, 0.99])
        xtmaj_str = ['1', '10', '50', '90', '99']
        xtmin = pd.np.concatenate([pd.np.linspace(0.001, 0.01, 10),
                                   pd.np.linspace(0.01, 0.1, 10),
                                   pd.np.linspace(0.1, 0.9, 9),
                                   pd.np.linspace(0.9, 0.99, 10),
                                   pd.np.linspace(0.99, 0.999, 10)])
        xtmaj = ppf(xtmaj)
        xtmin = ppf(xtmin)

        norm_axis.set_major_locator(FixedLocator(xtmaj))
        norm_axis.set_minor_locator(FixedLocator(xtmin))

        if type in ['norm_xaxis',
                    'lognorm_xaxis',
                    'weibull_xaxis']:
            ax.set_xticklabels(xtmaj_str)
            ax.set_ylim(ylim)
            ax.set_xlim(ppf(xlim))

        elif type in ['norm_yaxis',
                      'lognorm_yaxis',
                      'weibull_yaxis']:
            ax.set_yticklabels(xtmaj_str)
            ax.set_xlim(xlim)
            ax.set_ylim(ppf(ylim))

        if type in ['norm_xaxis',
                    'norm_yaxis']:
            xtitle = xtitle or 'Normal Distribution'
            ytitle = ytitle or tsd.columns[0]
        elif type in ['lognorm_xaxis',
                      'lognorm_yaxis']:
            xtitle = xtitle or 'Log Normal Distribution'
            ytitle = ytitle or tsd.columns[0]
        elif type in ['weibull_xaxis',
                      'weibull_yaxis']:
            xtitle = xtitle or 'Weibull Distribution'
            ytitle = ytitle or tsd.columns[0]

        if type in ['norm_yaxis',
                    'lognorm_yaxis',
                    'weibull_yaxis']:
            xtitle, ytitle = ytitle, xtitle

        if legend is True:
            ax.legend(loc='best')

    elif type in ['kde',
                  'probability_density']:
        ax = tsd.plot(kind='kde', legend=legend, subplots=subplots,
                      sharex=sharex, sharey=sharey, style=None, logx=logx,
                      logy=logy, xlim=xlim, ylim=ylim, secondary_y=secondary_y,
                      figsize=figsize)
        for index, line in enumerate(ax.lines):
            plt.setp(line, color=style[index][0])
            plt.setp(line, marker=style[index][1])
            plt.setp(line, linestyle=style[index][2:])
        ytitle = ytitle or 'Density'
        if legend is True:
            plt.legend(loc='best')
    elif type == 'kde_time':
        from scipy.stats.kde import gaussian_kde
        _, (ax0, ax1) = plt.subplots(nrows=1,
                                     ncols=2,
                                     sharey=True,
                                     figsize=figsize,
                                     gridspec_kw={'width_ratios': [1, 4]})
        tsd.plot(legend=legend, subplots=subplots, sharex=sharex,
                 sharey=sharey, style=None, logx=logx, logy=logy, xlim=xlim,
                 ylim=ylim, secondary_y=secondary_y, mark_right=mark_right,
                 figsize=figsize, drawstyle=drawstyle, ax=ax1)
        for index, line in enumerate(ax1.lines):
            plt.setp(line, color=style[index][0])
            plt.setp(line, marker=style[index][1])
            plt.setp(line, linestyle=style[index][2:])
        xtitle = xtitle or 'Time'
        ylimits = ax1.get_ylim()
        ny = pd.np.linspace(ylimits[0], ylimits[1], 1000)
        for col in range(len(tsd.columns)):
            xvals = tsd.iloc[:, col].dropna().values
            pdf = gaussian_kde(xvals)
            ax0.plot(pdf(ny),
                     ny,
                     linestyle=style[col][2:],
                     color=style[col][0],
                     marker=style[col][1],
                     label=tsd.columns[col],
                     drawstyle=drawstyle)
        ax0.set(xlabel='Probability Density', ylabel=ytitle)
    elif type == 'boxplot':
        tsd.boxplot(figsize=figsize)
    elif type == 'scatter_matrix':
        from pandas.plotting import scatter_matrix
        if scatter_matrix_diagonal == 'probablity_density':
            scatter_matrix_diagonal = 'kde'
        scatter_matrix(tsd,
                       diagonal=scatter_matrix_diagonal,
                       figsize=figsize)
    elif type == 'lag_plot':
        from pandas.plotting import lag_plot
        lag_plot(tsd,
                 lag=lag_plot_lag)
        xtitle = xtitle or 'y(t)'
        ytitle = ytitle or 'y(t+{0})'.format(short_freq or 1)
    elif type == 'autocorrelation':
        from pandas.plotting import autocorrelation_plot
        autocorrelation_plot(tsd)
        xtitle = xtitle or 'Time Lag {0}'.format(short_freq)
    elif type == 'bootstrap':
        from pandas.plotting import bootstrap_plot
        bootstrap_plot(tsd,
                       size=bootstrap_size,
                       samples=bootstrap_samples,
                       color='gray')
    elif type == 'heatmap':
        # Find beginning and end years
        byear = tsd.index[0].year
        eyear = tsd.index[-1].year
        tsd = tsutils.asbestfreq(tsd)
        if tsd.index.freqstr != 'D':
            raise ValueError("""
*
*  The "heatmap" plot type can only work with daily time series.
*
""")
        dr = pd.date_range('{0}-01-01'.format(byear),
                           '{0}-12-31'.format(eyear),
                           freq='D')
        ntsd = tsd.reindex(index=dr)
        groups = ntsd.iloc[:, 0].groupby(pd.TimeGrouper('A'))
        years = pd.DataFrame()
        for name, group in groups:
            ngroup = group.values
            if len(group.values) == 365:
                ngroup = pd.np.append(group.values, [pd.np.nan])
            years[name.year] = ngroup
        years = years.T
        plt.imshow(years,
                   interpolation=None,
                   aspect='auto')
        plt.colorbar()
        yticks = list(range(byear, eyear + 1))
        skip = len(yticks)//20 + 1
        plt.yticks(range(0, len(yticks), skip), yticks[::skip])
        mnths = [0, 30, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334]
        mnths_labels = ['Jan',
                        'Feb',
                        'Mar',
                        'Apr',
                        'May',
                        'Jun',
                        'Jul',
                        'Aug',
                        'Sep',
                        'Oct',
                        'Nov',
                        'Dec']
        plt.xticks(mnths, mnths_labels)
        grid = False
    elif (type == 'bar' or
          type == 'bar_stacked' or
          type == 'barh' or
          type == 'barh_stacked'):
        stacked = False
        if type[-7:] == 'stacked':
            stacked = True
        kind = 'bar'
        if type[:4] == 'barh':
            kind = 'barh'
        ax = tsd.plot(kind=kind, legend=legend, stacked=stacked,
                      style=style, logx=logx, logy=logy, xlim=xlim,
                      ylim=ylim, figsize=figsize)
        for index, line in enumerate(ax.lines):
            plt.setp(line, color=style[index][0])
            plt.setp(line, marker=style[index][1])
            plt.setp(line, linestyle=style[index][2:])
        freq = tsutils.asbestfreq(tsd, force_freq=force_freq).index.freqstr
        if freq is not None:
            if 'A' in freq:
                endchar = 4
            elif 'M' in freq:
                endchar = 7
            elif 'D' in freq:
                endchar = 10
            elif 'H' in freq:
                endchar = 13
            else:
                endchar = None
            nticklabels = []
            if kind == 'bar':
                taxis = ax.xaxis
            else:
                taxis = ax.yaxis
            for index, i in enumerate(taxis.get_majorticklabels()):
                if index % label_skip:
                    nticklabels.append(' ')
                else:
                    nticklabels.append(i.get_text()[:endchar])
            taxis.set_ticklabels(nticklabels)
            plt.setp(taxis.get_majorticklabels(), rotation=label_rotation)
        if legend is True:
            plt.legend(loc='best')
    elif type == 'histogram':
        tsd.hist(figsize=figsize)
    else:
        raise ValueError("""
*
*   Plot 'type' {0} is not supported.
*
""".format(type))

    if xy_match_line:
        if isinstance(xy_match_line, str):
            xymsty = xy_match_line
        else:
            xymsty = 'g--'
        nxlim = ax.get_xlim()
        nylim = ax.get_ylim()
        maxt = max(nxlim[1], nylim[1])
        mint = min(nxlim[0], nylim[0])
        ax.plot([mint, maxt], [mint, maxt], xymsty, zorder=1)
        ax.set_ylim(nylim)
        ax.set_xlim(nxlim)

    plt.xlabel(xtitle)
    plt.ylabel(ytitle)

    if invert_xaxis is True:
        plt.gca().invert_xaxis()
    if invert_yaxis is True:
        plt.gca().invert_yaxis()

    plt.grid(grid)

    plt.title(title)
    plt.tight_layout()
    if ofilename is None:
        return plt
    plt.savefig(ofilename)
Exemplo n.º 37
0
    # test if it running correctly
    # print(dataset.shape)

    ######## 3 #########

    # add some user text
    st.info("Here is a description of your dataset")

    # get description
    # to say: write automatically how to display data (if it's text, dataset, or whatever)
    st.write(dataset.describe())

    st.info("Here is a plot to see distributions and correlations")

    # display the scatter matrix
    scatter_matrix(dataset, diagonal="hist")

    st.set_option('deprecation.showPyplotGlobalUse', False)

    # display the plot
    st.pyplot()

    ######## 4 #########

    # train the model

    # get x and y
    # x = dataset.iloc[:,[ 0, 1, 2, 3]].values
    x = dataset.loc[:, ["Temperature"]]
    # x = dataset.iloc[:,[ 0]].values
    # y = dataset.iloc[:, -1].values
Exemplo n.º 38
0
def class_wise_scatter(data_frame):
    scatter_matrix(data_frame, alpha=0.5, figsize=(6, 6), diagonal='kde')
    plt.show()
Exemplo n.º 39
0
# %%
import pandas as pd
from pandas.plotting import scatter_matrix

# Reading the data and load it as a DataFrame
df = pd.read_csv('auto-mpg.csv')

# Print out the column names
print('Column names are: ', list(df.columns))

scatter_matrix(df, alpha=0.4, figsize=(7, 7))

# Make target (y) equal to mpg
y = df.pop('mpg')

# Make x a large matrix containing displacement, cylinders, weight, acceleration and model year
X = df[['displacement', 'cylinders', 'weight', 'acceleration', 'model year']]

#%%
# Import the nessecary Library from Sklearn
from sklearn.model_selection import train_test_split

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

# Import the module
from sklearn.linear_model import LinearRegression
Exemplo n.º 40
0
# descriptions
print(dataset.describe())

# class distribution
print(dataset.groupby('class').size())

# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()

# histograms
dataset.hist()
plt.show()

# scatter plot matrix
scatter_matrix(dataset)
plt.show()

# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = \
    model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
Exemplo n.º 41
0
import pickle

dataset = pd.read_csv('data_banknote.csv')

print(dataset.shape)

# scatter_matrix(dataset,color=colors)
# plotting.show()

# define colors list, to be used to plot survived either red (=0) or green (=1)
colors = ['red', 'green']

# make a scatter plot
scatter_matrix(dataset,
               figsize=[20, 20],
               marker='.',
               c=dataset.Class.apply(lambda x: colors[x]))
plotting.show()

#Corelation matrix
corrmat = dataset.corr()
fig = plt.figure(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()

#Create Validation dataset
array = dataset.values
X = array[:, 0:4]
y = array[:, 4]
validation_size = 0.20
seed = 7
Exemplo n.º 42
0
def main():
    ### fetching data

    # fetch_housing_data()
    housing = load_housing_data()

    ### Exploring data to gain insights
    # print (housing.head())
    # print(housing.info())
    # print(housing['ocean_proximity'].value_counts())
    # print (housing.describe())
    # plot_hist(housing)

    ### Create train set and test set from data using random sampling; use sklearn to get create train and test set
    train_set, test_set = train_test_func(housing)

    ### Exploring test_set

    # print(test_set.head())
    # housing['median_income'].hist()
    # plt.show()

    ### To limit the income category, we will divide by 1.5
    housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)

    ### Generalize the label with minimal value, so those greater than 5 label it with 5.
    housing['income_cat'].where(housing['income_cat'] < 5, 5.0, inplace=True)

    # housing['income_cat'].hist()
    # print(housing['income_cat'].value_counts())
    # plt.show()

    ### Create train and test set from data using stratified sampling
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing['income_cat']):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    # print(strat_test_set['income_cat'].value_counts() / len(strat_test_set))
    # print(housing["income_cat"].value_counts() / len(housing))

    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)


### Discover and visualize the data to gain insights
# 	housing = strat_train_set.copy()
# 	housing.plot(kind="scatter",x ="longitude", y = "latitude",alpha = 0.1)
# save_fig("better_visual_plot")

# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
#    s=housing["population"]/100, label="population", figsize=(10,7),
#    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
#    sharex=False)
# plt.legend()
# save_fig("housing_prices_scatterplot")

    corr_matrix = housing.corr()
    # print(corr_matrix["median_house_value"].sort_values(ascending = False))

    attribs = [
        "median_house_value", "median_income", "total_rooms",
        "housing_median_age"
    ]

    scatter_matrix(housing[attribs], figsize=(12, 8))
    # save_fig("scatter_matrix_plot")

    housing.plot(kind="scatter",
                 x="median_income",
                 y="median_house_value",
                 alpha=0.1)
    plt.axis([0, 16, 0, 550000])
    # save_fig("income_vs_house_value_scatterplot")

    housing[
        "rooms_per_household"] = housing["total_rooms"] / housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing[
        "total_rooms"]
    housing["population_per_household"] = housing["population"] / housing[
        "households"]

    corr_matrix = housing.corr()
    # print(corr_matrix["median_house_value"].sort_values(ascending = False))

    ### Prepare data for machine learning algo

    housing = strat_train_set.drop("median_house_value",
                                   axis=1)  # drop labels for training set
    housing_labels = strat_train_set["median_house_value"].copy()

    sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
    # print(sample_incomplete_rows)
    # print(sample_incomplete_rows.dropna(subset= ["total_bedrooms"]))  #drop all data with na
    # print(sample_incomplete_rows.drop("total_bedrooms",axis=1)) # drop column with na

    ### impute the missing values
    try:
        from sklearn.impute import SimpleImputer  # Scikit-Learn 0.20+
    except ImportError:
        from sklearn.preprocessing import Imputer as SimpleImputer

    imputer = SimpleImputer(strategy="median")

    ### removing categorical data
    housing_num = housing.drop('ocean_proximity', axis=1)
    imputer.fit(housing_num)
    # print (imputer.statistics_)

    ###transform the training set:
    X = imputer.transform(housing_num)
    housing_tr = pd.DataFrame(X,
                              columns=housing_num.columns,
                              index=housing.index)

    housing_cat = housing[['ocean_proximity']]

    from sklearn.preprocessing import LabelEncoder  # nearest value will assume that it is related.
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import LabelBinarizer

    encoder_LB = LabelBinarizer()
    housing_cat_LB_1hot = encoder_LB.fit_transform(housing_cat)
    # print(housing_cat_LB_1hot)
    # print(housing.columns)

    # attr_adder = FunctionTransformer(add_extra_features,validate = True, kw_args = {'add_bedroom_per_room':False})

    # housing_extra_attribs = attr_adder.fit_transform(housing.values)
    # print(housing.values)

    # housing_extra_attribs = pd.DataFrame(
    #    housing_extra_attribs,
    #    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    #    index=housing.index)
    # housing_extra_attribs.head()
    # print (housing.columns)

    global rooms_ix, bedrooms_ix, population_ix, household_ix
    rooms_ix, bedrooms_ix, population_ix, household_ix = [
        list(housing.columns).index(col)
        for col in ("total_rooms", "total_bedrooms", "population",
                    "households")
    ]
    # attr_adder = FunctionTransformer(add_extra_features, validate=False,kw_args={"add_bedrooms_per_room": False})
    # housing_extra_attribs = attr_adder.fit_transform(housing.values)

    from sklearn.preprocessing import FunctionTransformer

    def add_extra_features(X, add_bedrooms_per_room=True):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

    attr_adder = FunctionTransformer(add_extra_features,
                                     validate=False,
                                     kw_args={"add_bedrooms_per_room": False})
    housing_extra_attribs = attr_adder.fit_transform(housing.values)

    housing_extra_attribs = pd.DataFrame(
        housing_extra_attribs,
        columns=list(housing.columns) +
        ["rooms_per_household", "population_per_household"],
        index=housing.index)
    # print(housing_extra_attribs.head(10))

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder',
         FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])
    housing_num_tr = num_pipeline.fit_transform(housing_num)
    # print(housing_num_tr)

    from sklearn.compose import ColumnTransformer

    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    housing_prepared = full_pipeline.fit_transform(housing)
    # print (housing_prepared)
    # print (housing_prepared.shape)
    # print (housing_labels.shape)

    from sklearn.linear_model import LinearRegression

    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    # print (housing.iloc[:5])

    # print(housing_prepared.shape)

    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    # print(some_data.shape)

    some_data_prepared = full_pipeline.transform(some_data)
    print('prediction:', lin_reg.predict(some_data_prepared))
    print('Actual:', list(some_labels))

    from sklearn.metrics import mean_squared_error

    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print(lin_rmse)

    from sklearn.metrics import mean_absolute_error

    lin_mae = mean_absolute_error(housing_labels, housing_predictions)
    print(lin_mae)

    from sklearn.tree import DecisionTreeRegressor

    tree_reg = DecisionTreeRegressor(random_state=42)
    tree_reg.fit(housing_prepared, housing_labels)

    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    print(tree_rmse)
Exemplo n.º 43
0
    ## Ax4 
    gl.plot(days_keys,ACCDIST, ax = ax4, labels = ["","","ACCDIST"],AxesStyle = "Normal", 
            alpha = alpha_stem,  color = "k", legend = ["ACCDIST"], fill = 0)
    
    # Set final properties and save figure
    gl.subplots_adjust(left=.09, bottom=.10, right=.90, top=.95, wspace=.05, hspace=0.05)
    gl.set_fontSizes(ax = [ax1,ax2,ax3,ax4], title = 20, xlabel = 20, ylabel = 20, 
                      legend = 15, xticks = 12, yticks = 12)
    gl.savefig(folder_images + image_name, 
           dpi = 100, sizeInches = [20, 7])

# %%
if(plotting_variables):
    from pandas.plotting import scatter_matrix
    data_df["Target_reg"] = (data_df["Target_reg"] - np.mean(data_df["Target_reg"]))/np.std(data_df["Target_reg"])
    scatter_matrix(data_df[["Target_reg","day_1","week_1","Target_1","Daily_gap_1","HMA_1"]])
    # scatter_matrix(data_df_train[["Target_reg","day_1","week_1","Range_HL_1","Target_1",
#                                    "Daily_gap_1","HMA_1","RSI_1","MACD_1","ACCDIST_1"]])
    plt.show()
    plt.gcf().set_size_inches( 10, 10 )
    plt.savefig(folder_images +'variables_1.png', dpi = 100) ## Variables

    scatter_matrix(data_df[["Target_reg","week_1","Target_1","Target_2","Target_3","RSI_1","MACD_1","ACCDIST_1"]])
    # scatter_matrix(data_df_train[["Target_reg","day_1","week_1","Range_HL_1","Target_1",
#                                    "Daily_gap_1","HMA_1","RSI_1","MACD_1","ACCDIST_1"]])
    plt.show()
    plt.gcf().set_size_inches( 10, 10 )
    plt.savefig(folder_images +'variables_2.png', dpi = 100) ## Variables