Пример #1
0

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8),
                xycoords=ax.transAxes,
                size=20)


# Create the pairgrid object
grid = sns.PairGrid(data=plot_data,
                    size=3,
                    diag_sharey=False,
                    hue='TARGET',
                    vars=[x for x in list(plot_data.columns) if x != 'TARGET'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha=0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap=plt.cm.OrRd_r)

plt.suptitle('Ext Source and Age Features Pairs Plot', size=32, y=1.05)

# Make a new dataframe for polynomial features
g.map(sns.distplot, "Age")

g = sns.FacetGrid(titanic_train, row="Sex")
g.map(sns.distplot, "Fare")

g = sns.FacetGrid(titanic_train, row="Pclass", col="Sex")
g.map(sns.distplot, "Age")
g.map(sns.kdeplot, "Age")

g = sns.FacetGrid(titanic_train, col="Survived")
g.map(plt.scatter, "Parch", "SibSp")

g = sns.FacetGrid(titanic_train, row="Pclass", col="Sex", hue="Survived")
g.map(sns.kdeplot, "Age")

plt.xlim(0, 250)
plt.ylim(0, 60)

#interaction across each pair of features(pair plot and grid)
pairable_features = ["SibSp", "Parch", "Fare", "Age", "Survived"]
g = sns.PairGrid(titanic_train[pairable_features])
g.map_upper(sns.regplot)
g.map_lower(sns.residplot)
g.map_diag(sns.kdeplot)
g.add_legend()

#joint distributions(joint plot and joint grid)
g = sns.JointGrid(x="SibSp", y="Parch", data=titanic_train)
g.plot_joint(sns.regplot, order=2)
g.plot_marginals(sns.distplot)
Пример #3
0
def pair_corr(data,
              hue=None,
              hue_order=None,
              palette=None,
              vars=None,
              top: int = None,
              x_vars=None,
              y_vars=None,
              diag_kind='hist',
              height=2.5,
              aspect=1,
              dropna=True,
              size=None,
              group=None,
              corr_method='pearson',
              log_base=2,
              log_additive=1,
              prefix='PairCorr',
              fig_format='png'):
    if type(data) == str:
        data = pd.read_csv(data,
                           header=0,
                           index_col=0,
                           sep=None,
                           engine='python')

    group_dict = dict()
    if type(group) == str:
        tmp_dict = dict(x.strip().split()[:2] for x in open(group))
        for k, v in tmp_dict.items():
            group_dict.setdefault(v, list())
            group_dict[v].append(k)
    elif type(group) == dict:
        group_dict = group
    elif group is None:
        pass
    else:
        raise Exception(
            'value of group should be a group file or a python dict!')

    if not group_dict:
        group_dict = {'all': data.columns}

    for group_name, samples in group_dict.items():
        tdata = data[samples]
        tdata = tdata[
            tdata.apply(lambda x: sum(x > 0), axis=1) >= len(samples)]
        tdata.to_csv(f'{group_name}.comm.data.xls', sep='\t')
        if top is not None:
            mean_expr = tdata.mean(axis=1).sort_values(ascending=False)
            tdata = tdata.loc[mean_expr.index[:top]]
        if log_base == 2:
            tdata = np.log2(tdata + log_additive)
        elif log_base == 10:
            tdata = np.log10(tdata + log_additive)
        elif log_base in [0, 1]:
            print('no log transformation')
        else:
            raise Exception(f'log base {log_base} is not supported!')
        gridplot = sns.PairGrid(tdata,
                                hue=hue,
                                hue_order=hue_order,
                                palette=palette,
                                vars=vars,
                                x_vars=x_vars,
                                y_vars=y_vars,
                                height=height,
                                aspect=aspect,
                                dropna=dropna,
                                size=size)
        if diag_kind == 'hist':
            gridplot.map_diag(plt.hist)
        elif diag_kind == 'scatter':
            gridplot.map_diag(sns.scatterplot)
        elif diag_kind == 'kde':
            gridplot.map(sns.kdeplot)
        else:
            gridplot.map_diag(sns.scatterplot)
        for ax, col in zip(np.diag(gridplot.axes), tdata.columns):
            ax.set_xlabel(col)
        # lower
        gridplot.map_lower(sns.scatterplot)
        gridplot.map_lower(sns.regplot, scatter=False)
        gridplot.map_lower(corr_annotate, method=corr_method)

        # upper
        gridplot.map_upper(corr_annotate,
                           method=corr_method,
                           x_pos=0.15,
                           y_pos=0.6,
                           font_size=14)

        plt.savefig(prefix +
                    f'.{group_name}.common{tdata.shape[0]}genes.{fig_format}',
                    dpi=300)
        plt.close()
Пример #4
0
    def pairanalyzer(self, df, hue=None, palette=None, vars=None,
             lowerkind="boxscatter", diag_kind="kde", markers=None,
             height=2.5, aspect=1, dropna=True,
             lower_kws={}, diag_kws={}, grid_kws={}, size=None):
        #メンバ変数入力
        if diag_kind=="hist":#ヒストグラム表示のとき、bool型の列を除外してデータ読込
            self.df = df.select_dtypes(exclude=bool)
        else:#kde表示のとき、bool型を含めデータ読込
            self.df = df
        self.hue = hue
        self.corr_mat = df.corr(method="pearson")
        #文字サイズ調整
        sns.set_context("notebook")

        #PairGridインスタンス作成
        plt.figure()
        diag_sharey = diag_kind == "hist"
        g = sns.PairGrid(self.df, hue=self.hue,
                 palette=palette, vars=vars, diag_sharey=diag_sharey,
                 height=height, aspect=aspect, dropna=dropna, size=None, **grid_kws)

        #マーカーを設定
        if markers is not None:
            if g.hue_names is None:
                n_markers = 1
            else:
                n_markers = len(g.hue_names)
            if not isinstance(markers, list):
                markers = [markers] * n_markers
            if len(markers) != n_markers:
                raise ValueError(("markers must be a singleton or a list of "
                                "markers for each level of the hue variable"))
            g.hue_kws = {"marker": markers}

        #対角にヒストグラム or KDEをプロット
        if diag_kind == "hist":
            g.map_diag(plt.hist, **diag_kws)
        elif diag_kind == "kde":
            diag_kws.setdefault("shade", True)
            diag_kws["legend"] = False
            g.map_diag(sns.kdeplot, **diag_kws)

        #各変数のユニーク数を計算
        nuniques = []
        for col_name in g.x_vars:
            col_data = self.df[col_name]
            nuniques.append(len(col_data.dropna().unique()))

        #左下に散布図etc.をプロット
        if lowerkind == "boxscatter":
            if min(nuniques) <= 2: #ユニーク数が2の変数が存在するときのみ、箱ひげ表示
                self._boxscatter_lower(g, **lower_kws)
            else: #ユニーク数が2の変数が存在しないとき、散布図(_boxscatter_lowerを実行すると凡例マーカーが消えてしまう)
                g.map_lower(sns.scatterplot, **lower_kws)
        elif lowerkind == "scatter":
            g.map_lower(sns.scatterplot, **lower_kws)
        else:
            g.map_lower(sns.regplot, **lower_kws)

        #色分け(hue)有無で場合分けしてプロット&相関係数表示実行
        #hueなし
        if self.hue is None:
            #右上に相関係数表示
            self.hue_names = None
            self._corrall_upper(g)
        #hueあり
        else:
            #右上に相関係数表示(hueごとに色分け&全体の相関係数を黒表示)
            self.hue_names = g.hue_names
            g.map_upper(self._corrfunc)
            self._corrall_upper(g)
            g.add_legend()
Пример #5
0
    df_test = pd.DataFrame()

    # Add columns
    df_test['u-g'] = X_test[:num_data, 0]
    df_test['g-r'] = X_test[:num_data, 1]
    df_test['r-i'] = X_test[:num_data, 2]
    df_test['i-z'] = X_test[:num_data, 3]
    df_test['mag(i)'] = X_test[:num_data, 4]
    df_test['redshift'] = y_test[:num_data]

    df_all = pd.concat(
        [df_train.assign(dataset='train'),
         df_test.assign(dataset='test')])

    #### plotting ####
    g = sns.PairGrid(df_all, hue='dataset')
    g = g.map_upper(sns.scatterplot, alpha=0.8)
    g = g.map_lower(sns.kdeplot,
                    n_levels=6,
                    shade=True,
                    shade_lowest=False,
                    alpha=0.7)
    g = g.map_diag(sns.kdeplot, lw=2, shade=True)
    # g = g.map_diag(sns.distplot, hist=True, kde=False)
    g = g.add_legend()

    g.savefig("pairplot_best.png")

    plt.clf()  # Clean parirplot figure from sns
    Image(filename='pairplot_best.png')  # Show pairplot as image
correlations = features.corr()["Global"].dropna().sort_values()

# Extract the columns to  plot
plot_data = features[["Global", "Critic_Score", "User_Score",
                      "Critic_Count", "User_Count"]]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,
                size = 20)
    
# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3)

# Upper is a scatter plot
grid.map_upper(plt.scatter, color = "lightskyblue", alpha = 0.6, marker=".", s=10)

# Diagonal is a histogram
grid.map_diag(plt.hist, color = "lightskyblue", edgecolor="black")

# Bottom is correlation and density plot
grid.map_lower(corr_func)
grid.map_lower(sns.kdeplot, cmap = plt.cm.Blues)

# Title for entire plot
plt.suptitle("Pairs Plot of Game Scores", size = 36, y = 1.02);

Пример #7
0
#%%
#Generate Correlation HeatMap
colormap = sns.diverging_palette(220, 10, as_cmap=True)
f, ax = plt.subplots(figsize=(10, 10))
corr = df_census.corr()

sns.heatmap(corr, cmap="coolwarm", annot=True, fmt=".2f",
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

#%%
# Pairplot matrix.  
#%%
g = sns.PairGrid(df_census,vars=['age','fnlwgt',
                               'capital_gain','capital_loss', 
                               'hours_per_week'],
                               hue='income_bracket',palette = 'muted')
g.map(plt.scatter, alpha=0.8)
g.add_legend();

#%%
df_age = df_census.loc[:,['gender', 'age', 'income_bracket']]
conditions = [
    (df_age['age'] < 20),
    (df_age['age'] < 30),
    (df_age['age'] < 40),
    (df_age['age'] < 50),
    (df_age['age'] < 60),
    (df_age['age'] < 70),
    (df_age['age'] < 110)]
choices = ['10-20', '20-30', '30-40','40-50','50-60','60-70','70-110']
Пример #8
0
    if found:
        if 'DREAM' in algs[i]:
            if 'c' in AE:
                samp_lat=vec2img(samp)
                width=tuple(np.mod(i,2) for i in samp_lat.shape[1:])
                samp_lat=chop(samp_lat,width)[None,:,:,None] if autoencoder.activations['latent'] is None else samp_lat.flatten()[None,:]
                samp=(pad(np.squeeze(autoencoder.decode(samp_lat)),width)).reshape((samp.shape[0],-1))
            else:
                samp=autoencoder.decode(samp)
        samps[i]=samp
#         samp=pd.DataFrame(samp,columns=['$u_{}$'.format(i) for i in np.arange(1,samp.shape[1]+1)])
#         g=sns.PairGrid(samp,diag_sharey=False)
#         g.map_upper(plt.scatter)
#         g.map_lower(sns.kdeplot)
#         g.map_diag(sns.kdeplot,lw=2,legend=False)
# #         g.fig.suptitle(algs[i])
# #         g.fig.subplots_adjust(top=0.95)
#         plt.savefig(os.path.join(folder,algs[i]+'_dist.png'),bbox_inches='tight')

# form a big data frame
alg_array=np.hstack([[alg_names[i]]*num_samp for i in range(num_algs)])
df_samps=pd.DataFrame(samps.reshape((-1,lin.input_dim)),columns=['$u_{}$'.format(i) for i in np.arange(1,lin.input_dim+1)])
df_samps['algorithm']=alg_array.flatten()
g=sns.PairGrid(df_samps,hue='algorithm',diag_sharey=False)
g.map_upper(plt.scatter,s=1,alpha=0.5)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot,lw=2)
g.add_legend()
# g.fig.suptitle('MCMC')
# g.fig.subplots_adjust(top=0.95)
plt.savefig(os.path.join(folder,'allmcmc_dist.png'),bbox_inches='tight')
Пример #9
0
def show_beautiful_bivariate_groupings(dat, color_by=False):
    g = sns.PairGrid(dat, hue=color_by)
    g.map_diag(plt.hist)
    g.map_offdiag(plt.scatter)
    g.add_legend()
Пример #10
0
# In[97]:


sns.jointplot('AAPL', 'MSFT', roi_tech_comps, kind='scatter', color='seagreen' )


# In[98]:


sns.pairplot(roi_tech_comps.dropna())


# In[103]:


tech_rets_fig = sns.PairGrid(roi_tech_comps.dropna())
tech_rets_fig.map_upper(plt.scatter, marker='o', color='purple')
tech_rets_fig.map_diag(plt.hist, bins=40)
tech_rets_fig.map_lower(sns.kdeplot, cmap='cool_d')


# In[132]:


rets = roi_tech_comps.dropna()

area = np.pi*20

plt.scatter(rets.mean(), rets.std(), s=area)

plt.xlabel('Expected Return')
def plot_variable_pairs(df):
    graph = sns.PairGrid(df)
    graph.map_diag(plt.hist)
    graph.map_offdiag(sns.regplot)
    plt.show()
Пример #12
0
def plot_fig(df, save=False):
    fig = sns.PairGrid(df, diag_sharey=False)
    plt.subplots_adjust(top=0.9)
    fig.fig.suptitle("Distribution")
    fig.map_lower(sns.kdeplot, cmap="Blues_d")
    fig.map_upper(plt.scatter)
    fig.map_diag(sns.distplot)
    if save == True: fig.savefig("pairplot.png")

    #plot kde joitplot
    fig = plt.figure()
    g = sns.jointplot(x=df.columns.values[0],
                      y=df.columns.values[1],
                      kind="kde",
                      data=df)
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle("Kernel Distribution Estimation")
    if save == True: fig.savefig("joint_kde_plot.png")

    #calculate acf, pcf
    acf, pcf = calc_pacf(df)
    ma = calc_ma(df)

    #calcularw return
    ret = calc_return(df)

    for i in range(len(df.columns)):
        #plot distribution
        fig = plt.figure()
        sns.distplot(df.iloc[:, i])
        plt.title("Histgram - " + df.columns[i])
        if save == True: fig.savefig("dist_plot_" + df.columns[i] + ".png")

        #plot series
        fig = plt.figure()
        df.iloc[:, i].plot()
        #plot moving average
        ma[i].plot(c='r')
        plt.title("Row data & Moving average - " + df.columns[i])
        if save == True: fig.savefig("series_plot_" + df.columns[i] + ".png")

        #plot acf
        fig = plt.figure()
        plt.bar(range(len(acf[i])), acf[i], width=0.3)
        plt.title("Auto Correlation Function - " + df.columns[i])
        if save == True: fig.savefig("acf_plot_" + df.columns[i] + ".png")

        #plot pcf
        fig = plt.figure()
        plt.bar(range(len(pcf[i])), pcf[i], width=0.3)
        plt.title("Partial Auto Correlation Function - " + df.columns[i])
        if save == True: fig.savefig("pcf_plot_" + df.columns[i] + ".png")

        #plot fft
        fig = plt.figure()
        f = calc_fft(df.iloc[:, i])
        plt.plot(f[1], f[0])
        plt.title("Fast Fourier Transform - " + df.columns[i])
        if save == True: fig.savefig("fft_plot_" + df.columns[i] + ".png")

        #plot return value_i - value_i2
        fig = plt.figure()
        ret[i].plot()
        plt.title("Return - " + df.columns[i])
        if save == True: fig.savefig("return_plot_" + df.columns[i] + ".png")
Пример #13
0
# ## Scaling the data to chart it and allow better predictive power

# In[5]:

bcs = pd.DataFrame(preprocessing.scale(bc.iloc[:, 2:32]))
bcs.columns = list(bc.iloc[:, 2:32].columns)
bcs['diagnosis'] = bc['diagnosis']

# ## Checking for correlation between variables and diagnosis

# In[7]:

from pandas.plotting import scatter_matrix

p = sns.PairGrid(bcs.iloc[:, 20:32], hue='diagnosis', palette='Reds')
p.map_upper(plt.scatter, s=20, edgecolor='w')
p.map_diag(plt.hist)
p.map_lower(sns.kdeplot, cmap='GnBu_d')
p.add_legend()

p.figsize = (30, 30)

# ## Lets see how each variable breaks down by diagnosis

# In[8]:

mbc = pd.melt(bcs, "diagnosis", var_name="measurement")
fig, ax = plt.subplots(figsize=(10, 5))
p = sns.violinplot(ax=ax,
                   x="measurement",
Пример #14
0
def pcaDistribution(pca, X_pca, savePath, ncomp=4):
    # Visualise how the PCs compare to each other
    lim = 6
    rot2rad = -49 * np.pi / 180
    X_pca = rotFlip(X_pca, rot2rad, flipAx=2, rotAxes=[2, 3])
    df_pca = pd.DataFrame(data=X_pca[:, :ncomp])
    sns.set_context("paper", rc={"font.size": 18, "axes.labelsize": 18})
    g = sns.PairGrid(df_pca, corner=True, diag_sharey=False)
    g.fig.set_size_inches(20, 20)
    g.map_diag(sns.kdeplot, color="k")
    cbar_ax = g.fig.add_axes([0.91, 0.325, 0.015, 0.4])
    for i in range(g.axes.shape[0]):
        for j in range(g.axes.shape[1]):
            if g.axes[i, j] is not None and i is not j:
                if i == ncomp - 1 and j == ncomp - 2:
                    cbax = cbar_ax
                    cbFlag = True
                else:
                    cbax = None
                    cbFlag = False
                g.axes[i, j] = sns.kdeplot(
                    g.data.iloc[:, j],
                    g.data.iloc[:, i],
                    ax=g.axes[i, j],
                    shade=True,
                    shade_lowest=False,
                    legend=False,
                    vmin=0,
                    vmax=0.12,
                    n_levels=100,
                    cbar=cbFlag,
                    cbar_ax=cbax,
                    cmap="cubehelix_r",
                    cbar_kws={"label": "Density"},
                )
                if cbFlag:
                    cax = plt.gcf().axes[-1]
                    cax.tick_params(labelsize=18)
                for col in g.axes[i, j].collections:
                    col.set_edgecolor("face")
                g.axes[i, j].set_xlim((-lim, lim))
                g.axes[i, j].set_ylim((-lim, lim))
                g.axes[i, j].tick_params(axis="both", which="major", labelsize=18)
            elif g.axes[i, j] is not None and i == j:
                g.axes[i, j].tick_params(axis="both", which="major", labelsize=18)
                evr = str(round(pca.explained_variance_ratio_[i], 2))
                evrS = str(round(np.sum(pca.explained_variance_ratio_[: i + 1]), 2))
                g.axes[i, j].annotate(
                    "EVR:  " + evr, (0.6, 0.9), xycoords="axes fraction"
                )
                g.axes[i, j].annotate(
                    "CEVR: " + evrS, (0.6, 0.8), xycoords="axes fraction"
                )
                g.diag_axes[i].set_axis_on()
                sns.utils.despine(ax=g.diag_axes[i], left=True, right=False)
                g.diag_axes[i].tick_params(
                    axis="y", which="major", labelsize=18, right=True
                )
                g.diag_axes[i].set_ylim((0, 0.4))
                g.diag_axes[i].set_ylabel("Density")

            if i == g.axes.shape[0] - 1 and j == g.axes.shape[1] - 1:
                g.axes[i, j].set_xlim((-lim, lim))
            if i == g.axes.shape[1] - 1:
                g.axes[i, j].set_xlabel("PC " + str(j + 1))
            if j == 0:
                g.axes[i, j].set_ylabel("PC " + str(i + 1))

    plt.savefig(savePath + "/pairgrid.pdf", bbox_inches="tight")
    plt.show()
Пример #15
0
# using grids

import seaborn as sns
import matplotlib.pyplot as plt

iris = sns.load_dataset('iris')
print(iris.head())

print(iris['species'].unique())

# pair grid
# have to specify/map your own plots

g = sns.PairGrid(iris)

g.map_diag(sns.distplot)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
#plt.show()

# facet grid **

tips = sns.load_dataset('tips')

# seperate by col,row
# map plot with a column of data

g = sns.FacetGrid(data=tips, col='time', row='smoker')
#plt.figure()

g.map(sns.distplot, 'total_bill')
Пример #16
0
iris.head()

iris.info()

iris.describe()

iris.shape

iris['Species'].value_counts()

iris.drop(['Id'], axis=1, inplace=True)

iris.head()

g = sns.PairGrid(iris, hue='Species')
g.map_diag(sns.distplot)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)

species = pd.get_dummies(iris['Species'], drop_first=True)

species.head()

iris = pd.concat([iris, species], axis=1)

iris.head()

iris.drop(['Species'], axis=1, inplace=True)

iris.head()
Пример #17
0
from graspy.embed import select_dimension

select_dimension(all_hop_hist.T, n_elbows=5)
#%%
from graspy.embed import selectSVD
from graspy.plot import pairplot

n_elbows = 3
U, S, V = selectSVD(all_hop_hist.T, n_elbows=n_elbows)

plot_df = pd.DataFrame(data=U)
plot_df["label"] = meta["merge_class"].values

pg = sns.PairGrid(plot_df,
                  hue="label",
                  palette=CLASS_COLOR_DICT,
                  vars=np.arange(U.shape[1]),
                  height=4)

# pg._legend.remove()
# pg.map_diag(plt.hist)
pg.map_offdiag(sns.scatterplot, s=15, linewidth=0, alpha=0.7)


def tweak(x, y, **kws):
    ax = plt.gca()
    if len(x) > 0:
        xmax = np.nanmax(x)
        xtop = ax.get_xlim()[-1]
        if xmax > xtop:
            ax.set_xlim([-1, xmax + 1])
Пример #18
0
    'pelvic incidence', 'pelvic tilt', 'lumbar lordosis angle', 'sacral slope',
    'pelvic radius', 'grade of spondylolisthesis', 'class'
]
df['class'] = df['class'].map({'AB': 1, 'NO': 0})  #Abnormal=1, Normal=0
df.head(10)  #Displaying a part of the data

# # (b) Pre-Processing and Exploratory Data Analysis

# ## i. Scatterplots of the independent variables

sns.set(font_scale=2)
g = sns.PairGrid(df,
                 height=5,
                 hue="class",
                 hue_kws={"marker": ["o", "+"]},
                 palette=["#FF0000", "#0C41CE"],
                 vars=[
                     'pelvic incidence', 'pelvic tilt',
                     'lumbar lordosis angle', 'sacral slope', 'pelvic radius',
                     'grade of spondylolisthesis'
                 ])
g = g.map(plt.scatter)
g = g.add_legend()

# ## ii. Boxplots for each of the independent variables

plt.figure(figsize=(20, 20))
plt.subplots_adjust(wspace=1)
for i in range(len(df.columns) - 1):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x='class', y=df.columns[i], data=df)
plt.xticks(rotation=90)
# As an alternative of violin plot, box plot can be used
# box plots are also useful in terms of seeing outliers
# I do not visualize all features with box plot
# In order to show you lets have an example of box plot
# If you want, you can visualize other features as well.
plt.figure(figsize=(10, 10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90)
sns.jointplot(x.loc[:, 'concavity_worst'],
              x.loc[:, 'concave points_worst'],
              kind="regg",
              color="#ce1414")
sns.set(style="white")
df = x.loc[:, ['radius_worst', 'perimeter_worst', 'area_worst']]
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)
sns.set(style="whitegrid", palette="muted")
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())  # standardization
data = pd.concat([y, data_n_2.iloc[:, 0:10]], axis=1)
data = pd.melt(data,
               id_vars="diagnosis",
               var_name="features",
               value_name='value')
plt.figure(figsize=(10, 10))
tic = time.time()
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
Пример #20
0
mean, in this case it will also not ensure that data is C-contiguous which may cause a significant slowdown.

n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel.

None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.

algorithm : “auto”, “full” or “elkan”, default=”auto”
K-means algorithm to use. The classical EM-style algorithm is “full”. The “elkan” variation is more efficient by using 
the triangle inequality, but currently doesn’t support sparse data. “auto” chooses “elkan” for dense data and “full” for
sparse data.
"""

g = sns.PairGrid(
    data_train,
    hue="CLASS_LABEL",
    palette=sns.color_palette("cubehelix", 3),
    vars=['AGE', 'CAPITAL_GAIN', 'CAPITAL_LOSS', 'HOURS_WORKED_PER_WEEK'])
g.map(plt.scatter)
plt.savefig('charts/income-pairgrid.png')

model = KMeans(
    random_state=0,
    n_jobs=-1,
)

# https://www.scikit-yb.org/en/latest/api/cluster/elbow.html
visualizer = KElbowVisualizer(model, k=(1, 20))

visualizer.fit(X_train)  # Fit the data to the visualizer
# Finalize and render the figure
def chapter_3():
    """
    Notes for Linear Regression
    
    - coefficients -> give average change in Y with a one-unit increase in X
    
    - confidence interval -> B1_hat +- 2 * SE(B1_hat)
        - 95% chance the interval contains true value of B
        - SE(B1_hat) -> var(e) / SSE
    
    - t-statistic
        - t = (B1_hat - 0)/ SE(B1_hat)
        
    -  test for synergy (additive assumption)
        - effect of each predictor on response is independent of other predictors
        - include interaction term -> x1 * x2 
            - if interaction term has small p value, then not additive (synergy exists)
            - if results in substantial increase in r2, then not additive (synergy exists)

    - relationship exists
        - p value < 0.0005 or < 0.0001
        - F statistic greater than 1

    - strength of relationship
        - RSE -> estimates standard deviation of response from regression line
        - R squared -> % variability in response explained by predictors
        - percent error -> 100 * residual_standard_error / ys.mean()  
        
    - accuracy of prediction
        - prediction interval (individual response)
        - confidence interval (average response)
        
    - non-linearity
        - residual plots (fitted values vs. studentized/standardized residuals)
        - if residual plots are not random, transform with log(x), sqrt(x), or x2
        
    - correlation of error terms
        - will underestimate p value and narrow confidence/prediction intervals
        
    - heteroscedasticity (funnel shape of residual plot)
        - non-constant variances in the errors
        - if exists, transform the response with log(y) or sqrt(y)
    
    - co-linearity of features
        - (VIF) variance inflation factor -> 1 / (1 - r2)
        - correlation matrix
        - reduces t-statistic and increases standard error
        
    - outliers
        - leverage -> high impact on RSE and/or regression line
        - look at studentized residuals (observations > 3 are outliers)
        - influence (leverage) plots
    """
    #3.8 -> Simple Linear Regression on Auto data set
    dat = pd.read_csv("Auto.csv")
    dat = dat.replace("?", np.nan).dropna()

    # add constant to x values to ensure mean of residuals = 0
    xs = sm.add_constant(dat["horsepower"].astype(float))
    ys = dat["mpg"].astype(float)

    model = sm.OLS(ys, xs).fit()

    intercept, slope = model.params
    r2 = model.rsquared

    # variance inflation factor -> test for co-linearity
    # min(VIF) = 1.0, if VIF > 5 or 10, features are most likely correlated
    vif = 1 / (1 - r2)
    f_stat = model.fvalue
    p_value = model.pvalues[1]

    # create new line with the coefficients
    fit = [slope * x + intercept for x in xs["horsepower"]]
    print("Simple OLS: %s" % model.summary())

    prediction = model.predict()
    residuals = ys.astype(float) - prediction
    standardized_residuals = (residuals - residuals.mean()) / \
                             (residuals.max() - residuals.min())
    #residual_standard_error = results.rmse
    #percent_error = 100 * residual_standard_error / ys.mean()
    """
    Plot
    """
    f = plt.figure()
    ax = f.add_subplot(221)
    ax2 = f.add_subplot(223)
    ax3 = f.add_subplot(222)
    ax4 = f.add_subplot(224)
    ax.scatter(xs["horsepower"],
               ys,
               label="r2=%f; f=%f; p=%f" % (r2, f_stat, p_value))
    ax.plot(xs["horsepower"],
            fit,
            color="r",
            label="f(x) = %f * x + %f" % (slope, intercept))

    # plot fitted values vs residuals to check for non-linearity
    ax2.scatter(model.fittedvalues, residuals, color="r")
    ax2.axhline(0, color="k")
    ax2.set_xlabel("fitted values")
    ax2.set_ylabel("residuals")

    # show leverage to identity observations that may have
    # more effect on the regression than other observations
    sm.graphics.influence_plot(model, ax=ax3)

    # show fitted values vs studentized residuals
    outlier_influence = outliers_influence.OLSInfluence(model).summary_frame()
    ax4.scatter(model.fittedvalues, outlier_influence["student_resid"])
    ax4.axhline(0, color="k")
    ax4.set_xlabel("fitted values")
    ax4.set_ylabel("studentized residuals")

    for _ax in [ax, ax2, ax3, ax4]:
        _ax.legend(loc="best")
    plt.show()

    #3.9 -> Multiple Linear Regression on Auto data set
    xs = dat[[
        "cylinders", "displacement", "horsepower", "weight", "acceleration",
        "year", "origin"
    ]].astype(float)

    # plot correlation matrix to check co-linearity
    # co-linearity reduces the t-statistic (power) of the test
    # and also increases standard error
    print("Correlations: %s" % xs.corr())
    grid = sns.PairGrid(xs)
    grid = grid.map(plt.scatter)
    plt.show()

    results = pd.ols(y=ys, x=xs)
    model = sm.OLS(ys, xs).fit()
    print("Multiple OLS: %s" % results)

    # compute variance inflation factor (VIF) to check for co-linearity
    vif = list(map(lambda x: 1 / (1 - x), model.params))
    print("VIFs: %s" % vif)
    """
    Looking at the p-values associated with each predictor’s t-statistic, 
    we see that displacement, weight, year, and origin 
    have a statistically significant relationship, 
    while cylinders, horsepower, and acceleration do not.
    """
    print("Coefficients: %s" % results.beta)
    """
    The regression coefficient for year, 0.7508, 
    suggests that for every one year, mpg increases by the coefficient. 
    In other words, cars become more fuel efficient every year by almost 1 mpg / year.
    """
    residuals = results.resid
    standardized_residuals = (residuals - residuals.mean()) / \
                             (residuals.max() - residuals.min())
    residual_standard_error = results.rmse
    percent_error = 100 * residual_standard_error / ys.mean()
    """
    Plot
    """
    f = plt.figure()
    ax = f.add_subplot(221)
    ax2 = f.add_subplot(223)
    ax3 = f.add_subplot(222)
    ax4 = f.add_subplot(224)

    ax.scatter(results.y_fitted, residuals)
    ax.axhline(0, color="k")
    ax.set_xlabel("y fitted values")
    ax.set_ylabel("residuals")

    ax2.scatter(results.y_fitted,
                standardized_residuals,
                label='percent error=%f' % percent_error)
    ax2.axhline(0, color="k")
    ax2.set_xlabel("y fitted values")
    ax2.set_ylabel("standardized residuals")

    sm.graphics.influence_plot(model, ax=ax3)

    for _ax in [ax, ax2, ax3, ax4]:
        _ax.legend(loc="best")
    plt.show()
Пример #22
0
plt.title("Boxplots for dependents vs card")

#     scatterplot= is a plot that shows the data as a collection of points
#                  the position of a poinyt depends on its two dimensional value   





sns.scatterplot(x="age", y="reports", hue="card", data=data)
plt.title("Scatter plot between reports and age on basis of card")

sns.lmplot('age','income',data=data,hue='card',fit_reg=True)
plt.title("scatter plot with regresson line for income and age")

g=sns.PairGrid(data,hue="card")
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
plt.legend()

#### Organising Data

# replacing the values of the column of card (yes,no) with (credit_yes,credit_no)
# replacing the values of the column owner (yes,no) with (owner_yes,owner_no)
# replacing the values of the column selfemp (yes,no) with (self_emp_yes,self_emp_no)

data.card.replace(['yes','no'],['credit_yes','credit_no'],inplace=True)
data.owner.replace(['yes','no'],['owner_yes','owner_no'],inplace=True)
data.selfemp.replace(['yes','no'],['self_emp_yes','self_emp_no'],inplace=True)
data.head()
Пример #23
0
plt.tight_layout()
plt.savefig(os.path.join(WD_CLUST, "cor_rois_raw.pdf"))
"""
           xbeta_pos  xbeta_neg  xavg_pos  xavg_neg      xavg
xbeta_pos   1.000000   0.438476  0.933818 -0.401512  0.285163
xbeta_neg   0.438476   1.000000  0.477728 -0.976621 -0.301655
xavg_pos    0.933818   0.477728  1.000000 -0.442388  0.353100
xavg_neg   -0.401512  -0.976621 -0.442388  1.000000  0.342208
xavg        0.285163  -0.301655  0.353100  0.342208  1.000000
"""
scores["DX"] = y_sczCo

#plt.scatter(scores[], scores[], color=scores["y"])

fig = plt.figure(figsize=(20, 20))
g = sns.PairGrid(scores, hue="DX")
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(WD_CLUST, "rois_scatterplot.pdf"))

from sklearn.decomposition import PCA
# PCA with scikit-learn
pca = PCA(n_components=2)
pca.fit(Xscores[:, [0, 2]])
print(pca.explained_variance_ratio_)

scores["pc1_xbeta_pos_neg"] = pca.transform(Xscores[:, [0, 2]])[:, 0]
scores["pc2_xbeta_pos_neg"] = pca.transform(Xscores[:, [0, 2]])[:, 1]
Пример #24
0
#pairs plot of Winrate


plot_data = features[['Vpip%', 'Winrate', 'WMSD_WSF%', 'Pfr/Vpip']]

plot_data = plot_data.dropna()

def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy = (.2, .8), xycoords = ax.transAxes,
                size = 20)

grid = sns.PairGrid(data = plot_data, height = 3)

# Upper is a scatter plot
grid.map_upper(plt.scatter, color = 'red', alpha = 0.6)

# Diagonal is a histogram
grid.map_diag(plt.hist, color = 'red', edgecolor = 'black')

# Bottom is correlation and density plot
grid.map_lower(corr_func);
grid.map_lower(sns.kdeplot, cmap = plt.cm.Reds)

# Title for entire plot
plt.suptitle('Pairs Plot of Energy Data', size = 36, y = 1.02);

Пример #25
0
sns.factorplot(x='Pclass',
               hue='Survived',
               col='Sex',
               data=data_train,
               kind='count')

# In[14]:

sns.pointplot(x='Pclass', y='Survived', hue='Sex', data=data_train)

# If we look at each of these categorical variable independently, we observe the following.

# In[15]:

g = sns.PairGrid(data=data_train,
                 y_vars="Survived",
                 x_vars=["Pclass", "Sex", "SibSp", "Parch"])
g.map(sns.pointplot)

# `Pclass` and `Sex` factors seem to be good predictors of the survival as such. In addition, there is an interaction between the two as seen on the factorplot above.
#
# `SibSp` and `Parch` factors show something different: level `0` seems to exhibit a different behaviour as if there were a confusion with another effect. We can propose the following hypothesis for later data enrichment.
# * 0 means that a traveler has no relatives onboard the ship. These people apparently tended to behave differently from people traveling with relatives, which was not necessarily supporting their best interests.
# * non-0 values means that people traveled with relatives. These people tended to behave differently from lonely travelers which marks a possible group behaviour. It seems like the bigger the family group is, the lesser the odds of survival are. Smaller families groups of (2 to 3 people) tends to have better survival chances than the average, whereas bigger groups (more than 3 people) tends to on the contrary have lower chances of survival.

# Let's conclude this categorical features exploration with `Embarked`. There are 3 possible values, respectively `S` (for Southampton, England), `C` (for Cherbourg, Normandy) and `Q` (for Queenstown now Cobh, Ireland). There are also 2 missing values.

# In[16]:

sns.factorplot(x="Embarked",
               hue="Survived",
Пример #26
0
order_days = tips.day.value_counts().index
print(order_days)
order_days = Categorical(["Thur","Fri","Sat","Sun"])
g = sns.FacetGrid(tips,col="day",col_order=order_days)
g.map(sns.boxplot,"total_bill")
plt.show()
plt.close()

pal = dict(Lunch="seagreen",Dinner="gray")
g = sns.FacetGrid(tips,hue="time",col="sex",row="smoker",palette=pal,size=5,hue_kws={"marker":["<",">"]})
g.map(plt.scatter,"total_bill","tip",s=50,alpha=0.7,linewidth=0.6,edgecolor="white")
g.add_legend()
g.fig.subplots_adjust(wspace=0.2,hspace=0.2)#子图之间的间距
#g.set_axis_labels("1","2")
#g.set(xticks=[10,30,50])
plt.show()

g = sns.PairGrid(tips,hue="smoker",vars=["total_bill","tip"],palette="GnBu_d")#vars指定对比的变量,自己选择
g.map_diag(plt.hist)#对角线
g.map_offdiag(plt.scatter)#非对角线
g.add_legend()
#g.map(plt.scatter)
plt.show()

flights = flight.pivot("month","year","passengers")
ax = sns.heatmap(flights,annot=True,fmt="d",linewidths="0.5",cmap="YlGnBu")
plt.show()

tip_corr = tips.corr()
sns.heatmap(tip_corr,annot=True,fmt="f",linewidths="0.5",cmap="YlGnBu")
plt.show()
                          legend=True,
                          linestyle='--',
                          marker='o')

closing_dataframe = DataReader(['AAPL', 'GOOG', 'MSFT', 'AMZN'], 'yahoo',
                               startTime, endTime)['Adj Close']

closing_dataframe.head()

stock_returns = closing_dataframe.pct_change()

sns.jointplot('AMZN', 'GOOG', stock_returns, kind='scatter')

sns.pairplot(stock_returns.dropna())

fig = sns.PairGrid(closing_dataframe)
fig.map_upper(plt.scatter, color='purple')
fig.map_lower(sns.kdeplot, cmap='cool_d')
fig.map_diag(plt.hist, bins=30)

fig = sns.PairGrid(stock_returns.dropna())
fig.map_upper(plt.scatter, color='blue')
fig.map_lower(sns.kdeplot, cmap='cool_d')
fig.map_diag(plt.hist, bins=50)

#plotting the correlation table using heatmap for analysis
corr = stock_returns.corr()
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
Пример #28
0
sns.pairplot(corr[[
    'ground_floor_type_467b', 'roof_type_67f9', 'foundation_type_467b',
    'has_superstructure_mud_mortar_stone', 'foundation_type_337f',
    'damage_grade'
]],
             hue="damage_grade",
             diag_kind='kde',
             markers='+')

corr.dtypes

g = sns.PairGrid(corr,
                 x_vars=[
                     'ground_floor_type_467b', 'roof_type_67f9',
                     'foundation_type_467b',
                     'has_superstructure_mud_mortar_stone',
                     'foundation_type_337f', 'damage_grade'
                 ],
                 y_vars='damage_grade',
                 aspect=.75,
                 size=3.5)
g.map(sns.swarmplot, palette="pastel")

df

#Some sata transformation

df['superstructure'] = df[[
    'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
    'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
    'has_superstructure_mud_mortar_brick',
    'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
            ]
        else:
            df.loc[i] = [
                name, difficulty, horizon, channels, accetability,
                accetabilityPercent, accetabilePercent, loss, lossPercent,
                time, iterations
            ]

df[columns[1:4]] = df[columns[1:4]].astype(str)
df[columns[4:]] = df[columns[4:]].apply(pd.to_numeric)
# df = df.apply(pd.to_numeric)
df.sort_values(by=['horizon', 'name'], inplace=True, ascending=[False, True])
outliers.sort_values(by=['horizon', 'name'],
                     inplace=True,
                     ascending=[False, True])
df.to_csv(directory + 'plottedSolutionsData.csv', index=None, header=True)
outliers[['loss [x1000]', 'loss % improvement',
          'iterations [x100 000]']] = outliers[[
              'loss [x1000]', 'loss % improvement', 'iterations [x100 000]'
          ]].round(3)
outliers.to_csv(directory + 'outliersSolutionsData.csv',
                index=None,
                header=True)
print(df)
g = sns.PairGrid(df, x_vars=columns[1:4], y_vars=columns[4:])
g = g.map(sns.barplot)
plt.subplots_adjust(top=0.95, bottom=0.05)
g.fig.suptitle("Simulated annealing results categorized by the instances")
# plt.tight_layout()
plt.show()
Пример #30
0
#print range, mean, and std
print(a.loc[['mean', 'std', 'range']])

# part d

df2 = df.drop(index=df.index[10:85])

b = df2.describe()
b.loc['range'] = b.loc['max'] - b.loc['min']

print('New mean, std, and range:')
print(b.loc[['mean', 'std', 'range']])

# part e
g = sns.PairGrid(df, height=1.0)
g.map_upper(plt.scatter, s=3)
g.map_diag(plt.hist)
g.map_lower(plt.scatter, s=3)
plt.show()
'''
Acceleration seems normally distributed, weight and horsepower have a strong linear relationship
mpg has a strong non-linear relationship with displacement, weight, and horsepower

'''

# part f
'''
It apears that weight, displacement, and horsepower all have a strong 
non-linear negative relationship with with mpg, also year apears to have a positive relationship with mpg
All if these variables could be good predictors of mpg