# Function to calculate correlation coefficient between two columns def corr_func(x, y, **kwargs): r = np.corrcoef(x, y)[0][1] ax = plt.gca() ax.annotate("r = {:.2f}".format(r), xy=(.2, .8), xycoords=ax.transAxes, size=20) # Create the pairgrid object grid = sns.PairGrid(data=plot_data, size=3, diag_sharey=False, hue='TARGET', vars=[x for x in list(plot_data.columns) if x != 'TARGET']) # Upper is a scatter plot grid.map_upper(plt.scatter, alpha=0.2) # Diagonal is a histogram grid.map_diag(sns.kdeplot) # Bottom is density plot grid.map_lower(sns.kdeplot, cmap=plt.cm.OrRd_r) plt.suptitle('Ext Source and Age Features Pairs Plot', size=32, y=1.05) # Make a new dataframe for polynomial features
g.map(sns.distplot, "Age") g = sns.FacetGrid(titanic_train, row="Sex") g.map(sns.distplot, "Fare") g = sns.FacetGrid(titanic_train, row="Pclass", col="Sex") g.map(sns.distplot, "Age") g.map(sns.kdeplot, "Age") g = sns.FacetGrid(titanic_train, col="Survived") g.map(plt.scatter, "Parch", "SibSp") g = sns.FacetGrid(titanic_train, row="Pclass", col="Sex", hue="Survived") g.map(sns.kdeplot, "Age") plt.xlim(0, 250) plt.ylim(0, 60) #interaction across each pair of features(pair plot and grid) pairable_features = ["SibSp", "Parch", "Fare", "Age", "Survived"] g = sns.PairGrid(titanic_train[pairable_features]) g.map_upper(sns.regplot) g.map_lower(sns.residplot) g.map_diag(sns.kdeplot) g.add_legend() #joint distributions(joint plot and joint grid) g = sns.JointGrid(x="SibSp", y="Parch", data=titanic_train) g.plot_joint(sns.regplot, order=2) g.plot_marginals(sns.distplot)
def pair_corr(data, hue=None, hue_order=None, palette=None, vars=None, top: int = None, x_vars=None, y_vars=None, diag_kind='hist', height=2.5, aspect=1, dropna=True, size=None, group=None, corr_method='pearson', log_base=2, log_additive=1, prefix='PairCorr', fig_format='png'): if type(data) == str: data = pd.read_csv(data, header=0, index_col=0, sep=None, engine='python') group_dict = dict() if type(group) == str: tmp_dict = dict(x.strip().split()[:2] for x in open(group)) for k, v in tmp_dict.items(): group_dict.setdefault(v, list()) group_dict[v].append(k) elif type(group) == dict: group_dict = group elif group is None: pass else: raise Exception( 'value of group should be a group file or a python dict!') if not group_dict: group_dict = {'all': data.columns} for group_name, samples in group_dict.items(): tdata = data[samples] tdata = tdata[ tdata.apply(lambda x: sum(x > 0), axis=1) >= len(samples)] tdata.to_csv(f'{group_name}.comm.data.xls', sep='\t') if top is not None: mean_expr = tdata.mean(axis=1).sort_values(ascending=False) tdata = tdata.loc[mean_expr.index[:top]] if log_base == 2: tdata = np.log2(tdata + log_additive) elif log_base == 10: tdata = np.log10(tdata + log_additive) elif log_base in [0, 1]: print('no log transformation') else: raise Exception(f'log base {log_base} is not supported!') gridplot = sns.PairGrid(tdata, hue=hue, hue_order=hue_order, palette=palette, vars=vars, x_vars=x_vars, y_vars=y_vars, height=height, aspect=aspect, dropna=dropna, size=size) if diag_kind == 'hist': gridplot.map_diag(plt.hist) elif diag_kind == 'scatter': gridplot.map_diag(sns.scatterplot) elif diag_kind == 'kde': gridplot.map(sns.kdeplot) else: gridplot.map_diag(sns.scatterplot) for ax, col in zip(np.diag(gridplot.axes), tdata.columns): ax.set_xlabel(col) # lower gridplot.map_lower(sns.scatterplot) gridplot.map_lower(sns.regplot, scatter=False) gridplot.map_lower(corr_annotate, method=corr_method) # upper gridplot.map_upper(corr_annotate, method=corr_method, x_pos=0.15, y_pos=0.6, font_size=14) plt.savefig(prefix + f'.{group_name}.common{tdata.shape[0]}genes.{fig_format}', dpi=300) plt.close()
def pairanalyzer(self, df, hue=None, palette=None, vars=None, lowerkind="boxscatter", diag_kind="kde", markers=None, height=2.5, aspect=1, dropna=True, lower_kws={}, diag_kws={}, grid_kws={}, size=None): #メンバ変数入力 if diag_kind=="hist":#ヒストグラム表示のとき、bool型の列を除外してデータ読込 self.df = df.select_dtypes(exclude=bool) else:#kde表示のとき、bool型を含めデータ読込 self.df = df self.hue = hue self.corr_mat = df.corr(method="pearson") #文字サイズ調整 sns.set_context("notebook") #PairGridインスタンス作成 plt.figure() diag_sharey = diag_kind == "hist" g = sns.PairGrid(self.df, hue=self.hue, palette=palette, vars=vars, diag_sharey=diag_sharey, height=height, aspect=aspect, dropna=dropna, size=None, **grid_kws) #マーカーを設定 if markers is not None: if g.hue_names is None: n_markers = 1 else: n_markers = len(g.hue_names) if not isinstance(markers, list): markers = [markers] * n_markers if len(markers) != n_markers: raise ValueError(("markers must be a singleton or a list of " "markers for each level of the hue variable")) g.hue_kws = {"marker": markers} #対角にヒストグラム or KDEをプロット if diag_kind == "hist": g.map_diag(plt.hist, **diag_kws) elif diag_kind == "kde": diag_kws.setdefault("shade", True) diag_kws["legend"] = False g.map_diag(sns.kdeplot, **diag_kws) #各変数のユニーク数を計算 nuniques = [] for col_name in g.x_vars: col_data = self.df[col_name] nuniques.append(len(col_data.dropna().unique())) #左下に散布図etc.をプロット if lowerkind == "boxscatter": if min(nuniques) <= 2: #ユニーク数が2の変数が存在するときのみ、箱ひげ表示 self._boxscatter_lower(g, **lower_kws) else: #ユニーク数が2の変数が存在しないとき、散布図(_boxscatter_lowerを実行すると凡例マーカーが消えてしまう) g.map_lower(sns.scatterplot, **lower_kws) elif lowerkind == "scatter": g.map_lower(sns.scatterplot, **lower_kws) else: g.map_lower(sns.regplot, **lower_kws) #色分け(hue)有無で場合分けしてプロット&相関係数表示実行 #hueなし if self.hue is None: #右上に相関係数表示 self.hue_names = None self._corrall_upper(g) #hueあり else: #右上に相関係数表示(hueごとに色分け&全体の相関係数を黒表示) self.hue_names = g.hue_names g.map_upper(self._corrfunc) self._corrall_upper(g) g.add_legend()
df_test = pd.DataFrame() # Add columns df_test['u-g'] = X_test[:num_data, 0] df_test['g-r'] = X_test[:num_data, 1] df_test['r-i'] = X_test[:num_data, 2] df_test['i-z'] = X_test[:num_data, 3] df_test['mag(i)'] = X_test[:num_data, 4] df_test['redshift'] = y_test[:num_data] df_all = pd.concat( [df_train.assign(dataset='train'), df_test.assign(dataset='test')]) #### plotting #### g = sns.PairGrid(df_all, hue='dataset') g = g.map_upper(sns.scatterplot, alpha=0.8) g = g.map_lower(sns.kdeplot, n_levels=6, shade=True, shade_lowest=False, alpha=0.7) g = g.map_diag(sns.kdeplot, lw=2, shade=True) # g = g.map_diag(sns.distplot, hist=True, kde=False) g = g.add_legend() g.savefig("pairplot_best.png") plt.clf() # Clean parirplot figure from sns Image(filename='pairplot_best.png') # Show pairplot as image
correlations = features.corr()["Global"].dropna().sort_values() # Extract the columns to plot plot_data = features[["Global", "Critic_Score", "User_Score", "Critic_Count", "User_Count"]] # Function to calculate correlation coefficient between two columns def corr_func(x, y, **kwargs): r = np.corrcoef(x, y)[0][1] ax = plt.gca() ax.annotate("r = {:.2f}".format(r), xy=(.2, .8), xycoords=ax.transAxes, size = 20) # Create the pairgrid object grid = sns.PairGrid(data = plot_data, size = 3) # Upper is a scatter plot grid.map_upper(plt.scatter, color = "lightskyblue", alpha = 0.6, marker=".", s=10) # Diagonal is a histogram grid.map_diag(plt.hist, color = "lightskyblue", edgecolor="black") # Bottom is correlation and density plot grid.map_lower(corr_func) grid.map_lower(sns.kdeplot, cmap = plt.cm.Blues) # Title for entire plot plt.suptitle("Pairs Plot of Game Scores", size = 36, y = 1.02);
#%% #Generate Correlation HeatMap colormap = sns.diverging_palette(220, 10, as_cmap=True) f, ax = plt.subplots(figsize=(10, 10)) corr = df_census.corr() sns.heatmap(corr, cmap="coolwarm", annot=True, fmt=".2f", xticklabels=corr.columns.values, yticklabels=corr.columns.values) #%% # Pairplot matrix. #%% g = sns.PairGrid(df_census,vars=['age','fnlwgt', 'capital_gain','capital_loss', 'hours_per_week'], hue='income_bracket',palette = 'muted') g.map(plt.scatter, alpha=0.8) g.add_legend(); #%% df_age = df_census.loc[:,['gender', 'age', 'income_bracket']] conditions = [ (df_age['age'] < 20), (df_age['age'] < 30), (df_age['age'] < 40), (df_age['age'] < 50), (df_age['age'] < 60), (df_age['age'] < 70), (df_age['age'] < 110)] choices = ['10-20', '20-30', '30-40','40-50','50-60','60-70','70-110']
if found: if 'DREAM' in algs[i]: if 'c' in AE: samp_lat=vec2img(samp) width=tuple(np.mod(i,2) for i in samp_lat.shape[1:]) samp_lat=chop(samp_lat,width)[None,:,:,None] if autoencoder.activations['latent'] is None else samp_lat.flatten()[None,:] samp=(pad(np.squeeze(autoencoder.decode(samp_lat)),width)).reshape((samp.shape[0],-1)) else: samp=autoencoder.decode(samp) samps[i]=samp # samp=pd.DataFrame(samp,columns=['$u_{}$'.format(i) for i in np.arange(1,samp.shape[1]+1)]) # g=sns.PairGrid(samp,diag_sharey=False) # g.map_upper(plt.scatter) # g.map_lower(sns.kdeplot) # g.map_diag(sns.kdeplot,lw=2,legend=False) # # g.fig.suptitle(algs[i]) # # g.fig.subplots_adjust(top=0.95) # plt.savefig(os.path.join(folder,algs[i]+'_dist.png'),bbox_inches='tight') # form a big data frame alg_array=np.hstack([[alg_names[i]]*num_samp for i in range(num_algs)]) df_samps=pd.DataFrame(samps.reshape((-1,lin.input_dim)),columns=['$u_{}$'.format(i) for i in np.arange(1,lin.input_dim+1)]) df_samps['algorithm']=alg_array.flatten() g=sns.PairGrid(df_samps,hue='algorithm',diag_sharey=False) g.map_upper(plt.scatter,s=1,alpha=0.5) g.map_lower(sns.kdeplot) g.map_diag(sns.kdeplot,lw=2) g.add_legend() # g.fig.suptitle('MCMC') # g.fig.subplots_adjust(top=0.95) plt.savefig(os.path.join(folder,'allmcmc_dist.png'),bbox_inches='tight')
def show_beautiful_bivariate_groupings(dat, color_by=False): g = sns.PairGrid(dat, hue=color_by) g.map_diag(plt.hist) g.map_offdiag(plt.scatter) g.add_legend()
# In[97]: sns.jointplot('AAPL', 'MSFT', roi_tech_comps, kind='scatter', color='seagreen' ) # In[98]: sns.pairplot(roi_tech_comps.dropna()) # In[103]: tech_rets_fig = sns.PairGrid(roi_tech_comps.dropna()) tech_rets_fig.map_upper(plt.scatter, marker='o', color='purple') tech_rets_fig.map_diag(plt.hist, bins=40) tech_rets_fig.map_lower(sns.kdeplot, cmap='cool_d') # In[132]: rets = roi_tech_comps.dropna() area = np.pi*20 plt.scatter(rets.mean(), rets.std(), s=area) plt.xlabel('Expected Return')
def plot_variable_pairs(df): graph = sns.PairGrid(df) graph.map_diag(plt.hist) graph.map_offdiag(sns.regplot) plt.show()
def plot_fig(df, save=False): fig = sns.PairGrid(df, diag_sharey=False) plt.subplots_adjust(top=0.9) fig.fig.suptitle("Distribution") fig.map_lower(sns.kdeplot, cmap="Blues_d") fig.map_upper(plt.scatter) fig.map_diag(sns.distplot) if save == True: fig.savefig("pairplot.png") #plot kde joitplot fig = plt.figure() g = sns.jointplot(x=df.columns.values[0], y=df.columns.values[1], kind="kde", data=df) plt.subplots_adjust(top=0.9) g.fig.suptitle("Kernel Distribution Estimation") if save == True: fig.savefig("joint_kde_plot.png") #calculate acf, pcf acf, pcf = calc_pacf(df) ma = calc_ma(df) #calcularw return ret = calc_return(df) for i in range(len(df.columns)): #plot distribution fig = plt.figure() sns.distplot(df.iloc[:, i]) plt.title("Histgram - " + df.columns[i]) if save == True: fig.savefig("dist_plot_" + df.columns[i] + ".png") #plot series fig = plt.figure() df.iloc[:, i].plot() #plot moving average ma[i].plot(c='r') plt.title("Row data & Moving average - " + df.columns[i]) if save == True: fig.savefig("series_plot_" + df.columns[i] + ".png") #plot acf fig = plt.figure() plt.bar(range(len(acf[i])), acf[i], width=0.3) plt.title("Auto Correlation Function - " + df.columns[i]) if save == True: fig.savefig("acf_plot_" + df.columns[i] + ".png") #plot pcf fig = plt.figure() plt.bar(range(len(pcf[i])), pcf[i], width=0.3) plt.title("Partial Auto Correlation Function - " + df.columns[i]) if save == True: fig.savefig("pcf_plot_" + df.columns[i] + ".png") #plot fft fig = plt.figure() f = calc_fft(df.iloc[:, i]) plt.plot(f[1], f[0]) plt.title("Fast Fourier Transform - " + df.columns[i]) if save == True: fig.savefig("fft_plot_" + df.columns[i] + ".png") #plot return value_i - value_i2 fig = plt.figure() ret[i].plot() plt.title("Return - " + df.columns[i]) if save == True: fig.savefig("return_plot_" + df.columns[i] + ".png")
# ## Scaling the data to chart it and allow better predictive power # In[5]: bcs = pd.DataFrame(preprocessing.scale(bc.iloc[:, 2:32])) bcs.columns = list(bc.iloc[:, 2:32].columns) bcs['diagnosis'] = bc['diagnosis'] # ## Checking for correlation between variables and diagnosis # In[7]: from pandas.plotting import scatter_matrix p = sns.PairGrid(bcs.iloc[:, 20:32], hue='diagnosis', palette='Reds') p.map_upper(plt.scatter, s=20, edgecolor='w') p.map_diag(plt.hist) p.map_lower(sns.kdeplot, cmap='GnBu_d') p.add_legend() p.figsize = (30, 30) # ## Lets see how each variable breaks down by diagnosis # In[8]: mbc = pd.melt(bcs, "diagnosis", var_name="measurement") fig, ax = plt.subplots(figsize=(10, 5)) p = sns.violinplot(ax=ax, x="measurement",
def pcaDistribution(pca, X_pca, savePath, ncomp=4): # Visualise how the PCs compare to each other lim = 6 rot2rad = -49 * np.pi / 180 X_pca = rotFlip(X_pca, rot2rad, flipAx=2, rotAxes=[2, 3]) df_pca = pd.DataFrame(data=X_pca[:, :ncomp]) sns.set_context("paper", rc={"font.size": 18, "axes.labelsize": 18}) g = sns.PairGrid(df_pca, corner=True, diag_sharey=False) g.fig.set_size_inches(20, 20) g.map_diag(sns.kdeplot, color="k") cbar_ax = g.fig.add_axes([0.91, 0.325, 0.015, 0.4]) for i in range(g.axes.shape[0]): for j in range(g.axes.shape[1]): if g.axes[i, j] is not None and i is not j: if i == ncomp - 1 and j == ncomp - 2: cbax = cbar_ax cbFlag = True else: cbax = None cbFlag = False g.axes[i, j] = sns.kdeplot( g.data.iloc[:, j], g.data.iloc[:, i], ax=g.axes[i, j], shade=True, shade_lowest=False, legend=False, vmin=0, vmax=0.12, n_levels=100, cbar=cbFlag, cbar_ax=cbax, cmap="cubehelix_r", cbar_kws={"label": "Density"}, ) if cbFlag: cax = plt.gcf().axes[-1] cax.tick_params(labelsize=18) for col in g.axes[i, j].collections: col.set_edgecolor("face") g.axes[i, j].set_xlim((-lim, lim)) g.axes[i, j].set_ylim((-lim, lim)) g.axes[i, j].tick_params(axis="both", which="major", labelsize=18) elif g.axes[i, j] is not None and i == j: g.axes[i, j].tick_params(axis="both", which="major", labelsize=18) evr = str(round(pca.explained_variance_ratio_[i], 2)) evrS = str(round(np.sum(pca.explained_variance_ratio_[: i + 1]), 2)) g.axes[i, j].annotate( "EVR: " + evr, (0.6, 0.9), xycoords="axes fraction" ) g.axes[i, j].annotate( "CEVR: " + evrS, (0.6, 0.8), xycoords="axes fraction" ) g.diag_axes[i].set_axis_on() sns.utils.despine(ax=g.diag_axes[i], left=True, right=False) g.diag_axes[i].tick_params( axis="y", which="major", labelsize=18, right=True ) g.diag_axes[i].set_ylim((0, 0.4)) g.diag_axes[i].set_ylabel("Density") if i == g.axes.shape[0] - 1 and j == g.axes.shape[1] - 1: g.axes[i, j].set_xlim((-lim, lim)) if i == g.axes.shape[1] - 1: g.axes[i, j].set_xlabel("PC " + str(j + 1)) if j == 0: g.axes[i, j].set_ylabel("PC " + str(i + 1)) plt.savefig(savePath + "/pairgrid.pdf", bbox_inches="tight") plt.show()
# using grids import seaborn as sns import matplotlib.pyplot as plt iris = sns.load_dataset('iris') print(iris.head()) print(iris['species'].unique()) # pair grid # have to specify/map your own plots g = sns.PairGrid(iris) g.map_diag(sns.distplot) g.map_upper(plt.scatter) g.map_lower(sns.kdeplot) #plt.show() # facet grid ** tips = sns.load_dataset('tips') # seperate by col,row # map plot with a column of data g = sns.FacetGrid(data=tips, col='time', row='smoker') #plt.figure() g.map(sns.distplot, 'total_bill')
iris.head() iris.info() iris.describe() iris.shape iris['Species'].value_counts() iris.drop(['Id'], axis=1, inplace=True) iris.head() g = sns.PairGrid(iris, hue='Species') g.map_diag(sns.distplot) g.map_upper(plt.scatter) g.map_lower(sns.kdeplot) species = pd.get_dummies(iris['Species'], drop_first=True) species.head() iris = pd.concat([iris, species], axis=1) iris.head() iris.drop(['Species'], axis=1, inplace=True) iris.head()
from graspy.embed import select_dimension select_dimension(all_hop_hist.T, n_elbows=5) #%% from graspy.embed import selectSVD from graspy.plot import pairplot n_elbows = 3 U, S, V = selectSVD(all_hop_hist.T, n_elbows=n_elbows) plot_df = pd.DataFrame(data=U) plot_df["label"] = meta["merge_class"].values pg = sns.PairGrid(plot_df, hue="label", palette=CLASS_COLOR_DICT, vars=np.arange(U.shape[1]), height=4) # pg._legend.remove() # pg.map_diag(plt.hist) pg.map_offdiag(sns.scatterplot, s=15, linewidth=0, alpha=0.7) def tweak(x, y, **kws): ax = plt.gca() if len(x) > 0: xmax = np.nanmax(x) xtop = ax.get_xlim()[-1] if xmax > xtop: ax.set_xlim([-1, xmax + 1])
'pelvic incidence', 'pelvic tilt', 'lumbar lordosis angle', 'sacral slope', 'pelvic radius', 'grade of spondylolisthesis', 'class' ] df['class'] = df['class'].map({'AB': 1, 'NO': 0}) #Abnormal=1, Normal=0 df.head(10) #Displaying a part of the data # # (b) Pre-Processing and Exploratory Data Analysis # ## i. Scatterplots of the independent variables sns.set(font_scale=2) g = sns.PairGrid(df, height=5, hue="class", hue_kws={"marker": ["o", "+"]}, palette=["#FF0000", "#0C41CE"], vars=[ 'pelvic incidence', 'pelvic tilt', 'lumbar lordosis angle', 'sacral slope', 'pelvic radius', 'grade of spondylolisthesis' ]) g = g.map(plt.scatter) g = g.add_legend() # ## ii. Boxplots for each of the independent variables plt.figure(figsize=(20, 20)) plt.subplots_adjust(wspace=1) for i in range(len(df.columns) - 1): plt.subplot(2, 3, i + 1) sns.boxplot(x='class', y=df.columns[i], data=df)
plt.xticks(rotation=90) # As an alternative of violin plot, box plot can be used # box plots are also useful in terms of seeing outliers # I do not visualize all features with box plot # In order to show you lets have an example of box plot # If you want, you can visualize other features as well. plt.figure(figsize=(10, 10)) sns.boxplot(x="features", y="value", hue="diagnosis", data=data) plt.xticks(rotation=90) sns.jointplot(x.loc[:, 'concavity_worst'], x.loc[:, 'concave points_worst'], kind="regg", color="#ce1414") sns.set(style="white") df = x.loc[:, ['radius_worst', 'perimeter_worst', 'area_worst']] g = sns.PairGrid(df, diag_sharey=False) g.map_lower(sns.kdeplot, cmap="Blues_d") g.map_upper(plt.scatter) g.map_diag(sns.kdeplot, lw=3) sns.set(style="whitegrid", palette="muted") data_dia = y data = x data_n_2 = (data - data.mean()) / (data.std()) # standardization data = pd.concat([y, data_n_2.iloc[:, 0:10]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name='value') plt.figure(figsize=(10, 10)) tic = time.time() sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
mean, in this case it will also not ensure that data is C-contiguous which may cause a significant slowdown. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. algorithm : “auto”, “full” or “elkan”, default=”auto” K-means algorithm to use. The classical EM-style algorithm is “full”. The “elkan” variation is more efficient by using the triangle inequality, but currently doesn’t support sparse data. “auto” chooses “elkan” for dense data and “full” for sparse data. """ g = sns.PairGrid( data_train, hue="CLASS_LABEL", palette=sns.color_palette("cubehelix", 3), vars=['AGE', 'CAPITAL_GAIN', 'CAPITAL_LOSS', 'HOURS_WORKED_PER_WEEK']) g.map(plt.scatter) plt.savefig('charts/income-pairgrid.png') model = KMeans( random_state=0, n_jobs=-1, ) # https://www.scikit-yb.org/en/latest/api/cluster/elbow.html visualizer = KElbowVisualizer(model, k=(1, 20)) visualizer.fit(X_train) # Fit the data to the visualizer # Finalize and render the figure
def chapter_3(): """ Notes for Linear Regression - coefficients -> give average change in Y with a one-unit increase in X - confidence interval -> B1_hat +- 2 * SE(B1_hat) - 95% chance the interval contains true value of B - SE(B1_hat) -> var(e) / SSE - t-statistic - t = (B1_hat - 0)/ SE(B1_hat) - test for synergy (additive assumption) - effect of each predictor on response is independent of other predictors - include interaction term -> x1 * x2 - if interaction term has small p value, then not additive (synergy exists) - if results in substantial increase in r2, then not additive (synergy exists) - relationship exists - p value < 0.0005 or < 0.0001 - F statistic greater than 1 - strength of relationship - RSE -> estimates standard deviation of response from regression line - R squared -> % variability in response explained by predictors - percent error -> 100 * residual_standard_error / ys.mean() - accuracy of prediction - prediction interval (individual response) - confidence interval (average response) - non-linearity - residual plots (fitted values vs. studentized/standardized residuals) - if residual plots are not random, transform with log(x), sqrt(x), or x2 - correlation of error terms - will underestimate p value and narrow confidence/prediction intervals - heteroscedasticity (funnel shape of residual plot) - non-constant variances in the errors - if exists, transform the response with log(y) or sqrt(y) - co-linearity of features - (VIF) variance inflation factor -> 1 / (1 - r2) - correlation matrix - reduces t-statistic and increases standard error - outliers - leverage -> high impact on RSE and/or regression line - look at studentized residuals (observations > 3 are outliers) - influence (leverage) plots """ #3.8 -> Simple Linear Regression on Auto data set dat = pd.read_csv("Auto.csv") dat = dat.replace("?", np.nan).dropna() # add constant to x values to ensure mean of residuals = 0 xs = sm.add_constant(dat["horsepower"].astype(float)) ys = dat["mpg"].astype(float) model = sm.OLS(ys, xs).fit() intercept, slope = model.params r2 = model.rsquared # variance inflation factor -> test for co-linearity # min(VIF) = 1.0, if VIF > 5 or 10, features are most likely correlated vif = 1 / (1 - r2) f_stat = model.fvalue p_value = model.pvalues[1] # create new line with the coefficients fit = [slope * x + intercept for x in xs["horsepower"]] print("Simple OLS: %s" % model.summary()) prediction = model.predict() residuals = ys.astype(float) - prediction standardized_residuals = (residuals - residuals.mean()) / \ (residuals.max() - residuals.min()) #residual_standard_error = results.rmse #percent_error = 100 * residual_standard_error / ys.mean() """ Plot """ f = plt.figure() ax = f.add_subplot(221) ax2 = f.add_subplot(223) ax3 = f.add_subplot(222) ax4 = f.add_subplot(224) ax.scatter(xs["horsepower"], ys, label="r2=%f; f=%f; p=%f" % (r2, f_stat, p_value)) ax.plot(xs["horsepower"], fit, color="r", label="f(x) = %f * x + %f" % (slope, intercept)) # plot fitted values vs residuals to check for non-linearity ax2.scatter(model.fittedvalues, residuals, color="r") ax2.axhline(0, color="k") ax2.set_xlabel("fitted values") ax2.set_ylabel("residuals") # show leverage to identity observations that may have # more effect on the regression than other observations sm.graphics.influence_plot(model, ax=ax3) # show fitted values vs studentized residuals outlier_influence = outliers_influence.OLSInfluence(model).summary_frame() ax4.scatter(model.fittedvalues, outlier_influence["student_resid"]) ax4.axhline(0, color="k") ax4.set_xlabel("fitted values") ax4.set_ylabel("studentized residuals") for _ax in [ax, ax2, ax3, ax4]: _ax.legend(loc="best") plt.show() #3.9 -> Multiple Linear Regression on Auto data set xs = dat[[ "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin" ]].astype(float) # plot correlation matrix to check co-linearity # co-linearity reduces the t-statistic (power) of the test # and also increases standard error print("Correlations: %s" % xs.corr()) grid = sns.PairGrid(xs) grid = grid.map(plt.scatter) plt.show() results = pd.ols(y=ys, x=xs) model = sm.OLS(ys, xs).fit() print("Multiple OLS: %s" % results) # compute variance inflation factor (VIF) to check for co-linearity vif = list(map(lambda x: 1 / (1 - x), model.params)) print("VIFs: %s" % vif) """ Looking at the p-values associated with each predictor’s t-statistic, we see that displacement, weight, year, and origin have a statistically significant relationship, while cylinders, horsepower, and acceleration do not. """ print("Coefficients: %s" % results.beta) """ The regression coefficient for year, 0.7508, suggests that for every one year, mpg increases by the coefficient. In other words, cars become more fuel efficient every year by almost 1 mpg / year. """ residuals = results.resid standardized_residuals = (residuals - residuals.mean()) / \ (residuals.max() - residuals.min()) residual_standard_error = results.rmse percent_error = 100 * residual_standard_error / ys.mean() """ Plot """ f = plt.figure() ax = f.add_subplot(221) ax2 = f.add_subplot(223) ax3 = f.add_subplot(222) ax4 = f.add_subplot(224) ax.scatter(results.y_fitted, residuals) ax.axhline(0, color="k") ax.set_xlabel("y fitted values") ax.set_ylabel("residuals") ax2.scatter(results.y_fitted, standardized_residuals, label='percent error=%f' % percent_error) ax2.axhline(0, color="k") ax2.set_xlabel("y fitted values") ax2.set_ylabel("standardized residuals") sm.graphics.influence_plot(model, ax=ax3) for _ax in [ax, ax2, ax3, ax4]: _ax.legend(loc="best") plt.show()
plt.title("Boxplots for dependents vs card") # scatterplot= is a plot that shows the data as a collection of points # the position of a poinyt depends on its two dimensional value sns.scatterplot(x="age", y="reports", hue="card", data=data) plt.title("Scatter plot between reports and age on basis of card") sns.lmplot('age','income',data=data,hue='card',fit_reg=True) plt.title("scatter plot with regresson line for income and age") g=sns.PairGrid(data,hue="card") g.map_diag(plt.hist) g.map_offdiag(plt.scatter) plt.legend() #### Organising Data # replacing the values of the column of card (yes,no) with (credit_yes,credit_no) # replacing the values of the column owner (yes,no) with (owner_yes,owner_no) # replacing the values of the column selfemp (yes,no) with (self_emp_yes,self_emp_no) data.card.replace(['yes','no'],['credit_yes','credit_no'],inplace=True) data.owner.replace(['yes','no'],['owner_yes','owner_no'],inplace=True) data.selfemp.replace(['yes','no'],['self_emp_yes','self_emp_no'],inplace=True) data.head()
plt.tight_layout() plt.savefig(os.path.join(WD_CLUST, "cor_rois_raw.pdf")) """ xbeta_pos xbeta_neg xavg_pos xavg_neg xavg xbeta_pos 1.000000 0.438476 0.933818 -0.401512 0.285163 xbeta_neg 0.438476 1.000000 0.477728 -0.976621 -0.301655 xavg_pos 0.933818 0.477728 1.000000 -0.442388 0.353100 xavg_neg -0.401512 -0.976621 -0.442388 1.000000 0.342208 xavg 0.285163 -0.301655 0.353100 0.342208 1.000000 """ scores["DX"] = y_sczCo #plt.scatter(scores[], scores[], color=scores["y"]) fig = plt.figure(figsize=(20, 20)) g = sns.PairGrid(scores, hue="DX") g.map_diag(plt.hist) g.map_offdiag(plt.scatter) g.add_legend() plt.xticks(rotation=45) plt.tight_layout() plt.savefig(os.path.join(WD_CLUST, "rois_scatterplot.pdf")) from sklearn.decomposition import PCA # PCA with scikit-learn pca = PCA(n_components=2) pca.fit(Xscores[:, [0, 2]]) print(pca.explained_variance_ratio_) scores["pc1_xbeta_pos_neg"] = pca.transform(Xscores[:, [0, 2]])[:, 0] scores["pc2_xbeta_pos_neg"] = pca.transform(Xscores[:, [0, 2]])[:, 1]
#pairs plot of Winrate plot_data = features[['Vpip%', 'Winrate', 'WMSD_WSF%', 'Pfr/Vpip']] plot_data = plot_data.dropna() def corr_func(x, y, **kwargs): r = np.corrcoef(x, y)[0][1] ax = plt.gca() ax.annotate("r = {:.2f}".format(r), xy = (.2, .8), xycoords = ax.transAxes, size = 20) grid = sns.PairGrid(data = plot_data, height = 3) # Upper is a scatter plot grid.map_upper(plt.scatter, color = 'red', alpha = 0.6) # Diagonal is a histogram grid.map_diag(plt.hist, color = 'red', edgecolor = 'black') # Bottom is correlation and density plot grid.map_lower(corr_func); grid.map_lower(sns.kdeplot, cmap = plt.cm.Reds) # Title for entire plot plt.suptitle('Pairs Plot of Energy Data', size = 36, y = 1.02);
sns.factorplot(x='Pclass', hue='Survived', col='Sex', data=data_train, kind='count') # In[14]: sns.pointplot(x='Pclass', y='Survived', hue='Sex', data=data_train) # If we look at each of these categorical variable independently, we observe the following. # In[15]: g = sns.PairGrid(data=data_train, y_vars="Survived", x_vars=["Pclass", "Sex", "SibSp", "Parch"]) g.map(sns.pointplot) # `Pclass` and `Sex` factors seem to be good predictors of the survival as such. In addition, there is an interaction between the two as seen on the factorplot above. # # `SibSp` and `Parch` factors show something different: level `0` seems to exhibit a different behaviour as if there were a confusion with another effect. We can propose the following hypothesis for later data enrichment. # * 0 means that a traveler has no relatives onboard the ship. These people apparently tended to behave differently from people traveling with relatives, which was not necessarily supporting their best interests. # * non-0 values means that people traveled with relatives. These people tended to behave differently from lonely travelers which marks a possible group behaviour. It seems like the bigger the family group is, the lesser the odds of survival are. Smaller families groups of (2 to 3 people) tends to have better survival chances than the average, whereas bigger groups (more than 3 people) tends to on the contrary have lower chances of survival. # Let's conclude this categorical features exploration with `Embarked`. There are 3 possible values, respectively `S` (for Southampton, England), `C` (for Cherbourg, Normandy) and `Q` (for Queenstown now Cobh, Ireland). There are also 2 missing values. # In[16]: sns.factorplot(x="Embarked", hue="Survived",
order_days = tips.day.value_counts().index print(order_days) order_days = Categorical(["Thur","Fri","Sat","Sun"]) g = sns.FacetGrid(tips,col="day",col_order=order_days) g.map(sns.boxplot,"total_bill") plt.show() plt.close() pal = dict(Lunch="seagreen",Dinner="gray") g = sns.FacetGrid(tips,hue="time",col="sex",row="smoker",palette=pal,size=5,hue_kws={"marker":["<",">"]}) g.map(plt.scatter,"total_bill","tip",s=50,alpha=0.7,linewidth=0.6,edgecolor="white") g.add_legend() g.fig.subplots_adjust(wspace=0.2,hspace=0.2)#子图之间的间距 #g.set_axis_labels("1","2") #g.set(xticks=[10,30,50]) plt.show() g = sns.PairGrid(tips,hue="smoker",vars=["total_bill","tip"],palette="GnBu_d")#vars指定对比的变量,自己选择 g.map_diag(plt.hist)#对角线 g.map_offdiag(plt.scatter)#非对角线 g.add_legend() #g.map(plt.scatter) plt.show() flights = flight.pivot("month","year","passengers") ax = sns.heatmap(flights,annot=True,fmt="d",linewidths="0.5",cmap="YlGnBu") plt.show() tip_corr = tips.corr() sns.heatmap(tip_corr,annot=True,fmt="f",linewidths="0.5",cmap="YlGnBu") plt.show()
legend=True, linestyle='--', marker='o') closing_dataframe = DataReader(['AAPL', 'GOOG', 'MSFT', 'AMZN'], 'yahoo', startTime, endTime)['Adj Close'] closing_dataframe.head() stock_returns = closing_dataframe.pct_change() sns.jointplot('AMZN', 'GOOG', stock_returns, kind='scatter') sns.pairplot(stock_returns.dropna()) fig = sns.PairGrid(closing_dataframe) fig.map_upper(plt.scatter, color='purple') fig.map_lower(sns.kdeplot, cmap='cool_d') fig.map_diag(plt.hist, bins=30) fig = sns.PairGrid(stock_returns.dropna()) fig.map_upper(plt.scatter, color='blue') fig.map_lower(sns.kdeplot, cmap='cool_d') fig.map_diag(plt.hist, bins=50) #plotting the correlation table using heatmap for analysis corr = stock_returns.corr() sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
sns.pairplot(corr[[ 'ground_floor_type_467b', 'roof_type_67f9', 'foundation_type_467b', 'has_superstructure_mud_mortar_stone', 'foundation_type_337f', 'damage_grade' ]], hue="damage_grade", diag_kind='kde', markers='+') corr.dtypes g = sns.PairGrid(corr, x_vars=[ 'ground_floor_type_467b', 'roof_type_67f9', 'foundation_type_467b', 'has_superstructure_mud_mortar_stone', 'foundation_type_337f', 'damage_grade' ], y_vars='damage_grade', aspect=.75, size=3.5) g.map(sns.swarmplot, palette="pastel") df #Some sata transformation df['superstructure'] = df[[ 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
] else: df.loc[i] = [ name, difficulty, horizon, channels, accetability, accetabilityPercent, accetabilePercent, loss, lossPercent, time, iterations ] df[columns[1:4]] = df[columns[1:4]].astype(str) df[columns[4:]] = df[columns[4:]].apply(pd.to_numeric) # df = df.apply(pd.to_numeric) df.sort_values(by=['horizon', 'name'], inplace=True, ascending=[False, True]) outliers.sort_values(by=['horizon', 'name'], inplace=True, ascending=[False, True]) df.to_csv(directory + 'plottedSolutionsData.csv', index=None, header=True) outliers[['loss [x1000]', 'loss % improvement', 'iterations [x100 000]']] = outliers[[ 'loss [x1000]', 'loss % improvement', 'iterations [x100 000]' ]].round(3) outliers.to_csv(directory + 'outliersSolutionsData.csv', index=None, header=True) print(df) g = sns.PairGrid(df, x_vars=columns[1:4], y_vars=columns[4:]) g = g.map(sns.barplot) plt.subplots_adjust(top=0.95, bottom=0.05) g.fig.suptitle("Simulated annealing results categorized by the instances") # plt.tight_layout() plt.show()
#print range, mean, and std print(a.loc[['mean', 'std', 'range']]) # part d df2 = df.drop(index=df.index[10:85]) b = df2.describe() b.loc['range'] = b.loc['max'] - b.loc['min'] print('New mean, std, and range:') print(b.loc[['mean', 'std', 'range']]) # part e g = sns.PairGrid(df, height=1.0) g.map_upper(plt.scatter, s=3) g.map_diag(plt.hist) g.map_lower(plt.scatter, s=3) plt.show() ''' Acceleration seems normally distributed, weight and horsepower have a strong linear relationship mpg has a strong non-linear relationship with displacement, weight, and horsepower ''' # part f ''' It apears that weight, displacement, and horsepower all have a strong non-linear negative relationship with with mpg, also year apears to have a positive relationship with mpg All if these variables could be good predictors of mpg