def make_plot(X_train, y_train, X, y, test_data, model, model_name, features, response): feature = X.columns f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=False) sns.regplot(X[feature[4]], y, test_data, ax=ax1) sns.boxplot(X[feature[4]], y, color="Blues_r", ax=ax2) model.fit(X_train, y_train) sns.residplot(X[feature[4]], (model.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3) if model_name is 'linear': sns.interactplot(X[feature[3]], X[feature[4]], y, ax=ax4, filled=True, scatter_kws={"color": "dimgray"}, contour_kws={"alpha": .5}) elif model_name is 'logistic': pal = sns.blend_palette(["#4169E1", "#DFAAEF", "#E16941"], as_cmap=True) levels = np.linspace(0, 1, 11) sns.interactplot(X[feature[3]], X[feature[4]], y, levels=levels, cmap=pal, logistic=True) else: pass ax1.set_title('Regression') ax2.set_title(feature[4]+' Value') ax3.set_title(feature[4]+' Residuals') ax4.set_title('Two-value Interaction') f.tight_layout() plt.savefig(model_name+'_'+feature[4], bbox_inches='tight') # Multi-variable correlation significance level f, ax = plt.subplots(figsize=(10, 10)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(test_data, annot=False, diag_names=False, cmap=cmap) ax.grid(False) ax.set_title('Multi-variable correlation significance level') plt.savefig(model_name+'_multi-variable_correlation', bbox_inches='tight') # complete coefficient plot - believe this is only for linear regression sns.coefplot("diagnosis ~ "+' + '.join(features), test_data, intercept=True) plt.xticks(rotation='vertical') plt.savefig(model_name+'_coefficient_effects', bbox_inches='tight')
def volcano_plot(tissue_array, type=None, out): """Generates a tissue specific volcano plot based on the log2 fold change (x) and log10(p-values from Tukeys T-test Args: tissue_array (array): Tissue specific array type (str): Tissue data set was derived from; used for labeling plot out (str): The title of the Volcano plot generated <Tissue>+'-<out>'.pdf' Returns: Nothing. Generates a Volcano plot for a tissue specific array """ plt.clf() sns.set(font_scale=1.4) from scipy.stats import ttest_ind filt = tissue_array[np.all(tissue_array != 0, axis=1)] x = log2(filt[:, -2:].mean(axis=1)) - log2(filt[:, :3].mean(axis=1)) y = -log10(ttest_ind(filt[:, -2:], filt[:, :2], axis=1)[1:][0]) xy = column_stack((x, y)) xy = xy[~isinf(xy).any(axis=1)] sns.regplot(xy[:, 0], xy[:, 1], fit_reg=False, color='k', scatter_kws={'alpha': 0.9, 's': 2.0, 'rasterized': False, 'zorder': 1}).set_ylim(0, ) de = xy[abs(xy[:, 0]) > 1, :] de = de[de[:, 1] > 2, :] up = sum(de[:, 0] > 0) down = sum(de[:, 0] < 0) sns.regplot(de[:, 0], de[:, 1], fit_reg=False, color='r', scatter_kws={'alpha': 0.9, 's': 2.5, 'rasterized': False, 'zorder': 1}) plt.axhline(y=2.0, linewidth=.8, color='red', linestyle='dashed') plt.axvline(x=1.0, linewidth=.8, color='red', linestyle='dashed') plt.axvline(x=-1.0, linewidth=.8, color='red', linestyle='dashed') plt.xlabel(r'$\log_2$(KO/WT)') plt.ylabel(r'-$\log_{10}$ p-value') plt.suptitle('%s: Downregulated genes: %s Upregulated genes: %s ' % (type, down, up)) plt.savefig('%s-'+out % (type), format='pdf')
def moran_plot(IM): import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np import pysal as ps y_norm = normalize(IM.y) y_lag = ps.lag_spatial(IM.w, IM.y) y_lag_norm = normalize(y_lag) dados = pd.DataFrame({'y':IM.y, 'y_norm':y_norm, 'y_lag':y_lag, 'y_lag_norm':y_lag_norm}) f, ax = plt.subplots(1, figsize=(7, 5)) sns.regplot('y_norm', 'y_lag_norm', data=dados, ci=None, color='black', line_kws={'color':'red'}) plt.axvline(0, c='gray', alpha=0.7) plt.axhline(0, c='gray', alpha=0.7) limits = np.array([y_norm.min(), y_norm.max(), y_lag_norm.min(), y_lag_norm.max()]) limits = np.abs(limits).max() border = 0.02 ax.set_xlim(- limits - border, limits + border) ax.set_ylim(- limits - border, limits + border) plt.show();
def aligned_residuals(pca): """ Plots error components along with bootstrap resampled error surface. Provides another statistical method to estimate the variance of a dataset. """ A = pca.rotated() fig, axes = P.subplots(2,1, sharex=True, frameon=False) fig.subplots_adjust(hspace=0, wspace=0.1) kw = dict(c="#555555", s=40, alpha=0.5) #lengths = attitude.pca.singular_values[::-1] lengths = (A[:,i].max()-A[:,i].min() for i in range(3)) titles = ( "Long cross-section (axis 3 vs. axis 1)", "Short cross-section (axis 3 vs. axis 2)") for title,ax,(a,b) in zip(titles,axes, [(0,2),(1,2)]): seaborn.regplot(A[:,a], A[:,b], ax=ax) ax.text(0,1,title, verticalalignment='top', transform=ax.transAxes) ax.autoscale(tight=True) for spine in ax.spines.itervalues(): spine.set_visible(False) ax.set_xlabel("Meters") return fig
def explore(wine_set): low = wine_set[wine_set['quality'] <= 5] medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)] high = wine_set[wine_set['quality'] > 7] print('association between wine`s density and residual sugar for wines \nof `low` quality') print(scipy.stats.pearsonr(low['density'], low["residual_sugar"])) print('\nof `medium` quality') print(scipy.stats.pearsonr(medium['density'], medium["residual_sugar"])) print('\nof `high` quality') print(scipy.stats.pearsonr(high['density'], high["residual_sugar"])) scat0 = seaborn.regplot(x="density", y="residual_sugar", fit_reg=True, data=low) plt.xlabel("Density of wine") plt.ylabel("Residual sugar in wine, gram") plt.title("Association between wine's density and residual sugar for wines of `low` quality") plt.show() scat0 = seaborn.regplot(x="density", y="residual_sugar", fit_reg=True, data=medium) plt.xlabel("Density of wine") plt.ylabel("Residual sugar in wine, gram") plt.title("Association between wine's density and residual sugar for wines of `medium` quality") plt.show() scat0 = seaborn.regplot(x="density", y="residual_sugar", fit_reg=True, data=high) plt.xlabel("Density of wine") plt.ylabel("Residual sugar in wine, gram") plt.title("Association between wine's density and residual sugar for wines of `high` quality") plt.show()
def plot_building_temp(): sns.set_context("paper", font_scale=1.5) b = "AZ0000FF" s = "KTUS" filelist = glob.glob(os.getcwd() + "/csv_FY/testWeather/{0}*.csv".format(b)) dfs = [pd.read_csv(csv) for csv in filelist] col = "eui_gas" dfs2 = [df[[col, "month", "year"]] for df in dfs] df3 = pd.concat(dfs2) temp = pd.read_csv(os.getcwd() + "/csv_FY/weather/weatherData_meanTemp.csv") temp["year"] = temp["Unnamed: 0"].map(lambda x: float(x[:4])) temp["month"] = temp["Unnamed: 0"].map(lambda x: float(x[5:7])) temp.set_index(pd.DatetimeIndex(temp["Unnamed: 0"]), inplace=True) temp = temp[[s, "month", "year"]] joint2 = pd.merge(df3, temp, on=["year", "month"], how="inner") joint2.to_csv(os.getcwd() + "/csv_FY/testWeather/test_temp.csv", index=False) sns.lmplot(s, col, data=joint2, col="year", fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + "/csv_FY/testWeather/plot/scatter_temp_byyear.png", dpi=150) plt.close() joint2 = joint2[(2012 < joint2["year"]) & (joint2["year"] < 2015)] sns.regplot(s, col, data=joint2, fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + "/csv_FY/testWeather/plot/scatter_temp_1314.png", dpi=150) plt.close()
def plot_energy_temp(df_energy, df_temp, theme, b, s): df = pd.DataFrame({'energy': df_energy[theme], 'temp': df_temp[s]}) sns.regplot('temp', 'energy', data=df, fit_reg=False) P.savefig(os.getcwd() + '/plot_FY_weather/{2}/{0}_{1}.png'.format(b, s, theme), dpi = 150) plt.title('Temperature-{0} plot: {1}, {2}'.format(theme, b, s)) plt.close() return
def plot_energy_temp(df_energy, df_temp, theme, b, s): df = pd.DataFrame({"energy": df_energy[theme], "temp": df_temp[s]}) sns.regplot("temp", "energy", data=df, fit_reg=False) P.savefig(os.getcwd() + "/plot_FY_weather/{2}/{0}_{1}.png".format(b, s, theme), dpi=150) plt.title("Temperature-{0} plot: {1}, {2}".format(theme, b, s)) plt.close() return
def seaborn_plot_rolling(df_name=df_default_name): my_df = load_df(df_name) import seaborn as sns import matplotlib.pyplot as plt sns.regplot(x="age", y="mean_mod_50", data=my_df) plt.show()
def seaborn_plot(df_name=df_default_name): my_df = load_df(df_name) import seaborn as sns import matplotlib.pyplot as plt sns.regplot(x="age", y="mod_acquire", data=my_df) plt.show()
def plot_offense_vs_defense_spacing(spacing_data): """ Plot of offensive vs. defensive spacing for games Args: spacing_data (pd.DataFrame): Dataframe with columns of spacing data ['home_offense_areas', 'home_defense_areas', 'away_offense_areas', 'away_defense_areas'] save_fig (bool): if True, save plot to temp/ directory Returns None Also, shows plot. """ sns.regplot(spacing_data.away_offense_areas, spacing_data.home_defense_areas, fit_reg=True, color=sns.color_palette()[0], ci=None) sns.regplot(spacing_data.home_offense_areas, spacing_data.away_defense_areas, fit_reg=False, color=sns.color_palette()[0], ci=None) plt.xlabel('Average Offensive Spacing (sq ft)', fontsize=16) plt.ylabel('Average Defensive Spacing (sq ft)', fontsize=16) plt.title('Offensive spacing robustly induces defensive spacing', fontsize=16) plt.savefig('temp/OffenseVsDefense.png') plt.close() return None
def report_model_results(input_data, fit_model, name, filename): order = r_forecast.arimaorder(fit_model) print name + " ({},{},{})".format(*order) print try: intercept, intercept_se = r_stats.coef(fit_model).rx2("intercept")[0], numpy.sqrt(r_stats.vcov(fit_model).rx2("intercept", "intercept")[0]) print "Intercept:", intercept, "+-", intercept_se except: print "No intercept" print residuals = numpy.array(r_stats.residuals(fit_model)[sum(order):]) residuals_mask = ~numpy.isnan(residuals) fit_model_err = numpy.nansum(residuals**2) baseline_err = numpy.sum((numpy.nanmean(input_data) - input_data[sum(order):][residuals_mask])**2) print name + " model squared error: ", fit_model_err, "({} NA residuals)".format(numpy.count_nonzero(numpy.isnan(residuals))) print "No model squared error:" + " " * len(name), baseline_err print "-> R^2:", 1 - fit_model_err / baseline_err print print plt.figure() plt.axis('equal') seaborn.regplot(input_data[sum(order):], input_data[sum(order):] - residuals) plt.savefig(filename) plt.close()
def plot_returns_cmp(self, only_show_returns=False, only_info=False): """考虑资金情况下的度量,进行与benchmark的收益度量对比,收益趋势,资金变动可视化,以及其它度量信息,不涉及benchmark""" self.log_func('买入后卖出的交易数量:{}'.format(self.order_has_ret.shape[0])) self.log_func('胜率:{:.4f}%'.format(self.win_rate * 100)) self.log_func('平均获利期望:{:.4f}%'.format(self.gains_mean * 100)) self.log_func('平均亏损期望:{:.4f}%'.format(self.losses_mean * 100)) self.log_func('盈亏比:{:.4f}'.format(self.win_loss_profit_rate)) self.log_func('策略收益: {:.4f}%'.format(self.algorithm_period_returns * 100)) self.log_func('策略年化收益: {:.4f}%'.format(self.algorithm_annualized_returns * 100)) self.log_func('策略买入成交比例:{:.4f}%'.format(self.buy_deal_rate * 100)) self.log_func('策略资金利用率比例:{:.4f}%'.format(self.cash_utilization * 100)) self.log_func('策略共执行{}个交易日'.format(self.num_trading_days)) if only_info: return self.algorithm_cum_returns.plot() plt.legend(['algorithm returns'], loc='best') plt.show() if only_show_returns: return sns.regplot(x=np.arange(0, len(self.algorithm_cum_returns)), y=self.algorithm_cum_returns.values) plt.show() sns.distplot(self.capital.capital_pd['capital_blance'], kde_kws={"lw": 3, "label": "capital blance kde"}) plt.show()
def versus(data, x, y, xlabel="ratio", ylabels=("growth1", "growth2"), outfn="versus"): plt.figure(figsize=(12, 6)) re_data = reconfigure_data(data, x, y) # series1 = data[(data[x] > 1)] # series2 = data[(data[x] < 1)] # # growth1 = pd.concat([series1[y[0]], series2[y[1]]]) # growth2 = pd.concat([series1[y[1]], series2[y[0]]]) # ratio = pd.concat([series1[x], 1 / series2[x]]) # # indata = {} # indata[xlabel] = ratio # indata[ylabels[0]] = growth1 # indata[ylabels[1]] = growth2 # re_data = pd.DataFrame(indata) plt.subplot(131) sns.regplot(x="x", y="y1", data=re_data) shared.add_stats(re_data, "x", "y1") sns.despine() plt.subplot(132) sns.regplot(x="x", y="y2", data=re_data) shared.add_stats(re_data, "x", "y2") sns.despine() ax = plt.subplot(133) swarm(ax, re_data, ylabels[0], ylabels[1], "growth rate (\si{\per\hour})") plt.tight_layout() plt.savefig("ParA_inheritance/{0}.pdf".format(outfn))
def plot_building_temp(): sns.set_context("paper", font_scale=1.5) b = 'AZ0000FF' s = 'KTUS' filelist = glob.glob(os.getcwd() + '/csv_FY/testWeather/{0}*.csv'.format(b)) dfs = [pd.read_csv(csv) for csv in filelist] col = 'eui_gas' dfs2 = [df[[col, 'month', 'year']] for df in dfs] df3 = (pd.concat(dfs2)) temp = pd.read_csv(os.getcwd() + '/csv_FY/weather/weatherData_meanTemp.csv') temp['year'] = temp['Unnamed: 0'].map(lambda x: float(x[:4])) temp['month'] = temp['Unnamed: 0'].map(lambda x: float(x[5:7])) temp.set_index(pd.DatetimeIndex(temp['Unnamed: 0']), inplace=True) temp = temp[[s, 'month', 'year']] joint2 = pd.merge(df3, temp, on = ['year', 'month'], how = 'inner') joint2.to_csv(os.getcwd() + '/csv_FY/testWeather/test_temp.csv', index=False) sns.lmplot(s, col, data=joint2, col='year', fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + '/csv_FY/testWeather/plot/scatter_temp_byyear.png', dpi=150) plt.close() joint2 = joint2[(2012 < joint2['year']) & (joint2['year'] < 2015)] sns.regplot(s, col, data=joint2, fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + '/csv_FY/testWeather/plot/scatter_temp_1314.png', dpi=150) plt.close()
def show_friction_line(self, sns_context="talk"): """ Shows results from dF/dN friction test :param sns_context: :return: None """ self.mean_friction_frame() p_dat = self.mean_fric_frame[self.mean_fric_frame['direction'] == 0] n_dat = self.mean_fric_frame[self.mean_fric_frame['direction'] == 1] xbins = self.mean_fric_frame['load_index'].max() + 1 sns.set_context(sns_context) f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2) sns.regplot(x='N', y='F', data=p_dat, x_bins=xbins, ax=ax1) sns.regplot(x='N', y='F', data=n_dat, x_bins=xbins, ax=ax2) ax1.set_title("(+)") ax2.set_title("(-)") plt.tight_layout() plt.show()
def view_correlations(corr_df, dftouse, y=False, filter=None, col_num=2): if not y: # use dependent variable from first column of data frame y = dftouse[dftouse.columns.values[0]] # get the number of axes required axes_num = len(corr_df) axes_num = axes_num + 1 if is_prime(axes_num) else axes_num row_num = axes_num / col_num print row_num , col_num # create the figure and axes for each plot f, axes = plt.subplots(row_num, col_num, figsize=(30,30)) # generate plots for each independent variable for ax, (x, corr) in zip(axes.ravel(),corr_df.iterrows()): if filter == None or filter in x: sns.regplot(x=dftouse[x], y=dftouse[y], ax=ax) plt.show()
def createMatrixInteract(self, event): dlg = RegressDialog(self.parent, "Matrix Interaction Plot") log = wx.CheckBox(dlg, label="Logistic Fit?") dlg.Add(log) if dlg.ShowModal() == wx.ID_OK: y, xs = dlg.GetValue() log = log.GetValue() data = self.parent.data[list(xs) + [y]] df = data[list(xs)] fig, axes = plt.subplots(nrows=len(xs), ncols=len(xs)) for i, l1 in enumerate(df): for j, l2 in enumerate(df): ax = axes[j, i] ax.grid(False) plt.subplot(ax) if i == j: sns.regplot(data[l1], data[y], ax=ax), # would like to do logistic plot, but takes too long elif i < j: sns.interactplot(l1, l2, y, data, ax=ax, logistic=log, cmap=settings["cmap"]) if i != 0 and j != 0: ax.yaxis.set_visible(False) if j != len(xs) - 1: ax.xaxis.set_visible(False) plt.show()
def fig_compareposteriors(posterior_a, posterior_b): pbbetter = (np.array(posterior_b) > np.array(posterior_a)).mean() agtb = [i for i, (a, b) in enumerate(zip(posterior_a, posterior_b)) if a > b] bgta = [i for i, (a, b) in enumerate(zip(posterior_a, posterior_b)) if b > a] fig, ax = plt.subplots(figsize=(6.5, 6.5)) ax = sns.regplot(np.array(posterior_a)[agtb], np.array(posterior_b)[agtb], fit_reg=False, color=cola, marker='.') ax = sns.regplot(np.array(posterior_a)[bgta], np.array(posterior_b)[bgta], fit_reg=False, color=colb, marker='.') ax.plot(0.04, 0.05, color='#222222', marker='X') lim1, lim2 = 0, 0.12 ax.set_xlim(lim1, lim2) ax.set_ylim(lim1, lim2) ax.plot([lim1, lim2], [lim1, lim2], color=colb) ax.text(0.07, 0.10, 'B better', color=colb) ax.text(0.07, 0.095, '{:.1f}%'.format(100*pbbetter), color=colb) ax.text(0.09, 0.07, 'A better', color=cola) ax.text(0.09, 0.065, '{:.1f}%'.format(100-100*pbbetter), color=cola) ax.set_xlabel('Conversion fraction layout A') ax.set_ylabel('Conversion fraction layout B') fig.savefig('img/compareposteriors.png', bbox_inches='tight')
def fig_linplot(data, x , y, aly_title, fig_save = True): """ Plot correlations Parameters ---------- data : pd.DataFrame x,y : str X and Y axis for the plot, valid column names of data aly_title : str fig_save : bool, optional False if data should not be saved """ ff = file_folder_specs() title = aly_title + 'for {} vs {}'.format(x, y) sns.regplot(x ,y , data = data) plt.title(title) plt.xlim(0) plt.ylim(0) if fig_save: _save_fig(title, ff['fig']) plt.show() plt.close()
def timePlotLine(data): normalize = input("Would you like to normalize the y-axis? (y/n): ") geneNamesDict = {} for _, row in data.iterrows(): geneNamesDict[row['Gene']] = 1 data = data.pivot_table('Values', ['Sample'], ['Gene', 'Time']) geneList = geneNamesDict.keys() ylabel = input("What should the y-axis label be?: ") counter = 1 for key in geneList: plt.figure(counter) tempTable = data[key] tempTable = tempTable.T tempTable = tempTable.dropna(axis=1, how='any') if normalize == 'y': tempTable = tempTable / np.amax(tempTable.values) tempTable['Time'] = tempTable.index tempTable = pd.melt(tempTable, id_vars='Time')[['Time','value']] sns.regplot(x='Time',y='value',data=tempTable,scatter=True) plt.title(key) plt.ylabel(ylabel) plt.xlabel('Time(min)') counter += 1 plt.show()
def plot_against_y(self, function=None, y_margin=0.1, lim=10, context="talk"): """Where colour is squared error or some other var""" # do linked plots here cat, cont, time = cat_cont_time(self.df[self.vars_of_interest]) # cat = self.df.columns[self.df.dtypes=='category'] # cont = self.df.columns[self.df.dtypes=='float64'] # first continuous cols = cat + cont + time cols = cols[:10] sns.set_context(context) fig, axs = plt.subplots(nrows=1, ncols=len(cols), sharey=True) for ax, col in zip(axs.flat, cols): if col in cont: sns.regplot(x=col, y=self.y, data=self.df, ax=ax) # g = sns.lmplot(x="total_bill", y=self.y, data=self.df) # then categorical # fig, axs = plt.subplots(nrows=1, ncols=len(cat), sharey=True) # for ax, col in zip(axs.flat, cat): elif col in cat: sns.violinplot(x=col, y=self.y, data=self.df, ax=ax) else: # plot timeseries self.df([self.y, col]).plot() y_min, y_max = self.df[self.y].min(), (self.df[self.y].max()) y_range = y_max - y_min plt.ylim(y_min - y_margin * y_range, y_max + y_margin * y_range) # g = sns.FacetGrid(self.df,col=self.df.columns[self.df.dtypes=='category'],row=self.y,sharey=True) # g.map(sns.violinplot) return fig
def plot_crbl_fit(model_df, rbh_df, hits_df, model_plot_fn, show=False, figsize=(10,10)): plt.style.use('seaborn-ticks') with FigureManager(model_plot_fn, show=show, figsize=figsize) as (fig, ax): scatter_kws = {'s': 10, 'alpha':0.7} scatter_kws['c'] = sns.xkcd_rgb['ruby'] scatter_kws['marker'] = 'o' line_kws = {'c': sns.xkcd_rgb['red wine'], 'label':'Query Hits Regression'} sample_size = min(len(hits_df), 10000) sns.regplot('s_aln_len', 'E_s', hits_df.sample(sample_size), order=1, label='Query Hits', scatter_kws=scatter_kws, line_kws=line_kws, color=scatter_kws['c'], ax=ax) scatter_kws['c'] = sns.xkcd_rgb['twilight blue'] scatter_kws['marker'] = 's' sns.regplot('center', 'fit', model_df, fit_reg=False, x_jitter=True, y_jitter=True, ax=ax, label='CRBL Fit', scatter_kws=scatter_kws, line_kws=line_kws) leg = ax.legend(fontsize='medium', scatterpoints=3, frameon=True) leg.get_frame().set_linewidth(1.0) ax.set_xlim(model_df['center'].min(), model_df['center'].max()) ax.set_ylim(0, max(model_df['fit'].max(), hits_df['E'].max()) + 50) ax.set_title('CRBL Fit')
def plotExp(self,exp,myData): plt.figure(); sns.regplot(exp[1]['xaxis'],exp[1]['yaxis'],data=myData); plt.savefig("plots/static/%d.png" %settings.count); plt.clf() plt.close() settings.count=settings.count+1;
def sim_regression(show=True): """Simulate a data set with one regressor (age) on both d' and c. """ intercepts = {'d': 1, 'c': 0} betas = {'d': -0.005, 'c': 0.001} errors = {'d': 0.05, 'c': 0.005} reg = lambda p, y: intercepts[p] + betas[p] * y nsubjects = 100 y = np.linspace(10, 90, 1000) ages = np.random.randint(18, 80, size=nsubjects) data = { 'age': ages, 'N': [50] * nsubjects, 'S': [50] * nsubjects } plt.figure(figsize=(15, 7.5)) for i, p in enumerate(['d', 'c'], 1): plt.subplot(1, 2, i) true_p = reg(p, ages) + norm.rvs(0, errors[p], nsubjects) sb.regplot(ages, true_p, fit_reg=False, label='True values (regression + some error)') plt.plot(y, reg(p, y), linewidth=2, label='True regression line') plt.ylabel(p) data['true_%s' % p] = true_p simulations = np.asarray( simulate(data['true_d'], data['true_c'], data['N'], data['S']) ) data['F'], data['H'], data['M'], data['R'] = zip(*simulations) data['mle_d'], data['mle_c'] = zip(*est_sdt(*zip(*simulations))) df = pd.DataFrame(data) df['subj'] = ['subj_%i' % i for i in df.index] df.to_csv('02.OneContinuousAndOneDichotomousPredictor.csv') ylabels = {'d': "$d^\prime$", 'c': "$c$"} for i, p in enumerate(['d', 'c'], 1): plt.subplot(1, 2, i) sb.regplot( 'age', 'mle_%s' % p, df, label='MLEs based on simulated data (%i trials per subject)' % (data['N'][0] + data['S'][0]) ) plt.ylabel(ylabels[p]) plt.xlabel('Age (years)') plt.xlim(10, 90) if show is True: print df plt.legend() plt.tight_layout(pad=0) plt.savefig('fig1.png') else: return df
def scatter_plot(df,dep_var, indep_var,units): seaborn.regplot(x=indep_var, y=dep_var, data=df, fit_reg=False) #would be great to figure out how to remove '_cat' plt.xlabel(indep_var) plt.ylabel(dep_var + ", " + units) plt.title("Scatterplot of " + dep_var + " versus " + indep_var) plt.savefig(wd + "Scatterplot_" + dep_var + "_vs_"+ indep_var + '.png') plt.close
def plot_prediction_error(name, clf, X, y): plt.figure() cv = KFold(X.shape[0], 5, shuffle=True) predicted = cross_val_predict(clf, X, y, cv=cv) print("%.3f = mean squared error" % mean_squared_error(y, predicted)) sns.regplot(x=y[:1000], y=predicted[:1000]) sns.axlabel("actual", "predicted") plt.savefig("plot_validation_" + name + ".png")
def make_scatter_plot(explain, response): seaborn.regplot(x=explain, y=response, fit_reg=True, data=df_combined) plt.xlabel(str(explain)) plt.ylabel(str(response)) plt.title('Association between '+str(explain)+ ' and '+str(response)) plt.show() print (scipy.stats.pearsonr(df_combined[explain], df_combined[response]))
def makeCorrPlot(truthCol, predCol, df, outBase, method, measure, logged=True): plt.cla() plt.clf() measureName = "NO MEASURE NAME" if measure == "tpm": outFile = "{}_tpm_corr.pdf".format(outBase) measureName = "TPM" elif measure == "num_reads": outFile = "{}_num_reads_corr.pdf".format(outBase) measureName = "# fragments" rv, pv = stats.spearmanr(df[truthCol], df[predCol]) corrText = "Spearman r = {0:.2f}".format(rv) if (logged): minLogVal = -2.5 smallVal = 1e-2 ax = plt.axes() minVal = min(np.log10(df.loc[df[truthCol] > 0, truthCol].min()), np.log10(df.loc[df[predCol] > 0, predCol].min())) maxVal = max(np.log10(df.loc[df[truthCol] > 0, truthCol].max()), np.log10(df.loc[df[predCol] > 0, predCol].max())) sns.regplot(np.log10(df[truthCol]), np.log10(df[predCol]), df, fit_reg=False, dropna=True, color=[0.7, 0.7, 0.7, 0.2], ax=ax) ax.set_xlabel("log(True {})".format(measureName)) ax.set_ylabel("log({} {})".format(method, measureName)) ax.set_xlim(minVal-0.5, maxVal+0.5) ax.set_ylim(minVal-0.5, maxVal+0.5) plt.figtext(0.15, 0.85, corrText) else: ax = plt.axes() minVal = min(df[truthCol].min(), df[predCol].min()) maxVal = max(df[truthCol].max(), df[predCol].max()) sns.regplot(truthCol, predCol, df, fit_reg=False, color=[0.7, 0.7, 0.7, 0.2], ax=ax) ax.set_xlabel("True {}".format(measureName)) ax.set_ylabel("{} {}".format(method, measureName)) ax.set_xlim(minVal, maxVal) ax.set_ylim(minVal, maxVal) plt.figtext(0.15, 0.85, corrText) # Get rid of axis spines sns.despine() plt.savefig(outFile)
def plot_time_effects(measure_DVs, melted_DVs, title=None): f, (ax1,ax2) = plt.subplots(1, 2, figsize=(16,8)) for name in measure_DVs.columns[:-2]: sns.regplot('hour', name, data=measure_DVs, lowess=True, label=name, ax=ax1, scatter_kws={'s': 100, 'alpha': .4}) ax1.legend() sns.boxplot('split_time', 'value', hue='variable', data=melted, ax=ax2) if title: plt.suptitle(title, fontsize=18) plt.show()
df = df[df['date'] >= '2020-03-01'].reset_index() df.drop('index', axis=1, inplace=True) print(df.head()) plt.figure(figsize=(20, 10)) plt.bar(df.date, df['new_cases'], color='salmon') ax = plt.gca() for i, label in enumerate(ax.get_xaxis().get_ticklabels()): if i % 7 != 0: label.set_visible(False) sns.regplot(x=df.index, y='rolling_avg_week', data=df, order=16, ci=None, color='red') plt.title('California COVID-19 New Cases By Day') plt.xlabel('Date') plt.ylabel('New Cases') plt.show() last_week = df.tail(7)['rolling_avg_week'] m = (last_week.iloc[6] - last_week.iloc[0]) / 6 b = last_week.iloc[6] five_hundred = ceil((500 - b) / m) print('Based on current weekly trends, it would take ' + str(five_hundred) + ' days until the average number of new daily cases in '
df=pd.DataFrame(np.random.rand(6,4), index=['one','two','three','four','five','six'], columns=[pd.Index(['A','B','C','D'],name='Genus')]) df.plot.bar() df.plot.barh(stacked=True, alpha=0.5) df tips =pd.read_csv('C:/Users/barak/OneDrive/Documents/python/pydata-book-2nd-edition/examples/tips.csv') ######Seaborn import seaborn as sns tips['tip_pct']= tips['tip']/(tips['total_bill']-tips['tip']) tips sns.barplot(x='tip_pct',y='day',data=tips,orient='h') sns.barplot(x='tip_pct',y='day',data=tips,orient='h', hue='time') sns.set(style='whitegrid') tips['tip_pct'].plot.density() macrodata =pd.read_csv('C:/Users/barak/OneDrive/Documents/python/pydata-book-2nd-edition/examples/macrodata.csv') ######Seaborn data=macrodata[['cpi','m1','tbilrate','unemp']] trans_data=np.log(data).diff().dropna() trans_data[-5:] sns.regplot('m1','unemp',data=trans_data) plt.title('Change in log %s Versus log %s' % ('m1','unemp')) sns.pairplot(trans_data,diag_kind='kde',plot_kws={'alpha':0.2}) ######## adding things
#We can use bar chart or pie chart to visualise the distribution of categorical variables. fig = plt.figure(figsize=(15, 8)) for i, c in enumerate(categoricalVariables): ax = plt.subplot(3, 3, i + 1) sns.countplot(x=train[c]) fig.tight_layout() plt.show() train[continuousVariables].describe() fig = plt.figure(figsize=(15, 5)) for i, c in enumerate(continuousVariables): ax = plt.subplot(2, 2, i + 1) sns.distplot(train[c].dropna()) fig.tight_layout() plt.show() fig = plt.figure(figsize=(15, 10)) for i, c in enumerate(categoricalVariables): ax = plt.subplot(3, 3, i + 1) sns.boxplot(x=train[c], y=train["count"]) fig.tight_layout() #continuous variables fig = plt.figure(figsize=(15, 10)) for i, c in enumerate(continuousVariables): ax = plt.subplot(2, 2, i + 1) sns.regplot(x=train[c], y=train["count"]) fig.tight_layout()
def regplot(data: pd.DataFrame, featX: str, featY: str) -> None: fig, ax = plt.subplots() sns.regplot(data[featX], data[featY], logistic=True, ax=ax) sns.regplot(data[featX], data[featY], lowess=True, ax=ax) fig.show()
def mantel(output_dir: str, dm1: skbio.DistanceMatrix, dm2: skbio.DistanceMatrix, method: str = 'spearman', permutations: int = 999, intersect_ids: bool = False, label1: str = 'Distance Matrix 1', label2: str = 'Distance Matrix 2') -> None: test_statistics = {'spearman': 'rho', 'pearson': 'r'} alt_hypothesis = 'two-sided' # The following code to handle mismatched IDs, and subsequently filter the # distance matrices, is not technically necessary because skbio's mantel # function will raise an error on mismatches with `strict=True`, and will # handle intersection if `strict=False`. However, we need to handle the ID # matching explicitly to find *which* IDs are mismatched -- the error # message coming from scikit-bio doesn't describe those. We also need to # have the mismatched IDs to display as a warning in the viz if # `intersect_ids=True`. Finally, the distance matrices are explicitly # filtered to matching IDs only because their data are used elsewhere in # this function (e.g. extracting scatter plot data). # Find the symmetric difference between ID sets. ids1 = set(dm1.ids) ids2 = set(dm2.ids) mismatched_ids = ids1 ^ ids2 if not intersect_ids and mismatched_ids: raise ValueError( 'The following ID(s) are not contained in both distance matrices. ' 'This sometimes occurs when mismatched files are passed. If this ' 'is not the case, you can use `intersect_ids` to discard these ' 'mismatches and apply the Mantel test to only those IDs that are ' 'found in both distance matrices.\n\n%s' % ', '.join(sorted(mismatched_ids))) if mismatched_ids: matched_ids = ids1 & ids2 # Run in `strict` mode because the matches should all be found in both # matrices. dm1 = dm1.filter(matched_ids, strict=True) dm2 = dm2.filter(matched_ids, strict=True) # Run in `strict` mode because all IDs should be matched at this point. r, p, sample_size = skbio.stats.distance.mantel(dm1, dm2, method=method, permutations=permutations, alternative=alt_hypothesis, strict=True) result = pd.Series( [method.title(), sample_size, permutations, alt_hypothesis, r, p], index=[ 'Method', 'Sample size', 'Permutations', 'Alternative hypothesis', '%s %s' % (method.title(), test_statistics[method]), 'p-value' ], name='Mantel test results') table_html = q2templates.df_to_html(result.to_frame()) # We know the distance matrices have matching ID sets at this point, so we # can safely generate all pairs of IDs using one of the matrices' ID sets # (it doesn't matter which one). scatter_data = [] for id1, id2 in itertools.combinations(dm1.ids, 2): scatter_data.append((dm1[id1, id2], dm2[id1, id2])) plt.figure() x = 'Pairwise Distance (%s)' % label1 y = 'Pairwise Distance (%s)' % label2 scatter_data = pd.DataFrame(scatter_data, columns=[x, y]) sns.regplot(x=x, y=y, data=scatter_data, fit_reg=False) plt.savefig(os.path.join(output_dir, 'mantel-scatter.svg')) context = { 'table': table_html, 'sample_size': sample_size, 'mismatched_ids': mismatched_ids } index = os.path.join(TEMPLATES, 'mantel_assets', 'index.html') q2templates.render(index, output_dir, context=context)
from sklearn.metrics import mean_squared_error, r2_score from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt import statsmodels.formula.api as smf import statsmodels.api as sm from scipy.stats import linregress import seaborn as sns allData = pd.read_csv(r'C:\Users\nuran\Desktop\Senior_Project\All_data.csv', skiprows=0, delimiter=',') x = allData['Independence_00'] y = allData['Response_Rate_00'] #y plt.figure(figsize=(10, 10)) sns.regplot(x, y) plt.plot([x], [y], 'o', label='Indepedence', color='yellow', markersize='3') plt.title('Linear Regression Analysis: Response Rate & Independence for 2000') plt.xlabel('Independence', color='#1C2833') plt.ylabel('Response Rate') plt.show() # # clean code # # clean code
len(diff_hESC_endo_ncRNA[diff_hESC_endo_ncRNA["qval_hESC_endo"] < 0.05]) # In[103]: len(diff_hESC_endo_ncRNA[diff_hESC_endo_ncRNA["qval_hESC_endo"] < 0.05] ["gene_name"].unique()) # In[104]: fig = plt.figure(figsize=(1.5, 1.75)) g = sns.regplot(x="endo_hESC_log2fc", y="qval_log10_hESC_endo", data=diff_hESC_endo_ncRNA, fit_reg=False, color="firebrick", scatter_kws={ "s": 8, "edgecolors": "white", "linewidths": 0.5 }) plt.xlabel("log2(endoderm/hESC)") plt.ylabel("negative log10 q value") plt.ylim((-0.1, 4)) plt.xlim((-8.5, 8.5)) plt.axhline(y=-np.log10(0.05), linestyle="dashed", color="black", linewidth=1) #plt.title("volcano plot for ncRNAs in endoderm vs. hESCs\n(n=%s)" % (len(diff_hESC_endo_ncRNA))) plt.savefig("Fig2E_1.pdf", bbox_inches="tight", dpi="figure") # In[105]:
print(lm.summary()) # In[18]: # regression statistics of rest of world sales lm = smf.ols(formula = "Global_Sales ~ Other_Sales", data = df).fit() print(lm.summary()) # In[22]: # regression plot of North American vs global sales sns.regplot(x = "NA_Sales", y = "Global_Sales", data = df) # In[23]: # regression plot of European vs global sales sns.regplot(x = "EU_Sales", y = "Global_Sales", data = df) # In[24]: # regression plot of Japanese vs global sales sns.regplot(x = "JP_Sales", y = "Global_Sales", data = df)
import seaborn as sns from matplotlib import pyplot as plt import pandas as pd sns.set_context("poster") sns.axes_style() sns.despine() from scipy import stats df = pd.read_csv('data.csv') sns.lmplot(x='frame', y='x_position', data=df, fit_reg=True) # get coeffs of linear fit slope, intercept, r_value, p_value, std_err = stats.linregress( df['frame'], df['x_position']) # use line_kws to set line label for legend ax = sns.regplot( x="frame", y="x_position", data=df, line_kws={'label': "y={0:.1f}x+{1:.1f}".format(slope, intercept)}) # plot legend ax.legend() sns.despine() plt.show() ax.get_figure().savefig("output.png")
count = count + 1 plt.show() # Plot high correlation attributes - PLOTS #3 - GOOD fig = plt.figure(figsize=(14,60)) col = 3 row = int(len(df_corr.Attributes)/col) count = 1 for i, j in zip(df_plot.Attributes,df_plot.Correlation): fig.add_subplot(row, col, count) plt.title('Salinity vs {} (corr = {:.4f})\nnormalized distribution'.format(i,j)) #plt.xlim(-4,4) sns.regplot(x=df_sample[i],y="Salnty",data=df_sample,order=2, scatter_kws={'alpha':0.25},color='green'); count = count + 1 plt.show() #=============================================== #=============================================== # PLOT ALL COLUMNS IN CORR - not good #=============================================== # Plot high correlation attributes - PLOTS #1 - CRAP fig = plt.figure(figsize=(12,60)) plotNum = 1 # initialize plot number #for i in df_high.columns.drop(['Salnty','R_SALINITY']): for i, j in zip(df_high.Attributes,df_high.Correlation):
import seaborn as sns from pydataset import data import pandas as pd import matplotlib.pyplot as plt from env import host, password, user iris = data('iris') sns.distplot(iris['Petal.Length']) sns.regplot(x='Petal.Length', y='Petal.Width', data=iris) # Yes sns.relplot(x='Sepal.Length', y='Sepal.Width', data=iris, hue='Species' ) # Probably? Many edge cases between versicolor and virginica sns.pairplot( iris ) # It looks like setosa is easy to identify regardless of feature; I think petal width/length look like the best pair of features by which to distinguish versicolor and virginica, but it's still an imperfect metric. anscombe = sns.load_dataset('anscombe') anscombe.groupby('dataset').describe() sns.relplot(x='x', y='y', data=anscombe, col='dataset') insectsprays = data('InsectSprays') sns.boxplot(data=insectsprays, y='count', x='spray') swiss = data('swiss') swiss['is_catholic'] = swiss['Catholic'] > 80
# given (hh:mm:ss)_1 and (hh:mm:ss)_2. find the difference in seconds print( sum([(60**((5 - i) % 3)) * int(input()) * int((i // 3 - 0.5) * 2.0) for i in range(6)])) import seaborn as sns ax = sns.regplot(x='input', y='output', data=df, color='green', marker='+')
def pairgrid_plots(stats, dims, statop, season=None, scenario=None, period=None): season_ren = { 'full': 'annual', 's1': 'spring', 's2': 'summer', 's3': 'autumn', 's4': 'winter', 'FULL': 'annual', 'MAM': 'spring', 'JJA': 'summer', 'SON': 'autumn', 'DJF': 'winter' } stat_ren = {'timmean': 'mean', 'timvar': 'variance'} exp_ren = {'timmean': 'mean', 'timvar': 'variance'} d = dims['inv'] op = d[2][statop] m = {} exps = [ i for i in range(len(dims['exps'])) if ((period is None) or period in dims['exps'][i]) and ( (scenario is None) or scenario in dims['exps'][i]) ] seasons = [i for i in range(len(dims['seasons'])) ] if season is None else [d[0][season]] #scenarios = [i for i in range(len(dims['exps'])) if scenario in dims['exps'][i]] if scenario else [] #periods = [i for i in range(len(dims['exps'])) if period in dims['exps'][i]] if period else [] #print(dims['exps']) #print('scen', [dims['exps'][i] for i in scenarios]) #print('peri', [dims['exps'][i] for i in periods]) #print(seasons) #exit() ''' print('Building DataFrame') for s in seasons: for e in exps: tas = stats[s][e][op][d[3]['tas']] pr = stats[s][e][op][d[3]['pr']] exp_label = dims['exps'][e] season_label = season_ren[dims['seasons'][s]] #stat_label = stat_ren[statop] name = '%s %s %s' % ('tas', season_label, exp_label) print(name) m[name] = tas name = '%s %s %s' % ('pr', season_label, exp_label) m[name] = pr print('Define DataFrame') #df = pd.DataFrame(m) #print(df) #return g = sns.PairGrid(df) g.map_upper(sns.regplot) g.map_lower(sns.kdeplot, cmap = 'Blues_d') g.map_diag(sns.kdeplot, lw = 3, legend = True); plt.show() ''' fig = plt.figure(figsize=(18, 9)) n = 0 #cols = seasons if season is None else pos = [len(exps), len(seasons), n] print('exps', [dims['exps'][i] for i in exps], pos) for e in exps: for s in seasons: # plot one week pr / tas n += 1 pos[2] = n ax = fig.add_subplot(*pos) tas = stats[s][e][op][d[3]['tas']] pr = stats[s][e][op][d[3]['pr']] * (24 * 60 * 60) exp_label = dims['exps'][e] season_label = season_ren[dims['seasons'][s]] tas_name = '%s %s %s' % ('tas', season_label, exp_label) pr_name = '%s %s %s' % ('pr', season_label, exp_label) print(tas_name) df = pd.DataFrame({tas_name: tas, pr_name: pr}) sns.regplot(data=df, x=tas_name, y=pr_name, fit_reg=True, ax=ax) plt.show()
filewriter.add_summary(summ_buf, i) else: train_loss, _ = sess.run([loss, train_op], feed_dict=feed_dict) if min_loss is None or train_loss < min_loss: min_loss = train_loss table.add_row([path, min_loss, w.eval(), b.eval()]) # As we can see, 'ds2.csv' has the lowest loss, and thus contains the most linear data, 'ds3.csv' coming a distant second. The values of w and b are shown in the table. # In[6]: print(table) # As we can see, the plots show that `ds2.csv` is the most linear. # In[8]: import seaborn as sb import matplotlib.pyplot as plt for path in paths: d = pd.read_csv(path, names=['x', 'y']) m1 = np.max(d['x']) x = np.array(d['x']) y = np.array(d['y']) m2 = np.max(d['y']) sb.regplot((x / m1), (y / m2)) plt.show() # In[ ]:
import matplotlib.pyplot as plt import seaborn as sns titanic = sns.load_dataset('titanic') # 스타일 테마 설정 (5가지: darkgrid, whitegrid, dark, white, ticks) sns.set_style('darkgrid') # 그래프 객체 생성 (figure에 2개의 서브 플롯을 생성) fig = plt.figure(figsize=(15, 5)) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) # 그래프 그리기 - 선형회귀선 표시(fit_reg=True) sns.regplot( x='age', # x축 변수 y='fare', # y축 변수 data=titanic, # 데이터 ax=ax1) # axe 객체 - 1번째 그래프 # 그래프 그리기 - 선형회귀선 미표시(fit_reg=False) sns.regplot( x='age', y='fare', data=titanic, ax=ax2, # axe 객체 - 2번째 그래프 fit_reg=False) # 회귀선 미표시 plt.show()
def plot_candidate_codons(env, df, codons, cmap=None): # type: (Environment, pd.DataFrame, List[str]) -> None fig, ax = plt.subplots() from sbsp_viz.colormap import ColorMap as CM for c in sorted(codons): seaborn.regplot(df["GC"].astype(float).values, df[c].astype(float).values, label=c, lowess=True, scatter_kws={ "s": 5, "alpha": 0.1 }, color=cmap[c]) ax.set_ylim([-0.05, 1.05]) ax.set_ylabel("Probability") ax.set_xlabel("GC") leg = ax.legend() for lh in leg.legendHandles: lh.set_alpha(1) plt.show() # bacteria vs archaea fig, axes = plt.subplots(1, 2, sharex="all", sharey="all") for t, ax in zip(["Bacteria", "Archaea"], axes.ravel()): df_tmp = df[df["Type"] == t] for c in sorted(codons): seaborn.regplot(df_tmp["GC"].astype(float).values, df_tmp[c].astype(float).values, label=c, lowess=True, scatter_kws={ "s": 5, "alpha": 0.1 }, ax=ax, color=cmap[c]) ax.set_ylim([-0.05, 1.05]) ax.set_ylabel("Probability") ax.set_xlabel("GC") ax.set_title(t) leg = ax.legend() for lh in leg.legendHandles: lh.set_alpha(1) plt.show() # group fig, axes = plt.subplots(2, 2, sharex="all", sharey="all") for t, ax in zip(list("ABCD"), axes.ravel()): df_tmp = df[df["GENOME_TYPE"] == t] for c in sorted(codons): seaborn.regplot(df_tmp["GC"].astype(float).values, df_tmp[c].astype(float).values, label=c, lowess=True, scatter_kws={ "s": 5, "alpha": 0.1 }, ax=ax, color=cmap[c]) ax.set_ylim([-0.05, 1.05]) ax.set_ylabel("Probability") ax.set_xlabel("GC") ax.set_title(t) leg = ax.legend() for lh in leg.legendHandles: lh.set_alpha(1) plt.show()
print( str(RMes.shape[0] - RMes[RMes['EC taken'] >= 120].shape[0]) + ' RMes students removed with <120ECs') RMes_filtered = RMes[RMes['EC taken'] >= 120] # 3a) Checking normality stats.kstest(RMes_filtered['thesis_grades'], 'norm') stats.kstest(RMes_filtered['EC taken'], 'norm') # 3b) Spearmans stats.spearmanr(RMes_filtered['thesis_grades'], RMes_filtered['EC taken']) # rs = 0.0572, p = 0.520 fig, ax = plt.subplots(figsize=(45, 24)) sns.regplot(RMes_filtered['EC taken'], RMes_filtered['thesis_grades'], color='black', y_jitter=0.05) plt.xlabel('ECs taken', fontsize=35) plt.ylabel('Thesis Grade', fontsize=35) plt.title('ECs against thesis grade (RMes)', fontsize=35) sns.set_style('darkgrid') plt.xticks(fontsize=25) plt.yticks(fontsize=25) ax.set_ylim(5, 10.1) ax.set_xlim(119, 156) ax.text(145, 5.2, 'rs = 0.0572, p = 0.520, n = 129', fontsize=50) plt.show() # 4a) MSc all spec stats.kstest(MSc_filtered['thesis_grades'], 'norm') stats.kstest(MSc_filtered['EC taken'], 'norm')
data2 = pd.DataFrame({ 'Standard Reading Level': [mean_a+bwo], 'CDF': [heights[0]] }) data1 = pd.DataFrame({ 'Standard Reading Level': x_sub_set, 'CDF': std_plot_ind }) legend_properties = {'weight':'bold','size':8} ax = sns.regplot(data=benchmarks, x="benchmarks", y="CDF", fit_reg=False, marker="o", color="green") #bbox_props = dict(boxstyle="rarrow", fc=(0.8,0.9,0.9), ec="b", lw=2) #t = ax.text(0, 0, "Direction", ha="center", va="center", rotation=90, # size=15, # bbox=bbox_props) #import pdb; pdb.set_trace() #bmark_heights.reverse() for i in bmark_heights: print(i) #import pdb #pdb.set_trace() cnt=0 for i,j,k in zip(bmark_stats_items[0:-1],bmark_heights[0:-1],categories):
df[df['Status'].notnull()]['Status'].value_counts().plot(kind = 'pie', autopct='%1.1f%%') #plt.title('status Partitions') #plt.show() print(df[['Status', 'accuracy']][df.Status.notnull()].groupby('Status').mean()) df[df['Status'] != 'None'].boxplot(column = ['accuracy'], by = ['Status']) plt.title('') plt.show() ax = sns.boxplot(x="Status", y="accuracy", hue="Status",data=df, linewidth=2.5) plt.show() X = np.array(d1) y=df['Status'] = df['Status'].map({'completed': 1, 'parsed with error': 2,'drop case': 0}) sns.regplot(X, y, data=df, fit_reg=False) plt.show() import numpy as np import matplotlib.pyplot as plt # Create data import seaborn as sns import matplotlib.pyplot as plt carrier_count = df['accuracy'].value_counts() sns.set(style="darkgrid") sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9) plt.title('Frequency Distribution of Carriers') plt.ylabel('Number of Occurrences', fontsize=12) plt.xlabel('accuracy', fontsize=12)
(sns.factorplot(x="SES",hue="Gender",y="Proportion who chooses\neach career group", data=spanish_data.rename(columns={"value":"Proportion who chooses\neach career group"}), col="Field", legend_out=False, order=["Low","High"],col_wrap=4) ) # %% plt.close() target="loggdppc" merged_all.assign(Engistem=lambda x: x.hy_f_engi/x.hy_f_STEM) #merged_all.query("Nrrent<20").plot.scatter("loggdppc","Engistem") (merged_all.groupby("Country").mean().pipe(lambda x: x.plot .scatter(target,"Engistem",s=x["pop"].pipe(lambda i: np.sqrt(i)), c=x["Nrrent"],cmap="viridis") )) sns.regplot(target,"Engistem",data=merged_all.groupby("Country").mean().query("pop>3000"),lowess=True,scatter=False,ax=plt.gca()) plt.ylabel("% female Engineering / % female STEM") plt.xlabel("SIGI (0=less legal discrimination)") # %% pa="Reds" pal=sns.palettes.color_palette(pa,len(k)) k=merged_all.query("loggdppc<9.5") plt.close() #sns.set_palette(pa,len(k)) sns.set_palette( "deep") plt.figure() i=0 jp.joyplot(k,column="hy_f_STEM",by="TIME") for name,group in k.groupby("TIME"): #sns.kdeplot(group.hy_f_STEM,ax=plt.gca(),label=name) plt.axvline(group.hy_f_STEM.mean(),c=pal[i],ymax=0.5)
plt.ylabel('Number of Super Bowls') plt.show() # Display the closest game(s) and biggest blowouts print(super_bowls[super_bowls['difference_pts'] == 1]) print(super_bowls[super_bowls['difference_pts'] >= 35]) ## Do blowouts translate to lost viewers? # Join game and TV data, filtering out SB I because it was split over two networks games_tv = pd.merge(tv[tv['super_bowl'] > 1], super_bowls, on='super_bowl') # Import seaborn import seaborn as sns # Create a scatter plot with a linear regression model fit sns.regplot(x="difference_pts", y="share_household", data=games_tv) ## Viewership and the ad industry over time # Create a figure with 3x1 subplot and activate the top subplot plt.subplot(3, 1, 1) plt.plot(games_tv.super_bowl, games_tv.avg_us_viewers, color='#648FFF') plt.title('Average Number of US Viewers') # Activate the middle subplot plt.subplot(3, 1, 2) plt.plot(games_tv.super_bowl, games_tv.rating_household, color='#DC267F') plt.title('Household Rating') # Activate the bottom subplot plt.subplot(3, 1, 3) plt.plot(games_tv.super_bowl, games_tv.ad_cost, color='#FFB000')
sns.distplot(colValues, bins=7, kde=False, color='b') plt.title(colName) plt.ylabel(colName) plt.xlabel('Bins') plt.show() # scatterplots # plot Sscatterplot print('\n*** Scatterplot ***') colNames = df.columns.tolist() colNames.remove(depVars) print(colName) for colName in colNames: colValues = df[colName].values plt.figure() sns.regplot(data=df, x=depVars, y=colName, color= 'b', scatter_kws={"s": 5}) plt.title(depVars + ' v/s ' + colName) plt.show() # class count plot # change as required colNames = ["CHAS","RAD"] print("\n*** Distribution Plot ***") for colName in colNames: plt.figure() sns.countplot(df[colName],label="Count") plt.title(colName) plt.show() ################################
def plot_mev_miv(df_radiomics): """ Plot a 3-subplot of pav vs eav, subcapsular and chemo :param df_radiomics: :return: """ font = {'family': 'DejaVu Sans', 'size': 18} matplotlib.rc('font', **font) df = pd.DataFrame() df['PAV'] = df_radiomics['Predicted_Ablation_Volume'] df['EAV'] = df_radiomics['Ablation Volume [ml]'] df['Energy (kJ)'] = df_radiomics['Energy [kj]'] df['MWA Systems'] = df_radiomics['Device_name'] df['MIV'] = df_radiomics['Inner Ellipsoid Volume'] df['MEV'] = df_radiomics['Outer Ellipsoid Volume'] df['MEV-MIV'] = df['MEV'] - df['MIV'] df['R(EAV:PAV)'] = df['EAV'] / df['PAV'] fig, ax = plt.subplots(figsize=(12, 12)) sns.distplot(df['Energy (kJ)'], hist_kws={ "ec": 'black', "align": "mid" }, axlabel='Energy', ax=ax) timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'Energy_distribution_' + timestr + '.png') plt.savefig(figpath, bbox_inches='tight', dpi=300) plt.close() # drop outer volumes larger than 150 because they are probably erroneous df = df[df['MEV'] < 150] # drop the rows where MIV > MEV # since the minimum inscribed ellipsoid (MIV) should always be smaller than the maximum enclosing ellipsoid (MEV) df = df[df['MEV-MIV'] >= 0] min_val = int(min(df['MEV-MIV'])) max_val = int(max(df['MEV-MIV'])) print('Min Val Mev-Miv:', min_val) print('Max Val Mev-Miv:', max_val) print('nr of samples for mev-miv:', len(df)) # %% histogram MEV-MIV fig, ax = plt.subplots(figsize=(12, 12)) sns.distplot( df['MEV-MIV'], color=sns.xkcd_rgb["reddish"], hist_kws={ "ec": 'black', "align": "mid" }, axlabel='Distribution of Ablation Volume Irregularity (MEV-MIV) (mL)', ax=ax) timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'MEV-MIV_distribution_' + timestr + '.png') plt.savefig(figpath, bbox_inches='tight', dpi=300) plt.close() fig1, ax1 = plt.subplots(figsize=(12, 12)) sns.distplot(df['MEV'], color=sns.xkcd_rgb["reddish"], hist_kws={"ec": 'black'}, axlabel='MEV', ax=ax1) timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'MEV_distribution_' + timestr + '.png') plt.savefig(figpath, bbox_inches='tight', dpi=300) plt.close() fig1, ax2 = plt.subplots(figsize=(12, 12)) sns.distplot(df['MIV'], color=sns.xkcd_rgb["reddish"], hist_kws={"ec": 'black'}, axlabel='MIV', ax=ax2) timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'MIV_distribution_' + timestr + '.png') plt.savefig(figpath, dpi=300) plt.close() fig1, ax3 = plt.subplots(figsize=(12, 12)) sns.distplot(df['EAV'], color=sns.xkcd_rgb["reddish"], hist_kws={"ec": 'black'}, axlabel='EAV', ax=ax3) timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'EAV_distribution_' + timestr + '.png') plt.savefig(figpath, dpi=300) plt.close() fig1, ax4 = plt.subplots(figsize=(12, 12)) sns.distplot(df['PAV'], color=sns.xkcd_rgb["reddish"], hist_kws={"ec": 'black'}, axlabel='PAV', ax=ax4) timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'PAV_distribution_' + timestr + '.png') plt.savefig(figpath, dpi=300) plt.close() # %% R (EAV:PAV) on y-axis and MEV-MIV on the x-axis fig1, ax5 = plt.subplots(figsize=(12, 12)) slope, intercept, r_square, p_value, std_err = stats.linregress( df['R(EAV:PAV)'], df['MEV-MIV']) print('p-val mev miv energy:', p_value) print() p = sns.regplot(y="R(EAV:PAV)", x="MEV-MIV", data=df, scatter_kws={ "s": 100, "alpha": 0.5 }, color=sns.xkcd_rgb["reddish"], line_kws={'label': r'$r = {0:.2f}$'.format(r_square)}, ax=ax5) plt.xlabel('MEV-MIV (mL)') plt.legend() timestr = time.strftime("%H%M%S-%Y%m%d") figpath = os.path.join("figures", 'Ratio_EAV-PAV_MEV-MIV_difference_' + timestr) plt.savefig(figpath, dpi=300, bbox_inches='tight') plt.close()
weighted_ratio = [item[3] for item in data_city] categories =[item[4] for item in data_city] category_count=[item[5] for item in data_city] data_city = {"city" : city, "review_count": review_count, "stars":stars, "weighted_ratio":weighted_ratio,"categories":categories,"category_count":category_count} data_city=pd.DataFrame(data_city) city_business_reviews = data_city[['city', 'review_count', 'stars']].groupby(['city']).agg({'review_count':'sum','stars': 'mean'}).sort_values(by='review_count', ascending=False) city_business_reviews['review_count'][0:20].plot(kind='bar', stacked=False, figsize=[10,10],colormap='winter') plt.title('Top 20 cities by reviews') city_weighted_ratio =data_city[['city', 'weighted_ratio']].groupby(['city']).agg({'weighted_ratio': 'sum'}).sort_values(by='weighted_ratio', ascending=False) city_weighted_ratio['weighted_ratio'][0:20].plot(kind='bar', stacked=False, figsize=[10,10],colormap='summer') plt.title('Top 20 cities by Weighted Rating') import seaborn as sns sns.regplot(x=data_city["stars"], y=data_city["category_count"], fit_reg=False) categories_data=spark.read.csv("categories.csv", header=True) categories =categories_data.withColumn("stars", categories_data["stars"].cast(DoubleType())) categories =categories.withColumn("review_count", categories_data["review_count"].cast(DoubleType())) category=categories.select("city","categories","state","stars","review_count").collect() city = [item[0] for item in category] categories = [item[1] for item in category] state = [item[2] for item in category] stars = [item[3] for item in category] review_count=[item[4] for item in category] category = {"city" : city, "categories": categories, "state":state, "stars":stars,"review_count":review_count} df=pd.DataFrame(category)
ax1 = plt.subplot(221) ax1 = sns.violinplot(x="state", y="pledge_log", data=df_kick, palette="hls") ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45) ax1.set_title("Understanding the Pledged values by state", fontsize=15) ax1.set_xlabel("State Description", fontsize=12) ax1.set_ylabel("Pledged Values(log)", fontsize=12) ax2 = plt.subplot(222) ax2 = sns.violinplot(x="state", y="goal_log", data=df_kick) ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45) ax2.set_title("Understanding the Goal values by state", fontsize=15) ax2.set_xlabel("State Description", fontsize=12) ax2.set_ylabel("Goal Values(log)", fontsize=12) ax0 = plt.subplot(212) ax0 = sns.regplot(x="goal_log", y="pledge_log", data=df_kick, x_jitter=False) ax0.set_title("Better view of Goal x Pledged values", fontsize=15) ax0.set_xlabel("Goal Values(log)") ax0.set_ylabel("Pledged Values(log)") ax0.set_xticklabels(ax0.get_xticklabels(), rotation=90) plt.show() # <h2>Analysing further the CAaegorys: </h2> # - Sucessful category's frequency # - failed category's frequency # - General Goal Distribuition by Category # In[10]: main_cats = df_kick["main_category"].value_counts() main_cats_failed = df_kick[df_kick["state"] ==
X2_test = sc_X.transform(X_test) # adsaasd # X_sm = sm.add_constant(X) # model = sm.OLS(y,X_sm) # print(model.fit().summary()) lm1 = LinearRegression() lm1.fit(X_train, y_train) lm1_pred = lm1.predict(X_test) print('Linear Regression Performance:') print('MAE:', metrics.mean_absolute_error(y_test, lm1_pred)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lm1_pred))) print('R2_Score: ', metrics.r2_score(y_test, lm1_pred)) fig = plt.figure(figsize=(8, 5)) sns.regplot(y_test, lm1_pred, color='g') plt.xlabel('COA') plt.ylabel('Predictions') plt.title('LinearRegression Prediction Performance ') plt.grid() plt.show() # print('Estimated coefficients for the linear regression: ',lm1.coef_) # print('Independent term: ', lm1.intercept_) # import pickle # filename = 'LinearRegression.sav' # pickle.dump(lm1, open(filename, 'wb')) # # load the model from disk # loaded_model = pickle.load(open(filename, 'rb')) # result = loaded_model.score(X_test, Y_test) # print(result)
def split_timeseries_figures(in_frames, names, split_at=PRISM_DATE, same_plot=True, **kwargs): """ Takes the input data and creates a plot similar to figure 4a in the paper. :param dataframes: An iterable of 2 pandas data frames with the dates (dtype=PeriodIndex) as index and views as only column :param names: Names for the incoming data frames (neede for the legend) :param split_at: The date at which the data shall be split :param same_plot: If set false, a subplot will be created for each dataframe in in_frames. If true, they will be plotted on the same axis. :return: Plots the figure """ # Default values that can be changed title = kwargs.get('title', '') figsize = kwargs.get('figsize', [18, 6]) keyword = kwargs.get('keyword', 'views') sharey = kwargs.get('sharey', True) show_legend = kwargs.get('legend', True) and same_plot # Make sure we can iterate over the input argument to get a constant behavior, even if there is only 1df given. if isinstance(in_frames, pd.DataFrame): in_frames = [in_frames] if isinstance(names, str): names = [names] assert len(names) == len( in_frames ), "{} dataframes but {} names specified. This should be equal".format( len(in_frames), len(names)) nr_subplots = len(in_frames) same_plot = True if nr_subplots == 1 else same_plot # Remove useless specification of 'separate plots' if there is only 1 df dfs_to_plot = [] for in_frame in in_frames: # Prepare the dataframe for seaborn (https://stackoverflow.com/questions/52112979/having-xticks-to-display-months-in-a-seaborn-regplot-with-pandas) # Seaborn has issues handling datetimes, so for the computation they are transformed to integers before transforming them back for the labelling later on. # Matplotlib provides the necessary functionality. dataframe = in_frame.copy() dataframe.index = dataframe.index.to_timestamp() dataframe['date_ordinal'] = mdates.date2num(dataframe.index) dfs_to_plot.append(dataframe) # Some color definition for plotting and the legend if 'colors' in kwargs: colors = kwargs['colors'] else: colors = {} cmap = cm.get_cmap(kwargs.get('cmap', 'Set1')) cmap_colors = cmap.colors for i, name in enumerate(names): colors[name] = cmap_colors[i % len(cmap_colors)] colors['Prism Disclosure, 6/6/2013'] = 'red' # Starting to build the actual plot if same_plot: fig, ax = plt.subplots(figsize=figsize) show_every_nth_month = 1 # There is enough space for every month to be displayed else: COLS = math.ceil(math.sqrt(nr_subplots)) ROWS = math.ceil(nr_subplots / COLS) fig, axs = plt.subplots(ncols=COLS, nrows=ROWS, sharex=True, sharey=sharey, figsize=figsize) show_every_nth_month = COLS # It gets tight if we display every month for i, dataframe in enumerate(dfs_to_plot): # First some axes unpacking. Numpy has an ugly feature of changing the depth of the list storing the axes, so lets unpack them if not same_plot: if ROWS == 1: ROW = 0 COL = i ax = axs[COL] else: COL = i % COLS ROW = i // COLS ax = axs[ROW][COL] before = dataframe.loc[dataframe.index < split_at] after = dataframe.loc[dataframe.index >= split_at] sns.regplot(x='date_ordinal', y=keyword, data=before, ax=ax, color=colors[names[i]], scatter_kws={ 'color': colors[names[i]], 's': 30 }, line_kws={'color': colors[names[i]]}) sns.regplot(x='date_ordinal', y=keyword, data=after, ax=ax, color=colors[names[i]], scatter_kws={ 'color': colors[names[i]], 's': 30 }, line_kws={'color': colors[names[i]]}) ax.set(xlabel='', ylabel='' ) # Per default, we do not really want a label on every subplot if not show_legend: ax.set_title(names[i]) # Some minimum information should be there # Tune the visuals ax.set_xlim(dataframe['date_ordinal'].min() - 15, dataframe['date_ordinal'].max() + 15) # 15 days offset ax.vlines(mdates.date2num(split_at), 0, 1, color=colors['Prism Disclosure, 6/6/2013'], transform=ax.get_xaxis_transform(), label=split_at) if (not same_plot) and (COL == 0) and (not sharey): ax.set_ylim(dataframe[keyword].min() * 0.9, dataframe[keyword].max() * 1.1) # Assumes there are no negative values ax.set_ylabel(keyword) if (not same_plot) and (ROW + 1 == ROWS): # As mentioned above, the date was transformed to integers for the sake of plotting it using seaborn. # Now, they have to be transformed back to have nice x-Axis labels that are human-readable loc = mdates.MonthLocator(interval=show_every_nth_month) ax.xaxis.set_major_locator(loc) ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(loc)) # Formatting and Optics fig.patch.set_facecolor('lightgrey') fig.suptitle(title) # Axis settings if same_plot: ax.set_xlabel('Month / Year') ax.set_ylabel(keyword) # As mentioned above, the date was transformed to integers for the sake of plotting it using seaborn. # Now, they have to be transformed back to have nice x-Axis labels that are human-readable loc = mdates.MonthLocator(interval=show_every_nth_month) ax.xaxis.set_major_locator(loc) ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(loc)) fig.autofmt_xdate(rotation=60) if show_legend: # Add a custom legend legend_patches = [] for entry, c in colors.items(): legend_patches.append(mpatches.Patch(color=c, label=entry)) # legend_patches.append(mpatches.Patch(color='lightgrey', label='95% Confidence Interval')) font = FontProperties() font.set_size('large') plt.legend(handles=legend_patches, title='Legend', bbox_to_anchor=(0, 0), loc='lower left', prop=font, ncol=math.ceil(math.sqrt(nr_subplots)), fancybox=True, shadow=True)
mySession.loc[mySession.stim == stim, 'istim'] = istim mySession.loc[mySession.perc == stim, 'iperc'] = istim mySession.loc[:, 'istim'] = mySession.loc[:, 'istim'].astype(int) mySession.loc[:, 'iperc'] = mySession.loc[:, 'iperc'].astype(int) rho = pd.DataFrame(index=np.arange(nTrials), columns=np.sort(mySession.istim.drop_duplicates())) rho.loc[0, :] = 1 W = pd.DataFrame(index=np.arange(tbf.shape[0]), columns=rho.columns) W.loc[:, :] = 1 #%% sns.regplot(x='stim', y='isChoiceLeft', data=mySession, logistic=True, ci=None, y_jitter=0.01) plt.show() #%% INITIAL CONDITIONS # mySession.loc[0,'waitingTime'] = np.random.choice(np.arange(tbf.shape[1]), 1, p=pnorm((tbf.T @ W).values[:,mySession.iperc[0]])).item() # # mySession.loc[0,'feedbackTime'] = truncExp(1.5, .5, 8) #%% # hf[ipair], ha[ipair] = plt.subplots(1, 3, figsize=(10, 3)) hf, ha = plt.subplots(1, 3) # %%
import seaborn as sns import pandas as pd from matplotlib.pyplot import * sns.set_theme() tem = pd.read_csv("CSV_files/first/tempYearly.csv") rai = pd.read_csv("CSV_files/first/rainYearly.csv") tem["Rainfall"] = rai["Rainfall"] sns.regplot( x="Rainfall", y="Temperature", data=tem[(0.0 <= tem['Rainfall']) & (tem['Rainfall'] < 10.0) & (0.0 <= tem['Temperature']) & (tem['Temperature'] < 50.0)]) show()
df_join['TMB-Foundation-Value'])[0], 3)) + "/" + str( round( st.pearsonr(df_join['TMB-Total-Variants'], df_join['TMB-Foundation-Value'])[0], 3)) + "/" + str(round(r2_score(y, y_slr), 3))) plt.savefig("Fig_4_TMB_HMH_Foundation_Variants_Score_ScatterPlot_main.png") plt.close() ### caris vs Foundation x1 = np.reshape( df_join[df_join['Source'] == "Caris"]['TMB-Total-Variants'].values, (-1, 1)) y1 = df_join[df_join['Source'] == "Caris"]['TMB-Foundation-Value'].values y1_slr = fitData(x1, y1, "simple-lr") sns.regplot(x=x1, y=y1, ci=None) plt.title("TMB Score Comparison-Caris, S/P/R^2= " + str(round(st.spearmanr(x1, y1)[0], 3)) + "/" + str( round( st.pearsonr( df_join[df_join['Source'] == "Caris"] ['TMB-Total-Variants'].values, y1)[0], 3)) + "/" + str(round(r2_score(y1, y1_slr), 3))) plt.savefig("TMB_HMH_Variants_Foundation_Score_ScatterPlot_Caris.png") plt.close() x2 = np.reshape( df_join[df_join['Source'] == "Foundation"]['TMB-Total-Variants'].values, (-1, 1)) y2 = df_join[df_join['Source'] == "Foundation"]['TMB-Foundation-Value'].values y2_slr = fitData(x2, y2, "simple-lr")