def gen_entropy_hist(good, bad): g = gen_entropy_data(good) b = gen_entropy_data(bad) p1 = sns.distplot(g, color='g') p2 = sns.distplot(b, color='r') p1.set(xlim=(0,1)) p2.set(xlim=(0,1))
def cycle_time_histogram(cycle_data, bins=30, percentiles=[0.3, 0.5, 0.75, 0.85, 0.95], title=None, ax=None): histogram_df = cycle_data[['cycle_time']].dropna(subset=['cycle_time']) ct_days = histogram_df['cycle_time'].dt.days if len(ct_days.index) < 2: raise UnchartableData("Need at least 2 completed items to draw histogram") if ax is None: fig, ax = plt.subplots() sns.distplot(ct_days, bins=bins, ax=ax, axlabel="Cycle time (days)") if title is not None: ax.set_title(title) left, right = ax.get_xlim() ax.set_xlim(0, right) # Add percentiles bottom, top = ax.get_ylim() for percentile, value in ct_days.quantile(percentiles).iteritems(): ax.vlines(value, bottom, top - 0.001, linestyles='--', linewidths=1) ax.annotate("%.0f%% (%.0f days)" % ((percentile * 100), value,), xy=(value, top), xytext=(value, top - 0.001), rotation="vertical", fontsize="small", ha="right" ) return ax
def plot_frame_displacement(realignment_parameters_file, mean_FD_distribution=None, figsize=(11.7,8.3)): FD_power = calc_frame_dispalcement(realignment_parameters_file) fig = Figure(figsize=figsize) FigureCanvas(fig) if mean_FD_distribution: grid = GridSpec(2, 4) else: grid = GridSpec(1, 4) ax = fig.add_subplot(grid[0,:-1]) ax.plot(FD_power) ax.set_xlim((0, len(FD_power))) ax.set_ylabel("Frame Displacement [mm]") ax.set_xlabel("Frame number") ylim = ax.get_ylim() ax = fig.add_subplot(grid[0,-1]) sns.distplot(FD_power, vertical=True, ax=ax) ax.set_ylim(ylim) if mean_FD_distribution: ax = fig.add_subplot(grid[1,:]) sns.distplot(mean_FD_distribution, ax=ax) ax.set_xlabel("Mean Frame Dispalcement (over all subjects) [mm]") MeanFD = FD_power.mean() label = "MeanFD = %g"%MeanFD plot_vline(MeanFD, label, ax=ax) return fig
def rysuj_histogram(df, opis): plt.clf() global TRESC, MENU, global_tytul fig, ax = plt.subplots(figsize=(11, 5)) plt.subplots_adjust(bottom=0.18, top=0.85) #ax.get_xaxis().tick_bottom() #ax.get_yaxis().tick_left() ax1 = sns.distplot(df.czas_netto_s, rug=True, bins=bins, kde=False) ax1.xaxis.set_major_formatter(FuncFormatter(time_ticks)) ax1.xaxis.set_major_locator(MultipleLocator(dT)) plt.xticks(rotation='vertical') ax1.set_xlabel(u"Czas netto") plt.ylabel(u"Zawodników") ax2 = ax1.twiny() ax1 = sns.distplot(df.czas_netto_s, rug=True, bins=bins, kde=False) ax2.xaxis.set_major_formatter(FuncFormatter(pace_ticks)) ax2.xaxis.set_major_locator(MultipleLocator(dT)) plt.xticks(rotation='vertical') ax2.set_xlabel(u"Tempo, min/km") plt.ylabel(u"Zawodników") outFileName = "hist-%s.png" % (opis) plt.savefig(outputdir + outputdir_rel + outFileName, dpi=dpi) TRESC += u"<p><img src='%s' alt='%s' /></p>\n" % (outFileName, global_tytul) plt.clf()
def plot_volume_per_day_hist(transactions, ax=None, **kwargs): """ Plots a histogram of trading volume per day. Parameters ---------- transactions : pd.DataFrame A strategy's transactions. See pos.make_transaction_frame(transactions). ax : matplotlib.Axes, optional Axes upon which to plot. **kwargs, optional Passed to seaborn plotting function. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ if ax is None: ax = plt.gca() sns.distplot(transactions.txn_volume, ax=ax, **kwargs) ax.set_title('Distribution of Daily Trading Volume') ax.set_xlabel('Volume') return ax
def plot_dfgbrv_dist(self, **kwargs): """ Plot four distribution plots for the deltafactor, deltafactor prime and the relative errors for the GBRV fcc, bcc structures. Return: `matplotlib` figure. """ import matplotlib.pyplot as plt fig, ax_list = plt.subplots(nrows=2, ncols=2, squeeze=True) ax_list = ax_list.ravel() frame = self.get_dfgbrv_dataframe() import seaborn as sns for ax, col in zip(ax_list.ravel(), ["deltafactor", "gbrv_fcc", "df_prime", "gbrv_bcc"]): values = frame[col].dropna() sns.distplot(values, ax=ax, rug=True, hist=True, kde=False, label=col, bins=kwargs.pop("bins", 50)) # Add text with Mean or (MARE/RMSRE) text = []; app = text.append if col in ("deltafactor", "df_prime"): app("Mean = %.2f" % values.mean()) else: app("MARE = %.2f" % values.abs().mean()) app("RMSRE = %.2f" % np.sqrt((values**2).mean())) ax.text(0.8, 0.8, "\n".join(text), transform=ax.transAxes) return fig
def explore(self): gs1 = gs.GridSpec(2,2) fig = plt.figure(figsize=(15,6)) # histogram: report duration ax1 = fig.add_subplot(gs1[0:1,0]) sns.distplot(self.df['report.duration'], bins=15, ax=ax1) ax1.set_title("patient report maximum delay reception") ax1.set_xlabel("Days") # bar horizontal: duplicates ax2 = fig.add_subplot(gs1[0:1,1]) duplicates = self.df.duplicate.isnull().value_counts() duplicates.index=["UnReported", "Duplicates"] duplicates.plot(kind='barh', ax=ax2) ax2.set_title("Number of Duplicates Reported") # bar horizontal: country occurence ax3 = fig.add_subplot(gs1[1:2,0]) countries = self.df["occurcountry"].value_counts(sort=True, ascending=True) countries.plot(kind='barh', ax=ax3) ax3.set_title("Countries where reported event occured") # bar horizontal: reporting types ax4 = fig.add_subplot(gs1[1:2,1]) reportings = self.df["primarysource.qualification"] reportings = reportings[reportings.notnull()].astype('int').value_counts(sort=True, ascending=True) labels = {"1": "Physician", "2": "Pharamacist", "3": "Professional", "4": "Lawyer", "5": "Consumer"} reportings.index = reportings.index.map(lambda x: labels[str(x)]) reportings.plot(kind='barh', ax=ax4) ax4.set_title("Distribution of Reporting Types") plt.tight_layout()
def sb_distplots(plotargs, return_key='close_return', update_type='Revisions'): "Plots conditional underpricing distributions. Run set_data(df) first." f, ax = plt.subplots(1,1,figsize=(16, 5), sharex=True) for arg in plotargs: df, c, l, h = arg sb.distplot(df[return_key], ax=ax, kde_kws={"label": l + " Obs={N}".format(N=len(df)), "color": c}, hist_kws={"histtype": "stepfilled", "color": c}) r = df[return_key] m,s,y,med = r.mean(), r.std(), r.skew(), r.median() ax.annotate( u'μ={:.2f}%, σ={:.2f}, γ={:.2f}'.format(m,s,y), xy=(med+2, h), xytext=(med+6, h+0.01), arrowprops=dict(facecolor=cl.rgb2hex(c), width=1.5, headwidth=5, shrink=0.1)) H, prob = kruskalwallis(*[x[0][return_key] for x in plotargs]) ax.annotate("Kruskal-Wallis: (H={H:.2f}, prob={p:.3f})".format(H=H, p=prob), xy=(66,0.01)) plt.title("Conditional Underpricing Distributions %s" % update_type) plt.ylabel("Density") plt.xlim(xmin=-40,xmax=100) plt.xlabel("1st Day Returns (%)") plt.ylim((0, 0.12))
def DUPLICATE_remove(data, distance_threshold, graph=False): sizeFormat=pixelFormat * pixelSize poly=overlapping_grid(nrow, ncol, overlap_region, [sizeFormat, sizeFormat]) result=np.zeros(data.shape[0], dtype=bool) for i in xrange(data.shape[0]): result[i]=poly.contains(Point(data[i, 2], data[i, 3])) data_to_evaluate=data[result.ravel(), :] output=data[np.invert(result.ravel()), :] # Calculate Nearest Neighbors distance distance, indexes=DISTANCE_spatial( data_to_evaluate, data_to_evaluate, neighbors=2) if graph: sns.plt.grid(False) sns.distplot(distance) plt.title("%d spots to evaluate" % data_to_evaluate.shape[0]) sns.despine() sns.plt.show() # Find which spots is below the distance_threshold mask_to_keep=distance > distance_threshold to_keep=[] to_exclude=[] for source in xrange(len(indexes)): target=indexes[source] if indexes[target] == source and distance[source] < distance_threshold and source not in to_exclude: to_keep.append(source) to_exclude.append(target) mask_to_keep[to_keep]=True print str(len(to_exclude)) + ' spots discarded' data_to_evaluate=data_to_evaluate[mask_to_keep, :] output=np.row_stack((output, data_to_evaluate)) return output
def plot_daily_turnover_hist(transactions, positions, ax=None, **kwargs): """Plots a histogram of daily turnover rates. Parameters ---------- transactions : pd.DataFrame Prices and amounts of executed trades. One row per trade. - See full explanation in tears.create_full_tear_sheet. positions : pd.DataFrame Daily net position values. - See full explanation in tears.create_full_tear_sheet. ax : matplotlib.Axes, optional Axes upon which to plot. **kwargs, optional Passed to seaborn plotting function. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ if ax is None: ax = plt.gca() turnover = txn.get_turnover(positions, transactions, period=None) sns.distplot(turnover, ax=ax, **kwargs) ax.set_title('Distribution of Daily Turnover Rates') ax.set_xlabel('Turnover Rate') return ax
def plot_returns_cmp(self, only_show_returns=False, only_info=False): """考虑资金情况下的度量,进行与benchmark的收益度量对比,收益趋势,资金变动可视化,以及其它度量信息,不涉及benchmark""" self.log_func('买入后卖出的交易数量:{}'.format(self.order_has_ret.shape[0])) self.log_func('胜率:{:.4f}%'.format(self.win_rate * 100)) self.log_func('平均获利期望:{:.4f}%'.format(self.gains_mean * 100)) self.log_func('平均亏损期望:{:.4f}%'.format(self.losses_mean * 100)) self.log_func('盈亏比:{:.4f}'.format(self.win_loss_profit_rate)) self.log_func('策略收益: {:.4f}%'.format(self.algorithm_period_returns * 100)) self.log_func('策略年化收益: {:.4f}%'.format(self.algorithm_annualized_returns * 100)) self.log_func('策略买入成交比例:{:.4f}%'.format(self.buy_deal_rate * 100)) self.log_func('策略资金利用率比例:{:.4f}%'.format(self.cash_utilization * 100)) self.log_func('策略共执行{}个交易日'.format(self.num_trading_days)) if only_info: return self.algorithm_cum_returns.plot() plt.legend(['algorithm returns'], loc='best') plt.show() if only_show_returns: return sns.regplot(x=np.arange(0, len(self.algorithm_cum_returns)), y=self.algorithm_cum_returns.values) plt.show() sns.distplot(self.capital.capital_pd['capital_blance'], kde_kws={"lw": 3, "label": "capital blance kde"}) plt.show()
def main_fraction_under_figure(mirna2tar, mirna2age, target2age): counter = 0 perc_younger_lst = [] tot_counter = 0 for mirna in mirna2tar: if mirna not in mirna2age: continue age_set = [target2age[alpha] for alpha in mirna2tar[mirna] if alpha in target2age] perc_younger_lst.append(float(sum(i < mirna2age[mirna] for i in age_set))/ float(len(age_set))) print len(sorted(perc_younger_lst)) sns.distplot(perc_younger_lst) plt.gca().set_xlim([0,.6]) plt.ylabel('Number of miRNAs') plt.xlabel('Fraction of Protein Coding Targets Younger than miRNA') plt.subplots_adjust(bottom=0.20) plt.savefig('figures/mirna_age_fraction.pdf',bbox_inches='tight') plt.close()
def count_vote_dist(): db_inst = get_db_inst('AmazonReviews', 'AndroidAPP') delta = 2 x_list = [] y_list = [] xx = [] for i in range(1000): x_list.append((i * delta, (i + 1) * delta)) pass for tu in x_list: try: # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": tu[0], "$lt": tu[1]}}).count(), 10)) y_list.append(db_inst.find({"total_vote": {"$gte": tu[0], "$lt": tu[1]}}).count()) xx.append(tu[0]) print y_list[-1] except: xx.append(tu[0]) y_list.append(0) # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count(), 10)) y_list.append(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count()) xx.append(xx[-1] + 1) res = {"x": x_list, 'y': y_list} open('%s/data/amazon_data/%s' % (PROJECT_PATH, 'vote_counts.json'), 'w').write(json.dumps(res)) # plt.plot(xx, y_list) # plt.grid() # plt.show() sns.distplot(y_list) plt.show()
def plot_hist(self, struct_type, ax=None, errtxt=True, **kwargs): """ Histogram plot. """ #if codes is None: codes = ["ae"] ax, fig, plt = get_ax_fig_plt(ax) import seaborn as sns codes = ["this", "gbrv_paw"] #, "gbrv_uspp", "pslib", "vasp"] new = self[self["struct_type"] == struct_type].copy() ypos = 0.8 for i, code in enumerate(codes): values = (100 * (new[code] - new["ae"]) / new["ae"]).dropna() sns.distplot(values, ax=ax, rug=True, hist=False, label=code) # Add text with Mean or (MARE/RMSRE) if errtxt: text = []; app = text.append #app("%s MARE = %.2f" % (code, values.abs().mean())) app("%s RMSRE = %.2f" % (code, np.sqrt((values**2).mean()))) ax.text(0.6, ypos, "\n".join(text), transform=ax.transAxes) ypos -= 0.1 ax.grid(True) ax.set_xlabel("relative error %") ax.set_xlim(-0.8, 0.8) return fig
def log2_oulierfilter(df_by_cell, plot=False): log2_df = np.log2(df_by_cell+1) top_log2 = find_top_common_genes(log2_df) if top_log2.empty: print("no common genes found") return log2_df, log2_df.transpose() log2_df2= pd.DataFrame(pd.to_numeric(log2_df, errors='coerce')) log_mean = top_log2.mean(axis=0).sort_values(ascending=False) log2_sorted = top_log2.reindex_axis(top_log2.mean(axis=0).sort_values(ascending=False).index, axis=1) xticks = [] keep_col= [] log2_cutoff = np.average(log2_sorted)-np.std(log2_sorted) avg_cutoff = np.average(log2_cutoff) for col, m in zip(log2_sorted.columns.tolist(),log2_sorted.mean()): if m > avg_cutoff: keep_col.append(col) xticks.append(col+' '+str("%.2f" % m)) filtered_df_by_cell = df_by_cell[keep_col] filtered_df_by_gene = filtered_df_by_cell.transpose() filtered_log2 = np.log2(filtered_df_by_cell[filtered_df_by_cell>0]) if plot: ax = sns.boxplot(data=filtered_log2, whis= .75, notch=True) ax = sns.stripplot(x=filtered_log2.columns.values, y=filtered_log2.mean(axis=0), size=4, jitter=True, edgecolor="gray") xtickNames = plt.setp(ax, xticklabels=xticks) plt.setp(xtickNames, rotation=90, fontsize=9) plt.show() plt.clf() sns.distplot(filtered_log2.mean()) plt.show() log2_expdf_cell = np.log2(filtered_df_by_cell+1) log2_expdf_gene = log2_expdf_cell.transpose() return log2_expdf_cell, log2_expdf_gene
def plot_traces_rts(p, all_traces, rts, names=['A', 'B', 'C', 'D'], tb=1000): tr = np.mean(p['tr'])*1e3 rtkeys = np.sort(rts.keys()) rt_dists = [np.asarray(rts[k])*1e3-tr for k in rtkeys] tb = np.ceil(np.max([np.max(rti) if len(rti)>0 else 0 for rti in rt_dists]))+50 sns.set(style='white', font_scale=1.5) f, axes = build_multi_axis(p, tb=tb) clrs = ['#3572C6', '#c44e52', '#8172b2', '#83a83b'] for i in range(len(all_traces)): for ii, ax in enumerate(axes.flatten()): x=np.arange(len(all_traces[i][ii])) ax.plot(x, all_traces[i][ii], color=clrs[ii], alpha=.3, lw=.75) for i, ax in enumerate(axes.flatten()): divider = make_axes_locatable(ax) axx = divider.append_axes("top", size=.7, pad=0.01, sharex=ax) for spine in ['top', 'left', 'bottom', 'right']: axx.spines[spine].set_visible(False) axx.set_xticklabels([]) axx.set_yticklabels([]) if len(rt_dists[i])<=1: continue sns.distplot(rt_dists[i], ax=axx, label=k, kde=True, hist=True, color=clrs[i], bins=20) text_str='$\mu_{%s}=%.fms$'%(names[i], tr+np.mean(rt_dists[i])) ax.text(x[0]-50, np.mean(p['a'])-.1*np.mean(p['a']), text_str, fontsize=21)
def histogram(self,x=None, y=None, l=None, t=None, **kwargs): """ this is a short-cut for creating many possible histograms, at a specified beamline location l, or specified time t. - if x and y are not input, then it creates a full joint-scatterplot for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t) - if x is input, it creates a 1d histogram with respect to that parameter - if x and y are input, creates a 2d histogram with respect to those parameters """ table = self.to_dataframe(l=l, t=t, latex=True) if x is None and y is None: g = sns.pairplot(table, **kwargs) for ax in g.axes.flat: _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90) return if x is not None and y is None: x = self._reformat_label(x) sns.distplot(table[x], **kwargs) plt.xlabel(x) return if x is not None and y is not None: x = self._reformat_label(x) y = self._reformat_label(y) sns.jointplot(x=x, y=y, data=table, **kwargs); return
def make_return_dist_fig(sim_lookup, predictions, pick_K=100, n_bins=200, n_boots=5000): sim_net = sim_lookup['net_ret'].values sim_weights = sim_lookup['weights'].values bin_locs = np.linspace(0, 100, n_bins)[::-1] bins = np.percentile(sim_lookup['pred'].values, bin_locs) sim_samps_per_bin = len(sim_lookup)/float(n_bins) pred_bins = np.digitize(predictions['returns'] / 100., bins) #find bins of first max_K points in prediction sim_returns = np.zeros(n_boots) boot_samps = sim_samps_per_bin*pred_bins[:pick_K] + np.random.randint(0, sim_samps_per_bin, size=(n_boots, pick_K)) boot_samps = boot_samps.astype(int) sim_returns = np.sum(sim_net[boot_samps], axis=1) / np.sum(sim_weights[boot_samps], axis=1) sim_returns = LCM.annualize_returns(sim_returns) fig,ax=plt.subplots(figsize=(5.0,4.0)) sns.distplot(sim_returns,bins=100, hist=False, rug=False, ax=ax, kde_kws={'color':'k','lw':3}) plt.xlabel('Annual returns (%)',fontsize=14) plt.ylabel('Probability',fontsize=14) plt.title('Estimated portfolio returns', fontsize=18) plt.tick_params(axis='both', which='major', labelsize=10) plt.margins(.01, .01) plt.tight_layout() return fig
def plot_rt_dists(simdf, axes=None): targets=['A', 'B', 'C', 'D'] targetColors = dict(zip(targets, ['#3572C6', '#c44e52', '#8172b2', '#83a83b'])) sns.set(style='white') if axes is None: f, axes = plt.subplots(2, 2, figsize=(9, 6), sharex=True) axes = axes.flatten() for i, ax in enumerate(axes): target = targets[i] rts = simdf[simdf.choice==target].rt.values sns.distplot(rts, kde=False, hist_kws={'alpha':.9}, norm_hist=True, bins=10, ax=ax, color=targetColors[target]) top = ax.get_ylim()[1]*.75 ax.text(750, top, target, color=targetColors[target], fontsize=19) x = np.array([0,300,600,900]) axes = np.asarray(f.axes) axes[0].set_ylabel('Probability Mass', fontsize=17) axes[2].set_ylabel('Probability Mass', fontsize=17) axes[2].set_xlabel('Time (ms)', fontsize=17) axes[3].set_xlabel('Time (ms)', fontsize=17) for ax in axes.flatten(): ax.set_title('') ax.set_xticks(x) ax.set_yticklabels('') ax.set_xlim(0,900) axes[2].set_xticklabels(x, fontsize=12) axes[3].set_xticklabels(x, fontsize=12) sns.despine()
def hist_boxplot(x='', category='', df=pandas.DataFrame(), colors={}, xlim=[], bins=[], alpha=0.9, box_step=0.15, ax=None): category_values = df[category].drop_duplicates() if isinstance(colors, dict): category_values = list(colors.keys()) box_position = 1 + (box_step*len(category_values)) yticks = [0.0,0.2,0.4,0.6,0.8,1.0] x_values = dict() x_nums = dict() bins=numpy.arange(xlim[0]-((xlim[1]-xlim[0])/50), xlim[1]+((xlim[1]-xlim[0])/50), (xlim[1]-xlim[0])/100) for cv in category_values: label = cv if isinstance(colors, dict): color = colors[cv] elif isinstance(colors, list): color = colors.pop() df_tmp=df.loc[(df[category]==cv),:] x_values[cv] = df_tmp[x].dropna() x_nums[cv] = df_tmp[x].dropna().shape[0] hist_kws={'cumulative':True,'histtype':'step','lw':1,'alpha':alpha} seaborn.distplot(x_values[cv], color=color, kde=False, bins=bins, ax=ax, hist_kws=hist_kws, norm_hist=True, label=label) box = ax.boxplot(x_values[cv].tolist(), positions=[box_position,], vert=False, showfliers=False, widths=[0.1,]) for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']: matplotlib.pyplot.setp(box[element], color=color, linestyle='solid') yticks.append(box_position) box_position = box_position - box_step ax.set_xlabel(x) ax.set_ylabel('Cumulative frequency') ax.set_xlim(numpy.mean([xlim[0],min(bins)]),numpy.mean([xlim[1],max(bins)])) ax.set_ylim(-0.02, 1.1+(box_step*len(category_values))) ax.set_yticks(yticks) yticklabels = [ y for y in yticks if y<=1 ] + category_values ax.set_yticklabels(yticklabels) return ax
def plot_flux_distributions(plt, old_mag, new_mag, old_weighted_rms, new_weighted_rms, faint, bright, old_PA1, new_PA1, name='', outdir='.plots'): """Plot various distributions of fluxes and magnitudes. Parameters ---------- plt : matplotlib.pyplot instance pyplot instance to plot with old_mag : np.array old magnitudes new_mag : np.array new magnitudes old_weighted_rms : np.array old rms weighted by the mean (rms(data)/mean(data)) new_weighted_rms : np.array old rms weighted by the mean (rms(data)/mean(data)) faint : float Faint end of range that PA1 was computed from. bright : float Bright end of range that PA1 was computed from. old_PA1 : float Old value of PA1, to plot as horizontal line. new_PA1 : float New value of PA1, to plot as horizontal line. name : str Name to include in plot titles and save files. outdir : str, optional Directory to write the saved plots to. """ import seaborn seaborn.set_style('whitegrid') import scipy.stats old_color = 'blue' new_color = 'red' plt.figure() plt.plot(old_mag, old_weighted_rms, '.', color=old_color, label='old') plt.plot(new_mag, new_weighted_rms, '.', color=new_color, label='new') plt.axvline(faint, ls=':', color=old_color) plt.axvline(bright, ls=':', color=old_color) plt.axhline(old_PA1, ls='--', color=old_color) plt.axhline(new_PA1, ls='--', color=new_color) plt.legend(loc='upper left') plt.title('Where is the systematic flux rms limit?') plt.xlabel('magnitude') plt.ylabel('rms/mean per source') filename = os.path.join(outdir, '{}-photometry-PA1.pdf') plt.savefig(filename.format(name)) plt.figure() seaborn.distplot(old_weighted_rms, fit=scipy.stats.lognorm, kde=False, label="old", color=old_color) seaborn.distplot(new_weighted_rms, fit=scipy.stats.lognorm, kde=False, label="new", color=new_color) plt.title('Source RMS pre/post-jointcal') plt.xlabel('rms(flux)/mean(flux)') plt.ylabel('number') plt.legend(loc='upper right') filename = os.path.join(outdir, '{}-photometry-rms.pdf') plt.savefig(filename.format(name))
def skew_plot(app_train, features, filename): """ 对传入的 df 进行偏度可视化分析 """ fcols = 2 frows = len(features) plt.figure(figsize=(4 * fcols, 6 * frows)) i = 0 for col in features: dat = app_train[[col, 'TARGET']].dropna() i += 1 plt.subplot(frows, fcols, i) sns.distplot(dat[col], fit=stats.norm) plt.title(col + ' Original') plt.xlabel('') i += 1 plt.subplot(frows, fcols, i) _ = stats.probplot(dat[col], plot=plt) plt.title('skew=' + '{:.4f}'.format(stats.skew(dat[col]))) plt.xlabel('') plt.ylabel('') plt.tight_layout(h_pad=2.5) plt.savefig(filename) plt.show()
def plot_epi_T1_corregistration(mean_epi_file, wm_file, subject_id, similarity_distribution=None, figsize=(11.7,8.3),): fig = plt.figure(figsize=figsize) if similarity_distribution: ax = plt.subplot(2,1,1) sns.distplot(similarity_distribution.values(), ax=ax) ax.set_xlabel("EPI-T1 mincost function (over all subjects)") cur_similarity = similarity_distribution[subject_id] label = "mincost function = %g"%cur_similarity plot_vline(cur_similarity, label, ax=ax) ax = plt.subplot(2,1,2) else: ax = plt.subplot(1,1,0) func = nb.load(mean_epi_file).get_data() func_affine = nb.load(mean_epi_file).get_affine() wm_data = nb.load(wm_file).get_data() wm_affine = nb.load(wm_file).get_affine() slicer = viz.plot_anat(np.asarray(func), np.asarray(func_affine), black_bg=True, cmap = cm.Greys_r, # @UndefinedVariable figure = fig, axes = ax, draw_cross = False) slicer.contour_map(np.asarray(wm_data), np.asarray(wm_affine), linewidths=[0.1], colors=['r',]) fig.suptitle('coregistration', fontsize='14') return fig
def histogramPlot(cls, data, bins, fileLocation, label=''): from matplotlib import pyplot as plt import seaborn as sns histogram, axes = plt.subplots() sns.distplot(data, bins=bins, kde=False, rug=False, axlabel=label) cls.saveFigure(histogram, fileLocation)
def plot_DVARS(title, DVARS_file, mean_DVARS_distribution=None, figsize=(11.7,8.3)): DVARS = np.loadtxt(DVARS_file) fig = Figure(figsize=figsize) FigureCanvas(fig) if mean_DVARS_distribution: grid = GridSpec(2, 4) else: grid = GridSpec(1, 4) ax = fig.add_subplot(grid[0,:-1]) ax.plot(DVARS) ax.set_xlim((0, len(DVARS))) ax.set_ylabel("DVARS") ax.set_xlabel("Frame number") ylim = ax.get_ylim() ax = fig.add_subplot(grid[0,-1]) sns.distplot(DVARS, vertical=True, ax=ax) ax.set_ylim(ylim) if mean_DVARS_distribution: ax = fig.add_subplot(grid[1,:]) sns.distplot(mean_DVARS_distribution, ax=ax) ax.set_xlabel("Mean DVARS (over all subjects) [std]") MeanFD = DVARS.mean() label = "Mean DVARS = %g"%MeanFD plot_vline(MeanFD, label, ax=ax) fig.suptitle(title, fontsize='14') return fig
def plotResults(tr, resultKey='resultInputPsf', doRates=False, title='', asHist=False, doPrint=True, actuallyPlot=True): import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') import seaborn as sns sns.set(style="whitegrid", palette="pastel", color_codes=True) methods = ['ALstack', 'ZOGY', 'SZOGY', 'ALstack_decorr'] tr = [t for t in tr if t is not None and t[resultKey]] FN = pd.DataFrame({key: np.array([t[resultKey][key]['FN'] for t in tr]) for key in methods}) FP = pd.DataFrame({key: np.array([t[resultKey][key]['FP'] for t in tr]) for key in methods}) TP = pd.DataFrame({key: np.array([t[resultKey][key]['TP'] for t in tr]) for key in methods}) title_suffix = 's' if doRates: FN /= (FN + TP) FP /= (FN + TP) TP /= (FN + TP) title_suffix = ' rate' if doPrint: print 'FN:', '\n', FN.mean() print 'FP:', '\n', FP.mean() print 'TP:', '\n', TP.mean() if not actuallyPlot: return TP, FP, FN matplotlib.rcParams['figure.figsize'] = (18.0, 6.0) fig, axes = plt.subplots(nrows=1, ncols=2) if not asHist: sns.violinplot(data=TP, cut=True, linewidth=0.3, bw=0.25, scale='width', alpha=0.5, ax=axes[0]) if TP.shape[0] < 500: sns.swarmplot(data=TP, color='black', size=3, alpha=0.3, ax=axes[0]) sns.boxplot(data=TP, saturation=0.5, boxprops={'facecolor': 'None'}, whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[0]) plt.setp(axes[0], alpha=0.3) axes[0].set_ylabel('True positive' + title_suffix) axes[0].set_title(title) sns.violinplot(data=FP, cut=True, linewidth=0.3, bw=0.5, scale='width', ax=axes[1]) if FP.shape[0] < 500: sns.swarmplot(data=FP, color='black', size=3, alpha=0.3, ax=axes[1]) sns.boxplot(data=FP, saturation=0.5, boxprops={'facecolor': 'None'}, whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[1]) plt.setp(axes[1], alpha=0.3) axes[1].set_ylabel('False positive' + title_suffix) axes[1].set_title(title) else: for t in TP: sns.distplot(TP[t], label=t, norm_hist=False, ax=axes[0]) axes[0].set_xlabel('True positive' + title_suffix) axes[0].set_title(title) legend = axes[0].legend(loc='upper left', shadow=True) for t in FP: sns.distplot(FP[t], label=t, norm_hist=False, ax=axes[1]) axes[1].set_xlabel('False positive' + title_suffix) axes[1].set_title(title) legend = axes[1].legend(loc='upper left', shadow=True) return TP, FP, FN
def Array_Grapher(array): """ Simply creates a histogram from the array when running from a shell :param array: an array from a numpy function :return:a histogram of the array """ seaborn.distplot(array)
def plot_volume_per_day_hist(transactions, ax=None, **kwargs): """Plots a histogram of trading volume per day. Parameters ---------- transactions : pd.DataFrame Daily transaction volume and dollar ammount. - See full explanation in tears.create_full_tear_sheet. ax : matplotlib.Axes, optional Axes upon which to plot. **kwargs, optional Passed to seaborn plotting function. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ if ax is None: ax = plt.gca() sns.distplot(transactions.txn_volume, ax=ax, **kwargs) ax.set_title('Distribution of Daily Trading Volume') ax.set_xlabel('Volume') return ax
def rolling_success_diff(answers, last_count=4, filters=None, only_last=True): if filters is None: filters = [None] data = [] for filter in filters: df = filter_users(answers, min_answer_count=filter) for df in df.groupby('user'): df = df[1] mean = df['correct'].mean() if len(df) < last_count: continue for x in df['correct'].rolling(last_count, last_count).mean(): if np.isnan(x): continue if not only_last: data.append([np.round(x - mean, 1), filter, 0]) if not only_last: data[-1][-1] = 1 else: data.append([x - mean, filter, 1]) df = pd.DataFrame(data, columns=['rolling_success_diff', 'min_answers', 'leave']) if not only_last: sns.pointplot(data=df, x='rolling_success_diff', y='leave', hue='min_answers').set(ylim=(0, 0.2)) else: for filter in filters: sns.distplot(df.loc[df['min_answers'] == filter, 'rolling_success_diff'], label=str(filter)) plt.legend(loc=1) return df
def plot_vlast_density(self, nsim=100, nobs=100, param=None): """Plot the marginal density of ARG process.""" plt.figure(figsize=(8, 4)) vol = self.vsim_last(nsim=int(nsim), nobs=int(nobs), param=param) sns.distplot(vol, rug=True, hist=False) plt.show()
# Visualize frequency distribution of income variable f, ax = plt.subplots(1, 2, figsize=(18, 8)) ax[0] = dataset[' income'].value_counts().plot.pie(explode=[0, 0], autopct='%1.1f%%', ax=ax[0], shadow=True) ax[0].set_title('Income Share') #f, ax = plt.subplots(figsize=(6, 8)) ax[1] = sns.countplot(x=" income", data=dataset, palette="Set1") ax[1].set_title("Frequency distribution of income variable") plt.show() # Distribution of age variable f, ax = plt.subplots(figsize=(10, 8)) x = dataset['age'] ax = sns.distplot(x, bins=10, color='blue') ax.set_title("Distribution of age variable") plt.show() # Detect outliers in age variable with boxplot f, ax = plt.subplots(figsize=(10, 8)) x = dataset['age'] ax = sns.boxplot(x) ax.set_title("Visualize outliers in age variable") plt.show() # Visualize income with respect to age variable f, ax = plt.subplots(figsize=(10, 8)) ax = sns.boxplot(x=" income", y="age", data=dataset) ax.set_title("Visualize income with respect to age variable") plt.show()
dataset = pd.read_csv( '/Users/kalharaperera/Desktop/Projects/Data Sets/us-weather-history/KCLT.csv' ) # print(dataset.shape) print(dataset.describe()) dataset.plot(x='actual_min_temp', y='actual_max_temp', style='o') plt.title('Min temp Vs Max temp') plt.xlabel('Min temp') plt.ylabel('Max temp') plt.show() plt.figure(figsize=(15, 10)) # plt.tight_layout() seaborninstance.distplot(dataset['actual_max_temp']) plt.show() #data splicing basically spliting your data into training data and testing data X = dataset['actual_min_temp'].values.reshape(-1, 1) Y = dataset['actual_max_temp'].values.reshape(-1, 1) #assigning 20% of the data to test data and the others to training data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) regressor = LinearRegression() #training the algorithem regressor.fit(X_train, Y_train)
from scipy.stats import norm from sklearn.preprocessing import StandardScaler from scipy import stats import warnings # In[5]: ames_train = pd.read_csv('train.csv') ames_train.columns # In[15]: sns.set(style="white", palette="muted", color_codes=True) #histogram plot sns.distplot(ames_train['SalePrice'], color='r') # In[16]: ##rug plot sns.distplot(ames_train['SalePrice'], hist=False, rug=True, color='m') # In[19]: print("Std: %f" % ames_train['SalePrice'].std()) print("Skewness: %f" % ames_train['SalePrice'].skew()) print("Kurtosis: %f" % ames_train['SalePrice'].kurt()) # In[27]: ##build a correlation heatmap
@author: arunramji """ #Let's import required libraries import pandas as pd pd.options.display.max_rows=999 pd.options.display.max_columns =999 import numpy as np import matplotlib.pyplot as plt import seaborn as sns df = pd.read_csv('/Users/arunramji/Downloads/Sourcefiles/Kaggle_Housing_Price/train.csv') fig , ax = plt.subplots(figsize=(12,6)) sns.distplot(df['SalePrice']) plt.show() fig_size = plt.rcParams["figure.figsize"] fig_size[0] =16.0 fig_size[1] = 4.0 x =df['SalePrice'] plt.hist(x, normed=True, bins=400) plt.ylabel('SalePrice'); df_1 = df[df['SalePrice']<400000] #Missing values Null_Cols = pd.DataFrame(df.select_dtypes(include='object').isnull().sum(),columns=['Null_count']) Null_Cols[Null_Cols.Null_count>0]
AAPL[column_name] = pd.Series(AAPL['Adj Close']).rolling(window=ma).mean() # %% AAPL[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(subplots=False, figsize=(14,7)) # %% # If we get a moving average for more days at a time, we get a smoother line, and it's not gonna rely much on the daily # fluctuation changes. # %% # Now retrieving the daily returns for Apple # What that means is: for any given day, what is your percent return on your money? AAPL['Daily Return'] = AAPL['Adj Close'].pct_change() AAPL['Daily Return'].plot(figsize=(14,7), legend=True, linestyle='--', marker='o') # %% # This is a histogram of the daily returns for the past year. sns.distplot(AAPL['Daily Return'].dropna(), bins=100, color='purple') # %% # It looks like the above histogram is skewed a little more negatively, but we need to do some more analysis to check # that out. # The following graph is just another way to see it. AAPL['Daily Return'].hist(bins=50) plt.gcf().set_size_inches(15, 8) # %% # Now building up another Dataframe with all the adjusted close columns for each of the stocks Dataframes in order to # analyse the return of all the stocks in our data list. closing_df = DataReader(tech_list, 'yahoo', start, end)['Adj Close'] # %% closing_df.head() # %%
X_soph = np.concatenate([X_soph,encoded_conf_soph],axis=1) smote =SMOTE(sampling_strategy='minority',k_neighbors=5) X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train) #%% Plot distributions ################################################ zeros = features.loc[features['eval'] == 0] ones = features.loc[features['eval'] == 1] numeric_cols = features.columns[1:-1] nplot=1 for col in numeric_cols: plt.subplot(int(np.ceil(len(numeric_cols)/3)),3,nplot) sns.distplot(zeros[col],hist=False,label='Misses') sns.distplot(ones[col],hist=False,label='Hits') nplot+=1 plt.legend() plt.tight_layout() plt.show() # %% Modelling ############################################### estimator = XGBClassifier() # estimator = LogisticRegression() param_grid={
tips.head() tips['tip_pct'] = tips.tip/(tips.total_bill-tips.tip) sns.barplot(x='tip_pct', y='day', data=tips, orient='h') # length of the bar is the average tip_pcts on each day # black line is the 95% confidence interval sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h') sns.set(style='whitegrid') tips.tip_pct.plot.hist(bins=50) tips.tip_pct.hist(bins=50) tips.tip_pct.plot.density() tips.tip_pct.plot.kde() comp1 = np.random.normal(0, 1, size=200) comp1 comp2 = np.random.normal(10, 2, size=200) values = pd.Series(np.concatenate([comp1, comp2])) values sns.distplot(values, bins=100, color='k') # distplot plot both a histogram and a continuour density estimate simulation macro = pd.read_csv('../examples/macrodata.csv') macro.head() data = macro[['cpi', 'm1', 'tbilrate', 'unemp']] data.head() trans_data = np.log(data).diff().dropna() trans_data[-5:] sns.regplot('m1', 'unemp', data=trans_data) plt.title('Changes in log %s versus log %s' % ('m1', 'unemp')) sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2}) sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker',kind='bar', data=tips[tips.tip_pct<1]) sns.factorplot(x='day', y='tip_pct', row='time',col='smoker', kind='bar',data=tips[tips.tip_pct<1]) sns.factorplot(x='tip_pct', y='day', kind='box', data=tips[tips.tip_pct<1])
'price', 'std_score', 'max_score', 'min_score', 'avg_score', 'count' ]).fillna(0) corr_p = stats.pearsonr(df_analyze['price'], df_analyze['avg_score']) print('STATS | PEARSON R') print(corr_p) corr_s = stats.spearmanr(df_analyze['price'], df_analyze['avg_score']) print('STATS | SPEARMAN') print(corr_s) df_analyze = SharedUtils.normalize(df_analyze) sns.distplot(df_analyze['avg_score'], hist=True, kde=True, color='blue', hist_kws={'edgecolor': 'black'}) # Add labels plt.title('Histogram Sentiment Score (n=169)') plt.xlabel('Score') plt.ylabel('No. Weeks') plt.show() sns.distplot(df_analyze['price'], hist=True, kde=True, color='blue', hist_kws={'edgecolor': 'black'}) # Add labels plt.title('Histogram Price (n=169)')
def visualiseMargins(self, df): df_won = df[df['Result'] == 1] df_w_run = df_won[df_won['Margin'].astype(str).str.contains("runs")] df_w_run['Margin'].replace(to_replace=r'[\s]+.*', value="", regex=True, inplace=True) df_w_run = df_w_run.astype({"Margin": int}) categorical = ['Toss', 'Bat', 'Opposition', 'Ground'] fig, ax = plt.subplots(2, 2, figsize=(10, 10)) for variable, subplot in zip(categorical, ax.flatten()): sns.scatterplot(x="Result", y="Margin", hue=df_w_run[variable], data=df_w_run, ax=subplot) plt.show() plt.figure(2) sns.distplot(df_w_run['Margin'], hist=True, kde=True, bins=int(5), color='darkblue', hist_kws={'edgecolor': 'black'}, kde_kws={'linewidth': 4}) plt.show() df_w_wickets = df_won[df_won['Margin'].astype(str).str.contains( "wickets")] df_w_wickets['Margin'].replace(to_replace=r'[\s]+.*', value="", regex=True, inplace=True) df_w_wickets = df_w_wickets.astype({"Margin": int}) plt.figure(3) sns.distplot(df_w_wickets['Margin'], hist=True, kde=True, bins=int(10), color='darkblue', hist_kws={'edgecolor': 'black'}, kde_kws={'linewidth': 4}) plt.show() df_lost = df[df['Result'] == 0] df_l_run = df_lost[df_lost['Margin'].astype(str).str.contains("runs")] df_l_run['Margin'].replace(to_replace=r'[\s]+.*', value="", regex=True, inplace=True) df_l_run = df_l_run.astype({"Margin": int}) categorical = ['Toss', 'Bat', 'Opposition', 'Ground'] fig, ax = plt.subplots(2, 2, figsize=(10, 10)) for variable, subplot in zip(categorical, ax.flatten()): sns.scatterplot(x="Result", y="Margin", hue=df_l_run[variable], data=df_l_run, ax=subplot) plt.show() plt.figure(5) sns.distplot(df_l_run['Margin'], hist=True, kde=True, bins=int(5), color='darkblue', hist_kws={'edgecolor': 'black'}, kde_kws={'linewidth': 4}) plt.show() df_l_wicket = df_lost[df_lost['Margin'].astype(str).str.contains( "wickets")] df_l_wicket['Margin'].replace(to_replace=r'[\s]+.*', value="", regex=True, inplace=True) df_l_wicket = df_l_wicket.astype({"Margin": int}) plt.figure(6) sns.distplot(df_l_wicket['Margin'], hist=True, kde=True, bins=int(10), color='darkblue', hist_kws={'edgecolor': 'black'}, kde_kws={'linewidth': 4}) plt.show()
b = closure(a) b = multiplicative_replacement(a) pd.DataFrame(b) pd.DataFrame(b).sum(axis=1) ########################################################### # PLOT histogram - investigate error in A549_D aitchison ## ########################################################### import seaborn as sns df_part = df_int.iloc[:, df_int.columns.get_level_values("batch") == ("_".join(["A549", "D", "Rep2"]))] for i in range(np.shape(df_part)[1]): #plt.hist(np.log2((df_part.iloc[:,i]).replace(0,np.nan)), bins = 1000, # histtype="step", label = df_part.iloc[:,i].name[-1]) sns.distplot(np.log2((df_part.iloc[:,i]).replace(0,np.nan)), bins = 1000, label = df_part.iloc[:,i].name[-1], kde = True, hist = False) plt.legend() ax = np.log2(df_part.replace(0,np.nan)).plot.hist(bins=1000, alpha = 0.5) df_part = df_part[(df_part.T != 0).any()] df_ait_part = aitchison_transform(df_part) for i in range(np.shape(df_ait_part)[1]): # plt.hist((df_ait_part.iloc[:,i]).replace(0,np.nan), bins = 1000, # histtype="step", label = df_part.iloc[:,i].name[-1]) sns.distplot(df_ait_part.iloc[:,i], bins = 1000, label = df_part.iloc[:,i].name[-1], kde = True, hist = False) plt.legend()
mean = accident_df[column_name].mean() std = accident_df[column_name].std() min_ = accident_df[column_name].min() max_ = accident_df[column_name].max() kurt = accident_df[column_name].kurt() skew = accident_df[column_name].skew() print(column_name, ',min =', min_, ',max =', max_, ',avg =', mean, ',std =', std, ',skewness =', skew, ',kurtosis =', kurt, end='\n') print() fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5)) sns.distplot(accident_df[accident_df['Precipitation(in)'].isnull() == False] ['Precipitation(in)'], ax=axs[0]) sns.distplot(accident_df[accident_df['Temperature(F)'].isnull() == False] ['Temperature(F)'], ax=axs[1])
def plot_distplot(df, cell_model, matching): sns.distplot(df.loc[df.arch == "complex_core", "syn_len"], label = "core") sns.distplot(df.loc[df.arch == "complex_derived", "syn_len"], label = "der") plt.legend() outf = f"{RE}{cell_model}_ernst_active_bases_dist_{matching}.pdf" plt.savefig(outf, bbox_inches = "tight")
# fill HOLIDAY with normal df['HOLIDAY'] = df['HOLIDAY'].fillna("NORMAL") # drop features drop_feats = [ 'WW_GRS', 'PERCENT', 'NM_0.5W_T', 'NM_0.5W_M24', 'NM_0.5W_M26', 'NM_0.5W_F24', 'NM_0.5W_F26', 'GENRE2' ] df.drop(drop_feats, axis=1, inplace=True) # check OBO df['OBO'].describe() # orginal data sns.distplot(df['OBO'], fit=norm) # Get the fitted parameters used by the function (mu, sigma) = norm.fit(df['OBO']) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #Now plot the distribution plt.legend( ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('distribution') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(df['OBO'], plot=plt) plt.show()
def plotlabeldist(self): labels=[self.masks[i]['labels'] for i in range(len(self.masks))] return sns.distplot(labels)
dataset = pd.read_csv(r'C:\Users\Hp\Desktop\Nasa\Dataset\Asteroid_data_final.csv') dataset.shape dataset.describe() dataset.isnull().any() dataset = dataset.fillna(method='ffill') X = dataset[['As_diam_km', 'As_dist_km', 'As_velocity_kmh', 'As_velocity_angle', 'As_dist_relative_flag','Coordinate_flag','Sc_diam_m', 'Sc_velocity_kmh', 'Relative_velocity', 'Estimated_time', 'New_theta']].values y = dataset['Final_Angle'].values plt.figure(figsize=(15,10)) plt.tight_layout() seabornInstance.distplot(dataset['Final_Angle']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) regressor = LinearRegression() s regressor.fit(X_train, y_train) #coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient']) #coeff_df y_pred = regressor.predict(X_test) df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) df1 = df.head(25) df1
def plot_dist(result, out_dir, name, model_flavors, metric, cumu): if metric == "size": kwd = "set_sizes" elif metric == "prop": kwd = "set_props" sns.set(style="whitegrid", font="Roboto") # if "full" in model_flavors: # set_sizes_full = result["{0}_full".format(kwd)] # sns.distplot( # set_sizes_full, # hist=False, # kde=True, # kde_kws={"linewidth": 2, "shade":True, "cumulative":cumu}, # label="Full" # ) if "full" in model_flavors: set_sizes_full = result["{0}_full".format(kwd)] try: sns.distplot( set_sizes_full, hist=False, kde=True, kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu}, label="PLASMA-JC" ) except Exception: pass if "indep" in model_flavors: set_sizes_indep = result["{0}_indep".format(kwd)] try: sns.distplot( set_sizes_indep, hist=False, kde=True, kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu}, label="PLASMA-JI" ) except Exception: pass if "ase" in model_flavors: set_sizes_ase = result["{0}_ase".format(kwd)] try: sns.distplot( set_sizes_ase, hist=False, kde=True, kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu}, label="PLASMA-AS" ) except Exception: pass if "acav" in model_flavors: set_sizes_caviar_ase = result["{0}_caviar_ase".format(kwd)] try: sns.distplot( set_sizes_caviar_ase, hist=False, kde=True, kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu}, label="CAVIAR-ASE" ) except Exception: pass if "eqtl" in model_flavors: set_sizes_eqtl = result["{0}_eqtl".format(kwd)] try: sns.distplot( set_sizes_eqtl, hist=False, kde=True, kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu}, label="QTL-Only" ) except Exception: pass if metric == "prop": plt.xlim(0, 1) elif metric == "size": plt.xlim(0, 1000) plt.legend(title="Model") if cumu: cumu_kwd = "Cumulative " cumu_fname = "_cumu" yax = "Proportion of Markers" else: cumu_kwd = "" cumu_fname = "" yax = "Density" if metric == "size": plt.xlabel("Set Size") plt.ylabel(yax) plt.title("{0}Distribution of Credible Set Sizes: {1}".format(cumu_kwd, name)) plt.savefig(os.path.join(out_dir, "set_size_distribution{0}.svg".format(cumu_fname))) elif metric == "prop": plt.xlabel("Set Size (Proportion of Total Markers)") plt.ylabel(yax) plt.title("{0}Distribution of Credible Set Sizes: {1}".format(cumu_kwd, name)) plt.savefig(os.path.join(out_dir, "set_prop_distribution{0}.svg".format(cumu_fname))) plt.clf()
plt.scatter(dat[:, 1], dat[:, 3]) plt.grid() plt.title('Petal Width by Sepal Width') plt.ylabel('Petal width(cm)') plt.xlabel('Sepal width(cm)') dat1 = pd.DataFrame(data=dat, columns=iris["feature_names"]) dat1["target"] = iris["target"] dat1['target'] = dat1['target'].replace([0, 1, 2], iris["target_names"]) sns.set(style="whitegrid") sns.countplot(x="target", data=dat1) plt.title("Number of Examples per Species") plt.xlabel("species") sns.distplot(dat1['sepal width (cm)'], hist=True, kde=False) plt.title("Histogram of Sepal Width") sns.barplot(x='target', y='sepal length (cm)', data=dat1, estimator=np.mean) plt.title("Avg. Sepal Length by Species") plt.ylabel("mean(sepal length (cm))") plt.xlabel("species") sns.boxplot(x='target', y='sepal width (cm)', data=dat1) plt.title("Boxplot of Sepal Width by Species") plt.xlabel("species") sns.violinplot(x='target', y='sepal width (cm)', data=dat1) plt.title("Violinplot of Sepal Width by Species") plt.xlabel("species")
# Now drop the 'Id' column since it's unnecessary for the prediction process. train.drop('Id', axis=1, inplace=True) test.drop('Id', axis=1, inplace=True) print("Train data size before dropping Id feature is : {}".format(train.shape)) print("Test data size before dropping Id feature is : {}".format(test.shape)) print(train.head()) print(test.head()) # Getting Description print(train['SalePrice'].describe()) # Plot Histogram sns.distplot(train['SalePrice'], fit=norm) # Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['SalePrice']) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) plt.legend( ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show() print('Skewness: %f' % train['SalePrice'].skew())
from genetic_algorithm import generate_population sns.set_style("whitegrid") def lcs(X, Y): # find the length of the strings m = len(X) n = len(Y) # declaring the array for storing the dp values L = [[None]*(n + 1) for i in range(m + 1)] """Following steps build L[m + 1][n + 1] in bottom up fashion Note: L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]""" for i in range(m + 1): for j in range(n + 1): if i == 0 or j == 0 : L[i][j] = 0 elif X[i-1] == Y[j-1]: L[i][j] = L[i-1][j-1]+1 else: L[i][j] = max(L[i-1][j], L[i][j-1]) # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1] return L[m][n] population = generate_population(10000) lcs_vec = [lcs(individual[1], ['H','E','L','L','O','W','O','R','L','D']) for individual in population] gene_dist = sns.distplot(lcs_vec, kde=False, norm_hist=False, color='black', bins=10) plt.show()
import seaborn as sns import matplotlib.pyplot as plt import pandas as pd from sklearn.ensemble import RandomForestRegressor train = pd.read_csv('Bike_Train.csv') test = pd.read_csv('Bike_Test.csv') print("original train data:", train.shape) print("original test data:", test.shape) #图1:count fig = plt.figure() ax = fig.add_subplot(1, 1, 1) fig.set_size_inches(6, 5) sns.distplot(train['count']) ax.set(xlabel='count', title='distribution') fig.savefig('001 count distribution', dpi=200) train_drop_tail = train[np.abs(train['count'] - train['count'].mean()) <= ( 3 * train['count'].std())] print('----------------------') print("train_drop_tail:", train_drop_tail.shape) #图2 两个图的对比 fig = plt.figure() fig.set_size_inches(12, 5) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) sns.distplot(train['count'], ax=ax1) sns.distplot(train_drop_tail['count'], ax=ax2)
return reg # def q_walk_uni_line(state, size, steps): # H = np.array([[1,1], [1, -1]]) / np.sqrt(2) # plus = H * np.array([1,0]) # states = np.append(np.array() # walker = np.kron(state, plus) samples = 100 r_dist = np.array([disc_line_random_walk(51, 101, 40) for i in range(samples)]) fig, axis = plt.subplots() sbn.distplot(r_dist, ax=axis, kde=True, hist=True) plt.title( "Distribuição empírica para passeio aleatório com {} amostras.".format( samples)) plt.xlabel("Nó de parada") plt.ylabel("Frequência relativa") plt.grid(True) # coller plt.savefig("rd_walk{}.png".format(samples)) plt.clf() n_qbits = 8 graph_size = 101 steps = 40 eng = MainEngine() data = np.zeros(samples, dtype=int)
ax2.hist(data.Amount[data.Class == 0], bins = 30) ax2.set_title('Normal') plt.xlabel('Amount ($)') plt.ylabel('Number of Transactions') plt.yscale('log') plt.show() # In[ ]: plt.figure(figsize=(12,28*4)) gs = gridspec.GridSpec(28, 1) for i in range(1, 29): ax = plt.subplot(gs[i-1]) sns.distplot(data['V'+str(i)][data.Class == 1], bins=50) sns.distplot(data['V'+str(i)][data.Class == 0], bins=50) ax.set_xlabel('') ax.set_title('histogram of feature: ' + 'V'+str(i)) plt.show() plt.tight_layout() # In[ ]: # Based on observation of data overlap above, try out a second dataset with redunancies removed clean_data = data.drop(['V28','V27','V23','V8'], axis =1) # Later - can re run everything after running the following line #data = clean_data
train_df["coverage"] = train_df.masks.map(np.sum) / pow(img_size_ori, 2) def cov_to_class(val): for i in range(0, 11): if val * 10 <= i : return i train_df["coverage_class"] = train_df.coverage.map(cov_to_class) # In[8]: fig, axs = plt.subplots(1, 2, figsize=(15,5)) sns.distplot(train_df.coverage, kde=False, ax=axs[0]) sns.distplot(train_df.coverage_class, bins=10, kde=False, ax=axs[1]) plt.suptitle("Salt coverage") axs[0].set_xlabel("Coverage") axs[1].set_xlabel("Coverage class") # In[9]: #Plotting the depth distributions¶ sns.distplot(train_df.z, label="Train") sns.distplot(test_df.z, label="Test") plt.legend() plt.title("Depth distribution")
# Load data into a pandas DataFrame. Note: 1st column is ID home_data = pd.read_csv(file_path, index_col=0) home_data.tail() # home_data.head() home_data.shape # List of numerical attributes home_data.select_dtypes(exclude=['object']).columns len(home_data.select_dtypes(exclude='object').columns) home_data.select_dtypes(exclude=['object']).describe().round(decimals=2) home_data.select_dtypes(include=['object']).columns len(home_data.select_dtypes(include='object').columns) home_data.select_dtypes(include=['object']).describe() target = home_data.SalePrice plt.figure() sns.distplot(target) plt.title('Distribution of SalePrice') plt.show() sns.distplot(np.log(target)) plt.title('Distribution of Log-transformed SalePrice') plt.xlabel('log(SalePrice)') plt.show() print('SalePrice has a skew of ' + str(target.skew().round(decimals=2)) + ' while the log-transformed SalePrice improves the skew to ' + str(np.log(target).skew().round(decimals=2))) num_attributes = home_data.select_dtypes(exclude='object').drop('SalePrice', axis=1).copy() fig = plt.figure(figsize=(12,18)) for i in range(len(num_attributes.columns)): fig.add_subplot(9,4,i+1) sns.distplot(num_attributes.iloc[:,i].dropna())
traindata.isnull().sum().sum() sns.boxplot(traindata['y']) traindata.shape # fill missing values by mean traindata['y'].fillna(traindata['y'].mean(), inplace=True) sns.relplot(x="x", y="y", data=traindata) sns.relplot(x=traindata['x'], y=traindata['y'], data=traindata) sns.regplot(x=traindata['x'], y=traindata['y'], data=traindata) sns.distplot(traindata['x'], bins=10) sns.boxplot(traindata['x'], orient='v') sns.boxplot(traindata['y'], orient='v') Q1=traindata.quantile(.25) Q3=traindata.quantile(.75) IQR = traindata.apply(stats.iqr) upper = (Q3 + 1.5 * IQR) lower = (Q1 - 1.5 * IQR) # No of outliers for each column # To see no of outliers for each variable (traindata > (Q3 + 1.5 * IQR)).sum() (traindata < (Q1 - 1.5 * IQR)).sum()
plt.figure(figsize=(16,8)) plt.imshow(title_wordcloud) plt.axis('off') plt.show() # Get summary statistics of rating ratings['rating'].describe() # Import seaborn library import seaborn as sns sns.set_style('whitegrid') sns.set(font_scale=1.5) %matplotlib inline # Display distribution of rating sns.distplot(ratings['rating'].fillna(ratings['rating'].median())) # Join all 3 files into one dataframe dataset = pd.merge(pd.merge(movies, ratings),users) # Display 20 movies with highest ratings dataset[['title','genres','rating']].sort_values('rating', ascending=False).head(20) # Make a census of the genre keywords genre_labels = set() for s in movies['genres'].str.split('|').values: genre_labels = genre_labels.union(set(s)) # Function that counts the number of times each of the genre keywords appear def count_word(dataset, ref_col, census): keyword_count = dict() for s in census:
def distribution_compare_pretty(_df1, _df2, col, figsize=None, date_flag=False): """ Draw pretty distribution graph for data compare Parameters ---------- _df1: pandas DataFrame slice of table1 containing enough information to check _df2: pandas DataFrame slice of table2 containing enough information to check col: string name of column to check figsize: tuple, default=None figure size date_flag: bool, default=False whether it is checking date features """ # color values for graph TABLE1_DARK = "#4BACC6" TABLE2_DARK = "#F79646" df1, df2 = _df1.copy(), _df2.copy() if date_flag: numeric_col = '%s_numeric' % (col) if numeric_col not in df1.columns.values: snapshot_date_now = str(datetime.datetime.now().date()) df1[numeric_col] = ( pd.to_datetime(snapshot_date_now) - pd.to_datetime(df1[col], errors='coerce')).astype( 'timedelta64[M]', errors='ignore') if numeric_col not in df2.columns.values: snapshot_date_now = str(datetime.datetime.now().date()) df2[numeric_col] = ( pd.to_datetime(snapshot_date_now) - pd.to_datetime(df2[col], errors='coerce')).astype( 'timedelta64[M]', errors='ignore') else: numeric_col = col value_mins = [df1[numeric_col].min(), df2[numeric_col].min()] value_means = [df1[numeric_col].mean(), df2[numeric_col].mean()] value_medians = [df1[numeric_col].median(), df2[numeric_col].median()] value_maxs = [df1[numeric_col].max(), df2[numeric_col].max()] if date_flag: date_mins = [ pd.to_datetime(df1[col], errors='coerce').min(), pd.to_datetime(df2[col], errors='coerce').min() ] date_maxs = [ pd.to_datetime(df1[col], errors='coerce').max(), pd.to_datetime(df2[col], errors='coerce').max() ] both_value_max = np.max([abs(v) for v in value_maxs] + [abs(v) for v in value_mins]) # get clean values df1_sample_dropna_values = df1[numeric_col].dropna().values df2_sample_dropna_values = df2[numeric_col].dropna().values # get distribution scale_flg = 0 df1_draw_values = df1_sample_dropna_values df1_draw_value_4 = [ value_mins[0], value_means[0], value_medians[0], value_maxs[0] ] df2_draw_values = df2_sample_dropna_values df2_draw_value_4 = [ value_mins[1], value_means[1], value_medians[1], value_maxs[1] ] if both_value_max >= pow(10, 6): scale_flg = 1 df1_draw_values, df1_draw_value_4 = _get_scale_draw_values( df1_draw_values, df1_draw_value_4) df2_draw_values, df2_draw_value_4 = _get_scale_draw_values( df2_draw_values, df2_draw_value_4) # draw the graph plt.clf() if figsize is not None: plt.figure(figsize) else: plt.figure(figsize=(10, 5)) if scale_flg: plt.title('%s (log10 scale)' % (col)) else: plt.title('%s' % (col)) # if unique level is less than 10, draw countplot instead both_num_uni = np.max( [df1[col].dropna().nunique(), df2[col].dropna().nunique()]) if both_num_uni <= 10: df1_temp = pd.DataFrame(df1_sample_dropna_values, columns=['value']) df1_temp['type'] = 'table1' df2_temp = pd.DataFrame(df2_sample_dropna_values, columns=['value']) df2_temp['type'] = 'table2' full_temp = pd.concat([df1_temp, df2_temp], axis=0) sns.countplot(full_temp['value'], hue=full_temp['type'], palette=sns.color_palette([TABLE1_DARK, TABLE2_DARK])) if both_num_uni > 5: plt.xticks(rotation=90) plt.legend(loc=1) else: ax1 = sns.distplot(df1_draw_values, color=TABLE1_DARK, hist=False, label='table1') ax2 = sns.distplot(df2_draw_values, color=TABLE2_DARK, hist=False, label='table2') y_low_1, y_up_1 = ax1.get_ylim() y_low_2, y_up_2 = ax2.get_ylim() y_low, y_up = np.min([y_low_1, y_low_2]), np.max([y_up_1, y_up_2]) plt.ylim((y_low, y_up)) if date_flag: _draw_texts(text_values=[date_mins[0], date_maxs[0]], draw_value_4=df1_draw_value_4, mark=1, y_low=y_low, y_up=y_up, date_flag=True) _draw_texts(text_values=[date_mins[1], date_maxs[1]], draw_value_4=df2_draw_value_4, mark=2, y_low=y_low, y_up=y_up, date_flag=True) else: _draw_texts(text_values=[ value_mins[0], value_means[0], value_medians[0], value_maxs[0] ], draw_value_4=df1_draw_value_4, mark=1, y_low=y_low, y_up=y_up) _draw_texts(text_values=[ value_mins[1], value_means[1], value_medians[1], value_maxs[1] ], draw_value_4=df2_draw_value_4, mark=2, y_low=y_low, y_up=y_up) plt.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from pydataset import data #1 What does the distribution of petal lengths look like? sns.distplot(iris.petal_length) #2 s there a correlation between petal length and petal width? sns.relplot(data=iris, x='petal_length', y='petal_width') #3#3Would it be reasonable to predict species based on sepal width and sepal length? sns.jointplot(data=iris, x='petal_length', y='petal_width') #anscombe 1-Using the lesson as an example, use seaborn's load_dataset function to load the anscombe #data set. Use pandas to group the data by the dataset column, and calculate summary statistics for each dataset. What do you notice? anscombe.groupby('dataset').describe() sns.relplot(x='x', y='y', data=anscombe) ##Load the InsectSprays dataset and read it's documentation. Create a boxplot that shows the effectiveness of the different insect sprays. sns.boxplot(data=IS, x='count', y='spray')
prod_time2 = np.genfromtxt('gauss_prod_time_50_0_1_26_.txt') prod_time2_scaled = np.genfromtxt('gauss_prod_time_50_0_1_26_scaled.txt') prod_time3 = np.genfromtxt('gauss_prod_time_50_0_1_24_.txt') prod_time3_scaled = np.genfromtxt('gauss_prod_time_50_0_1_24_scaled.txt') prod_time4 = np.genfromtxt('gauss_prod_time_50_0_1_23_5.txt') prod_time4_scaled = np.genfromtxt('gauss_prod_time_50_0_1_23_5_scaled.txt') nbins = 60 if 1==1: fig, ax = plt.subplots() sns.distplot(np.log10(prod_time1[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='30') sns.distplot(np.log10(prod_time2[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='26') sns.distplot(np.log10(prod_time3[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='24') sns.distplot(np.log10(prod_time4[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='23.5') sns.distplot(np.log10(prod_time1_scaled[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='30 scaled') sns.distplot(np.log10(prod_time2_scaled[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='26 scaled') sns.distplot(np.log10(prod_time3_scaled[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='24 scaled') sns.distplot(np.log10(prod_time4_scaled[:,2]),hist=False,kde=True,bins=nbins, kde_kws = {'shade': True, 'linewidth': 3},label='23.5 scaled')
def _compare_numeric(col, _df1, _df2, img_dir, date_flag=False): """ Compare two numeric type values Parameters ---------- col: string name of column to check _df1: pandas DataFrame slice of table1 containing enough information to check _df2: pandas DataFrame slice of table2 containing enough information to check img_dir: root directory for the generated images date_flag: boolean Whether the column is date type Returns ------- Dictionary contains the output result """ # sampling df1_sample = _df1.copy() df2_sample = _df2.copy() stat_output = _simple_stats(col, df1_sample, df2_sample, 'numeric') nan_rate1, nan_rate2 = stat_output['nan_rate'] if (nan_rate1 == 1) or (nan_rate2 == 1): if (nan_rate1 == 1) and (nan_rate2 == 1): error_msg = 'all nan in both table' elif nan_rate1 == 1: error_msg = 'all nan in table1' else: error_msg = 'all nan in table2' return {'column': col, 'error_msg': error_msg} # generate the output output = [{ 'feature': 'column', 'value': col, 'graph': 'Distribution' }, { 'feature': 'sample_value', 'value': '\n'.join([str(v) for v in stat_output['sample_value']]) }, { 'feature': 'nan_rate', 'value': '\n'.join([str(round(v, 3)) for v in stat_output['nan_rate']]) }, { 'feature': 'num_uni', 'value': '%s/%s\n%s/%s' % (str(stat_output['num_uni'][0]), str(df1_sample.dropna().shape[0]), str(stat_output['num_uni'][1]), str(df2_sample.dropna().shape[0])) }, { 'feature': 'value_min', 'value': '\n'.join([str(round(v, 3)) for v in stat_output['value_min']]) }, { 'feature': 'value_mean', 'value': '\n'.join([str(round(v, 3)) for v in stat_output['value_mean']]) }, { 'feature': 'value_median', 'value': '\n'.join([str(round(v, 3)) for v in stat_output['value_median']]) }, { 'feature': 'value_max', 'value': '\n'.join([str(round(v, 3)) for v in stat_output['value_max']]) }] both_value_max = np.max([abs(v) for v in stat_output['value_max']] + [abs(v) for v in stat_output['value_min']]) # get clean values df1_sample_dropna_values = df1_sample[col].dropna().values df2_sample_dropna_values = df2_sample[col].dropna().values if date_flag: dt1 = pd.to_datetime(df1_sample[col.replace('_numeric', '')], errors='coerce') dt2 = pd.to_datetime(df2_sample[col.replace('_numeric', '')], errors='coerce') date_min1, date_max1 = dt1.min(), dt1.max() date_min2, date_max2 = dt2.min(), dt2.max() # get distribution scale_flg = 0 df1_draw_values = df1_sample_dropna_values df1_draw_value_4 = [ stat_output['value_min'][0], stat_output['value_mean'][0], stat_output['value_median'][0], stat_output['value_max'][0] ] df2_draw_values = df2_sample_dropna_values df2_draw_value_4 = [ stat_output['value_min'][1], stat_output['value_mean'][1], stat_output['value_median'][1], stat_output['value_max'][1] ] if both_value_max >= pow(10, 6): scale_flg = 1 df1_draw_values, df1_draw_value_4 = _get_scale_draw_values( df1_draw_values, df1_draw_value_4) df2_draw_values, df2_draw_value_4 = _get_scale_draw_values( df2_draw_values, df2_draw_value_4) # calculate correlation between two distributions if np.max(stat_output['num_uni']) <= 100: vc1, vc2 = _value_counts_df(df1_draw_values), _value_counts_df( df2_draw_values) vc = vc1.merge(vc2, on='value', how='outer').fillna(0) obs1, obs2 = vc['count_x'].values * 1.0 / vc['count_x'].sum( ), vc['count_y'].values * 1.0 / vc['count_y'].sum() else: both_min = np.min([np.min(df1_draw_values), np.min(df2_draw_values)]) both_max = np.max([np.max(df1_draw_values), np.max(df2_draw_values)]) hist1 = np.histogram(df1_draw_values, bins=100, range=(both_min, both_max), normed=False, density=False) hist2 = np.histogram(df2_draw_values, bins=100, range=(both_min, both_max), normed=False, density=False) obs1, obs2 = hist1[0] / (np.sum(hist1[0]) * 1.0), hist2[0] / (np.sum(hist2[0]) * 1.0) if len(obs1) == 1: corr = np.min([1. - nan_rate1, 1. - nan_rate2]) * 1.0 / np.max( [1. - nan_rate1, 1. - nan_rate2]) elif list(obs1) == list(obs2): corr = 1.0 else: corr = spearmanr(obs1, obs2)[0] # draw and save distribution graph dpi = 72 if date_flag: plt.figure(figsize=(635. / dpi, 635. / (9. / 8.) / dpi), dpi=dpi) else: plt.figure(figsize=(635. / dpi, 635. / (9. / 6.) / dpi), dpi=dpi) if scale_flg: plt.title('%s (log10 scale)' % (col)) else: plt.title('%s' % (col)) # if unique level is less than 10, draw countplot instead both_num_uni = np.max(stat_output['num_uni']) if both_num_uni <= 10: df1_temp = pd.DataFrame(df1_sample_dropna_values, columns=['value']) df1_temp['type'] = 'table1' df2_temp = pd.DataFrame(df2_sample_dropna_values, columns=['value']) df2_temp['type'] = 'table2' full_temp = pd.concat([df1_temp, df2_temp], axis=0) sns.countplot(full_temp['value'], hue=full_temp['type'], palette=sns.color_palette([TABLE1_DARK, TABLE2_DARK])) if both_num_uni > 5: plt.xticks(rotation=90) plt.legend(loc=1) else: ax1 = sns.distplot(df1_draw_values, color=TABLE1_DARK, hist=False, label='table1') ax2 = sns.distplot(df2_draw_values, color=TABLE2_DARK, hist=False, label='table2') y_low_1, y_up_1 = ax1.get_ylim() y_low_2, y_up_2 = ax2.get_ylim() y_low, y_up = np.min([y_low_1, y_low_2]), np.max([y_up_1, y_up_2]) plt.ylim((y_low, y_up)) if date_flag: _draw_texts(text_values=[date_min1, date_max1], draw_value_4=df1_draw_value_4, mark=1, y_low=y_low, y_up=y_up, date_flag=True) _draw_texts(text_values=[date_min2, date_max2], draw_value_4=df2_draw_value_4, mark=2, y_low=y_low, y_up=y_up, date_flag=True) else: _draw_texts(text_values=[ stat_output['value_min'][0], stat_output['value_mean'][0], stat_output['value_median'][0], stat_output['value_max'][0] ], draw_value_4=df1_draw_value_4, mark=1, y_low=y_low, y_up=y_up) _draw_texts(text_values=[ stat_output['value_min'][1], stat_output['value_mean'][1], stat_output['value_median'][1], stat_output['value_max'][1] ], draw_value_4=df2_draw_value_4, mark=2, y_low=y_low, y_up=y_up) # save the graphs # adjust graph name graph_name = col if '/' in graph_name: graph_name = graph_name.replace('/', '') plt.savefig(os.path.join(img_dir, graph_name + '.png'), transparent=True, dpi=dpi) if date_flag: output.append({ 'feature': 'date_min', 'value': '%s\n%s' % (date_min1, date_min2) }) output.append({ 'feature': 'date_max', 'value': '%s\n%s' % (date_max1, date_max2) }) output.append({'feature': 'corr', 'value': round(corr, 3)}) return { 'column': col, 'result_df': pd.DataFrame(output), 'corr': { 'column': col, 'corr': round(corr, 3) } }