def plotTradeVsNews(tickName): path2 = "resultsMKII" frame = getNewsNTradingVol(tick_Name,path2) newsBuz = [] tradingVol = [] newsVol = [] for i in range(len(frame['tradingVol'])): newsBuz.append(frame['NewsBuz'].values[i]) tradingVol.append(np.log(frame['tradingVol'].values[i])) newsVol.append(np.log(frame['NewsVol'].values[i])) sns.set(style="ticks") x = np.array(newsBuz) y = np.array(tradingVol) ax = sns.jointplot(x,y,kind="hex",stat_func=kendalltau,color="#4CB391") ax.set_axis_labels(xlabel= "News Buz",ylabel="Trading Volume") g = sns.jointplot(x, y, kind="kde", size=7, space=0) g.set_axis_labels(xlabel= "News Buz",ylabel="Trading Volume") x = np.array(newsVol) ay = sns.jointplot(x,y,kind="hex",stat_func=kendalltau,color="#4CB391") ay.set_axis_labels(xlabel= "News Volume",ylabel="Trading Volume") h = sns.jointplot(x, y, kind="kde", size=7, space=0) h.set_axis_labels(xlabel= "News Volume",ylabel="Trading Volume") sns.plt.show() # sns.plt.subplot(2,1,1)#41B3D3 # a1 = sns.regplot(x="NewsBuz", y="tradingVol", data=frame,ci=None,fit_reg=False,color="#1dad9b") # a1.set_ylim([0,4e8]) # sns.plt.subplot(2,1,2) # # a2 = sns.regplot(x="NewsVol", y="tradingVol", data=frame,ci=None,fit_reg=False,color="#41B3D3") # a2.set_ylim([0,4e8]) sns.plt.show()
def doplot(self, name): """ Do some plots """ self.trace = pickle.load( open( name, "rb" ) ) var = np.vstack([self.trace['muCB'][:,0], self.trace['muCB'][:,1], self.trace['sdCB'][:,0], self.trace['sdCB'][:,1]]).T corner.corner(var, labels=['$\mu_C$', '$\mu_B$', '$\sigma_C$','$\sigma_B$'], show_titles=True) pl.show() # pl.savefig('{0}.png'.format(name)) # Just get the first N samples. We shuffle the # arrays and get the subsamples C = self.trace['CB'][:,:,0] np.random.shuffle(C) C_slice = C[0:200,:].flatten() B = self.trace['CB'][:,:,1] np.random.shuffle(B) B_slice = B[0:200,:].flatten() # First option pl.plot(B_slice, C_slice, '.', alpha=0.002) pl.show() # KDE joint plot sns.jointplot(C_slice, B_slice, kind='kde') pl.show()
def seaborn_join(): data = np.random.multivariate_normal([0, 0], [[5, 2], [2, 2]], size=2000) data = pd.DataFrame(data, columns=['x', 'y']) with sns.axes_style('white'): sns.jointplot("x", "y", data, kind='hex') plt.show()
def histogram(self,x=None, y=None, l=None, t=None, **kwargs): """ this is a short-cut for creating many possible histograms, at a specified beamline location l, or specified time t. - if x and y are not input, then it creates a full joint-scatterplot for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t) - if x is input, it creates a 1d histogram with respect to that parameter - if x and y are input, creates a 2d histogram with respect to those parameters """ table = self.to_dataframe(l=l, t=t, latex=True) if x is None and y is None: g = sns.pairplot(table, **kwargs) for ax in g.axes.flat: _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90) return if x is not None and y is None: x = self._reformat_label(x) sns.distplot(table[x], **kwargs) plt.xlabel(x) return if x is not None and y is not None: x = self._reformat_label(x) y = self._reformat_label(y) sns.jointplot(x=x, y=y, data=table, **kwargs); return
def make_scatter_plot(frame, name, **kwargs): """ Makes a scatter plot of column name in frame. """ column_x = frame[name] if name == 'deltam31': column_x*=100.0 params = [] exclude = set(['hypo','llh','mctrue']) params = list(set(frame.columns).difference(exclude)) figs = [] # Plot correlation scatter plot for all other systematics for p in params: if p == name: continue column_y = frame[p] if p == 'deltam31': column_y*=100.0 if 'theta' in p: column_y = np.rad2deg(column_y) with sns.axes_style("whitegrid"): sns.jointplot(column_x, column_y, size=8, color='b', **kwargs) plt.tight_layout() figs.append(plt.gcf()) return figs
def plotBonusvsSalary(df): sns.jointplot(x="bonus", y="salary", data=df) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) fig.savefig('bonusVSsalary.png', dpi=100) #plt.savefig('bonusVSsalary.png') plt.show()
def plot_seaborn( self ): # https://stanford.edu/~mwaskom/software/seaborn/tutorial/distributions.html data = pd.read_csv( 'movement.csv' ).as_matrix() # 1/2 3/4 5/6 7/8 x_column = 3 y_column = 4 limit = 100 data = data[ ( data[:,0] == 0) & ( data[:,x_column] > -limit ) & ( data[:,x_column] < limit ) & ( data[:,y_column] > -limit ) & ( data[:,y_column] < limit ) ] x = data[:,x_column] y = data[:,y_column] with sns.axes_style( 'white' ): sns.jointplot( x=x, y=y, kind='kde' ) # scatter, reg, resid, hex, kde sns.plt.show()
def skill_vs_speed(prediction_mode, time_model, data): model = TimeCombiner(prediction_mode, time_model) Evaluator(data, model).get_report(force_run=True) students = data.get_students() skills = prediction_mode.get_skills(students) fastness = time_model.get_skills(students) sns.jointplot(pd.Series(skills), pd.Series(fastness), kind='kde', space=0).set_axis_labels("skill", "speed")
def show_graph(data): """ Show time series graph of given data. """ height_list = sorted([[p[0], height(p[1:])] for p in data], key=lambda x: x[0]) df = pd.DataFrame(height_list) df.columns = ["time","height"] seaborn.jointplot('time', 'height', data=df) plt.show()
def sbratio(sampler): chain = sampler.flatchain chain[:,2]=np.abs(chain[:,2]) chain[:,4]=np.abs(chain[:,4]) dd = pd.DataFrame(data=chain, columns=['theta','phi','scatter','badfrac','badsig','badmn']) with sns.axes_style("white"): sns.jointplot("theta", "phi", data, kind="kde");
def plot(self, samples, columns=None): if(columns is None): df = pd.DataFrame(samples, columns=["x", "y"]) sns.jointplot(x="x", y="y", data=df) else: df = pd.DataFrame(samples, columns=[columns[0], columns[1]]) # sns.jointplot(x=names[0], y=names[1], data=df, xlim=xlim, ylim=ylim) sns.jointplot(x=columns[0], y=columns[1], data=df)
def plot_scatter_hist_sns(x, y): #sns.set(color_codes=True) #sns.set(style="darkgrid") sns.set(style="ticks") sns.jointplot(np.array(x), np.array(y), kind="hex", size=4, stat_func=None).set_axis_labels("$\phi$", "$\\theta$") with PdfPages('plot4.pdf') as pdf: pdf.savefig() sns.plt.close()
def plot(data, total, title, width=800.0, unit='', dosort=True, target=None, target2=None): """A HTML bar plot given a dictionary and max value.""" if len(data) > 30 and target is not None: df = pandas.DataFrame(index=data) df[title] = pandas.Series(data, index=df.index) df[target.name] = target.ix[df.index] if target2 is not None: df[target2.name] = target2.ix[df.index] if target.dtype == numpy.number: if target2 is None: seaborn.jointplot(target.name, title, data=df, kind='reg') else: seaborn.lmplot(target.name, title, data=df, hue=target2.name) else: # X-axis is categorical df.sort_values(by=target.name, inplace=True) if target2 is None: seaborn.barplot(target.name, title, data=df) else: seaborn.barplot(target.name, title, data=df, hue=target2.name) fig = plt.gcf() fig.autofmt_xdate() # Convert to D3, SVG, javascript etc. # import mpld3 # result = mpld3.fig_to_html(plt.gcf(), template_type='general', # use_http=True) # Convert to PNG figfile = io.BytesIO() plt.savefig(figfile, format='png') result = '<div><img src="data:image/png;base64, %s"/></div>' % ( base64.b64encode(figfile.getvalue()).decode('utf8')) plt.clf() return result result = ['<div class=barplot>', ('<text style="font-family: sans-serif; font-size: 16px; ">' '%s</text>' % title)] if target is not None: data = OrderedDict([(key, data[key]) for key in target.sort_values().index if key in data]) keys = {key.split('_')[0] if '_' in key else key[0] for key in data} color = {} if len(keys) <= 5: color.update(zip(keys, range(1, 6))) keys = list(data) if dosort: keys.sort(key=data.get, reverse=True) for key in keys: result.append('<br><div style="width:%dpx;" class=b%d></div>' '<span>%s: %g %s</span>' % ( int(round(width * data[key] / total)) if data[key] else 0, color.get(key.split('_')[0] if '_' in key else key[0], 1) if data[key] else 0, htmlescape(key), data[key], unit,)) result.append('</div>\n') return '\n'.join(result)
def make_JointPlot(plot, region, data, backgrounds) : sample_to_plot = [] if data.name == plot.sample : sample_to_plot.append(data) if not len(sample_to_plot) : for bk in backgrounds : if bk.name == plot.sample : sample_to_plot.append(bk) if len(sample_to_plot) == 0 or len(sample_to_plot) > 1 : msg('ERROR make_JointPlot received %d samples to plot for plot with name %s'%(len(sample_to_plot), plot.name)) sys.exit() # turn this tree into an array :) sample_to_plot = sample_to_plot[0] selection_ = '(' + region.tcut + ') * eventweight * ' + str(sample_to_plot.scale_factor) tree_array = tree2rec(sample_to_plot.tree, branches=[plot.x_var, plot.y_var], selection=selection_) tree_array.dtype.names = (plot.x_var, plot.y_var) x_arr = tree_array[plot.x_var] y_arr = tree_array[plot.y_var] sns.set(style="white") # stats? stat_func_ = None if plot.stat_func == "kendalltau" : from scipy.stats import kendalltau stat_func_ = kendalltau elif plot.stat_func == None : from scipy.stats import pearsonr stat_func_ = pearsonr j_plot_grid = None if plot.cmap == None or plot.cmap == "default" : j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, color = plot.color, linewidth = plot.line_width, ylim=[plot.y_range_min,plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max]) #j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, color = plot.color, linewidth = plot.line_width, joint_kws={"n_levels":plot.n_levels, "shade":True}, ylim=[plot.y_range_min,plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max]) elif plot.cmap == "cubehelix" : cmap_ = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse = True) j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, linewidth = plot.line_width, joint_kws={"cmap":cmap_, "n_levels":plot.n_levels, "shade":True}, ylim=[plot.y_range_min, plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max]) elif plot.cmap == "blues" : j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, linewidth = 1.0, joint_kws={"cmap":"Blues", "n_levels":plot.n_levels, "shade":True, "shade_lowest":False}, ylim=[plot.y_range_min, plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max]) else : msg("cmap attribute of joint plot not yet added") sys.exit() j_plot_grid.fig.suptitle(plot.title) j_plot_grid.fig.subplots_adjust(top=0.935) j_plot_grid.set_axis_labels(plot.x_label, plot.y_label) # save the plot to file outname = plot.name + ".eps" j_plot_grid.savefig(outname) out = indir + "/plots/" + outdir utils.mv_file_to_dir(outname, out, True) fullname = out + "/" + outname msg("%s saved to : %s"%(outname, os.path.abspath(fullname)))
def plot_approx_posterior(cov, means, index): mean = means[index] print mean.shape mean, cov = util.product_gaussians(mean, np.zeros(2), cov, np.identity(2)) data = np.random.multivariate_normal(mean, cov, 200) df = pd.DataFrame(data, columns=["x", "y"]) xlim = (mean[0] - 3*np.sqrt(cov[0][0]),mean[0] + 3*np.sqrt(cov[0][0])) ylim = (mean[1] - 3*np.sqrt(cov[1][1]),mean[1] + 3*np.sqrt(cov[1][1])) sns.jointplot(x="x", y="y", data=df, kind="kde", stat_func= None, xlim = xlim, ylim = ylim)
def plot_var(times, pitches, ends, var_n): """ Show time series graph of variation [var_n]. """ # var_n: 0 to 30 (0: Aria) n_data = filter(lambda x:(ends[var_n] < x[0] <= ends[var_n+1]), zip(times, pitches)) # seaborn df = pd.DataFrame(n_data) df.columns = ["time","height"] seaborn.jointplot('time', 'height', data=df) plt.show()
def show(self): Y = np.reshape(self._pr,(1,-1)).tolist()[0] X = self._lams df = pd.DataFrame({'x':X,'y':Y}) sns.jointplot(x='x',y='y',data=df) Y = np.asarray(Y) X = np.asarray(X) mean = (X*Y).sum() sns.plt.title('mean %f'%mean) sns.plt.show()
def hist_2d(distribution, nsamples, **kwargs): """ Plots a 2d hexbinned histogram of distribution """ distr = distribution(ndims=2) sampler = MarkovJumpHMC(distr.Xinit, distr.E, distr.dEdX, **kwargs) samples = sampler.sample(nsamples) with sns.axes_style("white"): sns.jointplot(samples[0], samples[1], kind="kde", stat_func=None)
def pairwise_joint_plots(df, cols): logging.debug('Plotting pairwise joint distributions') cols = sorted(cols) for colA, colB in [(a,b) for a in cols for b in cols if a < b]: file = 'joint_{}_{}.png'.format(colA, colB) logging.debug('joint plot: %s', file) fig = plt.figure() sns.jointplot(df[colA], df[colB], kind='hex') plt.savefig(file) plt.close()
def AnalyzeAllElectrodes(): """From Jacek """ path = '/Users/ryszardcetnarski/Desktop/Nencki/Badanie_NFB/Dane/wszystkie_elektrody_jacek.csv' db = pd.read_csv(path) for band in ['theta', 'alpha','smr', 'beta1', 'beta2']: db[band+'_po'] = db[band+ '_przed'] + db[band+'_roznica'] # fig = plt.figure() # fig.suptitle(band) # corr = fig.add_subplot(211) # diff = fig.add_subplot(212) sns.jointplot(band +'_przed', band+'_po', data=db, kind="reg")#, color="r", size=7) # fig = plt.figure() # fig.suptitle(band) sns.jointplot(band +'_przed', band+'_roznica', data=db, kind="reg")#, color="r", size=7) conditions_str = ['mixed_conditions' for i in range(0,len(db))] conditions = [0 for i in range(0,len(db))] GeneralModel( db[band+ '_przed'] , db[band+ '_po'] , band, conditions, conditions_str) #corr.scatter(db[band +'_przed'], db[band+'_po']) #diff.scatter(db[band +'_przed'], db[band+'_roznica']) return db #Kde using sklearn, returns object # kde = KernelDensity(kernel='tophat', bandwidth = 3).fit(initial[:, np.newaxis]) # log_dens = kde.score_samples(x[:, np.newaxis]) #Plot sklearn kernel estimate # kernel.plot(x, np.exp(log_dens), 'g') #Plot original data histogram #followUp = np.random.random_sample(100) #followUp= np.random.normal(20,10, 100) #followUp = np.random.normal(20,10, 100)#initial + np.random.normal(0,100,100) #followUp = np.random.random_sample(100)#initial + np.random.normal(0,100,100) #hist.hist(initial) #initial = np.random.normal(20,10, 100) #initial = np.random.random_sample(100) #Add noise to each observation #initial = #initial *0.95 + np.random.normal(100,100,100) #Make a follow up by adding nosie second time to the same population
def plotCorrelation(frame): # Plot correlation of each variable to visualize each dimension: sns.jointplot("bedrooms","price",frame,size=8) plt.tight_layout() sns.jointplot("size","price",frame,size=8) plt.tight_layout() print("PAUSED...close figures to continue...") plt.show() return
def gauss_2d(nsamples=1000): """ Another simple test plot 1d gaussian sampled from each sampler visualized as a joint 2d gaussian """ gaussian = misc.distributions.TestGaussian(ndims=1) control = Control(gaussian.Xinit, gaussian.E, gaussian.dEdX) experimental = ContinuousTimeHMC(gaussian.Xinit, gaussian.E, gaussian.dEdX) with sns.axes_style("white"): sns.jointplot(control.sample(nsamples)[0], experimental.sample(nsamples)[0], kind="hex", stat_func=None)
def drawJointPlot(self, se1, se2): """ 画线性相关图,表示序列1和序列2的相关性 :param self: 类变量本身 :param se1: 序列1 :param se2: 序列2 """ sns.jointplot(se1, se2, kind='reg', color=self.linecolors[0]) # plt.title(self.title) plt.legend() plt.show()
def fixed_effects(data, labels): corcoeff, p_val = pearsonr(data[labels[0]], data[labels[1]]) print "Pearson correlation between %s and %s across all donors is %g (two tailed p value = %g)"%(labels[0], labels[1], corcoeff, p_val) grid = sns.jointplot(labels[0], labels[1], data, kind="hex") sns.jointplot(labels[0], labels[1], data, kind="reg", xlim=grid.ax_joint.get_xlim(), ylim=grid.ax_joint.get_ylim()) plt.show() return corcoeff, p_val
def covlen(args): """ %prog covlen covfile fastafile Plot coverage vs length. `covfile` is two-column listing contig id and depth of coverage. """ import numpy as np import pandas as pd import seaborn as sns from jcvi.formats.base import DictFile p = OptionParser(covlen.__doc__) p.add_option("--maxsize", default=1000000, type="int", help="Max contig size") p.add_option("--maxcov", default=100, type="int", help="Max contig size") p.add_option("--color", default='m', help="Color of the data points") p.add_option("--kind", default="scatter", choices=("scatter", "reg", "resid", "kde", "hex"), help="Kind of plot to draw") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 2: sys.exit(not p.print_help()) covfile, fastafile = args cov = DictFile(covfile, cast=float) s = Sizes(fastafile) data = [] maxsize, maxcov = opts.maxsize, opts.maxcov for ctg, size in s.iter_sizes(): c = cov.get(ctg, 0) if size > maxsize: continue if c > maxcov: continue data.append((size, c)) x, y = zip(*data) x = np.array(x) y = np.array(y) logging.debug("X size {0}, Y size {1}".format(x.size, y.size)) df = pd.DataFrame() xlab, ylab = "Length", "Coverage of depth (X)" df[xlab] = x df[ylab] = y sns.jointplot(xlab, ylab, kind=opts.kind, data=df, xlim=(0, maxsize), ylim=(0, maxcov), stat_func=None, edgecolor="w", color=opts.color) figname = covfile + ".pdf" savefig(figname, dpi=iopts.dpi, iopts=iopts)
def main(): movie_raw_data = pd.read_csv('../input/movie_metadata.csv') print movie_raw_data.head(3) print movie_raw_data.isnull().sum() print movie_raw_data.shape movie_raw_data_dropna=movie_raw_data.dropna() print movie_raw_data_dropna.shape print movie_raw_data.dtypes # movie_filterd_imdbscore=movie_raw_data['imdb_score'].loc # movie_filterd_imdbscore=movie_raw_data.loc[movie_raw_data['imdb_score'].isin([2,3])] movie_filterd_imdbscore_first=movie_raw_data.loc[movie_raw_data['imdb_score'] >5] movie_filterd_imdbscore_from_raw=movie_raw_data.loc[movie_raw_data['imdb_score'] <8] print movie_filterd_imdbscore_first.shape movie_filterd_imdbscore_second=movie_filterd_imdbscore_first.loc[movie_raw_data['imdb_score'] <8] print movie_filterd_imdbscore_second.shape print movie_filterd_imdbscore_from_raw.shape print '*********************************' print movie_raw_data_dropna.head(3) profit=(((movie_raw_data_dropna['gross'].values-movie_raw_data_dropna['budget'].values))/(movie_raw_data_dropna['gross'].values))*100 print profit movie_raw_data_dropna.loc[:,'profit']=pd.Series(profit, movie_raw_data_dropna.index) print movie_raw_data_dropna.shape print movie_raw_data_dropna.head(3) corr=movie_raw_data_dropna.corr() print corr f, ax = plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(corr, cmap=cmap, vmax=1, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) g = sns.jointplot(x="title_year", y="profit",kind='scatter',size=10,ylim = [0,110],xlim=[1980,2020],data=movie_raw_data_dropna) h = sns.jointplot(x="imdb_score", y="profit",kind='reg',size=10,ylim = [0,110],data=movie_raw_data_dropna) # j = sns.pairplot(movie_raw_data_dropna,hue='content_rating') plt.show()
def sample_54_1(): """ 5.4 使用seaborn可视化数据 :return: """ sns.distplot(tsla_df['p_change'], bins=80) plt.show() sns.boxplot(x='date_week', y='p_change', data=tsla_df) plt.show() sns.jointplot(tsla_df['high'], tsla_df['low']) plt.show()
def occupationAnalysis(): img = plt.imread("playground.jpg") robot_position = readLog( "./csv/windfield_game1_green_withindex_position.csv") data=np.zeros((nbcols, nbrows)) for robotp in robot_position: robotp = robotp.split(',') print(robotp[0]) px = int(float(robotp[1]) * nbcols) py = int(float(robotp[2]) * nbrows) data[px][py]+=1 robot_position = readLog( "./csv/windfield_game1_orange_withindex_position.csv") for robotp in robot_position: robotp = robotp.split(',') print(robotp[0]) px = int(float(robotp[1]) * nbcols) py = int(float(robotp[2]) * nbrows) data[px][py]+=1 robot_position = readLog( "./csv/windfield_game1_blue_withindex_position.csv") for robotp in robot_position: robotp = robotp.split(',') print(robotp[0]) px = int(float(robotp[1]) * nbcols) py = int(float(robotp[2]) * nbrows) data[px][py]+=1 fig, ax = plt.subplots() #heatmap = ax.pcolor(data) red_high = ((0., 0., 0.), (.3, .5, 0.5), (1., 1., 1.)) blue_middle = ((0., .2, .2), (.3, .5, .5), (.8, .2, .2), (1., .1, .1)) green_none = ((0,0,0),(1,0,0)) cdict3 = {'red': red_high, 'green': green_none, 'blue': blue_middle, 'alpha': ((0.0, 0.0, 0.0), (0.3, 0.5, 0.5), (1.0, 1.0, 1.0)) } #ax.scatter(x, y, label=str(i), color=color, alpha=0.5) #dropout_high = LinearSegmentedColormap('Dropout', cdict3) #plt.register_cmap(cmap = dropout_high) sns.jointplot(x="x", y="y", data=data, kind="kde");
def performance_vs_coverage(db, output=None, max_values=250, **kwargs): data = [ row for row in db.execute( "SELECT " " performance AS performance, " " coverage " "FROM param_stats" ) ] frame = pandas.DataFrame(data, columns=("Performance", "Legality")) sns.jointplot("Legality", "Performance", data=frame, xlim=(0, 1), ylim=(0, 1)) viz.finalise(output, **kwargs)
def show(self): pos = np.argsort(self.pr)[0][-20:] for k in pos: print self.hypos[k],self.pr[0,k] pos = np.argmax(self.pr) print 'max',self.hypos[pos],'pr=',self.pr[0,pos] X = [] for idx,hypo in enumerate(self.hypos): N,f = hypo X.append(idx) Y = self.pr.tolist()[0] df = pd.DataFrame({'x':X,'y':Y}) sns.jointplot(x='x',y='y',data=df) sns.plt.show()
# In[25]: # comapre with men and women that who have more target zero and who have not fig, ax = plt.subplots(figsize=(10, 5)) sns.countplot(df['target'], hue=df['sex'], ax=ax) plt.xlabel('target') plt.ylabel('sex') plt.xticks(rotation=50) plt.show # In[26]: nums = ['age', 'sex', 'trestbps', 'chol', 'trestbps', 'target'] for i in nums: plt.figure(figsize=(20, 10)) sns.jointplot(x=df[i], y=df['target'], kind='reg') plt.xlabel(i) plt.ylabel('resposne') plt.grid() plt.show() # In[8]: plt.bar(df['target'], df['age'], alpha=.5, width=0.8, label='chart') plt.show() # In[62]: sns.catplot('sex', 'target', data=df, kind='box', hue='fbs') # In[53]:
print("Minimum Cost: ${}".format(_min_cost)) print("Maximum Cost: ${}".format(_max_cost)) print("Mean Cost: ${}".format(_mean_cost)) print("Median Cost ${}".format(_median_cost)) print("Standard deviation of Cost: ${}".format(_stddev_cost)) _housedata['bedrooms'].value_counts().plot(kind='bar') plt.title('Total number of Bedroom') plt.xlabel('Bedrooms') plt.ylabel('Count of Bedrooms') plt.show() #sns.despine plt.figure(figsize=(10,10)) sns.jointplot(x=_housedata.lat.values, y=_housedata.long.values, size=10) plt.ylabel('Longitude of House', fontsize=12) plt.xlabel('Latitude of House', fontsize=12) plt.show() #plt1 = plt() #sns.despine plt.scatter(_housedata.price,_housedata.sqft_living) plt.title("Price of House vs Square Feet of House") plt.show() plt.scatter(_housedata.price,_housedata.long) plt.title("Price of House vs Location of the house area") plt.show() plt.scatter(_housedata.price,_housedata.lat)
names = ['variance','skewness','curtosis','entropy','class']) data.head(3) data.describe() data.shape data.isna().any() data.dtypes data['class'].unique() sns.countplot(x='class', data= data) sns.violinplot( y=data['curtosis']) sns.violinplot( y=data['entropy']) sns.violinplot( y=data['variance']) sns.violinplot( y=data['skewness']) p1=sns.kdeplot(data['curtosis'], shade=True, color="r") p1=sns.kdeplot(data['variance'], shade=True, color="b") sns.jointplot(x=data['curtosis'], y=data['entropy'], kind='hex', linewidth = 2) sns.jointplot(x=data['skewness'], y=data['variance'], kind='hex', color = 'skyblue', linewidth = 2) sns.jointplot(x=data['curtosis'], y=data['variance'], kind='hex', linewidth = 2) X = data[['variance', 'skewness' ,'curtosis', 'entropy']] y = data[['class']] from sklearn.model_selection import train_test_split # Support Vector Machine X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) from sklearn.svm import SVC SVC() svc = SVC() # Creating a dictionary of parameters parameters = {
def kde(self, n=0): sns.jointplot(x=self.samples[:, n, 0], y=self.samples[:, n, 1], kind="kde")
def generate_plots(plot_type=""): r""" Generate plots studying the distribution of graphs in different splits with respect to the graph size (|V| and |E|) :param plot_type: type of plot in {"histograms", "marginal_E", "marginal_V", "joint"} """ assert plot_type in {"histograms", "marginal_E", "marginal_V", "joint"} split_names = ["test", "valid", "train"] tot_n_nodes = [] tot_n_edges = [] for split_name in split_names: d = ToulouseRoadNetworkDataset(split=split_name, step=0.001, max_prev_node=8) dataloader = DataLoader(d, batch_size=1, shuffle=False, collate_fn=custom_collate_fn) n_nodes = [] n_edges = [] for datapoint in dataloader: this_x_adj, this_x_coord, this_y_adj, this_y_coord, this_img, this_seq_len, this_id = datapoint n_edges.append(int(this_y_adj.view(-1).sum().item())) n_nodes.append(int(this_seq_len[0] - 2)) tot_n_edges += n_edges tot_n_nodes += n_nodes n_nodes = np.array(n_nodes) n_edges = np.array(n_edges) print(f"{split_name} min/mean/max len nodes", np.min(n_nodes), np.mean(n_nodes), np.max(n_nodes)) print(f"{split_name} min/mean/max len edges", np.min(n_edges), np.mean(n_edges), np.max(n_edges)) if plot_type == "histograms": plt.hist(n_nodes, bins=np.max(n_nodes) - np.min(n_nodes) + 1) # arguments are passed to np.histogram plt.title(f"Histogram of |V| for {split_name}") plt.savefig(f"plots/histogram_|V|_{split_name}.png") plt.clf() plt.hist(n_edges, bins=np.max(n_edges) - np.min(n_edges) + 1) # arguments are passed to np.histogram plt.title(f"Histogram of |E| for {split_name}") plt.savefig(f"plots/histogram_|E|_{split_name}.png") plt.clf() elif plot_type == "marginal_V": a = sns.kdeplot(n_nodes, bw=.5, shade=True, label=split_name) elif plot_type == "marginal_E": b = sns.kdeplot(n_edges, bw=.5, shade=True, label=split_name) else: sns_plot = sns.jointplot(np.log10(n_nodes), np.log10(n_edges), marginal_kws=dict(kernel="gau", bw=.02), kind="kde", bw=.05) sns_plot.ax_joint.set_xlabel("log10 |V|", fontsize=15) sns_plot.ax_joint.set_ylabel("log10 |E|", fontsize=15) sns_plot.ax_marg_x.set_title(split_name, fontsize=20) sns_plot.ax_joint.set_xlim(0.6, 1.2) sns_plot.ax_joint.set_ylim(0.4, 1.2) sns_plot.savefig(f"plots/joint_{split_name}.png") tot_n_nodes = np.array(tot_n_nodes) tot_n_edges = np.array(tot_n_edges) print(f"min/mean/max len nodes", np.min(tot_n_nodes), np.mean(tot_n_nodes), np.max(tot_n_nodes)) print(f"min/mean/max len edges\n", np.min(tot_n_edges), np.mean(tot_n_edges), np.max(tot_n_edges)) if plot_type == "marginal_V": a.set_xlabel("|V|") a.set_ylabel("p(x)") a.set_title("Distributions of |V|") a.legend() a.figure.savefig(f"plots/marginal_|V|.png") a.figure.clf() if plot_type == "marginal_E": b.set_xlabel("|E|") b.set_ylabel("p(x)") b.set_title("Distributions of |E|") b.legend() b.figure.savefig(f"plots/marginal_|E|.png") b.figure.clf() print("Done!")
createFigure( figure_data_without_zynex, 'EY_ROC', EARNINGS_YIELD, 'Return On Capital (%)', 'Earnings Yield (%)', 'ey_roc.png', 'lower right', vscaling=1.2, hscaling=2) createFigure( figure_data, 'total_rank', 'EY_rank', 'Rank Return On Capital', 'Rank Earnings Yield', 'ey_roc_rank.png', 'upper right', number_format='%d', vscaling=1.2, hscaling=2) # Drop outliers df_capped = df[df[EARNINGS_YIELD].between( df[EARNINGS_YIELD].quantile(0.05), df[EARNINGS_YIELD].quantile(0.95))] df_capped = df_capped[df_capped['ROC'].between( df_capped['ROC'].quantile(0.05), df_capped['ROC'].quantile(0.95))] # Save density plot ax = sb.jointplot(EARNINGS_YIELD, 'ROC', data=df_capped, kind='kde', color="g") ax.set_axis_labels('Earnings Yield (%)', 'Return On Capital (%)') plt.tight_layout() plt.savefig('density_plot.png', format='png') plt.clf() # Create industry histogram ax = sb.countplot(x=SECTOR, data=figure_data, palette='Blues_d') ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') ax.set_ylabel('Amount') ax.set_xlabel('Industry') plt.tight_layout() plt.savefig('industry_histogram.png', format='png')
#%% #Histogramme seaborn.distplot(ordis.price) #%% # Boîte à moustaches seaborn.factorplot("price", data=ordis, kind="box") #%% # violin seaborn.factorplot("price", data=ordis, kind="violin") #%% # Lien entre price et var quanti (speed, hd) seaborn.factorplot("speed", "price", data=ordis) seaborn.jointplot("hd", "price", data=ordis, kind="reg") #%% # Lien entre price et var quali (ram, cd, premium, screen) seaborn.factorplot("ram", "price", data=ordis, kind="box") seaborn.factorplot("cd", "price", data=ordis, kind="box") seaborn.factorplot("premium", "price", data=ordis, kind="box") seaborn.factorplot("screen", "price", data=ordis, kind="box") #%% # price ~ speed et hd t = pandas.crosstab(pandas.cut(ordis.hd, 6, precision=0), ordis.speed, values=ordis.price, aggfunc=numpy.mean) seaborn.heatmap(t, cmap="Blues", cbar_kws={'label': 'mean price'})
# In[18]: sns.pairplot(sub_task_summary_Output, hue='EV', palette='Set1') # In[20]: # SIMPLE LINE PLOT sub_task_summary_Output['EV'].plot(figsize=(20, 12)) # In[26]: # In[65]: plt.figure(figsize=(12, 8)) sns.jointplot(x='SPI', y='EV', data=sub_task_summary_Output, color='hotpink') sns.jointplot(x='CPI', y='EV', data=sub_task_summary_Output, color='red') sns.jointplot(x='EAC', y='EV', data=sub_task_summary_Output, color='blue') # # In[41]: # In[55]: # In[56]: # In[66]: # In[67]:
def heatscatter_sns(x, y, figsize=(8, 8)): sns.set(rc={'figure.figsize': figsize}) sns.set(style="white", color_codes=True) sns.jointplot(x=x, y=y, kind='kde', color="skyblue")
plt.figure(figsize=(10, 25)) sns.countplot(y='country', data=dataset, alpha=alpha) plt.title('Data by country') plt.show() # Between Genders Male vs Female plt.figure(figsize=(7, 7)) sex = sns.countplot(x='sex', data=dataset) # Corelation between the Data plt.figure(figsize=(16, 7)) cor = sns.heatmap(dataset.corr(), annot=True) g = sns.jointplot(dataset.year, dataset.suicides_no, kind="kde", color="#bfa9e0", size=7) plt.savefig('graph.png') # Visualizing which age of people Suicide the most plt.figure(figsize=(16, 7)) bar_age = sns.barplot(x='sex', y='suicides_no', hue='age', data=dataset) # Visualizing which Generation of people Suicide the most plt.figure(figsize=(16, 7)) bar_gen = sns.barplot(x='sex', y='suicides_no', hue='generation', data=dataset) cat_accord_year = sns.catplot('sex', 'suicides_no', hue='age',
df = DataFrame(iris.data,columns = iris.feature_names) df['target'] = iris.target print(df) #数据可视化 import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes = True) #数据分布可视化,直方图和密度函数 #distplot()函数默认绘出数据的直方图和密度函数 sns.distplot(df['petal length (cm)'],bins = 15) #jointplot()函数同时绘制散点图和直方图 sns.jointplot(x = 'sepal length (cm)',y = 'sepal width (cm)',data = df,size =8) #分组散点图 #用seaborn.FacetGrid标记不同的种类 sns.FacetGrid(df,hue = 'target',size =8).map(plt.scatter,'sepal length (cm)','sepal width (cm)').add_legend() #六边形图 sns.axes_style('white') sns.jointplot(x = 'sepal length (cm)',y = 'sepal width (cm)',data = df,kind = 'hex',color = 'r') #二维核密度估计图 g = sns.jointplot(x = 'sepal length (cm)',y = 'sepal width (cm)',data = df,kind = 'kde',color = 'm') #添加散点图 g.plot_joint(plt.scatter,c='w',s=30,linewidth=1,marker='+')
sns.distplot(bd['age'], kde=False, norm_hist=True, bins=10) sns.distplot(bd['age'], hist=False) sns.distplot(bd['age'], hist=False) myimg = myplot.get_figure() myimg.savefig('distplot.png') sns.kdeplot(bd['age']) # other distribution plot, less used sns.kdeplot(bd['age'], shade=True) # shade area sns.kdeplot(bd['pdays'], shade=True) myplot = sns.boxplot(y='age', data=bd) myimg = myplot.get_figure() myimg.savefig('boxplot.png') myplot = sns.jointplot(x='age', y='balance', data=bd.iloc[:500, :]) myimg = myplot.get_figure() # not work in jointplot myimg.savefig('jointplot.png') myplot = sns.jointplot(x='age', y='balance', data=bd.iloc[:100, :], kind='hex', size=10) # light colour less density,givenby hex help(sns.jointplot) sns.jointplot(x='age', y='duration', data=bd.iloc[:100, :], kind='kde', size=10) myplot = sns.lmplot(x='age', y='balance', data=bd.iloc[1:10, :])
sns.distplot(data['x']) sns.distplot(data['y']) # In[9]: for col in 'xy': sns.kdeplot(data[col], shade=True) # In[10]: sns.kdeplot(data) # In[12]: with sns.axes_style('white'): sns.jointplot("x", "y", data, kind='kde') # In[13]: with sns.axes_style('white'): sns.jointplot("x", "y", data, kind='hex') # In[14]: sns.pairplot(data) # In[20]: import plotly.graph_objs as go import numpy as np x = np.random.randn(2000)
5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 1, 11, 10, 10, 10, 10, 10, 10, 10, 8, 3, 7, 3, 2, 2, 2, 11, 7, 7, 11, 11, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 12, 11, 11, 11, 9, 9, 9, 9, 9, 11, 11, 10, 1, 12, 12, 12, 3, 2, 12, 11, 11, 11, 11, 11, 11, 11, 10, 3, 11, 11, 2, 2, 1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 6, 6 ] y = [ 30, 29, 29, 24, 19, 11, 9, 8, 7, 3, 57, 54, 52, 34, 30, 29, 8, 1, 49, 44, 33, 31, 29, 29, 28, 27, 2, 6, 5, 52, 41, 36, 18, 27, 26, 46, 32, 35, 33, 15, 14, 10, 0, 51, 49, 44, 43, 28, 27, 26, 19, 16, 56, 21, 19, 16, 49, 43, 39, 25, 23, 22, 21, 13, 23, 1, 13, 17, 59, 55, 54, 10, 59, 1, 59, 57, 27, 25, 22, 21, 4, 49, 59, 31, 30, 5, 0, 8, 6, 0, 39, 37, 35, 31, 27, 25, 18, 11, 9 ] # print rs # x = rs.gamma(12, size=60) # y = 2 + rs.gamma(60,size=60) # x = rs.gamma(2, size=1000) # print 'y = '+ str(y) graph = sns.jointplot(x, y, kind="hex", stat_func=kendalltau, color="#4CB391") # x = np.random.normal(size=100) # print 'x = '+ str(x) # graph = sns.distplot(x); sns.plt.savefig(__main__.__file__ + ".png") # graph.pyplot.show() sns.plt.show()
print("Kurtosis:") print(data_set['T_MAX'].kurtosis()) ## Graph T MAX / CO & O3 df = data_set.sort_values(['T_MAX', 'CO'], ascending=True) plt.plot(df['T_MAX'], df['CO']) plt.title("La concentración de CO frente a la temperatura máxima") plt.show() df = data_set.sort_values(['T_MAX', 'O3'], ascending=True) plt.plot(df['T_MAX'], df['O3']) plt.title("La concentración de Ozono frente a la temperatura máxima") plt.show() ## Pairplot sns.jointplot(data_set['T_MAX'], data_set['CO'], kind="reg") plt.show() plt.close() sns.jointplot(data_set['T_MAX'], data_set['O3'], kind="reg") plt.show() plt.close() ## Correlation Matrix data_set_corr = data_set data_set_corr['Mes'] = data_set_corr['Mes'].map({ 'ENE': 1, 'FEB': 2, 'MAR': 3, 'ABR': 4, 'MAY': 5, 'JUN': 6,
ax_histx = plt.axes(rect_histx) ax_histx.tick_params(direction='in', labelbottom=False) ax_histy = plt.axes(rect_histy) ax_histy.tick_params(direction='in', labelleft=False) # the scatter plot: ax_scatter.scatter(x, y) # now determine nice limits by hand: binwidth = 0.25 lim = np.ceil(np.abs([x, y]).max() / binwidth) * binwidth ax_scatter.set_xlim((-lim, lim)) ax_scatter.set_ylim((-lim, lim)) bins = np.arange(-lim, lim + binwidth, binwidth) ax_histx.hist(x, bins=bins) ax_histy.hist(y, bins=bins, orientation='horizontal') ax_histx.set_xlim(ax_scatter.get_xlim()) ax_histy.set_ylim(ax_scatter.get_ylim()) plt.show() # Seaborn version import numpy as np import seaborn as sns #sns.set(style="ticks") sns.jointplot(x, y) sns.jointplot(x, y, kind="hex", color="#4CB391")
#Visulization matplotlib.rcdefaults() plt.show(df.plot(kind = 'box')) pd.options.display.mpl_style = 'default' # Sets the plotting display theme to ggplot2 df.plot(kind = 'box') sns.boxplot(data=df,width=0.5) sns.violinplot(df,width=3.5) plt.show(sns.distplot(df.ix[:,2], rug = True, bins = 15)) with sns.axes_style("white"): plt.show(sns.jointplot(df.ix[:,1],df.ix[:,2], kind = "kde")) plt.show(sns.lmplot("Benguet","Ifugao",df)) #Creating custom function def add_2int(x,y): return x+y print(add_2int(2,2)) # an algorithm example def case(n=10,mu=3,sigma=np.sqrt(5),p=0.025,rep=100): m=np.zeros((rep,4)) for i in range(rep): norm = np.random.normal(loc = mu, scale = sigma, size = n) xbar = np.mean(norm)
df.head() import matplotlib.pyplot as plt import seaborn as sns df.groupby('title')['rating'].mean().sort_values(ascending=False).head() df.groupby('title')['rating'].count().sort_values(ascending=False).head() ratings = pd.DataFrame(df.groupby('title')['rating'].mean()) ratings['numRatings'] = pd.DataFrame(df.groupby('title')['rating'].count()) ratings.head() ratings['numRatings'].hist(bins=100, figsize=(10, 6)) ratings['rating'].hist(bins=100, figsize=(10, 6)) sns.jointplot(x='rating', y='numRatings', data=ratings, alpha=0.6) # as the number of ratings goes up, so does the average rating moviemat = df.pivot_table(index='user_id', columns='title', values='rating') moviemat.head() ratings.sort_values('numRatings', ascending=False).head(10) starwars_user_ratings = moviemat['Star Wars (1977)'] liarliar_user_ratings = moviemat['Liar Liar (1997)'] # This will show how people who have seen star wars rate other movies similar_to_starwars = moviemat.corrwith(starwars_user_ratings) similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)
import matplotlib.pyplot as plt import seaborn as sns import pandas as pd #sns.residplot(x='age',y='fare',data=tips,color='indianred') # Generate a green residual plot of the regression between 'hp' and 'mpg' auto = pd.read_csv('auto.csv') # Generate a joint plot of 'hp' and 'mpg' sns.jointplot(x = 'hp', y = 'mpg', data = auto) # Display the plot plt.show()
axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8]) axes1.scatter(j_day, dw_solar_everyday, label='Observed dw_solar', color='red') axes1.scatter(j_day, ghi_everyday, label='Clear Sky GHI', color='green') axes1.set_xlabel('Days') axes1.set_ylabel('Solar Irradiance (Watts /m^2)') axes1.set_title('Solar Irradiance - Test Year 2009') axes1.legend(loc='best') fig.savefig('RNN Paper Results/Exp2_1/' + test_location + '/' + test_year + 'Figure 2.jpg', bbox_inches='tight') # In[525]: sns.jointplot(x=dw_solar_everyday, y=ghi_everyday, kind='reg') plt.xlabel('Observed global downwelling solar (Watts/m^2)') plt.ylabel('Clear Sky GHI (Watts/m^2)') plt.savefig('RNN Paper Results/Exp2_1/' + test_location + '/' + test_year + 'Figure 3', bbox_inches='tight') # ### making the Kt (clear sky index at time t) column by first removing rows with ghi==0 # In[526]: if run_train: # TRAIN dataset df_train = df_train[df_train['ghi'] != 0] df_train['Kt'] = df_train['dw_solar'] / df_train['ghi'] df_train.reset_index(inplace=True)
#mu = np.array([-0.5, -2.5]) size = 1000000 # at 10 million my RAM is overloaded ### If a vector X is normally distributed, then exp(X) is lognormally distributed with the same mean and variance log_data = np.random.multivariate_normal(mu,cov, size=size) level_data = np.exp(log_data) k = level_data[:,1] z = level_data[:,0] lnk = log_data[:,1] lnz = log_data[:,0] ### Plotting the joint density functions for levels and for logs ## First levels sns.jointplot(k,z,kind="hex").set_axis_labels("Capital", "Productivity") plt.show() sns.jointplot(lnk,lnz,kind="hex").set_axis_labels("Log Capital", "Log Productivity") plt.show() ''' ## Plotting the raw joint density of lognormal variables does not make much sense as in 10,000,000 observations there will be massive outliers ### I atempt to get rid of these outliers for plotting purposes meank = np.mean(k) sdk = np.std(k) final_k = [x for x in k if (x > meank - 2 * sdk)] final_k = [x for x in final_k if (x < meank + 2 * sdk)] meanz = np.mean(z)
def viz_cont_cont(df, features, target): for feature in features: sns.jointplot(x=feature, y=target, data=df)
merged_df.popularity.plot.hist(bins=50, color='green') # explore vote_average distribution # appear to be almost normal distribution merged_df.vote_average.plot.hist(bins=50, color='red') # to fix popularity, we will remove vote_count under 10 to prevent bias merged_df = merged_df[~(merged_df.vote_count < 10)] # replot merged_df.popularity.plot.hist(bins=50, color='blue', alpha=0.5) # appear to be better # plot scatter and find r2 for popularity versus domestic_gross columns # before plot, we want to convert the scale into log10 and need to remove 0s merged_df = merged_df[~(merged_df.domestic_gross == 0)] merged_df = merged_df[~(merged_df.worldwide_gross == 0)] merged_df.to_pickle('budget_popularity.pkl') sns.jointplot(merged_df['popularity'], np.log10(merged_df['domestic_gross']), kind="reg", stat_func=hf.r2) sns.jointplot(merged_df['popularity'], np.log10(merged_df['worldwide_gross']), kind="reg", stat_func=hf.r2) sns.jointplot(merged_df['vote_average'], np.log10(merged_df['domestic_gross']), kind="reg", stat_func=hf.r2) sns.jointplot(merged_df['vote_average'], np.log10(merged_df['worldwide_gross']), kind="reg", stat_func=hf.r2) # popularity is R2 is 0.3 while vote_average is 0.051, we will use popularity as a metric to estimate gross income # we will use popularity to estimate how well genres perform using tmdb data frame
for a, b in product(features, plottables): msg('Making %s %s' % (a, b)) x = with_elo[a] y = with_elo[b] msg('type = %s' % x.dtype) if x.dtype == 'object': plt.figure() x.value_counts().plot(kind='bar') plt.savefig('/data/' + a + '_hist.png') plt.close('all') else: try: xlim = tuple(np.percentile(x, [1, 99])) ylim = tuple(np.percentile(y, [1, 99])) with sns.axes_style("white"): sns.jointplot(x, y, kind="hex", xlim=xlim, ylim=ylim) plt.savefig('/data/scatter_' + a + '_' + b + '.png') plt.close('all') except: # sns.violinplot(x, y) # plt.savefig('/data/' + a + '_' + b + '.png') # plt.close() plt.figure() x.plot(kind='hist') plt.savefig('/data/' + a + '_hist.png') plt.close('all') do_indivs = True if do_indivs: for a, b in product(features, plottables): msg('Making %s %s' % (a, b))
file_out_figures = 'C:/Users/lalc/Documents/Old Documents folder/PhD/Meetings/July 2020/' file = ['U','UN','N','SN'] limits = [[-np.inf,-.1], [-.1,-.01], [-.01,.01], [.01,.21], [.21,np.inf]] limits = [[-np.inf,-.1], [-.1,-.01], [-.01,.01], [.01,.21]] relind = L30min1.relscan>.25 j = -2 for i,l in enumerate(limits): stabind = ((Ri1[:,j]>l[0]) & (Ri1[:,j]<l[1])) cols = np.r_[['$L_{u_1,x_1}$', '$L_{u_1,x_2}$','$L_{v_1,x_1}$', '$L_{v_1,x_2}$','$L_{h,x_1}$', '$L_{h,x_2}$'], L30min1.columns [6:]] L30min1.columns = cols xlim = 5*200 ylim = 5*200 g = sns.jointplot(x ='$L_{h,x_1}$', y = '$L_{h,x_2}$', data=L30min1.loc[relind & stabind & ind1], height = 8, kind="kde", cmap="jet", xlim = (0,xlim), ylim = (0,ylim), color='k')#,cbar=True, cbar_kws={"format": formatter, "label": '$Density$'}) g.set_axis_labels('$L_{h,x_1}$', '$L_{h,x_2}$', fontsize = 24) g.ax_joint.plot([0,xlim],[0,ylim],'--k', linewidth = 2) g.ax_joint.plot(L30min1.loc[relind & stabind & ind1]['$L_{h,x_1}$'].values,L30min1.loc[relind & stabind & ind1]['$L_{h,x_2}$'].values,'o', color = 'k', alpha=.2) g.ax_joint.text(100, 800,'$'+'%.2f' % l[0] +'<Ri_f<'+'%.2f' % l[1] +'$',fontsize=30,color='r') plt.tight_layout() plt.savefig(file_out_figures+file[i]+'_phase_1.png') file = ['U','UN','N','SN','VS'] relind = L30min2.relscan>.25 for i,l in enumerate(limits): stabind = ((Ri2[:,-2]>l[0]) & (Ri2[:,-2]<l[1])) cols = np.r_[['$L_{u_1,x_1}$', '$L_{u_1,x_2}$','$L_{v_1,x_1}$', '$L_{v_1,x_2}$','$L_{h,x_1}$', '$L_{h,x_2}$'], L30min2.columns [6:]] L30min2.columns = cols
# que pasa por los valores, lo desactivamos asi sns.distplot(tips['total_bill'],kde=False) plt.show() # podemos modificar la cantidad de bins que son la barras, # con el parametro bins solo pasando un int, hay que tener # cuidado con el tamaño del bin sns.distplot(tips['total_bill'],kde=False,bins=40) plt.show() # tenemos un metodos que nos compara dos columnas dentro de # un dataset sns.jointplot(x='total_bill',y='tip',data=tips) plt.show() # podemos graficar esto de varias maneras con el parametro # kind usando: hex, reg, kde # este otro metodo nos muestra una serie de graficas comparando # todas las columnas con todas, cuando se compara con si mismo, # muestra un histogram, y cuando es con otro, es un jointplot() sns.pairplot(tips) plt.show() # si queremos dividir la informacion de cada grafica por otras # columnas por ejemplo por sexo usamos el parametro hue, se le # pasa una columa categorial, no que tenga un valor por eso
Next compare the distributions of the positive and negative examples over a few features. Good questions to ask yourself at this point are: * Do these distributions make sense? + Yes. You've normalized the input and these are mostly concentrated in the +/- 2 range. * Can you see the difference between the ditributions? + Yes the positive examples contain a much higher rate of extreme values. ----------------------------------------------------------------------------------------- ''' pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns) neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns) sns.jointplot( pos_df['V5'], pos_df['V6'], kind='hex', xlim = (-5,5), ylim = (-5,5) ) plt.suptitle("Positive distribution") sns.jointplot( neg_df['V5'], neg_df['V6'], kind='hex', xlim = (-5,5), ylim = (-5,5) ) _ = plt.suptitle("Negative distribution")
# Histogram sns.distplot(a = iris_data['Petal Length (cm)'], kde=False) # Kernel Density Estimate (kde) # This is the smoothed histogram # kde plot sns.kdeplot(data=iris_data['Petal Length (cm)'], shade=True) # We can create two-dimensional kde plot sns.jointplot(x=iris_data['Petal Length (cm)'], y=iris_data['Sepal Width (cm)'], kind='kde') # Let split the data to understand difference btw species iris_set_data = pd.read_csv('data/iris_setosa.csv', index_col="Id") iris_ver_data = pd.read_csv('data/iris_versicolor.csv', index_col="Id") iris_vir_data = pd.read_csv('data/iris_virginica.csv', index_col="Id")
def explore_global_plot(data, label='label', n_feats=50, id=None, task='classification'): ''' :param data: DataFrame :param label: label column name in the data :param n_feats: the number of features be used to analysis. :param task: regression or classification :return: ''' columns = data.columns.tolist() columns.remove(label) if id is not None: if columns[id].duplicated().sum(): print('{} is duplicated !!!'.format(id)) columns.remove(id) data.drop(id, axis=1, inplace=True) numeric_features = [True if any([ptypes.is_integer_dtype(i),ptypes.is_int64_dtype(i),ptypes.is_float_dtype(i)]) else False for i in data[columns].dtypes] numeric_names = [columns[i] for i, v in enumerate(numeric_features) if v] category_names = list(set(columns) - set(numeric_names)) if task == 'classification': if len(category_names): # data distribution for each class new_data = data.dropna(axis=0) famd = prince.FAMD( n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42 ) famd = famd.fit(new_data[columns]) ax = famd.plot_row_coordinates( new_data, ax=None, x_component=0, y_component=1, labels=new_data.index, color_labels=['{}'.format(t) for t in new_data[label]], ellipse_outline=False, ellipse_fill=True, show_points=True ) plt.show() else: new_data = data.dropna(axis=0) pca = PCA(n_components=2, random_state=seed) X_pca = pca.fit_transform(new_data[columns]) sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=label, data=new_data) plt.show() # sort features for correlation plot sorted_feat_name = numeric_names if len(numeric_names) > 6: n_clusters = 3 new_data = data[[label] + numeric_names].dropna(axis=0) new_data_feat = new_data[numeric_names] new_data_stand = StandardScaler().fit_transform(new_data_feat) kmean_init = KMeans(n_clusters=n_clusters, random_state=seed) new_data_kmean=kmean_init.fit_transform( new_data_stand.reshape(len(numeric_names), -1)) sorted_feat = sorted(zip(numeric_names, kmean_init.labels_), key=lambda x: x[1]) sorted_feat_name = [i[0] for i in sorted_feat] # correlation plot for all features sns.heatmap(data[[label] + sorted_feat_name + category_names].corr()) plt.show() # outlier detection just for numeric features outlier = data[numeric_names].apply(mad_based_outlier) for i, column in enumerate(outlier.columns): print('outlier:\n {}'.format(data[[column]][outlier.iloc[:, i]])) # missing value pattern plot for all features msno.matrix(data[columns[:n_feats]]) plt.show() msno.bar(data[columns[:n_feats]]) plt.show() miss_data = data[columns[:n_feats]].isnull().sum(axis=1) miss_data = miss_data.to_frame() miss_data.columns = ['number_of_missing_attributes'] miss_data.sort_values('number_of_missing_attributes', inplace=True) miss_data['index'] = list(range(0, miss_data.shape[0])) sns.jointplot(x="index", y="number_of_missing_attributes", data=miss_data) plt.show()
def analyze_zN(z, outdir, vg, skip_umap=False, num_pcs=2, num_ksamples=20): zdim = z.shape[1] # Principal component analysis log('Perfoming principal component analysis...') pc, pca = analysis.run_pca(z) log('Generating volumes...') for i in range(num_pcs): start, end = np.percentile(pc[:,i],(5,95)) z_pc = analysis.get_pc_traj(pca, z.shape[1], 10, i+1, start, end) vg.gen_volumes(f'{outdir}/pc{i+1}', z_pc) # kmeans clustering log('K-means clustering...') K = num_ksamples kmeans_labels, centers = analysis.cluster_kmeans(z, K) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans{K}'): os.mkdir(f'{outdir}/kmeans{K}') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl') np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers) np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d') log('Generating volumes...') vg.gen_volumes(f'{outdir}/kmeans{K}', centers) # UMAP -- slow step if zdim > 2 and not skip_umap: log('Running UMAP...') umap_emb = analysis.run_umap(z) utils.save_pkl(umap_emb, f'{outdir}/umap.pkl') # Make some plots log('Generating plots...') plt.figure(1) g = sns.jointplot(x=pc[:,0], y=pc[:,1], alpha=.1, s=2) g.set_axis_labels('PC1','PC2') plt.tight_layout() plt.savefig(f'{outdir}/z_pca.png') plt.figure(2) g = sns.jointplot(x=pc[:,0], y=pc[:,1], kind='hex') g.set_axis_labels('PC1','PC2') plt.tight_layout() plt.savefig(f'{outdir}/z_pca_hexbin.png') if zdim > 2 and not skip_umap: plt.figure(3) g = sns.jointplot(x=umap_emb[:,0], y=umap_emb[:,1], alpha=.1, s=2) g.set_axis_labels('UMAP1','UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/umap.png') plt.figure(4) g = sns.jointplot(x=umap_emb[:,0], y=umap_emb[:,1], kind='hex') g.set_axis_labels('UMAP1','UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/umap_hexbin.png') analysis.scatter_annotate(pc[:,0], pc[:,1], centers_ind=centers_ind, annotate=True) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/kmeans{K}/z_pca.png') g = analysis.scatter_annotate_hex(pc[:,0], pc[:,1], centers_ind=centers_ind, annotate=True) g.set_axis_labels('PC1','PC2') plt.tight_layout() plt.savefig(f'{outdir}/kmeans{K}/z_pca_hex.png') if zdim > 2 and not skip_umap: analysis.scatter_annotate(umap_emb[:,0], umap_emb[:,1], centers_ind=centers_ind, annotate=True) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/kmeans{K}/umap.png') g = analysis.scatter_annotate_hex(umap_emb[:,0], umap_emb[:,1], centers_ind=centers_ind, annotate=True) g.set_axis_labels('UMAP1','UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/kmeans{K}/umap_hex.png') for i in range(num_pcs): if zdim > 2 and not skip_umap: analysis.scatter_color(umap_emb[:,0], umap_emb[:,1], pc[:,i], label=f'PC{i+1}') plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/pc{i+1}/umap.png')