def length_dist(self, pat_out="genes_lengths.png"): '''Gets a list of sequence lengths, creates a dataframe and plots it using ggplot. Then saves the file in specified path.''' len_ditribution = [len(i) for i in self.num] df = pd.DataFrame({"record_length": np.array(len_ditribution)}) pl = ggplot(df, aes(x="record_length")) + geom_density() pl.save(pat_out)
def _post_density_plot(self, func=None, x_name='', plot_title='', include_doses=None, boot_samps=1000): from ggplot import aes, ggplot, geom_density, ggtitle import pandas as pd if include_doses is None: include_doses = range(1, self.num_doses + 1) def my_func(x, samp): tox_probs = _pi_T(x, mu=samp[:, 0], beta=samp[:, 1]) eff_probs = _pi_E(x, mu=samp[:, 2], beta1=samp[:, 3], beta2=samp[:, 4]) u = self.metric(eff_probs, tox_probs) return u if func is None: func = my_func x_boot = [] dose_indices = [] samp = self.pds._samp p = self.pds._probs p /= p.sum() for i, x in enumerate(self.scaled_doses()): dose_index = i+1 if dose_index in include_doses: x = func(x, samp) x_boot.extend(np.random.choice(x, size=boot_samps, replace=True, p=p)) dose_indices.extend(np.repeat(dose_index, boot_samps)) df = pd.DataFrame({x_name: x_boot, 'Dose': dose_indices}) return ggplot(aes(x=x_name, fill='Dose'), data=df) + geom_density(alpha=0.6) + ggtitle(plot_title)
def density_plot(by='dpsi_zscore', categorical=True): if categorical: data_dict = { 'muts increasing AAA': np.array([x[by] for x in variants['increase']]), 'muts decreasing AAA': np.array([x[by] for x in variants['decrease']]), 'muts not changing AAA length': np.array([x[by] for x in variants['constant']]) } else: data_dict = OrderedDict( (change, np.array( [x[by] for x in variants['all'] if x['change'] == change])) for change in aaa_changes if len([x[by] for x in variants['all'] if x['change'] == change]) > 1) plot = ( ggplot(aes(x='value', colour='variable', fill='variable'), data=prepare_data_frame(data_dict)) + ggtitle('Impact of variants affecting poly AAA sequences on %s' % by) + xlab(by) + ylab('Kernel density estimate') + geom_density(alpha=0.6)) return plot
def density_chart(self, conn, column, table_chosen, title): data_df = dfile.single_selector(conn=conn, table=table_chosen, column=column) density_plot = ggplot( aes(x=column), data=data_df) + geom_density() + theme_gray() + labs(title=title) now = datetime.datetime.now() b = now print(b) print(b - a) print(density_plot)
def build_list_of_plots(self, var): plots = [] for tbl in self.tables: df_from_original_table_on_variable = pd.DataFrame( self.get_variable_data_from_original_table(tbl, var)) df_from_intersection_on_variable = pd.DataFrame( self.get_variable_data_from_intersection_table(var)) df_from_intersection_on_variable.append( df_from_intersection_on_variable) g = ggplot(df_from_intersection_on_variable, aes(x='var', color='table')) + geom_density() g.make()
def render(data, bin_width, plot_density=False): if plot_density: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \ + ggplot.geom_density() \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle('Conversation Densities') \ + ggplot.ylab('Density') \ + ggplot.xlab('Date') else: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \ + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \ + ggplot.ggtitle('Message Breakdown') \ + ggplot.ylab('Number of Messages') \ + ggplot.xlab('Date') print(plot)
from ggplot import aes, diamonds, geom_density, ggplot import matplotlib.pyplot as plt from bokeh import mpl from bokeh.plotting import output_file, show g = ggplot(diamonds, aes(x='price', color='cut')) + geom_density() g.draw() plt.title("Density ggplot-based plot in Bokeh.") output_file("density.html", title="density.py example") show(mpl.to_bokeh())
print(subjects_words_count.describe()) #%% import ggplot as gg df = pd.DataFrame(subjects_words_count, columns = ["count"]) hist = gg.ggplot(df, gg.aes(x = "count")) hist += gg.xlab("# of words") +\ gg.ylab("Frequency") +\ gg.ggtitle("Frequency of words") hist += gg.geom_vline(x = df.mean(), color="red") hist += gg.geom_vline(x = df.median(), color="blue") hist += gg.geom_density(color="green") hist += gg.geom_histogram(binwidth=1, color="grey") hist #%% # 1st attemtp to classify subjects per tag X_raw_train = subjects_train X_raw_test = subjects_test Y_train = raw_data_train.target Y_test = raw_data_test.target target_names = raw_data_train.target_names
wordfreq = {key: 0 for key in worduniq} wordscore = {key: 0 for key in worduniq} wordprobs = {key: 0 for key in worduniq} for w in wordbag: wordfreq[w] += 1 for i in range(nrow): for j in words[i]: wordscore[j] += outcomes[i] for i in wordprobs: wordprobs[i] = float(wordscore[i]) / wordfreq[i] b1 = pandas.Series([np.mean([wordprobs[w] for w in l]) if len(l) else 0 for l in words]) d1 = pandas.concat((b1, outcomes), axis=1) d1.columns = ['prob', 'out'] g1 = ggplot(d1, aes(x='prob', color='out')) + geom_density() cutoff = 0.3 bpreds = b1 > cutoff print(float(sum(bpreds == outcomes)) / nrow) # 0.922524752475 # classify test data words = test[textVar].apply(lambda x: prepare_document(x)).tolist() b2 = [np.mean([wordprobs.get(w) if w in wordprobs else 0 for w in l]) if len(l) else 0 for l in words] with open('submission.csv', 'wb') as f: f.write(idVar + ',' + outVar + '\n') for i in range(len(b2)): row = test[idVar][i] + ',' + ('1' if b2[i] > cutoff else '0') print(row) f.write(row + '\n')
"rain", "ENTRIESn_hourly", "EXITSn_hourly" ]] turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") turnstile_rain.groupby("rain2").describe() turnstile_rain = turnstile_weather[[ "rain", "ENTRIESn_hourly", "EXITSn_hourly" ]] turnstile_rain["ENTRIESn_hourly_log10"] = np.log10( turnstile_rain["ENTRIESn_hourly"] + 1) turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \ gg.geom_density() + \ gg.facet_wrap("rain2", scales="fixed") + \ gg.scale_colour_manual(values=set1) + \ gg.xlab("log10(entries per hour)") + \ gg.ylab("Number of turnstiles") + \ gg.ggtitle("Entries per hour whilst raining and not raining") plot np.random.seed(42) data = pd.Series(np.random.normal(loc=180, scale=40, size=600)) data.hist() p = turnstile_weather["ENTRIESn_hourly"].hist() pylab.suptitle("Entries per hour across all stations") pylab.xlabel("Entries per hour") pylab.ylabel("Number of occurrences")
def density(self, inp1, inp2, inp3): return gg.ggplot(self.data, gg.aes(x=inp1, color=inp2, fill=inp2)) +\ gg.geom_density(alpha=0.5, size=5) +\ gg.facet_grid(inp3) +\ gg.ggtitle('Density of Fare by Sex and Survival Status') +\ gg.ylab('Survival Status')
ax.set_title("Total entries/exits per hour by hour across all stations") ax.legend(["Entries", "Exits"]) ax.set_ylabel("Entries/exits per hour (1e6 is a million)") ax.set_xlabel("Hour (0 is midnight, 12 is noon, 23 is 11pm)") ax.set_xlim(0, 23) turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]] turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") turnstile_rain.groupby("rain2").describe() turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]] turnstile_rain["ENTRIESn_hourly_log10"] = np.log10(turnstile_rain["ENTRIESn_hourly"] + 1) turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \ gg.geom_density() + \ gg.facet_wrap("rain2", scales="fixed") + \ gg.scale_colour_manual(values=set1) + \ gg.xlab("log10(entries per hour)") + \ gg.ylab("Number of turnstiles") + \ gg.ggtitle("Entries per hour whilst raining and not raining") plot np.random.seed(42) data = pd.Series(np.random.normal(loc=180, scale=40, size=600)) data.hist() p = turnstile_weather["ENTRIESn_hourly"].hist() pylab.suptitle("Entries per hour across all stations") pylab.xlabel("Entries per hour") pylab.ylabel("Number of occurrences")