def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain): # ---------------------- Prepare Data Frame ----------------------- # df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume']) df_domain['Date'] = dates x_lbl = ['Observed Volume' for i in xrange(len(x))] xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))] xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))] col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl) df_plot = pd.concat( (df_domain, col3), axis=1) df_plot.columns = ['Date', 'Volume', 'Data'] # ---------------------- Plot Decomposition ----------------------- # p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \ ggplot.geom_line(color='blue', size=2) + \ ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \ ggplot.xlab("Week (Marked on Mondays)") + \ ggplot.ylab("Message Vol") + \ ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \ ggplot.facet_grid('Data', scales='free_y') + \ ggplot.theme_seaborn() return p
def plot(self, what='cumulative_payouts', include_ci=True): import ggplot as gg #This is hacky ... need to DRY out the imports if what == 'cumulative_payouts': plt = self._plot_cumulative_payouts(include_ci=include_ci) elif what == 'avg_accuracy': plt = self._plot_avg_accuracy(include_ci=include_ci) elif what == 'all': summary = self.summary() p1 = self._plot_cumulative_payouts(include_ci=include_ci, summary=summary) p2 = self._plot_avg_accuracy(include_ci=include_ci, summary=summary) d1 = p1.data d2 = p2.data d1['Outcome'] = d1['AverageCumulativePayout'] d2['Outcome'] = d2['AverageAccuracy'] d1['Plot'] = 'Cumulative Payouts' d2['Plot'] = 'Average Accuracy' df = d1.append(d2, ignore_index=True) if include_ci: plt = gg.ggplot(gg.aes(x='Round', y='Outcome', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='Outcome'), data=df) plt += gg.facet_grid('Plot', scales='free') else: raise ValueError('%s is not a valid option' % what) return plt + gg.geom_line()
def graph1(score_data): """ Average score as time goes on; Creates and returns graph 1, a line graph. """ date_column = score_data[0][find_time_stamp(score_data)] data = DataFrame(score_data[1:], columns=score_data[0]) # Get all columns that arlabels = date_format("%Y-%m-%d")e numerical # questions so we know what to graph num_questions = data.select_dtypes(include=['int64']).columns.values # Melt data so that each question is in a seperate row new_data = pd.melt(data, id_vars=date_column, value_vars=num_questions, var_name="Question", value_name="Score") # Convert date string into an actual date type new_data[date_column] = pd.to_datetime(new_data[date_column], format="%m/%d/%Y") # Group all rows with same date and question, and then take the average. new_data = new_data.groupby([date_column, 'Question']).mean().reset_index() new_data['All'] = "Indiviual Questions" new_data2 = new_data.groupby(date_column).mean().reset_index() new_data2['Question'] = "All Questions" new_data2['All'] = "Average of All Questions" new_data = pd.concat([new_data, new_data2]) new_data[date_column] = new_data[date_column].astype('int64') # Create time graph with seperate lines for each question ret = ggplot.ggplot(ggplot.aes(x=date_column, y="Score", colour="Question"), new_data) +\ ggplot.geom_point() +\ ggplot.geom_line() +\ ggplot.facet_grid("All") +\ ggplot.scale_x_continuous(labels=[""], breaks=0) +\ ggplot.labs(x="Time", y="Average Question Score") +\ ggplot.ggtitle("Question Scores Over Time") return ret
def _make_grid(grid): columns = ceil(grid.n_rows / len(grid.plots())) return grid.plot[0] + facet_grid(grid.n_rows, columns, scales="fixed")
def plot_weather_data(turnstile_weather): ''' You are passed in a dataframe called turnstile_weather. Use turnstile_weather along with ggplot to make a data visualization focused on the MTA and weather data we used in assignment #3. You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time of day or day of week * How ridership varies based on Subway station * Which stations have more exits or entries at different times of day If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out: https://www.dropbox.com/s/meyki2wl9xfa7yk/\n turnstile_data_master_with_weather.csv To see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3 of the actual data in the turnstile_weather dataframe ''' df = turnstile_weather.copy() # we will remove national holidays from the data. May 30 is Memorial Day, # the only national holiday in our data set. Normally this would be done # by passing in the data more elegantly, but since this is a bit more # constrained, we will simply hard code it into the function. national_holidays = ['2011-05-30'] for holiday in national_holidays: df = df[df.DATEn != holiday] # add a column to represent the ISO day of the week for each data point. df[u'weekday'] = df[u'DATEn'].apply(\ lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').isoweekday()) ##now introduce a multiplier variable so that the ENTRIESn_hourly ##values can be modified when we have multiple data days. For example ##if we have 2 fridays with rain the multiplier is 1/2 so that summing ##the modified values will give us the average number of riders ##entering the subways system on a rainy friday. for day in df.weekday.unique(): for rain_status in df.rain.unique(): # number of unique dates with the same weekday and rain status u = df[(df.weekday == day) & (df.rain == rain_status)].\ DATEn.nunique() if u != 0: multiplier = float(1.0 / u) else: multiplier = 0 daily_sum = \ df[(df.weekday == day) & (df.rain == rain_status)].sum() entries_sum = daily_sum.ENTRIESn_hourly multiplier_index_list = \ df[(df.weekday == day) & (df.rain == rain_status)].index df.loc[multiplier_index_list, u'ENTRIESn_hourly'] = \ multiplier * entries_sum ##now we have a dataframe wich is ready to be utilized for making our ##plot using the data contained within. p = ggplot.ggplot(ggplot.aes(x = u'factor(weekday)', \ weight = u'ENTRIESn_hourly', \ fill = u'weekday'),\ data = df) +\ ggplot.geom_bar() +\ ggplot.facet_grid(x = u'rain', y = u'weekday') +\ ggplot.ggtitle('Average Ridership on Sunny & Rainy ISO Weekdays') print p return p
import ggplot as gg import ultrasignup as us import numpy as np d = us.event_results(299) p1 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("50K Finishing Times for All Years") p2 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("11M Finishing Times for All Years")
}, index=range(t * len(count_tops), t * len(count_tops) + len(count_tops))) probs_list.append(probs_t) # Calculate KL divergences kl_mle_list.append(stats.entropy(true_bins_t, mle_probs_vals)) kl_nn_list.append(stats.entropy(true_bins_t, nn_probs_t)) probs = pd.concat(probs_list) # In[44]: probs_tail = probs[probs.Tenor > 360] gg.ggplot(probs_tail, gg.aes(x='Count Top', weight='Probs True') ) + gg.facet_grid('Tenor') + gg.geom_bar() + gg.geom_step( gg.aes(y='Probs MLE', color='red')) + gg.geom_step( gg.aes(y='Probs NN', color='blue')) + gg.scale_x_continuous( limits=(0, len(count_tops))) # In[57]: # KL divergences kl_df = pd.DataFrame({ 'Tenor': range(0, t_end + 1), 'KL MLE': kl_mle_list, 'KL NN': kl_nn_list }) print kl_df.head()
'Pi':windowPi(sorted(list(set(vcfdf['window']))))}) # Now try and plot graph p_MaxMinor = gg.ggplot(gg.aes('window', 'MaxMinor'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Minor Variant Frequency (%)") +gg.ggtitle(vcfoutput + "\n Valid Minor Variant Sites :" + str(len(minorvar))) # Plot Nucleotide Diversity (Pi) along genome p_pi =gg.ggplot(gg.aes('window', 'Pi'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Mean nucleotide diversity (" + u"\u03c0" +")") +gg.scale_y_continuous(expand=(0,0),limits=(0, windowed_df['Pi'].max(axis=0)+0.001)) +gg.ggtitle(vcfoutput + "\n Genome-wide Mean Nucleotide Diversity (" +u"\u03c0"+ ") :" +str(round(gw_Pi,6))) #p_pi # Facetted plot (still not sorted y axes labels yet) windowed_df_melt = pd.melt(windowed_df, id_vars=['window']) p_combi = gg.ggplot(gg.aes('window', 'value',colour='variable'),data=windowed_df_melt) p_combi = p_combi + gg.geom_point(colour='variable') + gg.facet_grid('variable',scales='free_y')+gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")") # Print graphs to .png p_combi.save(vcfinput + ".MinorVar_combo.png") p_MaxMinor.save(vcfinput + ".MinorVar.png") p_pi.save(vcfinput + ".Pi-diversity.png") # Print full dataframe and minor vars only to separate tab delimited files vcfdf.to_csv(vcfinput + ".analysed.tsv",sep='\t', index=False) minorvar.to_csv(vcfinput + ".minorvars.tsv",sep='\t', index=False)
def density(self, inp1, inp2, inp3): return gg.ggplot(self.data, gg.aes(x=inp1, color=inp2, fill=inp2)) +\ gg.geom_density(alpha=0.5, size=5) +\ gg.facet_grid(inp3) +\ gg.ggtitle('Density of Fare by Sex and Survival Status') +\ gg.ylab('Survival Status')
def test_ndim_1_facet_grid_col(self): p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid(None, 'clarity') nrow, ncol = p.facets.nrow, p.facets.ncol self.assertEqual(nrow, 1) self.assertEqual(ncol, 8)
def test_ndim_2facet_grid_reverse(self): p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid('clarity', 'cut') nrow, ncol = p.facets.nrow, p.facets.ncol self.assertEqual(nrow, 8) self.assertEqual(ncol, 5)
def test_ndim_1_facet_grid_col(self): p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid( None, 'clarity') nrow, ncol = p.facets.nrow, p.facets.ncol self.assertEqual(nrow, 1) self.assertEqual(ncol, 8)
def test_ndim_2facet_grid_reverse(self): p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid( 'clarity', 'cut') nrow, ncol = p.facets.nrow, p.facets.ncol self.assertEqual(nrow, 8) self.assertEqual(ncol, 5)