def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain): # ---------------------- Prepare Data Frame ----------------------- # df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume']) df_domain['Date'] = dates x_lbl = ['Observed Volume' for i in xrange(len(x))] xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))] xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))] col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl) df_plot = pd.concat( (df_domain, col3), axis=1) df_plot.columns = ['Date', 'Volume', 'Data'] # ---------------------- Plot Decomposition ----------------------- # p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \ ggplot.geom_line(color='blue', size=2) + \ ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \ ggplot.xlab("Week (Marked on Mondays)") + \ ggplot.ylab("Message Vol") + \ ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \ ggplot.facet_grid('Data', scales='free_y') + \ ggplot.theme_seaborn() return p
def plot_update_frequency(result): import pandas as pd import numpy #turns query results into timeseries of chnages d = [] v = [] for res in result: d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime()) v.append(res['count']) ts = pd.DataFrame(v, index = d, columns = ['changes']) ts = ts.resample('W', how='sum') ts.index.names = ['date'] import ggplot #plots timeseries of changes p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\ ggplot.geom_point(color = 'blue') +\ ggplot.xlab('Period') +\ ggplot.ylab('Changes') +\ ggplot.geom_smooth() +\ ggplot.ylim(low = 0) +\ ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"), labels = ggplot.date_format('%Y-%m')) +\ ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week') return p
def main(file_path): # Validate raw data path if not os.path.exists(file_path): LOG_ERROR('Could not find file: {}'.format(file_path)) return # Validate raw data file type if not file_path.endswith('.pkl'): LOG_ERROR('File path must be a pickle file') return with open(file_path, 'rb') as f: LOG_INFO('Parsing pickle file: {}'.format(file_path)) conversation = pickle.load(f) LOG_INFO('Found conversation: {}'.format(conversation['conversation_name'])) df = pd.DataFrame(conversation['messages']) df.columns = ['Timestamp', 'Type', 'Participant'] # df['Datetime'] = pd.to_datetime(df['Timestamp']) df['Datetime'] = df['Timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(float(x)).toordinal()) histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=2) \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle(conversation['conversation_name']) \ + ggplot.ylab('Number of messages') \ + ggplot.xlab('Date') print(histogram)
def plot_line(X,y,title=None,labelx=None,labely=None,save=False, colors=None): ''' Show on screen a line plot. Can save to a .pdf file too if specified. X,y - ''' df = pandas.DataFrame() if (title!=None): img_title = title.replace(" ","").replace(".","-") + ".pdf" df['X'] = X for i in range(y.shape[1]): df[str(i)] = y.iloc[:,i].values if colors is None: colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys()) df = df.iloc[0:df.shape[0]-1, :] p = ggplot(df, aes(x='X')) for i in range(y.shape[1]): if colors not in X.columns.values: p = p + geom_line(aes(y=str(i),color = colors[i])) else: p = p + geom_point(aes(y=str(i),color = colors)) p = p + xlab(labelx) + ylab(labely) + ggtitle(title) if(save): p.save(img_title) else: return p
def density_plot(by='dpsi_zscore', categorical=True): if categorical: data_dict = { 'muts increasing AAA': np.array([x[by] for x in variants['increase']]), 'muts decreasing AAA': np.array([x[by] for x in variants['decrease']]), 'muts not changing AAA length': np.array([x[by] for x in variants['constant']]) } else: data_dict = OrderedDict( (change, np.array( [x[by] for x in variants['all'] if x['change'] == change])) for change in aaa_changes if len([x[by] for x in variants['all'] if x['change'] == change]) > 1) plot = ( ggplot(aes(x='value', colour='variable', fill='variable'), data=prepare_data_frame(data_dict)) + ggtitle('Impact of variants affecting poly AAA sequences on %s' % by) + xlab(by) + ylab('Kernel density estimate') + geom_density(alpha=0.6)) return plot
def render(data, bin_width, plot_density=False): if plot_density: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \ + ggplot.geom_density() \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle('Conversation Densities') \ + ggplot.ylab('Density') \ + ggplot.xlab('Date') else: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \ + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \ + ggplot.ggtitle('Message Breakdown') \ + ggplot.ylab('Number of Messages') \ + ggplot.xlab('Date') print(plot)
def plot_roc(self, experiment_type, to_plot): # turn this to string for categorical colour scheme to_plot.loc[:, "parameter"] = [str(par) for par in to_plot.loc[:, "parameter"]] p = gg.ggplot(data = to_plot, aesthetics = gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \ gg.geom_line(gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \ gg.ggtitle(experiment_type) + gg.xlab("FPR") + gg.ylab("TPR") gg.ggsave(filename = self.results_path + experiment_type + "_" + self.mode + ".png", plot = p) return
def plotAverageLatency(self): averages = [d.averageLatency() for d in self.data] dat = { "device" : range(1, len(averages) + 1), "average" : averages } dataframe = pandas.DataFrame(dat) chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \ + ggplot.labs(title="Average Latency Per Device") + \ ggplot.ylab("Average Latency (ms)") + \ ggplot.xlab("Device Number") + \ ggplot.geom_bar(stat="identity") chart.show()
def plotAverageLatency(self): averages = [d.averageLatency() for d in self.data] dat = {"device": range(1, len(averages) + 1), "average": averages} dataframe = pandas.DataFrame(dat) chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \ + ggplot.labs(title="Average Latency Per Device") + \ ggplot.ylab("Average Latency (ms)") + \ ggplot.xlab("Device Number") + \ ggplot.geom_bar(stat="identity") chart.show()
def plot_weather_data(df): df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby('DATEn', as_index=False).sum() grouped.index.name = 'DATEn' plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) plot += gp.geom_line() plot += gp.ggtitle('Subway Ridership by Day') plot += gp.xlab('Date') plot += gp.ylab('Exits') return plot
def lineplot(hr_year_csv): df = pandas.read_csv(hr_year_csv) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR")) + gp.geom_point(color="red") + gp.geom_line(color="red") + gp.ggtitle("Homeruns by Year") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def lineplot_compare(filename): df = pd.read_csv(filename) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle("Homeruns by Year by Team") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def visualize_segmentation(X, var): ''' Prints with ggplot a visualization of the different segments. ''' aux = pandas.DataFrame(index = X.index) aux['fecha'] = X.index.values aux[var] = X[var] aux['Segmento'] = X['segmento'].astype(str) return ggplot(aes(x="fecha", y=var, color="Segmento"), aux) + geom_point() + xlab("Fecha") + ylab(var) + ggtitle("Segmentacion de la variable \"" + var + "\"") + theme(axis_text_x = element_text(color=[0,0,0,0]))
def visualize_clusters(X, var, color = 'cluster'): ''' Prints with ggplot a visualization of the different clusters. ''' aux = pandas.DataFrame() aux['fecha'] = X.index aux.index = X.index aux[var] = X[var] aux['Cluster'] = X[color] return ggplot(aes(x='fecha', y=var, color='Cluster'), aux) + geom_point() + xlab(var) + ylab("Valor") + ggtitle("Clustering de la variable \"" + var + "\"") + theme(axis_text_x = element_text(color=[0,0,0,0]))
def plot(mydata, opts): # number of mutants killed by exactly 0 tests nd = sum(mydata[mydata.ntests == 0].exactly) d = sum(mydata[mydata.ntests != 0].exactly) total = nd + d print("Not detected = ", nd, "/", total) title = opts['title'] + (' ND=%d/%d (Mu: %3.1f%%)' % (nd, total, (1 - nd / total) * 100.0)) p = gg.ggplot(gg.aes(x=opts['x'], y=opts['y']), data=mydata) + gg.geom_point() +\ gg.xlab(opts['x']) + gg.ylab(opts['y']) + gg.ggtitle(title) #+ \ # gg.xlim(0,lim) p.save(opts['file'])
def lineplot_compare(filename): # Cleaner version with string vars df = pd.read_csv(filename) p_title = "Homeruns by Year by Team" p_xlab = "Homeruns" p_ylab = "Year" gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) ) return gg
def googletrend_command(delta_t, threshold=0.0, inverse=False): """the command to run google trend algorithm. :param delta_t: the upper bound for original delta_t parameter :param threshold: upper bound for the threshold of differentiating two classes :param inverse: whether to inverse the classifier """ ## handle filepath and title based on parameter inverse filename = "googletrend" titlename = "ROC of google trend classifier" if inverse: filename += "_inverse" titlename += " (inverse version)" filepath = "./plots/%s.jpg" % filename ## generate data first data = googletrend.preprocess() ## store classifier evaluation metrics into dict output = {} output['tpr'] = [] output['fpr'] = [] output['plot'] = [] for thre in np.arange(0, threshold + 0.1, 0.1): print "==> threshold: %f, inverse: %s" % (thre, inverse) for i in xrange(1, int(delta_t)): googletrend.algorithm(data, i, thre, inverse) tp_rate, fp_rate = googletrend.evaluate(data) # print "delta_t: %d, TPR: %f, FPR: %f" % (i, tp_rate, fp_rate) output['tpr'].append(tp_rate) output['fpr'].append(fp_rate) output['plot'].append('thre_' + str(thre)) ## plot ROC graph ## add a y=x baseline for comparison output['tpr'].extend([0.0, 1.0]) output['fpr'].extend([0.0, 1.0]) output['plot'].extend(['baseline', 'baseline']) df = pd.DataFrame(output) graph = gg.ggplot(df, gg.aes('fpr', 'tpr', color='plot')) + \ gg.theme_seaborn() + \ gg.ggtitle(titlename) + \ gg.xlab("FPR") + \ gg.ylab("TPR") + \ gg.xlim(0.0, 1.0) + \ gg.ylim(0.0, 1.0) + \ gg.geom_point() + \ gg.geom_line() gg.ggsave(plot=graph, filename=filepath, width=6, height=6, dpi=100)
def data_output(data, chart_title): print "Good News! You're data has been returned. I'm happy to show it to you." print "Just tell me how you want it - Table or Line Graph?" data_output = raw_input("Choose table or line > ") if data_output[0].lower() == "t": print "Ok, here's your data." print data elif data_output[0] == "l" or data_output[0].lower() =="g": import ggplot as gg plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \ gg.geom_point(color='black') + \ gg.geom_line(color='green') + \ gg.ggtitle(chart_title) + \ gg.xlab("Month, Year") + \ gg.ylab("Value") gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B")) print (plot + gg.theme_xkcd())
def lineplot(hr_year_csv): # Assume that we have a pandas dataframe file called hr_year, # which contains two columns -- yearID, and HR. # # The pandas dataframe contains the number of HR hit in the # Major League baseball in each year. Can you write a function, # lineplot, that creates a chart with points connected by lines, both # colored 'red', showing the number of HR by year? # # You can check out the data loaded into the dataframe at the link below: # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv # your code here df = pd.read_csv('hr_year.csv') gg = gp.ggplot(df, gp.aes('yearID', 'HR')) gg += gp.geom_point(color='red') gg += gp.geom_line(color='red') gg += gp.ggtitle('Total HRs by Year') gg += gp.xlab('Year') gg += gp.ylab('HR') return gg
) ) group by pod_id_location """) qry_job = bqclient.query(qry_str, location='EU', job_config=job_config) #save result as dataframe df = qry_job.to_dataframe() df_long = df.melt(id_vars=['pod_str', 'pod_idx'], value_vars=['p05', 'p25', 'med', 'p75', 'p95'], var_name='yparam', value_name='value') #plots #plt1 = gg.ggplot(df, gg.aes(x='date_UTC',y='no2_ppb'))+gg.geom_line()+gg.xlab('Time')+gg.ylab('NO2 (ppb)')+gg.theme_bw()+gg.facet_wrap('pod_id_location',scales='free_y') #plt1.save(filename = r'.\charts\ulezpodts.png', width=None, height=None, dpi=200) plt2 = gg.ggplot(df_long, gg.aes( x='pod_str', y='value', color='yparam')) + gg.geom_point() + gg.xlab( 'pod') + gg.ylab('NO2 (as % of median)') + gg.theme_bw() + gg.theme( figure_size=(12, 6)) + gg.scale_x_discrete() plt2.save(filename=r'.\charts\ulezpodvar.png', width=10, height=6, dpi=200) #repeat for mobile data using segid instead of podid where N = 10 and N = 40 #repeat for stationary data at mobile times qry_str = (""" with cte0 as ( --all data, ULEZ pods with 6000 hrs select date_UTC, a.pod_id_location, no2_ppb from AQMesh.NO2_scaled_hightimeres_ppb_20180901_20190630 a join AQMesh.NO2_site_metadata_v2_1_20180901_20190630 b on a.pod_id_location=b.pod_id_location where ULEZ = true and no2_ppb <> -999 and a.pod_id_location in --limit to pods with at least 6000 hours
import pandas as pd import numpy as np # from source import view_and_print_output import ggplot as gg df = pd.DataFrame() for num_layers, num_nodes in [(2, 50), (2, 100), (2, 150), (2, 200), (4, 50), (4, 100), (4, 150), (4, 200)]: file_coarse = '../../data/coarse_lambda_dropout_' + str(num_layers) + '_' + str(num_nodes) + '.txt' newdata = pd.read_csv(file_coarse) newdata = newdata.sort_values(by='validation error', ascending=True) newdata['lambda'] = np.log10(newdata['lambda']) newdata['index'] = (np.arange(len(newdata), dtype='float')/len(newdata))**3 newdata['config'] = str(num_layers * 100 + num_nodes) + ' ' + str(num_layers) + ' ' + str(num_nodes) df = df.append(newdata) print(df.sort_values(by='validation error', ascending=False).head(20)) p = gg.ggplot(gg.aes(x='lambda', y='dropout prob', color='index'), data=df) + \ gg.geom_point() + \ gg.xlab('lambda') + \ gg.ylab('dropout prob') + \ gg.scale_x_continuous(limits=(-5, 2)) + \ gg.facet_wrap('config') print(p) # Conclusion: ignore dropout
ax.set_xlabel("Hour (0 is midnight, 12 is noon, 23 is 11pm)") ax.set_xlim(0, 23) turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]] turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") turnstile_rain.groupby("rain2").describe() turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]] turnstile_rain["ENTRIESn_hourly_log10"] = np.log10(turnstile_rain["ENTRIESn_hourly"] + 1) turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \ gg.geom_density() + \ gg.facet_wrap("rain2", scales="fixed") + \ gg.scale_colour_manual(values=set1) + \ gg.xlab("log10(entries per hour)") + \ gg.ylab("Number of turnstiles") + \ gg.ggtitle("Entries per hour whilst raining and not raining") plot np.random.seed(42) data = pd.Series(np.random.normal(loc=180, scale=40, size=600)) data.hist() p = turnstile_weather["ENTRIESn_hourly"].hist() pylab.suptitle("Entries per hour across all stations") pylab.xlabel("Entries per hour") pylab.ylabel("Number of occurrences") turnstile_weather["grp"]=turnstile_weather["rain"]+turnstile_weather["fog"] plot=ggplot(aes(y='ENTRIESn_hourly',x='Hour'), data=turnstile_weather)+geom_histogram()+xlab("Hour")+ylab("ENTRIESn_hourly")+ggtitle("T")
#total-based dftmp = df[['n_sub']+brks[:5]].melt(id_vars=['n_sub'],value_vars=brks[:5], var_name = 'stat',value_name = 'value') dftmp['method']=['(Total-Expected Total)/Expected Total']*dftmp['n_sub'].size df_stacked = dftmp #enhancement-based dftmp = df[['n_sub']+brks[5:10]].melt(id_vars=['n_sub'],value_vars=brks[5:10], var_name = 'stat',value_name = 'value') dftmp['method']=['(Enhanc-Expected Enhanc)/Expected Enhanc']*dftmp['n_sub'].size df_stacked = df_stacked.append(dftmp) #enhancements + full sample background dftmp = df[['n_sub']+brks[10:]].melt(id_vars=['n_sub'],value_vars=brks[10:], var_name = 'stat',value_name = 'value') dftmp['method']=['(Enhanc+Expected Backgr-Expected Total)/Expected Total']*dftmp['n_sub'].size df_stacked = df_stacked.append(dftmp) df_stacked['percentile']=['{0}th%'.format(a[1:3]) for a in df_stacked['stat']] #plots #compare all 3 plt1 = gg.ggplot(df_stacked, gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.facet_wrap('method')+gg.ggtitle('Bias comparison {0}'.format(title)) plt1.save(filename = r'..\charts\drivebias_laqn_{0}.png'.format(species), width=None, height=None, dpi=300) #plot total alone for presenation plt2 = gg.ggplot(df_stacked[df_stacked['method']=='(Total-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title)) t = gg.theme_bw() t._rcParams['font.size']=16 plt2 = plt2+t plt2.save(filename = r'..\charts\drivebias_laqn_{0}_total.png'.format(species), width=None, height=None, dpi=300) #plot enhancement alone for presenation plt3 = gg.ggplot(df_stacked[df_stacked['method']=='(Enhanc+Expected Backgr-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title)) t = gg.theme_bw() t._rcParams['font.size']=16 plt3 = plt3+t plt3.save(filename = r'..\charts\drivebias_laqn_{0}_enhanc.png'.format(species), width=None, height=None, dpi=300)
cov = [(0.2**2, -0.064 / 2), (-0.064 / 2, 0.2**2)] data = pd.DataFrame() data['x'] = array(sorted(append([-3, 0, 3], linspace(-10, 10, 997)))) # for c) # Generate analytical data via uncertainties a_analytical = correlated_values((1, 1), cov) data['Analytical Nom'] = unp.nominal_values(f(data['x'], a_analytical)) data['Analytical Std'] = unp.std_devs(f(data['x'], a_analytical)) # Monte Carlo samples = 10000 a_mc = random.multivariate_normal((1, 1), cov, samples) # a plot visualizing the 2d normal distribution plot = gg.qplot(a_mc[:, 0], a_mc[:, 1]) + gg.xlab('a0') + gg.ylab('a1') plot.save("fig/4b-a.pdf") def std_dev_mc(x_array): return_value = [] for x in x_array: values = [f(x, a) for a in a_mc] nominal = mean(values) std_d = std(values, ddof=1) return_value.append(ufloat(nominal, std_d)) return return_value noms_with_stds_mc = smd.parallel_slice( std_dev_mc, data['x']) # std_dev_mc(list(data['x']))
import ggplot as gg import ultrasignup as us import numpy as np d = us.event_results(299) p1 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("50K Finishing Times for All Years") p2 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("11M Finishing Times for All Years")
#Testing results = [] for m in mvals: results.append(test_approx_nn(method = "hashing", traindata=docdata, testdata = testdata, m=m, alpha=1)) for alpha in avals: results.append(test_approx_nn(method = "kdtree" , traindata=docdata, testdata = testdata, m=1, alpha=alpha)) #save results to results folder, with plot and printing to screen. metadata = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode) f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode = 'w') pkl.dump(obj=results, file=f) logtimes = [math.log(r.avg_time, 2) for r in results] distances = [r.avg_distance for r in results] methods = [r.method[0:3] for r in results] alpha = [r.alpha for r in results] m = [r.m for r in results] results_df = pd.DataFrame(data = {"logtimes" : logtimes, "distances" : distances, "methods" : methods, "m":m, "alpha": alpha}) print results_df p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes", y = "distances", label = "methods")) + \ gg.geom_text() + \ gg.ggtitle("LSH and KD trees: tradeoffs") + \ gg.xlab("Log2 average query time ") + gg.ylab("Average L2 distance from query point)") gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot = p)
def quarterly_queries(keywords, category, cookies, session, domain, throttle, filing_date, ggplot, month_offset=[-12, 12], trends_url=DEFAULT_TRENDS_URL): """Gets interest data (quarterly) for the 12 months before and 12 months after specified date, then gets interest data for the whole period and merges this data. month_offset: [no. month back, no. months forward] to query Returns daily data over the period. """ aw_range = arrow.Arrow.range begin_period = aget(filing_date).replace(months=month_offset[0]) ended_period = aget(filing_date).replace(months=month_offset[1]) # Set up date ranges to iterate queries across start_range = aw_range('month', YYYY_MM(begin_period), YYYY_MM(ended_period)) ended_range = aw_range('month', YYYY_MM(begin_period).replace(months=3), YYYY_MM(ended_period).replace(months=3)) start_range = [r.datetime for r in start_range][::3] ended_range = [r.datetime for r in ended_range][::3] # Fix last date if incomplete quarter (offset -1 week from today) last_week = arrow.utcnow().replace(weeks=-1).datetime start_range = [d for d in start_range if d < last_week] ended_range = [d for d in ended_range if d < last_week] if len(ended_range) < len(start_range): ended_range += [last_week] # Iterate attention queries through each quarter all_data = [] missing_queries = [] # use this to scale IoT later. for start, end in zip(start_range, ended_range): if start > last_week: break print("Querying period: {s} ~ {e}".format(s=start.date(), e=end.date())) throttle_rate(throttle) response_args = {'url': trends_url.format(domain=domain), 'params': _query_parameters(start, end, keywords, category), 'cookies': cookies, 'session': session} query_data = _check_data(keywords, _process_response( _get_response(**response_args))) if all(int(vals)==0 for date,vals in query_data): query_data = [[date, '0'] for date in arrow.Arrow.range('day', start, end)] missing_queries.append('missing') elif len(query_data[0][0]) > 10: missing_queries.append('weekly') else: missing_queries.append('daily') try: if not aligned_weekly(query_data, all_data): ## Workaround: shift filing date q1 = weekly_date(all_data[-1][-1][0]) q2 = weekly_date(query_data[0][0]) if q1 < q2: start = arrow.get(start).replace(months=-1) response_args['params'] = _query_parameters(start, end, keywords, category) ## Do a new 4month query, overlap/replace previous month. query_data = _check_data(keywords, _process_response( _get_response(**response_args))) if all_data[:-1] != []: q2 = weekly_date(query_data[0][0], 'start') all_data[-1] = [d for d in all_data[-1] if q2 > weekly_date(d[0])] elif q1 >= q2: # if q1 > 1st date in query_data, remove the first few entries query_data = [d for d in query_data if q1 < weekly_date(d[0])] except IndexError: pass except: from IPython import embed; embed() finally: all_data.append(query_data) # Get overall long-term trend data across entire queried period s = begin_period.replace(weeks=-2).datetime e1 = arrow.get(ended_range[-1]).replace(months=+1).datetime e2 = arrow.utcnow().replace(weeks=-1).datetime e = min(e1,e2) print("\n=> Merging with overall period: {s} ~ {e}".format(s=s.date(), e=e.date())) response_args = { 'url': trends_url.format(domain=domain), 'params': _query_parameters(s, e, keywords, category), 'cookies': cookies, 'session': session } query_data = _check_data(keywords, _process_response( _get_response(**response_args))) if len(query_data) > 1: # compute changes in IoI (interest over time) per quarter # and merged quarters together after interpolating data # with daily data. # We cannot mix quarters as Google normalizes each query all_ioi_delta = [] qdat_interp = [] for quarter_data in all_data: if quarter_data != []: quarter_data = [x for x in quarter_data if x[1] != ''] all_ioi_delta += list(zip(*change_in_ioi(*zip(*quarter_data)))) if ggplot: qdat_interp += interpolate_ioi(*zip(*quarter_data))[1] # for plotting only qdate = [date for date, delta_ioi in all_ioi_delta] delta_ioi = [delta_ioi for date, delta_ioi in all_ioi_delta] ydate = [date[-10:] if len(date) > 10 else date for date, ioi in query_data] try: yIoI = [float(ioi) for date, ioi in query_data] except: # from IPython import embed; embed() yIoI = [float(ioi) for date, ioi in query_data[:-1]] ydate, yIoI = interpolate_ioi(ydate, yIoI) # match quarterly and yearly dates and get correct delta IoI # common_date = [x for x in ydate+qdate if x in ydate and x in qdate] common_date = sorted(set(ydate) & set(qdate)) delta_ioi = [delta_ioi for date,delta_ioi in zip(qdate, delta_ioi) if date in common_date] y_ioi = [y for x,y in zip(ydate, yIoI) if x in common_date] # calculate daily %change in IoI and adjust weekly values adj_IoI = [ioi*mult for ioi,mult in zip(y_ioi, delta_ioi)] adj_all_data = [[str(date.date()), round(ioi, 2)] for date,ioi in zip(common_date, adj_IoI)] else: adj_all_data = [[str(date.date()), int(zero)] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data,[]))))] # from IPython import embed; embed() heading = ["Date", keywords[0].title] querycounts = list(zip((d.date() for d in start_range), missing_queries)) keywords[0].querycounts = querycounts if not ggplot: return [heading] + adj_all_data ## GGplot Only else: # GGPLOT MERGED GTRENDS PLOTS: import pandas as pd from ggplot import ggplot, geom_line, ggtitle, ggsave, scale_colour_manual, ylab, xlab, aes try: ydat = pd.DataFrame(list(zip(common_date, y_ioi)), columns=["Date", 'Weekly series']) mdat = pd.DataFrame(list(zip(common_date, adj_IoI)), columns=['Date', 'Merged series']) qdat = pd.DataFrame(list(zip(common_date, qdat_interp)), columns=['Date', 'Daily series']) ddat = ydat.merge(mdat, on='Date').merge(qdat,on='Date') ddat['Date'] = list(map(pd.to_datetime, ddat['Date'])) ydat['Date'] = list(map(pd.to_datetime, ydat['Date'])) mdat['Date'] = list(map(pd.to_datetime, mdat['Date'])) qdat['Date'] = list(map(pd.to_datetime, qdat['Date'])) except UnboundLocalError as e: raise(UnboundLocalError("No Interest-over-time to plot")) # meltkeys = ['Date','Weekly series','Merged series','Daily series'] # melt = pd.melt(ddat[meltkeys], id_vars='Date') colors = [ '#77bde0', # blue '#b47bc6', # purple '#d55f5f' # red ] entity_type = keywords[0].desc g = ggplot(aes(x='Date', y='Daily series' ), data=ddat) + \ geom_line(aes(x='Date', y='Daily series'), data=qdat, alpha=0.5, color=colors[0]) + \ geom_line(aes(x='Date', y='Merged series'), data=mdat, alpha=0.9, color=colors[1]) + \ geom_line(aes(x='Date', y='Weekly series'), data=ydat, alpha=0.5, color=colors[2], size=1.5) + \ ggtitle("Interest over time for '{}' ({})".format(keywords[0].keyword, entity_type)) + \ ylab("Interest Over Time") + xlab("Date") # from IPython import embed; embed() print(g) # ggsave(BASEDIR + "/iot_{}.png".format(keywords[0].keyword), width=15, height=5) return [heading] + adj_all_data
def test_xlab(self): p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.xlab("TEST") self.assertEqual(p.xlab, "TEST")
def plot_weather_data(df): # older version df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby('DATEn', as_index=False).sum() grouped.index.name = 'DATEn' p_title = 'Subway Ridership by Hour vs Raining' p_xlab = 'Hour of the Day' p_ylab = 'Subway Entries' plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) return plot
def plot_vol(dates, x, cp, my_domain): # -------------------- Prepare for Plotting -------------------------- # # Prepare DataFrame objects for graphing #Add a column for the label to show in the legend in the graph #Need to reshape it, from (124,) to (124,1) for exmple, so that it #will concatenate. This gives a df with [date, vol_data, 'Volume'] v = ['Volume' for i in xrange(x.shape[0])] #df_domain = np.concatenate((x, v), axis=1) ndf_vol = np.transpose(np.array([dates, x, v])) df_vol = pd.DataFrame(ndf_vol, columns=['Date', 'Volume', 'Data']) #Create pre-allocated lists for plotting means and cp xmin_list = [0 for i in xrange(len(cp))] #hold lft pt of vol_mean xmax_list = [0 for i in xrange(len(cp))] #hold rt pt of vol_mean yint_list = [0 for i in xrange(len(cp))] #holds vol_means cp_date_list = [0 for i in xrange(len(cp))] #holds date for cp cp_value_list = [0 for i in xrange(len(cp))] #holds cp value ref_idx = 0 #used to keep track of vol_means #collect list data for plotting for i in xrange(len(cp)): cp_idx = cp[i][0] - 1 #-1 b/c 1-indexed (includes cp itself) xmin_list[i] = dates[ref_idx].toordinal() #convert to match ggplot xmax_list[i] = dates[cp_idx].toordinal() #convert to match ggplot yint_list[i] = cp[i][2] #use value from_mean for vol_mean cp_date_list[i] = dates[cp_idx] #date of cp #cp_value_list[i] = x[cp_idx] #value of cp cp_value_list[i] = cp[i][2] ref_idx = cp_idx + 1 #+1 b/c moving to next point #Reform lists into a data frame and attach to df_domains. The first two #lists can be created together since they are both numeric, but if I try #to create all three together all types will be downgraded to strings. #np.concatenate avoids this conversion. The transpose is needed to take #an item from each to form a single row. cp_lbl = ['Change Point' for i in xrange(len(yint_list))] #Need to create a dummy entry to put 'Volume Mean' into legend cp_date_list.append(dates[0]) yint_list.append(x[0]) cp_lbl.append('Volume Mean') ndf_cp = np.transpose(np.array([cp_date_list, yint_list, cp_lbl])) yint_list.pop(-1) cp_date_list.pop(-1) df_cp = pd.DataFrame(ndf_cp, columns=['Date', 'Volume', 'Data']) df_plot = pd.concat((df_vol, df_cp), axis=0) #Need to create a dummy entry to put 'Volume Mean' into legend #dummy = np.array([dates[0], x[0], 'Volume Mean']).reshape(1,-1) #df_cp = np.concatenate( (df_cp, dummy), axis=0) #add to bottom df_cp #df_domain = np.concatenate( (df_domain, df_cp), axis=0 ) #add df_domains #convert final array into a pd.DataFrame for printing and plotting #df_domain = pd.DataFrame(df_domain, columns=['Date','Volume','Data']) #df_domain.to_html(open('out.html','w')) #os.system('sudo cp out.html /usr/local/www/analytics/rwing') margin = 0.10 * (np.max(x) - np.min(x)) p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \ ggplot.geom_line(color='blue',size=2) + \ ggplot.geom_point(x=xmax_list, y=cp_value_list, color='black', \ shape='D', size=50) + \ ggplot.geom_hline(xmin=xmin_list, \ xmax=xmax_list, \ yintercept=yint_list, color="red", size=3) + \ ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \ ggplot.scale_colour_manual(values = ["black", "blue", "red"]) + \ ggplot.scale_y_continuous(labels='comma') + \ ggplot.ylim(low=np.min(x)-margin/4.0, high=np.max(x)+margin) + \ ggplot.xlab("Week (Marked on Mondays)") + \ ggplot.ylab("Message Vol") + \ ggplot.ggtitle("%s\nMessage Volume by Week" % my_domain) + \ ggplot.theme_seaborn() return p
def quarterly_queries(keywords, category, cookies, session, domain, throttle, filing_date, ggplot, month_offset=[-12, 12], trends_url=DEFAULT_TRENDS_URL): """Gets interest data (quarterly) for the 12 months before and 12 months after specified date, then gets interest data for the whole period and merges this data. month_offset: [no. month back, no. months forward] to query Returns daily data over the period. """ aw_range = arrow.Arrow.range begin_period = aget(filing_date).replace(months=month_offset[0]) ended_period = aget(filing_date).replace(months=month_offset[1]) # Set up date ranges to iterate queries across start_range = aw_range('month', YYYY_MM(begin_period), YYYY_MM(ended_period)) ended_range = aw_range('month', YYYY_MM(begin_period).replace(months=3), YYYY_MM(ended_period).replace(months=3)) start_range = [r.datetime for r in start_range][::3] ended_range = [r.datetime for r in ended_range][::3] # Fix last date if incomplete quarter (offset -1 week from today) last_week = arrow.utcnow().replace(weeks=-1).datetime start_range = [d for d in start_range if d < last_week] ended_range = [d for d in ended_range if d < last_week] if len(ended_range) < len(start_range): ended_range += [last_week] # Iterate attention queries through each quarter all_data = [] missing_queries = [] # use this to scale IoT later. for start, end in zip(start_range, ended_range): if start > last_week: break print("Querying period: {s} ~ {e}".format(s=start.date(), e=end.date())) throttle_rate(throttle) response_args = { 'url': trends_url.format(domain=domain), 'params': _query_parameters(start, end, keywords, category), 'cookies': cookies, 'session': session } query_data = _check_data( keywords, _process_response(_get_response(**response_args))) # from IPython import embed; embed() if query_data[1] == '': query_data = [[date, '0'] for date in arrow.Arrow.range('day', start, end)] missing_queries.append('missing') if all(int(vals) == 0 for date, vals in query_data): query_data = [[date, '0'] for date in arrow.Arrow.range('day', start, end)] missing_queries.append('missing') elif len(query_data[0][0]) > 10: missing_queries.append('weekly') else: missing_queries.append('daily') try: if not aligned_weekly(query_data, all_data): ## Workaround: shift filing date q1 = weekly_date(all_data[-1][-1][0]) q2 = weekly_date(query_data[0][0]) if q1 < q2: start = arrow.get(start).replace(months=-1) response_args['params'] = _query_parameters( start, end, keywords, category) ## Do a new 4month query, overlap/replace previous month. query_data = _check_data( keywords, _process_response(_get_response(**response_args))) if all_data[:-1] != []: q2 = weekly_date(query_data[0][0], 'start') all_data[-1] = [ d for d in all_data[-1] if q2 > weekly_date(d[0]) ] elif q1 >= q2: # if q1 > 1st date in query_data, remove the first few entries query_data = [ d for d in query_data if q1 < weekly_date(d[0]) ] except IndexError: pass except: from IPython import embed embed() finally: all_data.append(query_data) # Get overall long-term trend data across entire queried period s = begin_period.replace(weeks=-2).datetime e1 = arrow.get(ended_range[-1]).replace(months=+1).datetime e2 = arrow.utcnow().replace(weeks=-1).datetime e = min(e1, e2) print("\n=> Merging with overall period: {s} ~ {e}".format(s=s.date(), e=e.date())) response_args = { 'url': trends_url.format(domain=domain), 'params': _query_parameters(s, e, keywords, category), 'cookies': cookies, 'session': session } query_data = _check_data(keywords, _process_response(_get_response(**response_args))) if query_data[1] == '': adj_all_data = [[ str(date.date()), int(zero) ] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data, []))))] elif len(query_data) > 1: # compute changes in IoI (interest over time) per quarter # and merged quarters together after interpolating data # with daily data. # We cannot mix quarters as Google normalizes each query all_ioi_delta = [] qdat_interp = [] for quarter_data in all_data: if quarter_data != []: quarter_data = [x for x in quarter_data if x[1] != ''] all_ioi_delta += list(zip(*change_in_ioi(*zip(*quarter_data)))) if ggplot: qdat_interp += interpolate_ioi(*zip(*quarter_data))[1] # for plotting only qdate = [date for date, delta_ioi in all_ioi_delta] delta_ioi = [delta_ioi for date, delta_ioi in all_ioi_delta] try: ydate = [ date[-10:] if len(date) > 10 else date for date, ioi in query_data ] yIoI = [float(ioi) for date, ioi in query_data] except: from IPython import embed embed() yIoI = [float(ioi) for date, ioi in query_data[:-1]] ydate, yIoI = interpolate_ioi(ydate, yIoI) # match quarterly and yearly dates and get correct delta IoI # common_date = [x for x in ydate+qdate if x in ydate and x in qdate] common_date = sorted(set(ydate) & set(qdate)) delta_ioi = [ delta_ioi for date, delta_ioi in zip(qdate, delta_ioi) if date in common_date ] y_ioi = [y for x, y in zip(ydate, yIoI) if x in common_date] # calculate daily %change in IoI and adjust weekly values adj_IoI = [ioi * mult for ioi, mult in zip(y_ioi, delta_ioi)] adj_all_data = [[str(date.date()), round(ioi, 2)] for date, ioi in zip(common_date, adj_IoI)] else: adj_all_data = [[ str(date.date()), int(zero) ] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data, []))))] # from IPython import embed; embed() heading = ["Date", keywords[0].title] querycounts = list(zip((d.date() for d in start_range), missing_queries)) keywords[0].querycounts = querycounts if not ggplot: return [heading] + adj_all_data ## GGplot Only else: # GGPLOT MERGED GTRENDS PLOTS: import pandas as pd from ggplot import ggplot, geom_line, ggtitle, ggsave, scale_colour_manual, ylab, xlab, aes try: ydat = pd.DataFrame(list(zip(common_date, y_ioi)), columns=["Date", 'Weekly series']) mdat = pd.DataFrame(list(zip(common_date, adj_IoI)), columns=['Date', 'Merged series']) qdat = pd.DataFrame(list(zip(common_date, qdat_interp)), columns=['Date', 'Daily series']) ddat = ydat.merge(mdat, on='Date').merge(qdat, on='Date') ddat['Date'] = list(map(pd.to_datetime, ddat['Date'])) ydat['Date'] = list(map(pd.to_datetime, ydat['Date'])) mdat['Date'] = list(map(pd.to_datetime, mdat['Date'])) qdat['Date'] = list(map(pd.to_datetime, qdat['Date'])) except UnboundLocalError as e: raise (UnboundLocalError("No Interest-over-time to plot")) # meltkeys = ['Date','Weekly series','Merged series','Daily series'] # melt = pd.melt(ddat[meltkeys], id_vars='Date') colors = [ '#77bde0', # blue '#b47bc6', # purple '#d55f5f' # red ] entity_type = keywords[0].desc g = ggplot(aes(x='Date', y='Daily series' ), data=ddat) + \ geom_line(aes(x='Date', y='Daily series'), data=qdat, alpha=0.5, color=colors[0]) + \ geom_line(aes(x='Date', y='Merged series'), data=mdat, alpha=0.9, color=colors[1]) + \ geom_line(aes(x='Date', y='Weekly series'), data=ydat, alpha=0.5, color=colors[2], size=1.5) + \ ggtitle("Interest over time for '{}' ({})".format(keywords[0].keyword, entity_type)) + \ ylab("Interest Over Time") + xlab("Date") # from IPython import embed; embed() print(g) # ggsave(BASEDIR + "/iot_{}.png".format(keywords[0].keyword), width=15, height=5) return [heading] + adj_all_data
site_list)] #filter to only ULEZ sites if applicable df_along = df_a.melt(id_vars=['site_str', 'n_passes'], value_vars=['p05', 'p25', 'p50', 'p75', 'p95'], var_name='yparam', value_name='value') print(c['name']) #print(df_a) #plots #split percentiles into different charts, all sites #plt1 = gg.ggplot(df_along, gg.aes(x='n_passes',y='value',color='site_str'))+gg.geom_point()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.xlim(0,100)+gg.facet_wrap('yparam',scales='free_y') #plt1.save(filename = r'..\charts\bias_{0}.png'.format(c['name']), width=None, height=None, dpi=200) #n_segments plt2 = gg.ggplot( df_a, gg.aes(x='n_passes', y='n_segments', color='site_str') ) + gg.geom_line() + gg.xlab('n, number drive periods') + gg.ylab( 'Sample size (number of drive patterns)') + gg.theme_bw() + gg.xlim( 0, 35) + gg.ylim(0, 2000) plt2.save(filename=r'..\charts\n_segments_{0}_{1}.png'.format( c['name'], dtstamp), width=None, height=None, dpi=200) #combine percentiles, split sites plt3 = gg.ggplot( df_along, gg.aes(x='n_passes', y='value', color='yparam') ) + gg.geom_line() + gg.xlab('n, number of drive periods') + gg.ylab( 'Sample error (%)') + gg.theme_bw() + gg.xlim(0, 35) + gg.ylim( -100, 100) + gg.geom_hline( y=25, linetype="dashed", color="gray") + gg.geom_hline( y=-25, linetype="dashed", color="gray") + gg.geom_vline(
#coding=utf-8 #!/usr/bin/python ### 资料来源:http://nbviewer.ipython.org/gist/wrobstory/1eb8cb704a52d18b9ee8/Up%20and%20Down%20PyData%202014.ipynb # 导入文件模块 import ggplot as gg from ggplot import ggplot import numpy as np import pandas as pd df = pd.read_csv('/Users/zhangbo/github/pydatasv2014/USGS_WindTurbine_201307_cleaned.csv') min_heights = df[df['Rotor Diameter'] > 10] (ggplot(gg.aes(x='Turbine MW', y='Rotor Swept Area'), data=min_heights[:500]) + gg.geom_point(color='#75b5aa', size=75) + gg.ggtitle("Rotor Swept Area vs. Power") + gg.xlab("Power (MW)") + gg.ylab("Rotor Swept Area (m^2)"))
"not raining") turnstile_rain.groupby("rain2").describe() turnstile_rain = turnstile_weather[[ "rain", "ENTRIESn_hourly", "EXITSn_hourly" ]] turnstile_rain["ENTRIESn_hourly_log10"] = np.log10( turnstile_rain["ENTRIESn_hourly"] + 1) turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining") set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \ gg.geom_density() + \ gg.facet_wrap("rain2", scales="fixed") + \ gg.scale_colour_manual(values=set1) + \ gg.xlab("log10(entries per hour)") + \ gg.ylab("Number of turnstiles") + \ gg.ggtitle("Entries per hour whilst raining and not raining") plot np.random.seed(42) data = pd.Series(np.random.normal(loc=180, scale=40, size=600)) data.hist() p = turnstile_weather["ENTRIESn_hourly"].hist() pylab.suptitle("Entries per hour across all stations") pylab.xlabel("Entries per hour") pylab.ylabel("Number of occurrences") turnstile_weather["grp"] = turnstile_weather["rain"] + turnstile_weather["fog"] plot = ggplot(aes(y='ENTRIESn_hourly', x='Hour'),
count_vect = CountVectorizer() kk = count_vect.fit_transform(subjects_train) analyze = count_vect.build_analyzer() subjects_words_count = subjects_train.apply(lambda x: len(analyze(x))) print(subjects_words_count.describe()) #%% import ggplot as gg df = pd.DataFrame(subjects_words_count, columns = ["count"]) hist = gg.ggplot(df, gg.aes(x = "count")) hist += gg.xlab("# of words") +\ gg.ylab("Frequency") +\ gg.ggtitle("Frequency of words") hist += gg.geom_vline(x = df.mean(), color="red") hist += gg.geom_vline(x = df.median(), color="blue") hist += gg.geom_density(color="green") hist += gg.geom_histogram(binwidth=1, color="grey") hist #%% # 1st attemtp to classify subjects per tag X_raw_train = subjects_train
import sys from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() species = 'no2' df = pd.read_csv(r'.\charts\background_data_melted.csv', index_col='idx', dtype={ 'timestamp': 'str', 'vidperiod': 'str', 'type': 'str', 'param': 'str', 'value': 'float64' }) print(df[:10]) df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S") #plots plt1 = gg.ggplot(df, gg.aes( x='timestamp', y='value', color='type')) + gg.geom_line() + gg.xlab( 'Time') + gg.ylab('Concentration') + gg.theme_bw() + gg.ylim( 0, 100) + gg.facet_wrap('vidperiod', scales='free') + gg.ggtitle( 'Regional background comparison {0}'.format(species)) #+gg.theme(axis_text_x=gg.element_text(angle=20)) plt1.save(filename=r'.\charts\background_{0}_ggtest_{1}.png'.format( species, dt.datetime.today().strftime('%Y%b%d')), width=None, height=None, dpi=300)
# -*- coding: utf-8 -*- from ggplot import ggplot, aes, geom_point, geom_line, ggtitle, xlab, ylab data = [] xvar = 'X' yvar = 'Y' print ggplot( data, aes(x='yearID', y='HR')) + \ geom_point(color='red') + \ geom_line(color='red') + \ ggtitle('Number of HR by year') + \ xlab('Year') + \ ylab('Number of HR')
m=1, alpha=alpha)) #save results to results folder, with plot and printing to screen. metadata = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode) f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode='w') pkl.dump(obj=results, file=f) logtimes = [math.log(r.avg_time, 2) for r in results] distances = [r.avg_distance for r in results] methods = [r.method[0:3] for r in results] alpha = [r.alpha for r in results] m = [r.m for r in results] results_df = pd.DataFrame( data={ "logtimes": logtimes, "distances": distances, "methods": methods, "m": m, "alpha": alpha }) print results_df p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes", y = "distances", label = "methods")) + \ gg.geom_text() + \ gg.ggtitle("LSH and KD trees: tradeoffs") + \ gg.xlab("Log2 average query time ") + gg.ylab("Average L2 distance from query point)") gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot=p)
pd.value_counts(vehicles.fuelType1) pd.value_counts(vehicles.trany) vehicles["trany2"] = vehicles.trany.str[0] pd.value_counts(vehicles.trany2) #%% step 1 ~ 4 on Page 202 from ggplot import ggplot, aes, geom_point, xlab, ylab, ggtitle grouped = vehicles.groupby("year") averaged = grouped['comb08', 'highway08', 'city08'].agg([np.mean]) averaged.columns = ['comb08_mean', 'highway08_mean', 'city08_mean'] averaged['year'] = averaged.index print(ggplot(averaged, aes('year', 'comb08_mean')) + geom_point(color='steelblue') + xlab('Year') + ylab('Average MPG') + ggtitle('All cars')) #%% step 5 criteria1 = vehicles.fuelType1.isin(['Regular Gasoline', 'Prenium Gasoline', 'Midgrade Gasoline']) criteria2 = vehicles.fuelType2.isnull() criteria3 = vehicles.atvType != 'Hybrid' vehicles_non_hybrid = vehicles[criteria1 & criteria2 & criteria3] len(vehicles_non_hybrid) #%% step 6 grouped = vehicles_non_hybrid.groupby(['year']) averaged = grouped['comb08'].agg([np.mean]) print(averaged)
if (reward == 1): wins_for_player_1[i] += 1.0 elif (reward == 0.5): draw_for_players[i] += 1.0 print(i, wins_for_player_1[i], draw_for_players[i]) data.append({ 'Type': 0, 'Wins': wins_for_player_1[i], 'Training': training_steps * (i - 1) }) data.append({ 'Type': 1, 'Wins': draw_for_players[i], 'Training': training_steps * (i - 1) }) learnitMC(training_steps, epsilon, alpha, n) # learnit(training_steps, epsilon, alpha) # the original learning code. # Pandas gives you the power of R learningdf = pd.DataFrame(data) # I use ggplot when I generate figures in R and would like to use it with Python, HOWEVER: # latest Pandas causes problems for ggplot so I needed these two patches: # https://stackoverflow.com/questions/50591982/importerror-cannot-import-name-timestamp/52378663 # https://github.com/yhat/ggpy/issues/612 p = gg.ggplot(gg.aes(x='Training', y='Wins', group='Type'), data=learningdf)+ gg.xlab('Learning games') + \ gg.ylab('Wins for player 1') + gg.ggtitle("n="+str(n)) + gg.geom_point() + gg.stat_smooth(method='loess') p.make() filename = "experiment_" + str(n) + ".pdf" p.save(filename)
def firms_dynamics_plot(decision): data = pd.read_csv(os.path.join( parameters.OUTPUT_PATH, "temp_general_firms_pop_%s_decision_%s_time_%s.txt" % (parameters.pop_redutor, decision, parameters.final_Time)), sep=",", header=None, decimal=",").astype(float) # renaming the collunms names data.columns = [ 'time', 'total_firms', 'average_output', 'average_age', 'average_size', 'new_firms', 'exit_firms', 'max_size', 'total_effort', 'average_effort' ] #logical test to control the process of burn the initial if parameters.time_to_cut_plots > 0: data = data.loc[( data['time']).astype(int) >= parameters.time_to_cut_plots, :] # variable to add in the plot title title_pop_val = float(parameters.pop_redutor) * 100 # create a list of a years to plot list_of_years_division = list( range(int(data['time'].min()), int(data['time'].max()), 12)) + [data['time'].max() + 1] list_of_years = [int(i / 12) for i in list_of_years_division] # graph paramter variables dpi_var_plot = 700 width_var_plot = 15 height_var_plot = 10 ############################################################################################################### # plotting AGENTS UTILITY # Total firms plot_data = gg.ggplot(data, gg.aes('time', 'total_firms')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \ gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years) +\ gg.ggtitle('Total firms') + gg.xlab('Years') + gg.ylab('Total of Firms')+ gg.theme_bw() # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_general_total_firms_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join(parameters.OUTPUT_PATH, ('temp_general_total_firms_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot gg.ggsave(plot_data, os.path.join(parameters.OUTPUT_PATH, ('temp_general_total_firms_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), width=width_var_plot, height=height_var_plot, units="in") # Average of output plot_data = gg.ggplot(data, gg.aes('time', 'average_output')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \ gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\ +gg.ggtitle('Average of output') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw() # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_general_average_output_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join(parameters.OUTPUT_PATH, ('temp_general_average_output_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot gg.ggsave(plot_data, os.path.join(parameters.OUTPUT_PATH, ('temp_general_average_output_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), width=width_var_plot, height=height_var_plot, units="in") # Average of age plot_data = gg.ggplot(data, gg.aes('time', 'average_age')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \ gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\ +gg.ggtitle('Average of age of firms') + gg.xlab('Years') + gg.ylab('Age of Firms')+ gg.theme_bw() # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_general_average_age_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join(parameters.OUTPUT_PATH, ('temp_general_average_age_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot gg.ggsave(plot_data, os.path.join(parameters.OUTPUT_PATH, ('temp_general_average_age_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), width=width_var_plot, height=height_var_plot, units="in") # Average of size plot_data = gg.ggplot(data, gg.aes('time', 'average_size')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \ gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\ +gg.ggtitle('Average of size of firms') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw() # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_general_average_size_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join(parameters.OUTPUT_PATH, ('temp_general_average_size_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot gg.ggsave(plot_data, os.path.join(parameters.OUTPUT_PATH, ('temp_general_average_size_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), width=width_var_plot, height=height_var_plot, units="in") # number of new firms plot_data = gg.ggplot(data, gg.aes('time', 'new_firms')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \ gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\ +gg.ggtitle('Number of new firms') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw() # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_general_number_of_new_firms_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join(parameters.OUTPUT_PATH, ('temp_general_number_of_new_firms_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot gg.ggsave(plot_data, os.path.join(parameters.OUTPUT_PATH, ('temp_general_number_of_new_firms_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), width=width_var_plot, height=height_var_plot, units="in") # Number of firms out plot_data = gg.ggplot(data, gg.aes('time', 'exit_firms')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \ gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\ +gg.ggtitle('Number of firms out') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw() # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_general_number_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join(parameters.OUTPUT_PATH, ('temp_general_number_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot gg.ggsave(plot_data, os.path.join(parameters.OUTPUT_PATH, ('temp_general_number_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), width=width_var_plot, height=height_var_plot, units="in") # Average and max size of firms dat_merged = pd.concat([ data.iloc[:, data.columns == 'average_effort'], data.iloc[:, data.columns == 'total_effort'] ], axis=1) plot_data = dat_merged.plot( title='Average and maximum effort of employees') plot_data.set_xlabel('Years') plot_data.set_ylabel('Values units of effort') plot_data.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plot_data.set_xticks(list_of_years_division) plot_data.set_xticklabels(list_of_years) plot_data.set_axis_bgcolor('w') fig = plot_data.get_figure() fig.set_size_inches(width_var_plot, height_var_plot) # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_average_and_maximum_effort_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join( parameters.OUTPUT_PATH, ('temp_average_and_maximum_effort_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot fig.savefig(os.path.join( parameters.OUTPUT_PATH, ('temp_average_and_maximum_effort_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), dpi=dpi_var_plot) dat_merged = pd.concat([ data.iloc[:, data.columns == 'average_size'], data.iloc[:, data.columns == 'max_size'] ], axis=1) plot_data = dat_merged.plot(title='Average and maximum size firms') plot_data.set_xlabel('Years') plot_data.set_ylabel('Number of employees') plot_data.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plot_data.set_xticks(list_of_years_division) plot_data.set_xticklabels(list_of_years) plot_data.set_axis_bgcolor('w') fig = plot_data.get_figure() fig.set_size_inches(width_var_plot, height_var_plot) # logical test for presence of plot, if is TRUE is deleted before save the new one if os.path.isfile( os.path.join( parameters.OUTPUT_PATH, ('temp_average_size_and_maximum_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) is True: os.remove( os.path.join( parameters.OUTPUT_PATH, ('temp_average_size_and_maximum_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time)))) # saving the plot fig.savefig(os.path.join( parameters.OUTPUT_PATH, ('temp_average_size_and_maximum_of_firms_out_%s_%s_%s.png' % (decision, title_pop_val, parameters.final_Time))), dpi=dpi_var_plot)