def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain): # ---------------------- Prepare Data Frame ----------------------- # df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume']) df_domain['Date'] = dates x_lbl = ['Observed Volume' for i in xrange(len(x))] xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))] xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))] col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl) df_plot = pd.concat( (df_domain, col3), axis=1) df_plot.columns = ['Date', 'Volume', 'Data'] # ---------------------- Plot Decomposition ----------------------- # p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \ ggplot.geom_line(color='blue', size=2) + \ ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \ ggplot.xlab("Week (Marked on Mondays)") + \ ggplot.ylab("Message Vol") + \ ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \ ggplot.facet_grid('Data', scales='free_y') + \ ggplot.theme_seaborn() return p
def plot_matches(df_in, date, filename_out, x_var='date_time', y_var="shorthand_search_vol"): """ Plot y-var and save based on specified variables. Assumes that df has already been filtered using dplyr's sift mechanism. Also assumes that a date has been passed in. """ # basic data processing for viz df_in['date_time'] = date + " " + df_in['time'].astype(str) df_in['date_time'] = pd.to_datetime(df_in['date_time'], errors="coerce", infer_datetime_format=True) # build layers for plot p = ggplot(aes(x=x_var, y=y_var, group="match_id", color="match_id"), data=df_in) p += geom_line(size=2) # informative p += labs(x="time (gmt)", y="search volume (scaled to 100)") # p += ggtitle("man. city (h) vs. chelsea (a)\naug. 8 '16, etihad stadium") p += scale_x_date(labels=date_format("%H:%M:%S"), date_breaks="30 minutes") # visual t = theme_gray() t._rcParams['font.size'] = 8 t._rcParams['font.family'] = 'monospace' p += t # done p.save(filename_out, width=16, height=8)
def main(file_path): # Validate raw data path if not os.path.exists(file_path): LOG_ERROR('Could not find file: {}'.format(file_path)) return # Validate raw data file type if not file_path.endswith('.pkl'): LOG_ERROR('File path must be a pickle file') return with open(file_path, 'rb') as f: LOG_INFO('Parsing pickle file: {}'.format(file_path)) conversation = pickle.load(f) LOG_INFO('Found conversation: {}'.format(conversation['conversation_name'])) df = pd.DataFrame(conversation['messages']) df.columns = ['Timestamp', 'Type', 'Participant'] # df['Datetime'] = pd.to_datetime(df['Timestamp']) df['Datetime'] = df['Timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(float(x)).toordinal()) histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=2) \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle(conversation['conversation_name']) \ + ggplot.ylab('Number of messages') \ + ggplot.xlab('Date') print(histogram)
def plot_update_frequency(result): import pandas as pd import numpy #turns query results into timeseries of chnages d = [] v = [] for res in result: d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime()) v.append(res['count']) ts = pd.DataFrame(v, index = d, columns = ['changes']) ts = ts.resample('W', how='sum') ts.index.names = ['date'] import ggplot #plots timeseries of changes p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\ ggplot.geom_point(color = 'blue') +\ ggplot.xlab('Period') +\ ggplot.ylab('Changes') +\ ggplot.geom_smooth() +\ ggplot.ylim(low = 0) +\ ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"), labels = ggplot.date_format('%Y-%m')) +\ ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week') return p
def render(data, bin_width, plot_density=False): if plot_density: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \ + ggplot.geom_density() \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle('Conversation Densities') \ + ggplot.ylab('Density') \ + ggplot.xlab('Date') else: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \ + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \ + ggplot.ggtitle('Message Breakdown') \ + ggplot.ylab('Number of Messages') \ + ggplot.xlab('Date') print(plot)
def data_output(data, chart_title): print "Good News! You're data has been returned. I'm happy to show it to you." print "Just tell me how you want it - Table or Line Graph?" data_output = raw_input("Choose table or line > ") if data_output[0].lower() == "t": print "Ok, here's your data." print data elif data_output[0] == "l" or data_output[0].lower() =="g": import ggplot as gg plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \ gg.geom_point(color='black') + \ gg.geom_line(color='green') + \ gg.ggtitle(chart_title) + \ gg.xlab("Month, Year") + \ gg.ylab("Value") gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B")) print (plot + gg.theme_xkcd())
def plot_predictions(date_times, actual_values, predictions, match_id, feature_set_in, filename): """ Plot y-var and save based on specified variables. Assumes that df has already been filtered using dplyr's sift mechanism. Also assumes that a date has been passed in. """ actual_df = pd.DataFrame() actual_df['date_time'] = pd.to_datetime(date_times, errors="coerce", infer_datetime_format=True) actual_df['search_vol'] = actual_values actual_df['match_id'] = "actual" + match_id predict_df = pd.DataFrame() predict_df['date_time'] = pd.to_datetime(date_times, errors="coerce", infer_datetime_format=True) predict_df['search_vol'] = list(predictions) predict_df['match_id'] = "predictedby_" + str(feature_set_in) + match_id plotting_df = pd.concat([actual_df, predict_df], axis=0, ignore_index=True) # build layers for plot p = ggplot(aes(x='date_time', y='search_vol', group="match_id", color="match_id"), data=plotting_df) p += geom_line(size=2) # informative p += labs(x="time (gmt)", y="search volume (scaled to 100)") # p += ggtitle("man. city (h) vs. chelsea (a)\naug. 8 '16, etihad stadium") p += scale_x_date(labels=date_format("%H:%M:%S"), date_breaks="30 minutes") # visual t = theme_gray() t._rcParams['font.size'] = 8 t._rcParams['font.family'] = 'monospace' p += t # done p.save(filename, width=16, height=8)
print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 前几行") print(df.head()) #text = df.comments.iloc[0] 单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始 #s = SnowNLP(text) # #print(s.sentiments) def get_sentiment_cn(text): s = SnowNLP(text) return s.sentiments df["sentiment"] = df.comments.apply(get_sentiment_cn) print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值") print(df) print("#######################################") print("重要信息") print("所有影评的平均值为:", df.sentiment.mean()) print("所有影评的中位数为:", df.sentiment.median()) ggplot.ggplot(ggplot.aes(x="date", y="sentiment"), data=df) + ggplot.geom_point() + ggplot.geom_line( color='blue') + ggplot.scale_x_date( labels=ggplot.date_format("%Y-%m-%d")) df.sort_values(['sentiment'])[:5]
def render_png(self, buffer): """ Render timeseries plots as PNG images. """ bucket = self.bucket import matplotlib.font_manager matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf') import matplotlib try: matplotlib.use('agg') except: pass import matplotlib.pyplot as plt df = self.dataframe #df = df.set_index(['time']) # Compute datetime range boundaries datetime_min = min(df.time) datetime_max = max(df.time) datetime_delta = datetime_max - datetime_min #xmin = pd.to_datetime('2016-05-01') #xmax = pd.to_datetime('2016-08-01') renderer = bucket.tdata.get('renderer', 'matplotlib') if renderer == 'matplotlib': # Bring DataFrame into appropriate format df = dataframe_index_and_sort(df, 'time') # Propagate non-null values forward or backward, otherwise # matplotlib would not plot the sparse data frame properly. # With time series data, using pad/ffill is extremely common so that the “last known value” is available at every time point. # http://pandas.pydata.org/pandas-docs/stable/missing_data.html#filling-missing-values-fillna df.fillna(method='pad', inplace=True) # Make plots of DataFrame using matplotlib / pylab. # http://matplotlib.org/ # http://pandas.pydata.org/pandas-docs/version/0.13.1/visualization.html # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html # https://markthegraph.blogspot.de/2015/05/plotting-time-series-dataframes-in.html if 'style' in bucket.tdata and bucket.tdata.style: try: plt.style.use(bucket.tdata.style) except Exception: error_message = u'# Unknown style "{style_name}", available styles: {available}'.format( style_name=bucket.tdata.style, available=plt.style.available) log.error(error_message) return self.request.error_response(bucket, error_message) # Basic plotting #df.plot() #plt.savefig(buffer) # Advanced plotting ax = df.plot() fig = ax.get_figure() # Figure heading title = fig.suptitle(bucket.title.human, fontsize=12) #fig.tight_layout(pad=1.5) # Axis and tick labels ax.set_xlabel('Time') ax.set_ylabel('Value') ax.tick_params(axis='x', labelsize='smaller') # Grid and legend # http://matplotlib.org/users/legend_guide.html # http://matplotlib.org/examples/pylab_examples/legend_demo3.html ax.grid(True) legend_params = dict(ncol=1, loc='center left', bbox_to_anchor=(1, 0.5), fontsize='small', shadow=True, fancybox=True) legend = ax.legend(**legend_params) # title='Origin' #ax.legend(**legend_params) # title='Origin' # Sort list of legend labels # http://stackoverflow.com/questions/22263807/how-is-order-of-items-in-matplotlib-legend-determined/27512450#27512450 # Axis formatting #ax.xaxis_date() #ax.autoscale_view() # Compute appropriate locator and formatter locator, formatter = matplotlib_locator_formatter(datetime_delta, span=1) #ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) # Figure formatting fig.autofmt_xdate() # http://stackoverflow.com/questions/10101700/moving-matplotlib-legend-outside-of-the-axis-makes-it-cutoff-by-the-figure-box/10154763#10154763 fig.savefig(buffer, bbox_extra_artists=(title, legend), bbox_inches='tight') # TODO: Add annotations """ # https://stackoverflow.com/questions/11067368/annotate-time-series-plot-in-matplotlib # https://stackoverflow.com/questions/17891493/annotating-points-from-a-pandas-dataframe-in-matplotlib-plot import matplotlib.dates as mdates fig = plot.draw() ax = fig.axes[0] ax.annotate('Test', (mdates.date2num(x[1]), y[1]), xytext=(15, 15), textcoords='offset points', arrowprops=dict(arrowstyle='-|>')) """ elif renderer == 'ggplot': # https://yhat.github.io/ggplot/notebook.html?page=build/docs/examples/Multiple%20Line%20Plot.html # https://stackoverflow.com/questions/23541497/is-there-a-way-to-plot-a-pandas-series-in-ggplot # https://stackoverflow.com/questions/24478925/is-it-possible-to-plot-multiline-chart-on-python-ggplot/24479513#24479513 # https://github.com/yhat/ggplot/blob/master/docs/how-to/Building%20Faceted%20(or%20Trellised)%20Plots.ipynb # https://github.com/yhat/ggplot/blob/master/docs/how-to/Annotating%20Plots%20-%20Titles%20and%20Labels.ipynb # https://github.com/yhat/ggplot/blob/master/docs/how-to/How%20to%20make%20xkcd%20style%20graphs.ipynb from ggplot import ggplot, aes, qplot, geom_line, geom_text, ggtitle, stat_smooth, scale_x_date, date_format, date_breaks from ggplot import theme_538, theme_bw, theme_gray, theme_xkcd # https://stackoverflow.com/questions/24478925/is-it-possible-to-plot-multiline-chart-on-python-ggplot/24479513#24479513 # https://stackoverflow.com/questions/23541497/is-there-a-way-to-plot-a-pandas-series-in-ggplot # Convert DataFrame from wide to long format, retaining "time" as visible column df = dataframe_wide_to_long_indexed(df, 'time') dataframe_index_to_column(df, 'time') # Compute appropriate locator and formatter locator, formatter = matplotlib_locator_formatter(datetime_delta, span=2) plot = ggplot(df, aes(x='time', y='value', color='variable'))\ + geom_line()\ + scale_x_date(limits=(datetime_min, datetime_max), breaks=locator, labels=formatter)\ + ggtitle(bucket.title.human) # Axis labels plot.xlab = 'Time' plot.ylab = 'Value' # Labs #+ stat_smooth(colour='blue', span=0.2) \ #+ geom_text(aes(x='x', y='y'), label='hello world') #+ scale_x_date(limits=(xmin, xmax), breaks=date_breaks('1 hour'), labels=date_format('%Y-%m-%d\n%H:%M')) theme_name = bucket.tdata.get('theme') # TODO: Switching themes will leak some matplotlib/pyplot properties, postpone to future versions if theme_name: if isinstance(theme_name, float): theme_name = str(int(theme_name)) try: theme = eval('theme_' + theme_name) plot += theme() except Exception: error_message = u'# Unknown theme "{theme_name}"'.format( theme_name=theme_name) log.error(error_message) return self.request.error_response(bucket, error_message) plot.save(buffer) # Attempt to reset global matplotlib parameters to get rid of xkcd theme style """ import matplotlib as mpl #mpl.rcParams = mpl.rc_params() #del mpl.rcParams['path.sketch'] #del mpl.rcParams['path.effects'] #mpl.rcParams = mpl.defaultParams.copy() #mpl.rcParams.clear() #mpl.rcdefaults() #mpl.rcParams = mpl.rcParamsOrig if 'axes.prop_cycle' in mpl.rcParams: del mpl.rcParams['axes.prop_cycle'] mpl.rcParams.update({'path.sketch': None, 'path.effects': []}) mpl.rcParams.update(mpl.rc_params()) """ elif renderer == 'seaborn': # TODO: We don't do statistical plotting yet. # https://stanford.edu/~mwaskom/software/seaborn/examples/timeseries_from_dataframe.html # https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.tsplot.html import seaborn as sns sns.set(style="darkgrid") #sns.tsplot(data=gammas, time="timepoint", unit="subject", condition="ROI", value="BOLD signal") #print dir(df) #df['time'] = pandas.to_datetime(df['time']) #df = df.set_index(df.time) pprint(df) sns.tsplot(data=df, time="time") #sns.tsplot(data=df) plt.savefig(buffer) else: error_message = u'# Unknown renderer "{renderer_name}"'.format( renderer_name=renderer) log.error(error_message) return self.request.error_response(bucket, error_message)
def plot_vol(dates, x, cp, my_domain): # -------------------- Prepare for Plotting -------------------------- # # Prepare DataFrame objects for graphing #Add a column for the label to show in the legend in the graph #Need to reshape it, from (124,) to (124,1) for exmple, so that it #will concatenate. This gives a df with [date, vol_data, 'Volume'] v = ['Volume' for i in xrange(x.shape[0])] #df_domain = np.concatenate((x, v), axis=1) ndf_vol = np.transpose(np.array([dates, x, v])) df_vol = pd.DataFrame(ndf_vol, columns=['Date', 'Volume', 'Data']) #Create pre-allocated lists for plotting means and cp xmin_list = [0 for i in xrange(len(cp))] #hold lft pt of vol_mean xmax_list = [0 for i in xrange(len(cp))] #hold rt pt of vol_mean yint_list = [0 for i in xrange(len(cp))] #holds vol_means cp_date_list = [0 for i in xrange(len(cp))] #holds date for cp cp_value_list = [0 for i in xrange(len(cp))] #holds cp value ref_idx = 0 #used to keep track of vol_means #collect list data for plotting for i in xrange(len(cp)): cp_idx = cp[i][0] - 1 #-1 b/c 1-indexed (includes cp itself) xmin_list[i] = dates[ref_idx].toordinal() #convert to match ggplot xmax_list[i] = dates[cp_idx].toordinal() #convert to match ggplot yint_list[i] = cp[i][2] #use value from_mean for vol_mean cp_date_list[i] = dates[cp_idx] #date of cp #cp_value_list[i] = x[cp_idx] #value of cp cp_value_list[i] = cp[i][2] ref_idx = cp_idx + 1 #+1 b/c moving to next point #Reform lists into a data frame and attach to df_domains. The first two #lists can be created together since they are both numeric, but if I try #to create all three together all types will be downgraded to strings. #np.concatenate avoids this conversion. The transpose is needed to take #an item from each to form a single row. cp_lbl = ['Change Point' for i in xrange(len(yint_list))] #Need to create a dummy entry to put 'Volume Mean' into legend cp_date_list.append(dates[0]) yint_list.append(x[0]) cp_lbl.append('Volume Mean') ndf_cp = np.transpose(np.array([cp_date_list, yint_list, cp_lbl])) yint_list.pop(-1) cp_date_list.pop(-1) df_cp = pd.DataFrame(ndf_cp, columns=['Date', 'Volume', 'Data']) df_plot = pd.concat((df_vol, df_cp), axis=0) #Need to create a dummy entry to put 'Volume Mean' into legend #dummy = np.array([dates[0], x[0], 'Volume Mean']).reshape(1,-1) #df_cp = np.concatenate( (df_cp, dummy), axis=0) #add to bottom df_cp #df_domain = np.concatenate( (df_domain, df_cp), axis=0 ) #add df_domains #convert final array into a pd.DataFrame for printing and plotting #df_domain = pd.DataFrame(df_domain, columns=['Date','Volume','Data']) #df_domain.to_html(open('out.html','w')) #os.system('sudo cp out.html /usr/local/www/analytics/rwing') margin = 0.10 * (np.max(x) - np.min(x)) p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \ ggplot.geom_line(color='blue',size=2) + \ ggplot.geom_point(x=xmax_list, y=cp_value_list, color='black', \ shape='D', size=50) + \ ggplot.geom_hline(xmin=xmin_list, \ xmax=xmax_list, \ yintercept=yint_list, color="red", size=3) + \ ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \ ggplot.scale_colour_manual(values = ["black", "blue", "red"]) + \ ggplot.scale_y_continuous(labels='comma') + \ ggplot.ylim(low=np.min(x)-margin/4.0, high=np.max(x)+margin) + \ ggplot.xlab("Week (Marked on Mondays)") + \ ggplot.ylab("Message Vol") + \ ggplot.ggtitle("%s\nMessage Volume by Week" % my_domain) + \ ggplot.theme_seaborn() return p