def create_timeseries_celebrities(start_time): """ :return: json file to get timeseries for celebrities """ data = pandas.DataFrame(hour_status) celebrity_names = [ celeb_first[ix] + " " + celeb_last[ix] for ix in range(len(celeb_first)) ] data.columns = celebrity_names timestamp_rows = [] for i in range(len(hour_status)): time = start_time + i * 3600 timestamp_rows.append(datetime.datetime.fromtimestamp(time)) idx = pandas.DatetimeIndex(timestamp_rows) data = data.set_index(idx) match_data = dict(data) # data converted into a dictionary all_matches = pandas.DataFrame(match_data) all_matches[all_matches < 0] = 0 # plotting time_chart = vincent.Line(all_matches[470:]) time_chart.axis_titles(x='Time in hours', y='Tweet Count') time_chart.legend(title='Celebrities') time_chart.to_json('../Graphs/Question 6/time_chart_celeb.json') return all_matches
def create_timeseries_topics(start_time): # normalize data advertisements = np.array(ads_hour_count) / float(max(ads_hour_count)) celebrities = np.array(celeb_hour_count) / float(max(celeb_hour_count)) goals = np.array(goal_hour_count) / float(max(goal_hour_count)) teams = np.array(team_hour_count) / float(max(team_hour_count)) # print celeb_hour_count data = pandas.DataFrame({ "Advertisements": advertisements, "Celebrities": celebrities, "Goal Chatter": goals, "Team Chatter": teams }) data[data < 0] = 0 timestamp_rows = [] for i in range(len(goal_hour_count)): time = start_time + i * 3600 timestamp_rows.append(datetime.datetime.fromtimestamp(time)) idx = pandas.DatetimeIndex(timestamp_rows) data = data.set_index(idx) # data converted into a dictionary match_data = dict(data) # Dataframe for multiple series all_matches = pandas.DataFrame(match_data) # plotting time_chart = vincent.Line(all_matches[470:]) time_chart.axis_titles(x='Time in hours', y='Tweet Count') time_chart.legend(title='Topic Modelling') time_chart.to_json('../Graphs/Question 6/time_chart_topics.json')
def api_vendas_dia(): import pandas as pd resultado = mongo.db.notas_fiscais.aggregate([{ "$group": { "_id": "$nfeProc.NFe.infNFe.ide.dEmi", "total": { "$sum": "$nfeProc.NFe.infNFe.total.ICMSTot.vNF" } } }, { "$sort": { "_id": -1 } }, { "$limit": 20 }]) resultado = resultado['result'] resultado = pd.DataFrame.from_records(resultado, index="_id") line = vincent.Line(resultado, width=540, height=380) line.axis_titles(x='', y='Valor vendas') line.y_axis_properties(label_align="right", title_offset=-40, title_size=14) return line.to_json()
def create_timeseries_ads(start_time): """ :return: json file to get timeseries for advertisements """ df = pandas.DataFrame(hour_status) df.columns = graph_ads data = (df - df.mean()) / (df.max() - df.min()) # normalize data[data < 0] = 0 timestamp_rows = [] for i in range(len(hour_status)): time = start_time + i * 3600 timestamp_rows.append(datetime.datetime.fromtimestamp(time)) idx = pandas.DatetimeIndex(timestamp_rows) data = data.set_index(idx) match_data = dict(data) # all the data together all_matches = pandas.DataFrame(match_data) all_matches[all_matches < 0] = 0 # all_matches[all_matches == np.NaN] = 0 # plotting the time-series time_chart = vincent.Line(all_matches[470:]) time_chart.axis_titles(x='Time in hours', y='Tweet Count') time_chart.legend(title='Advertisement Names') time_chart.to_json('../Graphs/Question 6/time_chart_ads.json') return all_matches
def make_line_chart_popup(data_row:pd.Series, title:str) -> folium.Popup: '''Create a line chart popup from temporal Series for departements Index of the Series have to be in {year}_median, {year}_decile1, {year}_decile9, {year+1}_median, {year+1}_decile1... format this popup can be added in map layers''' # filter index names and build 3 columns from one(series) data = { 'decile_1': data_row.filter(regex=".*decile_1$").values, 'decile_9': data_row.filter(regex=".*decile_9$").values, 'median': data_row.filter(like="median").values, } df_to_display = pd.DataFrame.from_dict(data) data_row = data_row.drop("color") # create index of the dataframe from the inital data_row Series.index df_to_display.index = pd.to_datetime(list(dict.fromkeys([int(annee_c[:4]) for annee_c in data_row.index.tolist()])), format="%Y") line_chart = vincent.Line(df_to_display, width=300, height=200) line_chart.axis_titles(x='Année', y='prix m2') line_chart.legend(title=title) popup = folium.Popup() folium.Vega(line_chart, width = 400, height=250).add_to(popup) return popup
def data_multiline(): period = request.args.get('period', 9) print period print request.args data = get_data(float(period)) return vincent.Line(data, width=WIDTH, height=HEIGHT, iter_idx=('x')).to_json()
def stocks(): line = vincent.Line(data.price[['MSFT', 'AAPL']], width=WIDTH, height=HEIGHT) line.axis_titles(x='Date', y='Price') line.legend(title='MSFT vs AAPL') return line.to_json()
def main5(fn): with open(fn + '.json', 'r') as f: count_all = Counter() dates = [] search_hashes = '#WWAT' for line in f: tweet = json.loads(line) tokens = preprocess(tweet['text']) terms = terms_single(terms_hash(tokens) + getHashtags(tweet)) count_all.update(terms) time = getTime(tweet) if time is not None and search_hashes in terms: dates.append(time) print(count_all.most_common(10)) per_minute = time_series(dates, '1D') per_minute.to_csv(fn + '.series.csv', sep='\t', encoding='utf-8') # and now the plotting time_chart = vincent.Line(per_minute) time_chart.axis_titles(x='Time', y='Freq') time_chart.to_json(fn + '.count.time_chart.json')
def test_to_json(self): '''Test json output This tests that files are written with the correct names, not that the json was serialized correctly.''' line = vincent.Line() line.tabular_data([1, 2, 3, 4, 5]) from mock import call, patch, MagicMock with patch('__builtin__.open', create=True) as mock_open: mock_open.return_value = MagicMock(spec=file) path = 'test.json' data_path = 'test_data.json' default_data_path = 'data.json' html_path = 'test.html' default_html_path = 'vega_template.html' # No data splitting / html kwargs_default_behavior = [{}, { 'split_data': False }, { 'html': False }, { 'data_path': data_path }, { 'html_path': html_path }] for kwargs in kwargs_default_behavior: line.to_json(path, **kwargs) mock_open.assert_called_once_with(path, 'w') mock_open.reset_mock() line.to_json(path, split_data=True) mock_open.assert_has_calls( [call(path, 'w'), call(default_data_path, 'w')], any_order=True) mock_open.reset_mock() line.to_json(path, split_data=True, data_path=data_path) mock_open.assert_has_calls( [call(path, 'w'), call(data_path, 'w')], any_order=True) mock_open.reset_mock() # The HTML option reads a default file that needs a real return # value the template substitution. mock_open.return_value.read.return_value = '$path' line.to_json(path, html=True) mock_open.assert_has_calls( [call(path, 'w'), call(default_html_path, 'w')], any_order=True) mock_open.reset_mock() line.to_json(path, html=True, html_path=html_path) mock_open.assert_has_calls( [call(path, 'w'), call(html_path, 'w')], any_order=True) mock_open.reset_mock()
def route_popup(schedule, route_id): route_df = make_route_df(schedule, route_id) vega = vincent.Line(vincent.Data.from_pandas(route_df)) popup = Vega(vega.to_json(), width=vega.width + 50, height=vega.height + 50) return popup
def plot_time_series(fname: str, export_fname: str, num_top_terms: int, rule: str, term_to_analyze_fname: str, export_fname_for_trending_terms: str, export_fname_for_non_trending_terms: str): terms_to_analyze = read_all_important_terms(term_to_analyze_fname) terms_date, count_all = analyze(fname, terms_to_analyze=terms_to_analyze) idx_list = [] keys = [] match_data = {} for term_freq_tuple in count_all.most_common(num_top_terms): key = term_freq_tuple[0] if (key != '') : value = terms_date[term_freq_tuple[0]] keys.append(key) ones = [1] * len(value) idx = pandas.DatetimeIndex(value) term_time_series = pandas.Series(ones, index=idx) # # Resampling / bucketing # per_minute = term_time_series.resample(rule).sum().fillna(0) # time_bin = term_time_series.resample(rule).sum().fillna(0) time_bin = term_time_series.resample(rule).sum().fillna(0) match_data[key] = time_bin idx_list.append(idx) all_matches = pandas.DataFrame(data=match_data, index=idx_list[0]) # Resampling as above # all_matches = all_matches.resample('1Min', how='sum').fillna(0) # all_matches = pandas.DataFrame(data=match_data, index=idx_list[0]) # Re-sampling for all added series all_matches = all_matches.resample(rule).sum().fillna(0) print("Terms plotted:") print(keys) time_chart = vincent.Line(all_matches[keys], width=1150, height=580) time_chart.axis_titles(x='Time', y='Freq') time_chart.legend(title='Term Timeseries') time_chart.to_json(export_fname) print("Term-Timeseries file exported at [%s]." % export_fname) export_terms(terms_date, count_all, keys, export_fname_trending=export_fname_for_trending_terms, export_fname_non_trending=export_fname_for_non_trending_terms) print("Terms with top trending terms are exported at [%s]" % export_fname_for_trending_terms) print("Terms without top trending terms are exported at [%s]" % export_fname_for_non_trending_terms)
def time_visualizatiton(): with open(fname, 'r') as f: for line in f: tweet = json.loads(line) # let's focus on hashtags only at the moment terms_hash = [ term for term in preprocess(tweet['text']) if term.startswith('#') ] # track when the hashtag is mentioned if '#kaplansba' in terms_hash: dates.append(tweet['created_at']) # a list of "1" to count the hashtags ones = [1] * len(dates) # the index of the series idx = pandas.DatetimeIndex(dates) # the actual series (at series of 1s for the moment) my_dates = pandas.Series(ones, index=idx) # series is resampled per minute per_minute = my_dates.resample('1Min', how='sum').fillna(0) time_chart = vincent.Line(my_dates) time_chart.axis_titles(x='Time', y='Freq') time_chart.to_json('time_chart.json') # all the data together match_data = dict(kaplansba=per_minute_i, amazonstudent=per_minute_s, hello=per_minute_e) # we need a DataFrame, to accommodate multiple series all_matches = pandas.DataFrame(data=match_data, index=per_minute_i.index) # Resampling as above all_matches = all_matches.resample('1Min', how='sum').fillna(0) # and now the plotting time_chart = vincent.Line( all_matches[['kaplansba', 'amazonstudent', 'hello']]) time_chart.axis_titles(x='Time', y='Freq') time_chart.legend(title='Matches') time_chart.to_json('time_chart.json')
def show_acce(map_data_p, lat0_p, lng0_p, vehicleplate_number_p, num1_p, acceleration_data_p, lat_p, lng_p, location_time_p, file_out_path_p): m = folium.Map([lat0_p, lng0_p], zoom_start=8) m.add_child(folium.LatLngPopup()) #在地图上显示经纬度; route = folium.PolyLine( #polyline方法为将坐标用线段形式连接起来 map_data_p, #将坐标点连接起来 weight=3, #线的大小为3 color='blue', #线的颜色为橙色 opacity=0.8 #线的透明度 ).add_to(m) #将这条线添加到刚才的区域m内 for i in range(num1_p): if i != 0 and i % 59 == 0: y_data = [acceleration_data_p[j] for j in range(i - 59, i + 1, 1)] vis = vincent.Line(y_data, width=320, height=150) vis.axis_titles(x=location_time_p[i - 59] + '至' + location_time_p[i] + '的加速度变化', y='单位:m/s^2') vis_json = vis.to_json() tooltip = location_time_p[i - 59] + '至' + location_time_p[i] status = 0 for k in range(i - 59, i + 1): if acceleration_data_p[k] > 3 or acceleration_data_p[ k] < -3: #判断加速度 status = 1 if status == 1: folium.Marker( location=[lat_p[i], lng_p[i]], popup=folium.Popup(max_width=3250).add_child( folium.Vega(vis_json, width=380, height=200)), icon=folium.Icon(color='red', icon='info-sign'), tooltip=tooltip).add_to( m ) #在每60条记录处显示一个标记点,且点击标记点可以看到过去60条记录内车辆的加速度变化折线图速度变化折线图 else: folium.Marker( location=[lat_p[i], lng_p[i]], popup=folium.Popup(max_width=3250).add_child( folium.Vega(vis_json, width=380, height=200)), tooltip=tooltip).add_to( m ) #在每60条记录处显示一个标记点,且点击标记点可以看到过去的60条记录内车辆的加速度变化折线图速度变化折线图 html_path = os.path.join('r', file_out_path_p, vehicleplate_number_p + '_acceleration.html') m.save(html_path) #将结果以HTML形式保存 webbrowser.open(html_path, new=1)
def time_plot(fname, search_word, classified): ext = fname.split('.')[1] stop = stopwords.words('english') punctuation = string.punctuation.replace('#', '') with open(fname, 'r') as FILE: dates = [] if not ext == 'json': next(FILE) for line in FILE: values = line.split(';') text = C.unicode_clean(values[len(values) - 1]) text = text.translate(None, punctuation).strip() text = text.replace('RT ', '') terms = [ term for term in text.lower().split() if term not in stop ] if search_word.lower() in terms: dates.append(values[1]) elif ext == 'json': for line in FILE: information = json.loads(line) text = information['text'].encode('unicode_escape') text = C.unicode_clean(text) text = text.translate(None, punctuation).strip() text = text.replace('RT ', '') terms = [ term for term in text.lower().split() if term not in stop ] if search_word.lower() in terms: dates.append(information['created_at']) # A list of "1" to count the terms ones = [1] * len(dates) # The index of the series idx = pandas.DatetimeIndex(dates) # Resampling / bucketing bar_time = pandas.Series(ones, index=idx) bar_time = bar_time.resample(classified).sum().fillna(0) # Creating the Chart time_chart = vincent.Line(bar_time) time_chart.axis_titles(x='Time', y='Freq') if search_word[0] == '#': hashtag = search_word[1:] time_chart.legend(title='#%s' % hashtag) time_chart.to_json('Time_hash_%s.json' % hashtag, html_out=True, html_path='Time_hash_%s.html' % hashtag) else: time_chart.legend(title='%s' % search_word) time_chart.to_json('Time_%s.json' % search_word, html_out=True, html_path='Time_%s.html' % search_word)
def graphLine(estacion, idioma): models.dbToCsv(estacion, idioma) tweets = pd.read_csv('static/tweets.csv') tweets['created_at'] = pd.to_datetime(pd.Series(tweets['created_at'])) tweets.set_index('created_at', drop=False, inplace=True) tweets_pm = tweets['created_at'].resample('M').count() # vincent.core.initialize_notebook() line = vincent.Line(tweets_pm) line.axis_titles(x='Meses', y='Nº Tweets') line.colors(brew='Spectral') line.to_json('static/area.json') return 0
def draw_line(self, data, coordinates, style, label, mplobj=None): import vincent # only import if VincentRenderer is used if coordinates != 'data': warnings.warn("Only data coordinates supported. Skipping this") linedata = {'x': data[:, 0], 'y': data[:, 1]} line = vincent.Line(linedata, iter_idx='x', width=self.figwidth, height=self.figheight) # TODO: respect the other style settings line.scales['color'].range = [style['color']] if self.chart is None: self.chart = line else: warnings.warn("Multiple plot elements not yet supported")
def build_json_plot(df, prd, state): df_r = df[df['LocationAbbr'] == 'LA'].set_index('Year') dic = pd.concat([prd[[state]], df_r], axis=1) dic.index = np.arange(dic.shape[0]) dic.columns = ['Prediction %s' % state, 'LocationAbbr', 'True Value' ] + list(dic.columns[3:]) line = vincent.Line(dic[['True Value', 'Prediction %s' % state]], columns=['True Value', 'Prediction %s' % state], key_on='idx') line.axis_titles(x='Year', y='Diabete in ' + state) line.legend(title='Diabete') line.height = 200 line.width = 200 return (line.grammar())
def render_stats(url, stats, method): import vincent txt = '' if 'calls' in stats: calls = stats['calls'] data = [x['duration'] for x in calls] if data: line = vincent.Line(data) line.axis_titles(x='%s %s' % (method, x['url']), y='Duration') filepath = STATIC_PATH.join("assets", "%s_%s.json" % (method, url)) line.to_json(str(filepath)) txt = """ <div id="vis_%(url)s_%(method)s" class="vis"></div> """ % locals() return txt
def createTSMap(pos, timeSeries, zoom_start=4): map = folium.Map(location=pos.items(), zoom_start=4,crs='EPSG4326') df = timeSeries; df.index = df.index.values.astype('M8[D]') chart = vincent.Line(df[['evi','ndvi']],width=300,height=150) chart.legend(title='') chart.axis_titles(x='dates', y='') popup = folium.Popup(max_width=400) folium.Vega(chart.to_json(), height=200, width=450).add_to(popup) folium.Marker(pos.items(), popup=popup,icon=folium.Icon(color='green',icon='info-sign')).add_to(map) wms = folium.features.WmsTileLayer('https://neo.sci.gsfc.nasa.gov/wms/wms', name='MODIS Data', format='image/png', layers='MOD13A2_M_NDVI') wms.add_to(map) return map
def create_data_plots_map(): json_files = {} dic = {} for name in cases.index: coord = get_coordinates(name) dic[name] = coord df = cases_pT_new.T[name].to_frame(name='cases') df['deaths*10'] = deaths_pT_new.T[name] * 10 line = v.Line(df.rolling(7, center=True, min_periods=1).mean()) line.axis_titles(x='Date', y='per 100k inhabitants') line.legend(name) line.width = 350 line.height = 150 json_files[name] = str(line.to_json()) df = pd.DataFrame(dic, index=['lat', 'long']).T df2 = pd.DataFrame(json_files, index=['json']).T df['json'] = df2 coord = df.copy() coord.to_csv('data/coord.csv') return True
def test_datetimeandserial(self): '''Test pandas serialization and datetime parsing''' import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2004', '1/1/2006') price = pd.DataFrame( {tic: data['Adj Close'] for tic, data in all_data.iteritems()}) scatter = vincent.Scatter() scatter.tabular_data(price, columns=['AAPL', 'GOOG']) assert scatter.data[0]['values'][0]['x'] == 10.49 nt.assert_is_none(scatter.data[0]['values'][0]['y']) line = vincent.Line() line.tabular_data(price, columns=['AAPL']) assert line.data[0]['values'][0]['x'] == 1073030400000
def timeDataVisualization(self): dates_Search = [] with open(self.fname, 'r') as f: for line in f: tweet = json.loads(line) terms_only = [ term for term in self._preprocess(tweet.get('text', 'nil')) ] if 'search' in terms_only: dates_Search.append(tweet['created_at']) ones = [1] * len(dates_Search) idx = pandas.DatetimeIndex(dates_Search) Search = pandas.Series(ones, idx) per_minute = Search.resample('1Min', how='sum').fillna(0) time_chart = vincent.Line(per_minute) time_chart.axis_titles(x="time", y="Freq") time_chart.to_json('time_chart.json')
def main3(fn): with open(fn + '.json', 'r') as f: count_all = Counter() datess = [] search_hashes = ['#Endomondo', '#MexicoNeedsWWATour'] for line in f: tweet = json.loads(line) tokens = preprocess(tweet['text']) terms = terms_hash(tokens) count_all.update(terms) for i in range(0, len(search_hashes)): datess.append([]) if search_hashes[i] in terms: datess[i].append(getTime(tweet)) print(count_all.most_common(10)) per_minutes = [] for dates in datess: per_minute = time_series(dates) per_minutes.append(per_minute) keys = search_hashes values = per_minutes # all the data together match_data = dict(zip(keys, values)) # we need a DataFrame, to accommodate multiple series all_matches = pandas.DataFrame(data=match_data, index=values[0].index) # Resampling as above all_matches = all_matches.resample('1Min', how='sum').fillna(0) #all_matches = all_matches.resample('1D', how='sum').fillna(0) # and now the plotting time_chart = vincent.Line(all_matches[keys]) time_chart.axis_titles(x='Time', y='Freq') time_chart.legend(title='Matches') time_chart.to_json(fn + '.time_chart.json')
def graphLineIdioma(lang, estacion): models.dbToCsv(estacion, lang) tweets = pd.read_csv('static/tweets.csv') #if(tweets['lang'] == lang): #tweets['created_at']['lang'] = pd.to_datetime(pd.Series(tweets['created_at'],tweets['lang'])) lista = [] for creacion, idioma in tweets.itertuples(index=False): if idioma == lang: tweets['created_at'] = pd.to_datetime( pd.Series(tweets['created_at'])) #tweets['created_at'] = pd.to_datetime(pd.Series(creacion)) tweets.set_index('created_at', drop=False, inplace=True) tweets_pm = tweets['created_at'].resample('M').count() # vincent.core.initialize_notebook() line = vincent.Line(tweets_pm) line.axis_titles(x='Meses', y='Nº Tweets') line.colors(brew='Spectral') line.to_json('static/area.json') return 0
def graph(id): project = Project.query.get_or_404(id) logs = project.logs log_number = len([1 for _ in logs]) if project.logs and log_number > 1: logs = project.logs.order_by(ProjectLog.log_date) x = [ datetime.combine(log.log_date, datetime.min.time()).timestamp() * 1000 for log in logs ] y = [log.previous_score * score_multiplier for log in logs] multi_iter = {'x': x, 'data': y} line = vincent.Line(multi_iter, iter_idx='x') line.scales['x'] = vincent.Scale(name='x', type='time', range='width', domain=vincent.DataRef( data='table', field="data.idx")) line.scales['y'] = vincent.Scale(name='y', range='height', nice=True, domain=[0, score_multiplier]) line.scales['color'] = vincent.Scale(name='color', range=['#12897D'], type='ordinal') line.axes['y'].ticks = 3 line.axes['x'].ticks = 7 if line_style: line.marks['group'].marks[ 0].properties.enter.interpolate = vincent.ValueRef( value=line_style) return jsonify({"status": "success", "data": line.grammar()}) else: return failure_response("No history for this project", 404)
def test2(): #import pandas_datareader.data as web #import pandas.io.data as web #from pandas.io import data, wb # becomes from pandas_datareader import data all_data = {} for ticker in ['AAPL', 'IBM', 'YHOO', 'MSFT']: all_data[ticker] = data.get_data_yahoo(ticker, '1/1/2010', '1/1/2012') price = pd.DataFrame( {tic: data['Adj Close'] for tic, data in all_data.iteritems()}) import vincent line = vincent.Line(price) line.axis_titles(x='Date', y='Price') line.legend(title='IBM vs AAPL') js = line.to_json( 'out.json', html_out=True, )
def plotdataPopVega(data,vals): ''' Fuction to create a data popup, as a time servies. What then can be added to a marker ''' df=data[vals] df.fillna(value='null', inplace=True) # Does not handle missing values. line=vincent.Line(df) line.axis_titles(x="Time", y="Mass Concentration") line.legend(title="Values") #find the lenght of the data width=len(df.index) if width <500: width=400 line.width=width line.height=200 vega = folium.Vega(json.loads(line.to_json()), width="30%", height="10%") popup = folium.Popup(max_width=line.width+75).add_child(vega) return popup
def main2(fn): with open(fn + '.json', 'r') as f: data_samples = [] doc_lengths = [] dates = [] for line in f: tweet = json.loads(line) sample, length = preprocess_text(getText(tweet)) data_samples.append(sample) doc_lengths.append(length) dates.append(getTime(tweet)) n_features = 1000 n_topics = 10 n_top_words = 20 #lda_topic(data_samples, n_features, n_topics, n_top_words) data_viz, doc_topic_dists = lda_viz(data_samples, doc_lengths, n_features, n_topics, n_top_words) #print(type(data_viz)) counts = [] for i in range(0, len(dates)): #count = sum(1 for topic_prob in doc_topic_dists[i] if topic_prob >= 0.5) count = 1 if doc_topic_dists[i][9] >= 0.5 else 0 #rint(count) counts.append(count) per_minute = time_series(dates, '1D', counts) per_minute.to_csv(fn + '.topic.csv', sep='\t', encoding='utf-8') # and now the plotting time_chart = vincent.Line(per_minute) time_chart.axis_titles(x='Time', y='Freq') time_chart.to_json(fn + '.topic.time_chart.json')
# f is the file pointer to the JSON data set for line in f: #for every tweet if line.strip(): tweet = json.loads(line) # let's focus on hashtags only at the moment terms_hash = [term for term in preprocess(tweet['text'].translate(non_bmp_map)) if term.startswith('#')] # track when the hashtag is mentioned count_all.update(terms_hash) if '#Trump' in terms_hash: dates_trump.append(tweet['created_at']) # a list of "1" to count the hashtags ones = [1]*len(dates_trump) # the index of the series, find date format idx = pandas.DatetimeIndex(dates_trump) # the actual series (at series of 1s for the moment) trump = pandas.Series(ones, index=idx) # Resampling / bucketing per_minute = trump.resample('1Min').sum().fillna(0) time_chart = vincent.Line(trump) time_chart.axis_titles(x='Time', y='Freq') time_chart.to_json('time_chart.json', html_out=True, html_path='chart.html')
# Prints the result for co-occurences for search_word print("Co-occurences for %s" % search_word) print(count_search.most_common(10)) com_max = [] # For each term, look for most common co-occurent terms for t1 in com: t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1)) for t2, t2_count in t1_max_terms: com_max.append(((t1, t2), t2_count)) # Get the most frequent co-occurences term_max = sorted(com_max, key=operator.itemgetter(1), reverse=True) """print(term_max[:5])""" # Print the first 5 most frequent words """print(count_all.most_common(10))""" # Visual representation without time reference """ word_freq = count_all.most_common(20) labels, freq = zip(*word_freq) data = {'data':freq, 'x':labels} bar = vincent.Bar(data, iter_idx='x') bar.to_json('term_freq.json') """ # Visual representation with time reference time_chart = vincent.Line(IDE) time_chart.axis_titles(x='Time', y='Freq') time_chart.to_json('time_chart.json')