Пример #1
0
def main(file_path):
    # Validate raw data path
    if not os.path.exists(file_path):
        LOG_ERROR('Could not find file: {}'.format(file_path))
        return

    # Validate raw data file type
    if not file_path.endswith('.pkl'):
        LOG_ERROR('File path must be a pickle file')
        return

    with open(file_path, 'rb') as f:
        LOG_INFO('Parsing pickle file: {}'.format(file_path))
        conversation = pickle.load(f)

        LOG_INFO('Found conversation: {}'.format(conversation['conversation_name']))

        df = pd.DataFrame(conversation['messages'])
        df.columns = ['Timestamp', 'Type', 'Participant']
        # df['Datetime'] = pd.to_datetime(df['Timestamp'])
        df['Datetime'] = df['Timestamp'].apply(lambda x:
                datetime.datetime.fromtimestamp(float(x)).toordinal())

        histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \
                        + ggplot.geom_histogram(alpha=0.6, binwidth=2) \
                        + ggplot.scale_x_date(labels='%b %Y') \
                        + ggplot.ggtitle(conversation['conversation_name']) \
                        + ggplot.ylab('Number of messages') \
                        + ggplot.xlab('Date')

        print(histogram)
Пример #2
0
def plot_deg_distrib(G):
	(in_deg, out_deg, deg) = wa.degree_distribution(G)
	in_deg_series = pd.Series(in_deg)
	out_deg_series = pd.Series(out_deg)
	in_out = { 'in_deg': in_deg_series, 'out_deg': out_deg_series }
	df = pd.DataFrame(in_out)
	df = pd.melt(df)
	p = gg.ggplot(gg.aes(x='value', color='variable', fill='variable'), data=df2) + gg.geom_histogram(alpha=0.6, binwidth=1)
	print p
Пример #3
0
    def histogram(self, dataframe, bins=100, width=None, height=None, palette=None, title='Histogram', values=None,
                  groups=None, legend=True):
        palette = self.__default_options__.get('palette', None) if palette is None else palette

        return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \
               geom_histogram(alpha=0.6, breaks=bins, position="fill") + \
               self._palette(palette) + \
               ggtitle(title) + \
               scale_y_continuous(name="Count (%s)" % values)
Пример #4
0
def plot_bin_dists(df, bin_def="distance_bin <= 500"):
    plt.rcParams['figure.figsize'] = np.array([16, 12]) * 0.65

    p = gp.ggplot(gp.aes(x='R2'), data=df.query(bin_def))
    p = p + gp.geom_histogram(
        fill='coral') + gp.facet_wrap("distance_bin") + gp.theme_seaborn(
            context='talk') + gp.ggtitle(bin_def)

    return p
Пример #5
0
def plot_deg_distrib(G):
    (in_deg, out_deg, deg) = wa.degree_distribution(G)
    in_deg_series = pd.Series(in_deg)
    out_deg_series = pd.Series(out_deg)
    in_out = {'in_deg': in_deg_series, 'out_deg': out_deg_series}
    df = pd.DataFrame(in_out)
    df = pd.melt(df)
    p = gg.ggplot(gg.aes(x='value', color='variable', fill='variable'),
                  data=df2) + gg.geom_histogram(alpha=0.6, binwidth=1)
    print p
Пример #6
0
    def hist_chart(self, conn, column, table_chosen, title):

        data_df = dfile.single_selector(conn=conn,
                                        table=table_chosen,
                                        column=column)

        hist_plot = ggplot(
            aes(x=column),
            data=data_df) + geom_histogram() + theme_gray() + labs(title=title)
        now = datetime.datetime.now()
        b = now
        print(b)
        print(b - a)
        print(hist_plot)
Пример #7
0
def render(data, bin_width, plot_density=False):
    if plot_density:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \
               + ggplot.geom_density() \
               + ggplot.scale_x_date(labels='%b %Y') \
               + ggplot.ggtitle('Conversation Densities') \
               + ggplot.ylab('Density') \
               + ggplot.xlab('Date')
    else:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \
               + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \
               + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \
               + ggplot.ggtitle('Message Breakdown') \
               + ggplot.ylab('Number of Messages') \
               + ggplot.xlab('Date')

    print(plot)
Пример #8
0
    def histogram(self,
                  dataframe,
                  bins=100,
                  width=None,
                  height=None,
                  palette=None,
                  title='Histogram',
                  values=None,
                  groups=None,
                  legend=True):
        palette = self.__default_options__.get(
            'palette', None) if palette is None else palette

        return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \
               geom_histogram(alpha=0.6, breaks=bins, position="fill") + \
               self._palette(palette) + \
               ggtitle(title) + \
               scale_y_continuous(name="Count (%s)" % values)
Пример #9
0
import pandas as pd
meat = gp.meat


p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.ggtitle(u'散点图')
print (p)
p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_line(color='blue')+gp.ggtitle(u'折线图')
print (p)
p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.geom_line(color='blue')+gp.ggtitle(u'散点图+折线图')
print (p)

# 将想要表达的变量组成一列
meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date')
# meat_lng包含了date,value(变量的值组成的列),variable(变量的名称组成的列)
p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+\
    gp.geom_point()+gp.geom_line()
print (p)




meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date')
p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+gp.geom_point()+gp.facet_wrap('variable')
print (p)

p = gp.ggplot(gp.aes(x='beef'),data=meat)+gp.geom_histogram()
print (p)

meat_lng = pd.melt(meat[['date','beef','pork']],id_vars='date')
p = gp.ggplot(gp.aes(x='value'),data=meat_lng)+gp.facet_wrap('variable')+gp.geom_histogram()
print (p)
Пример #10
0
def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.  
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.  

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
     
    To see all the columns and data points included in the turnstile_weather 
    dataframe. 
     
    However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
    of the actual data in the turnstile_weather dataframe
    '''

    #Ridership by day of week - Option 1 (Entries by Day of Week)
    #pd.options.mode.chained_assignment = None  # default='warn'
    #turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday)

    #plot = gg.ggplot(turnstile_weather, aes('weekday','ENTRIESn_hourly')) + ggtitle('Entries by Day of Week') + xlab('Day of Week') + ylab('Number  of Entries') +gg.geom_histogram(stat = "bar", position = "stack")+ scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

    #Ridership by day of week - Option 2 (Avg number of Entries by Day of Week)
    pd.options.mode.chained_assignment = None  # default='warn'
    turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday)
    averageentries_on_weekday = turnstile_weather.groupby('weekday', as_index=False).ENTRIESn_hourly.mean()
    averageentries_on_weekday.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True)

    plot = gg.ggplot(averageentries_on_weekday, aes('weekday', 'avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by Day of Week') + xlab('Day of Week') + ylab('avg number of Entries')  + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

    #Ridership by Unit(Station) - Option 3 (Entries by UNIT)
    #pd.options.mode.chained_assignment = None  # default='warn'

    #plot = gg.ggplot(turnstile_weather, aes('UNIT','ENTRIESn_hourly')) + ggtitle('Entries by UNIT') + xlab('UNIT') + ylab('Number  of Entries') +gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 100), breaks=range(0, 100, 1))

    #Ridership by day of week - Option 4 (Avg number of Entries by UNIT)
    #pd.options.mode.chained_assignment = None  # default='warn'

    #averageentries_unit = turnstile_weather.groupby('UNIT', as_index=False).ENTRIESn_hourly.mean()
    #averageentries_unit.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True)

    #plot = gg.ggplot(averageentries_unit, aes('UNIT','avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by UNIT') + xlab('UNIT') + ylab('avg number of Entries')  + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 50), breaks=range(0, 50, 1))

    return plot
Пример #11
0
    # create a new long-form dataframe for clean plotting purposes
    values_dict = {
        "significant": coefficients[feature]["significant"],
        "insignificant": coefficients[feature]["unsignificant"]
    }
    df = pd.DataFrame.from_dict(values_dict, orient='index')
    df = df.transpose()
    df = pd.melt(df)
    df['feature'] = feature
    dfs_to_concat.append(df)

master_df = pd.concat(dfs_to_concat)

# histogram
p = ggplot(aes(x='value', fill='variable', color='variable'), data=master_df)
p += geom_histogram(bins=25, alpha=0.5)
p += scale_x_continuous(limits=(-25, 25))
p += ggtitle("sarimax coefficient magnitude distribution")
p += facet_wrap("feature", ncol=3, scales="free")
p += labs(x=" ", y=" ")

# visuals
t = theme_gray()
t._rcParams['font.size'] = 10
t._rcParams['font.family'] = 'monospace'

p += t
p.save("arima_1/" + "histogram.png")

# boxplot
p = ggplot(aes(x='variable', y='value'), data=master_df)
Пример #12
0
print(subjects_words_count.describe())

#%%
import ggplot as gg

df = pd.DataFrame(subjects_words_count, columns = ["count"])

hist =  gg.ggplot(df, gg.aes(x = "count"))
hist += gg.xlab("# of words") +\
        gg.ylab("Frequency") +\
        gg.ggtitle("Frequency of words")

hist += gg.geom_vline(x = df.mean(), color="red")
hist += gg.geom_vline(x = df.median(), color="blue")
hist += gg.geom_density(color="green")
hist += gg.geom_histogram(binwidth=1, color="grey")

hist

#%%

# 1st attemtp to classify subjects per tag

X_raw_train = subjects_train
X_raw_test = subjects_test

Y_train = raw_data_train.target
Y_test = raw_data_test.target

target_names = raw_data_train.target_names
def get_target_name(index):
Пример #13
0
 def test_ggtitle(self):
     p = gg.ggplot(gg.aes(x='mpg'),
                   gg.mtcars) + gg.geom_histogram() + gg.ggtitle("TEST")
     self.assertEqual(p.title, "TEST")
Пример #14
0
 def test_ylab(self):
     p = gg.ggplot(gg.aes(x='mpg'),
                   gg.mtcars) + gg.geom_histogram() + gg.ylab("TEST")
     self.assertEqual(p.ylab, "TEST")
Пример #15
0
# 升级pip, 以免安装.whl失败。注意 .whl文件名不能修改,不要使用迅雷下载
# pip install --upgrade setuptools

# 安装numpy,scipy,windows下需要编译,可以在http://www.lfd.uci.edu/~gohlke/pythonlibs/ 下载编译包.whl安装。
# pip install .whl

# windows下需要安装VC++ 14.0,http://landinghub.visualstudio.com/visual-cpp-build-tools ,在该网站下载 Visual C++ Build Tools 2015

# 安装ggplot
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple ggplot

# 绘制散点图
import ggplot as gp
meat = gp.meat  # 使用ggplot自带的测试数据
p = gp.ggplot(
    gp.aes(
        x='date',  # 指定x轴数据
        y='beef',  # 指定y轴数据
        color='beef'),  # 指定填充颜色
    data=meat)  # 指定数据集

p + gp.geom_line()  # 绘制折线图
p + gp.geom_point()  # 绘制散点图

# 绘制分面图
gp.ggplot(gp.aes(x='carat', y='price', color='color'),
          data=gp.diamonds) + gp.geom_point() + gp.facet_wrap('cut')

# 绘制直方图
gp.ggplot(gp.aes(x='price'), data=gp.diamonds) + gp.geom_histogram()
Пример #16
0
 def test_ylab(self):
     p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.ylab("TEST")
     self.assertEqual(p.ylab, "TEST")
Пример #17
0
 def test_ggtitle(self):
     p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.ggtitle("TEST")
     self.assertEqual(p.title, "TEST")
print("Mean of DI: " + str(df_pre.DI.mean()))
print("MAD DI-DS: " + str((df_pre["DI"] - df_pre["DS"]).mean()))

# #### Get to know the data set

# In[11]:

# import additional dependencies for plotting
from ggplot import geom_histogram, geom_density
from ggplot import *

# In[12]:

# Distribution of target variable
ggplot(aes(x='DI', ), data=df_pre) + geom_histogram(
    binwidth=2, alpha=0.6, fill="#008080",
    color="#20b2aa") + xlab("DI") + ggtitle("Distribution of DI")

# # 2. Outlier detection and handling

# In[13]:

# possible negative values in distribution
# check for negative values of DI
df_pre[df_pre["DI"] < 0]

# In[14]:

# duration of transportion cannot be negative
# delete negative occurences of DI
df_pre = df_pre[~df_pre["DI"] < 0]
Пример #19
0
        tile(w_from_figure_wh_ratio, norm(data)),
        '%s-layer-acts-%s-%s-(i=%s)' % (img_desc, layer, show_tuple_tight(data.shape), batch_i),
    )

conv_layers = filter(lambda (layer, acts): len(acts.data.shape) == 4, net.blobs.items())
fc_layers   = filter(lambda (layer, acts): len(acts.data.shape) != 4, net.blobs.items())

# Plot conv acts
for layer, acts in conv_layers:
    plot_conv_acts(layer, acts)

# Plot fc acts
df = pd.concat([
    pd.DataFrame({'act': acts.data[batch_i], 'layer': layer}).reset_index()
    for layer, acts in fc_layers
])
plot_gg(gg_layer(
    gg.ggplot(df, gg.aes(y='act', x='index')),
    gg.geom_point(alpha=.5),
    gg.facet_wrap(x='layer', scales='free'),
    gg.ggtitle('%s layer acts fc/prob points (i=%s)' % (img_desc, batch_i)),
))
plot_gg(gg_layer(
    gg.ggplot(df, gg.aes(x='act')),
    gg.geom_histogram(bins=25, size=0),
    gg.facet_wrap(x='layer', scales='free'),
    gg.scale_y_log(),
    gg.ylim(low=0.1),
    gg.ggtitle('%s layer acts fc/prob histo (i=%s)' % (img_desc, batch_i)),
))