示例#1
0
def visualize_all(segment,rows,columns):
    '''fuction to visualize each variable's count'''
    fig, ax = plt.subplots(rows, columns, figsize=(20, 10))
    for variable, subplot in zip(segment, ax.flatten()):
        sns.countplot(segment[variable], ax=subplot, order = segment[variable].value_counts().index)
        for label in subplot.get_xticklabels():
            label.set_rotation(0)
示例#2
0
path = os.getcwd()
allFiles = glob.glob(path + "/bitcoinTweets_*.txt")
frame = pd.DataFrame()

list_ = []
for i, file_ in enumerate(allFiles):
    df = pd.read_csv(file_, sep='::', engine='python', header=None)
    df['month'] = i + 16
    list_.append(df)
frame = pd.concat(list_)
frame.columns = ['tweets', 'sentiment', 'date']
#print frame

frame['sentiment'] = frame['sentiment'].astype('category')
sns.set()

plt.figure()

sns.countplot(x='sentiment', hue='date', data=frame, palette="Greens_d")
#grouped = frame.groupby('month')
'''
for name,group in grouped:
    print name
    print group
    group.hist(by='month', column='fare')
'''
#frame.hist(by='month', column='sentiment')
plt.show()
#frame.sentiment.groupby('month').value_counts().plot.bar(stacked=True)
#plt.show()
示例#3
0
def gridSpec(title,df):
    """ Function to Visualize Segmented Data """
        
    fig = plt.figure(figsize=(10,15))
    fig.suptitle(title, size=20)
    # grid spec
    gs = gridspec.GridSpec(nrows=6, 
                           ncols=4, 
                           figure=fig, 
                           width_ratios= [1, 1, 1,1],
                           height_ratios=[1, 1, 1,1,1,1],
                           wspace=0.3,
                           hspace=0.7)
    for v in df.columns:
        if(v=='age'):
            # row 1
            ax1 = fig.add_subplot(gs[0, 0:4])
#            sns.distplot(df.loc[:,Age], hist=True,ax=ax1, ) #array, top subplot
            ax1 = df[v].astype('int').value_counts().plot(kind='bar',rot=0,use_index=False)
            vPercent(ax1,df)
            plt.title(v)
        
        if(v=='net_worth'):
            # row 2
            #ax2 = fig.add_subplot(gs[1,0:4])
            #sns.countplot(seg1.loc[:,'net_worth'], ax=ax2,order = seg1['net_worth'].value_counts().index ) #array, top subplot
            ax2 = fig.add_subplot(gs[1:4,0:2])
            ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%')
            ax2.set_xlabel(v)
            ax2.set_ylabel(None)
        
        if(v=='household_income'):
            #ax2 = fig.add_subplot(gs[2,0:4])
            #sns.countplot(seg1.loc[:,'household_income'], ax=ax2,order = seg1['household_income'].value_counts().index ) #array, top subplot
            ax2 = fig.add_subplot(gs[1:4,2:4])
            ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%',)
            ax2.set_xlabel(v)
            ax2.set_ylabel(None)
            
        if(v=='investment_personal'):
            # row 5 and 6
            ax3 = fig.add_subplot(gs[4,0])
            sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
        
        if(v=='investment_real_estate'):
            ax3 = fig.add_subplot(gs[5,0])
            sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
        
        if(v=='investment_stocks_bonds'):
            ax3 = fig.add_subplot(gs[4,1])
            sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
            
        if(v=='life_insurance_policy_owner'):
            ax3 = fig.add_subplot(gs[5,1])
            sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
            
        if(v=='children'):
            ax4 = fig.add_subplot(gs[5,2])
            sns.countplot(df.loc[:,v], ax=ax4,  ) #array, top subplot
            vPercent(ax4,df)
            plt.ylabel(None)
            
        if(v=='household_size'):
            ax4 = fig.add_subplot(gs[4,2])
            sns.countplot(y=df.loc[:,v], ax=ax4, ) #array, top subplot
            hPercent(ax4,df)
            plt.xlabel('House Hold Size')
            plt.ylabel(None)
            
        if(v=='house_owner'):
            ax4 = fig.add_subplot(gs[4,3])
            sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot
            vPercent(ax4,df)
            plt.xlabel('House Owner')
            plt.ylabel(None)
            
        if(v=='marital_status'):
            ax4 = fig.add_subplot(gs[5,3])
            sns.countplot(y=df.loc[:,v], ax=ax4, ) #array, top subplot
            hPercent(ax4,df)
#            plt.title(Marital_Status)
            plt.xlabel('Marital Status')
            plt.ylabel(None)
     
    plt.show()
示例#4
0
def gridSpec2(title,df):
    """ Function to Visualize Segmented Data """
        
    fig = plt.figure(figsize=(10,15))
    fig.suptitle(title, size=20)
    # grid spec
    gs = gridspec.GridSpec(nrows=6, 
                           ncols=4, 
                           figure=fig, 
                           width_ratios= [1, 1, 1,1],
                           height_ratios=[1, 1, 1,1,1,1],
                           wspace=0.3,
                           hspace=0.7)
    for v in df.columns:
        if(v=='age'):
            # row 1
            ax1 = fig.add_subplot(gs[0:1, 0:4])
#            sns.distplot(df.loc[:,v], hist=True,ax=ax1,bins=100) #array, top subplot
#            ax1.set_xticks([10,20,30,40,50,60,70,80])
            ax1 = df[v].value_counts(sort=False).plot(kind='bar',rot=0)
            vPercent(ax1,df)
            plt.title(v)
        
        if(v=='occupation'):
            # row 2
            #ax2 = fig.add_subplot(gs[1,0:4])
            #sns.countplot(seg1.loc[:,'net_worth'], ax=ax2,order = seg1['net_worth'].value_counts().index ) #array, top subplot
            #ax2 = fig.add_subplot(gs[1:3,0:2])
            ax2 = fig.add_subplot(gs[1:4,0:2])
#            ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%')
            sns.countplot(y=df.loc[:,v], ax=ax2,order = df[v].value_counts().index ) #array, top subplot
            hPercent(ax2,df)
            ax2.set_xlabel(v)
            ax2.set_ylabel(None)
        
        if(v=='household_income'):
            #ax2 = fig.add_subplot(gs[2,0:4])
            #sns.countplot(seg1.loc[:,'household_income'], ax=ax2,order = seg1['household_income'].value_counts().index ) #array, top subplot
            ax2 = fig.add_subplot(gs[1:4,2:4])
            ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%',)
            ax2.set_xlabel(v)
            ax2.set_ylabel(None)
            
        
        
        if(v=='age_group_psnx'):
#            # row 5 and 6
            ax3 = fig.add_subplot(gs[4:6,2:4])
            sns.countplot(df.loc[:,v], ax=ax3,order = df[v].value_counts().index ) #array, top subplot
            vPercent(ax3,df)
            ax3.set_xlabel(v)
            ax3.set_ylabel(None)
        
            
#        if(v=='number_of_children'):
#            ax3 = fig.add_subplot(gs[4,2])
#            sns.countplot(df.loc[:,v], ax=ax3, ) #array, top subplot
#            vPercent(ax3,df)
#            plt.xlabel(v)
#            plt.ylabel(None)
        
        if(v=='marital_status'):
            ax3 = fig.add_subplot(gs[4,0])
            sns.countplot(y=df.loc[:,v], ax=ax3, ) #array, top subplot
            hPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
            
                
        if(v=='children'):
            ax3 = fig.add_subplot(gs[4,1])
            sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
        
        if(v=='household_size'):
            ax3 = fig.add_subplot(gs[5,0])
            sns.countplot(df.loc[:,v], ax=ax3,) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
            
            
        if(v=='house_owner'):
            # row 5 and 6
            ax3 = fig.add_subplot(gs[5,1])
            sns.countplot(df.loc[:,v], ax=ax3,order = df[v].value_counts().index ) #array, top subplot
            vPercent(ax3,df)
            plt.xlabel(v)
            plt.ylabel(None)
        
       
#        if(v=='age_range2'):
#            ax4 = fig.add_subplot(gs[1:3,0:2])
#            sns.countplot(df.loc[:,v], ax=ax4,  ) #array, top subplot
#            vPercent(ax4,df)
#            plt.xlabel(v)
#            plt.ylabel(None)
#            
#        if(v=='grand_children'):
#            ax4 = fig.add_subplot(gs[4,0])
#            sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot
#            vPercent(ax4,df)
#            plt.xlabel(v)
#            plt.ylabel(None)
#            
#        if(v=='interests__sports'):
#            ax4 = fig.add_subplot(gs[4,1])
#            sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot
#            vPercent(ax4,df)
#            plt.xlabel(v)
#            plt.ylabel(None)
#            
#        if(v=='interests_travel'):
#            ax4 = fig.add_subplot(gs[5,0])
#            sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot
#            vPercent(ax4,df)
##            plt.title(Marital_Status)
#            plt.xlabel(v)
#            plt.ylabel(None)
     
    plt.show()
示例#5
0
文件: titanic.py 项目: tvml/ml1819
# In[535]:

df['TravelBuds'] = df["SibSp"] + df["Parch"]
df['Alone'] = np.where(df['TravelBuds'] > 0, 0, 1)
df.drop('TravelBuds', axis=1, inplace=True)

# In[350]:

categ = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Alone', 'Survived']
conti = ['Fare', 'Age']

#Distribution
fig = plt.figure(figsize=(16, 12))
for i in range(0, len(categ)):
    fig.add_subplot(3, 3, i + 1)
    sns.countplot(x=categ[i], data=df, alpha=.7)

for col in conti:
    fig.add_subplot(3, 3, i + 2)
    sns.distplot(df[col].dropna(),
                 kde_kws={
                     "lw": 2,
                     "color": colors[8]
                 },
                 hist_kws={"alpha": .5})
    i += 1

plt.show()

# In[373]:
示例#6
0
# Question 1: Which country has won the most prizes in each category?
max_count = -float('inf')
max_cat = ''
for cat in all_cat:
    current = data.loc[data['Category'] == cat, 'Category'].agg(['count']).iloc[
        0]
    if current > max_count:
        max_cat = cat
        max_count = current
print('Which category has won the most prizes?:', max_cat)


print('Chemistry\n', chem_df['Birth Country'].value_counts())
plt.figure(figsize=(10, 12))
chem_graph = sns.countplot(y='Birth Country', data=chem_df,
                           order=chem_df['Birth Country'].value_counts().index, palette='GnBu_d')
plt.show()

print('Economics\n', eco_df['Birth Country'].value_counts())
plt.figure(figsize=(10, 12))
eco_graph = sns.countplot(y='Birth Country', data=eco_df,
                          order=eco_df['Birth Country'].value_counts().index, palette='GnBu_d')
plt.show()

print('Medicine\n', med_df['Birth Country'].value_counts())
plt.figure(figsize=(10, 12))
med_graph = sns.countplot(y='Birth Country', data=med_df,
                          order=med_df['Birth Country'].value_counts().index, palette='GnBu_d')
plt.show()

print('Physics\n', phy_df['Birth Country'].value_counts())
示例#7
0
def multi_cand_plotting():
    df = gcm().get_data([vars.run_num(), vars.evt_num(), vars.pt(gcm().D0)])
    sel = extended_selection.get_complete_selection(True)
    sel &= selection.delta_mass_wide_signal_region()
    passed = remove_right_sign_candidates()
    passed &= remove_clones()

    outfile = gcm().get_output_path('selection') + 'mult_candidates.pdf'
    with PdfPages(outfile) as pdf:
        add_separation_page(pdf, 'Matched on eventNumber and runNumber')
        candidates = df.groupby(['eventNumber', 'runNumber']).size()
        fig, ax = plt.subplots(figsize=(10, 10))
        sns.countplot(candidates, palette='plasma')
        ax.set_xlabel('Number of candidates')
        ax.set_ylabel('Number of events')
        ax.set_yscale("log", nonposy='clip')
        pdf.savefig(fig)
        plt.clf()

        add_separation_page(
            pdf, 'matched on eventnumber and runNumber. '
            'full selection + signal window')
        candidates = df[sel].groupby(['eventNumber', 'runNumber']).size()
        fig, ax = plt.subplots(figsize=(10, 10))
        sns.countplot(candidates, palette='plasma')
        ax.set_xlabel('Number of candidates')
        ax.set_ylabel('Number of events')
        ax.set_yscale("log", nonposy='clip')
        pdf.savefig(fig)
        plt.clf()

        add_separation_page(pdf, 'Matched on eventNumber, runNumber and D0 PT')
        candidates = df.groupby(['eventNumber', 'runNumber', 'D0_PT']).size()
        fig, ax = plt.subplots(figsize=(10, 10))
        sns.countplot(candidates, palette='plasma')
        ax.set_xlabel('Number of candidates')
        ax.set_ylabel('Number of events')
        ax.set_yscale("log", nonposy='clip')
        pdf.savefig(fig)
        plt.clf()

        add_separation_page(
            pdf, 'Matched on eventNumber, runNumber and D0 PT'
            'full selection + signal window')
        candidates = df[sel].groupby(['eventNumber', 'runNumber',
                                      'D0_PT']).size()
        fig, ax = plt.subplots(figsize=(10, 10))
        sns.countplot(candidates, palette='plasma')
        ax.set_xlabel('Number of candidates')
        ax.set_ylabel('Number of events')
        ax.set_yscale("log", nonposy='clip')
        pdf.savefig(fig)
        plt.clf()

        add_separation_page(
            pdf, 'Matched on eventNumber and runNumber'
            'full selection + signal window + clones + RS matching')
        candidates = df[sel & passed].groupby(['eventNumber',
                                               'runNumber']).size()
        fig, ax = plt.subplots(figsize=(10, 10))
        sns.countplot(candidates, palette='plasma')
        ax.set_xlabel('Number of candidates')
        ax.set_ylabel('Number of events')
        ax.set_yscale("log", nonposy='clip')
        pdf.savefig(fig)
        plt.clf()

        add_separation_page(
            pdf, 'Matched on eventNumber, runNumber and D0 PT'
            'full selection + signal window + clones + RS matching')
        candidates = df[sel & passed].groupby(
            ['eventNumber', 'runNumber', 'D0_PT']).size()
        fig, ax = plt.subplots(figsize=(10, 10))
        sns.countplot(candidates, palette='plasma')
        ax.set_xlabel('Number of candidates')
        ax.set_ylabel('Number of events')
        ax.set_yscale("log", nonposy='clip')
        pdf.savefig(fig)
        plt.clf()