def visualize_all(segment,rows,columns): '''fuction to visualize each variable's count''' fig, ax = plt.subplots(rows, columns, figsize=(20, 10)) for variable, subplot in zip(segment, ax.flatten()): sns.countplot(segment[variable], ax=subplot, order = segment[variable].value_counts().index) for label in subplot.get_xticklabels(): label.set_rotation(0)
path = os.getcwd() allFiles = glob.glob(path + "/bitcoinTweets_*.txt") frame = pd.DataFrame() list_ = [] for i, file_ in enumerate(allFiles): df = pd.read_csv(file_, sep='::', engine='python', header=None) df['month'] = i + 16 list_.append(df) frame = pd.concat(list_) frame.columns = ['tweets', 'sentiment', 'date'] #print frame frame['sentiment'] = frame['sentiment'].astype('category') sns.set() plt.figure() sns.countplot(x='sentiment', hue='date', data=frame, palette="Greens_d") #grouped = frame.groupby('month') ''' for name,group in grouped: print name print group group.hist(by='month', column='fare') ''' #frame.hist(by='month', column='sentiment') plt.show() #frame.sentiment.groupby('month').value_counts().plot.bar(stacked=True) #plt.show()
def gridSpec(title,df): """ Function to Visualize Segmented Data """ fig = plt.figure(figsize=(10,15)) fig.suptitle(title, size=20) # grid spec gs = gridspec.GridSpec(nrows=6, ncols=4, figure=fig, width_ratios= [1, 1, 1,1], height_ratios=[1, 1, 1,1,1,1], wspace=0.3, hspace=0.7) for v in df.columns: if(v=='age'): # row 1 ax1 = fig.add_subplot(gs[0, 0:4]) # sns.distplot(df.loc[:,Age], hist=True,ax=ax1, ) #array, top subplot ax1 = df[v].astype('int').value_counts().plot(kind='bar',rot=0,use_index=False) vPercent(ax1,df) plt.title(v) if(v=='net_worth'): # row 2 #ax2 = fig.add_subplot(gs[1,0:4]) #sns.countplot(seg1.loc[:,'net_worth'], ax=ax2,order = seg1['net_worth'].value_counts().index ) #array, top subplot ax2 = fig.add_subplot(gs[1:4,0:2]) ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%') ax2.set_xlabel(v) ax2.set_ylabel(None) if(v=='household_income'): #ax2 = fig.add_subplot(gs[2,0:4]) #sns.countplot(seg1.loc[:,'household_income'], ax=ax2,order = seg1['household_income'].value_counts().index ) #array, top subplot ax2 = fig.add_subplot(gs[1:4,2:4]) ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%',) ax2.set_xlabel(v) ax2.set_ylabel(None) if(v=='investment_personal'): # row 5 and 6 ax3 = fig.add_subplot(gs[4,0]) sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='investment_real_estate'): ax3 = fig.add_subplot(gs[5,0]) sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='investment_stocks_bonds'): ax3 = fig.add_subplot(gs[4,1]) sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='life_insurance_policy_owner'): ax3 = fig.add_subplot(gs[5,1]) sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='children'): ax4 = fig.add_subplot(gs[5,2]) sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot vPercent(ax4,df) plt.ylabel(None) if(v=='household_size'): ax4 = fig.add_subplot(gs[4,2]) sns.countplot(y=df.loc[:,v], ax=ax4, ) #array, top subplot hPercent(ax4,df) plt.xlabel('House Hold Size') plt.ylabel(None) if(v=='house_owner'): ax4 = fig.add_subplot(gs[4,3]) sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot vPercent(ax4,df) plt.xlabel('House Owner') plt.ylabel(None) if(v=='marital_status'): ax4 = fig.add_subplot(gs[5,3]) sns.countplot(y=df.loc[:,v], ax=ax4, ) #array, top subplot hPercent(ax4,df) # plt.title(Marital_Status) plt.xlabel('Marital Status') plt.ylabel(None) plt.show()
def gridSpec2(title,df): """ Function to Visualize Segmented Data """ fig = plt.figure(figsize=(10,15)) fig.suptitle(title, size=20) # grid spec gs = gridspec.GridSpec(nrows=6, ncols=4, figure=fig, width_ratios= [1, 1, 1,1], height_ratios=[1, 1, 1,1,1,1], wspace=0.3, hspace=0.7) for v in df.columns: if(v=='age'): # row 1 ax1 = fig.add_subplot(gs[0:1, 0:4]) # sns.distplot(df.loc[:,v], hist=True,ax=ax1,bins=100) #array, top subplot # ax1.set_xticks([10,20,30,40,50,60,70,80]) ax1 = df[v].value_counts(sort=False).plot(kind='bar',rot=0) vPercent(ax1,df) plt.title(v) if(v=='occupation'): # row 2 #ax2 = fig.add_subplot(gs[1,0:4]) #sns.countplot(seg1.loc[:,'net_worth'], ax=ax2,order = seg1['net_worth'].value_counts().index ) #array, top subplot #ax2 = fig.add_subplot(gs[1:3,0:2]) ax2 = fig.add_subplot(gs[1:4,0:2]) # ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%') sns.countplot(y=df.loc[:,v], ax=ax2,order = df[v].value_counts().index ) #array, top subplot hPercent(ax2,df) ax2.set_xlabel(v) ax2.set_ylabel(None) if(v=='household_income'): #ax2 = fig.add_subplot(gs[2,0:4]) #sns.countplot(seg1.loc[:,'household_income'], ax=ax2,order = seg1['household_income'].value_counts().index ) #array, top subplot ax2 = fig.add_subplot(gs[1:4,2:4]) ax2 = df[v].value_counts().plot(kind='pie',autopct='%.f%%',) ax2.set_xlabel(v) ax2.set_ylabel(None) if(v=='age_group_psnx'): # # row 5 and 6 ax3 = fig.add_subplot(gs[4:6,2:4]) sns.countplot(df.loc[:,v], ax=ax3,order = df[v].value_counts().index ) #array, top subplot vPercent(ax3,df) ax3.set_xlabel(v) ax3.set_ylabel(None) # if(v=='number_of_children'): # ax3 = fig.add_subplot(gs[4,2]) # sns.countplot(df.loc[:,v], ax=ax3, ) #array, top subplot # vPercent(ax3,df) # plt.xlabel(v) # plt.ylabel(None) if(v=='marital_status'): ax3 = fig.add_subplot(gs[4,0]) sns.countplot(y=df.loc[:,v], ax=ax3, ) #array, top subplot hPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='children'): ax3 = fig.add_subplot(gs[4,1]) sns.countplot(df.loc[:,v], ax=ax3,palette={'Y':'limegreen','N':'#FA8072'} ) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='household_size'): ax3 = fig.add_subplot(gs[5,0]) sns.countplot(df.loc[:,v], ax=ax3,) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) if(v=='house_owner'): # row 5 and 6 ax3 = fig.add_subplot(gs[5,1]) sns.countplot(df.loc[:,v], ax=ax3,order = df[v].value_counts().index ) #array, top subplot vPercent(ax3,df) plt.xlabel(v) plt.ylabel(None) # if(v=='age_range2'): # ax4 = fig.add_subplot(gs[1:3,0:2]) # sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot # vPercent(ax4,df) # plt.xlabel(v) # plt.ylabel(None) # # if(v=='grand_children'): # ax4 = fig.add_subplot(gs[4,0]) # sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot # vPercent(ax4,df) # plt.xlabel(v) # plt.ylabel(None) # # if(v=='interests__sports'): # ax4 = fig.add_subplot(gs[4,1]) # sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot # vPercent(ax4,df) # plt.xlabel(v) # plt.ylabel(None) # # if(v=='interests_travel'): # ax4 = fig.add_subplot(gs[5,0]) # sns.countplot(df.loc[:,v], ax=ax4, ) #array, top subplot # vPercent(ax4,df) ## plt.title(Marital_Status) # plt.xlabel(v) # plt.ylabel(None) plt.show()
# In[535]: df['TravelBuds'] = df["SibSp"] + df["Parch"] df['Alone'] = np.where(df['TravelBuds'] > 0, 0, 1) df.drop('TravelBuds', axis=1, inplace=True) # In[350]: categ = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Alone', 'Survived'] conti = ['Fare', 'Age'] #Distribution fig = plt.figure(figsize=(16, 12)) for i in range(0, len(categ)): fig.add_subplot(3, 3, i + 1) sns.countplot(x=categ[i], data=df, alpha=.7) for col in conti: fig.add_subplot(3, 3, i + 2) sns.distplot(df[col].dropna(), kde_kws={ "lw": 2, "color": colors[8] }, hist_kws={"alpha": .5}) i += 1 plt.show() # In[373]:
# Question 1: Which country has won the most prizes in each category? max_count = -float('inf') max_cat = '' for cat in all_cat: current = data.loc[data['Category'] == cat, 'Category'].agg(['count']).iloc[ 0] if current > max_count: max_cat = cat max_count = current print('Which category has won the most prizes?:', max_cat) print('Chemistry\n', chem_df['Birth Country'].value_counts()) plt.figure(figsize=(10, 12)) chem_graph = sns.countplot(y='Birth Country', data=chem_df, order=chem_df['Birth Country'].value_counts().index, palette='GnBu_d') plt.show() print('Economics\n', eco_df['Birth Country'].value_counts()) plt.figure(figsize=(10, 12)) eco_graph = sns.countplot(y='Birth Country', data=eco_df, order=eco_df['Birth Country'].value_counts().index, palette='GnBu_d') plt.show() print('Medicine\n', med_df['Birth Country'].value_counts()) plt.figure(figsize=(10, 12)) med_graph = sns.countplot(y='Birth Country', data=med_df, order=med_df['Birth Country'].value_counts().index, palette='GnBu_d') plt.show() print('Physics\n', phy_df['Birth Country'].value_counts())
def multi_cand_plotting(): df = gcm().get_data([vars.run_num(), vars.evt_num(), vars.pt(gcm().D0)]) sel = extended_selection.get_complete_selection(True) sel &= selection.delta_mass_wide_signal_region() passed = remove_right_sign_candidates() passed &= remove_clones() outfile = gcm().get_output_path('selection') + 'mult_candidates.pdf' with PdfPages(outfile) as pdf: add_separation_page(pdf, 'Matched on eventNumber and runNumber') candidates = df.groupby(['eventNumber', 'runNumber']).size() fig, ax = plt.subplots(figsize=(10, 10)) sns.countplot(candidates, palette='plasma') ax.set_xlabel('Number of candidates') ax.set_ylabel('Number of events') ax.set_yscale("log", nonposy='clip') pdf.savefig(fig) plt.clf() add_separation_page( pdf, 'matched on eventnumber and runNumber. ' 'full selection + signal window') candidates = df[sel].groupby(['eventNumber', 'runNumber']).size() fig, ax = plt.subplots(figsize=(10, 10)) sns.countplot(candidates, palette='plasma') ax.set_xlabel('Number of candidates') ax.set_ylabel('Number of events') ax.set_yscale("log", nonposy='clip') pdf.savefig(fig) plt.clf() add_separation_page(pdf, 'Matched on eventNumber, runNumber and D0 PT') candidates = df.groupby(['eventNumber', 'runNumber', 'D0_PT']).size() fig, ax = plt.subplots(figsize=(10, 10)) sns.countplot(candidates, palette='plasma') ax.set_xlabel('Number of candidates') ax.set_ylabel('Number of events') ax.set_yscale("log", nonposy='clip') pdf.savefig(fig) plt.clf() add_separation_page( pdf, 'Matched on eventNumber, runNumber and D0 PT' 'full selection + signal window') candidates = df[sel].groupby(['eventNumber', 'runNumber', 'D0_PT']).size() fig, ax = plt.subplots(figsize=(10, 10)) sns.countplot(candidates, palette='plasma') ax.set_xlabel('Number of candidates') ax.set_ylabel('Number of events') ax.set_yscale("log", nonposy='clip') pdf.savefig(fig) plt.clf() add_separation_page( pdf, 'Matched on eventNumber and runNumber' 'full selection + signal window + clones + RS matching') candidates = df[sel & passed].groupby(['eventNumber', 'runNumber']).size() fig, ax = plt.subplots(figsize=(10, 10)) sns.countplot(candidates, palette='plasma') ax.set_xlabel('Number of candidates') ax.set_ylabel('Number of events') ax.set_yscale("log", nonposy='clip') pdf.savefig(fig) plt.clf() add_separation_page( pdf, 'Matched on eventNumber, runNumber and D0 PT' 'full selection + signal window + clones + RS matching') candidates = df[sel & passed].groupby( ['eventNumber', 'runNumber', 'D0_PT']).size() fig, ax = plt.subplots(figsize=(10, 10)) sns.countplot(candidates, palette='plasma') ax.set_xlabel('Number of candidates') ax.set_ylabel('Number of events') ax.set_yscale("log", nonposy='clip') pdf.savefig(fig) plt.clf()