def time_on_course(self): """ Estimation of time spent in course. Using self.potime, click events are grouped by username and time sorted. Time differences >= 10 sec and <= 3600 sec are summed. Justification for those cutoffs can be found here: Submitted Paper: "eText Use in Blended Introductory Physics Courses: Interpreting Meaningful Interactions and the Effects of Course Structure" Only estimates time for users with more than one event. Parameters (generated during class initialization) ---------- None Output ------ Saves figures to specified directories. Returns ------- None """ fig = plt.figure(figsize=[12,6]) ax1 = fig.add_subplot(1,1,1) ### All Non-Certified participants = self.person[self.person.certified==0].username.dropna().unique() self.time_spent[participants].apply(np.log).hist(ax=ax1,bins=100,range=[0,16], color='Silver',edgecolor=None,alpha=0.9,label="$Non-Certified$") ### Certified certs = self.person[self.person.certified==1].username.dropna().unique() self.time_spent[certs].apply(np.log).hist(ax=ax1,bins=100,range=[0,16], color='Crimson',edgecolor=None,alpha=0.9,label="$Certified$") ax1.set_xticks([np.log(x) for x in [1,10,60,600,3600,10*3600,100*3600]]) ax1.set_xticklabels(['1 sec','10 sec','1 min','10 min','1 hr','10 hrs','100 hrs'],rotation=40) ax1.legend() figsavename = self.figpath+'time_in_course_'+self.nickname.replace('.','_') xff.texify(fig,ax1, xlabel='Total Time In Course', ylabel='Count', # title=self.nickname, tic_size=24, label_size=24, gridb='y', figsavename=figsavename+'.png') # print figsavename # if figsavename != None: # dpiset = 300 # #fig.savefig('OUTPUT_Figures/%s/%s_%s.png' %(mens2_cid,figsavename,nickname), bbox_inches='tight', dpi=dpiset) # fig.savefig('%s' % (figsavename), bbox_inches='tight', dpi=dpiset) return None
def daily_unique_users(self): """ """ fig = plt.figure(figsize=[20, 6]) ax1 = fig.add_subplot(1, 1, 1) sgcolors = ['Silver', 'Crimson'] sglabels = ['$Non-Certified$', '$Certified$'] for sg in [0, 1]: users = self.person[self.person.certified == sg].username.dropna().unique() daily = pd.crosstab( self.potime[self.potime.username.isin(users)].username, daily = daily[daily > 0].count().sort_index() daily.index = [np.datetime64(d) for d in daily.index] daily.plot(ax=ax1, style="-o", ms=6, lw=2, color=sgcolors[sg], rot=0, label=sglabels[sg]) xmin = (self.cinfo['start_date'] - np.timedelta64(2, 'W')).item().date() xmax = (self.cinfo['end_date'] + np.timedelta64(4, 'W')).item().date() ax1.set_xlim(xmin, xmax) ax1 = xff.timeseries_plot_formatter(ax1, interval=1) ax1.legend(loc=1, prop={'size': 24}, frameon=False) ylim1 = ax1.get_ylim()[1] ax1.vlines([ self.cinfo.start_date.item().date(), self.cinfo.end_date.item().date() ], 0, ylim1, colors='Gray', lw=1.5, linestyles='--') ax1.set_ylim(0, ylim1) figsavename = self.figpath + 'daily_unique_users_' + self.nickname.replace( '.', '_') xff.texify(fig, ax1, ylabel='Unique Users', tic_size=20, label_size=24, datefontsize=20, title=self.nickname, figsavename=figsavename + '.png') return None
def resource_use(self, category): """ Number of unique resources by category. Parameters (generated during class initialization) ---------- category: ['video','problem', etc.] Output ------ Saves figures to specified directories. Returns ------- None """ resources = self.caxis[self.caxis.category == category].index.unique() usage = self.freq.filter(resources).count(axis=1) bins = len(resources) brange = (0, usage.max()) sgcolors = ['Silver', 'Crimson'] sglabels = ['$Non-Certified$', '$Certified$'] fig = plt.figure(figsize=[12, 6]) ax1 = fig.add_subplot(1, 1, 1) for sg in [ 0, 1 ]: ### Note, later this could be all four subgroups: explored, certified, etc subpop = self.person[self.person.certified == sg].username.dropna().unique() usage[subpop].hist( range=brange, bins=bins, cumulative=0, #histtype='step', normed=True, alpha=0.85, edgecolor=sgcolors[sg], color=sgcolors[sg]) # usage[subpop].hist(range=brange,bins=bins, # cumulative=-1,histtype='step', # normed=True,alpha=0.85, # edgecolor=sgcolors[sg],color=sgcolors[sg],label=None) ax1.legend(sglabels, loc=9, prop={'size': 24}, frameon=False) figsavename = self.figpath + 'resource_use_' + category + '_dist_' + self.nickname.replace( '.', '_') xff.texify(fig, ax1, xlabel='Unique %s resources' % (category), ylabel='Normalized Count', title=self.nickname, figsavename=figsavename + '.png') return None
def enrollment_plots(self,**kwargs): """ Plots using the start_date from enrollment_df. Parameters ---------- None Output ------ Figures and respective formats. Returns ------- None """ ### For JSON Data Output jsondata = [] def date2epoch(x): return int( (datetime.combine(x, datetime.min.time()) - datetime(1970,1,1)).total_seconds()*1000 ) for C in self.enrollment_df.columns: fig = plt.figure(figsize=(12,6)) ax1 = fig.add_subplot(1,1,1) self.enrollment_df[C].plot(ax=ax1,color=xff.colors['institute'],rot=0,lw=3,label=self.nickname) ax1 = xff.timeseries_plot_formatter(ax1) ax1.set_yticklabels([r'${0}$'.format("%d" % (y)) for y in ax1.get_yticks()]) #ax1.legend(loc=4,prop={'size':22},frameon=False,scatterpoints=1) ### Generalized Plotting functions figsavename = self.figpath+C.replace(' ','_')+'_'+self.nickname.replace('.','_') print figsavename xff.texify(fig,ax1, ylabel=C, #title=self._xdata.course_id+' - All Registrants', datefontsize=20, gridb='y', figsavename=figsavename+'.png') ### Append JSON Data record = collections.OrderedDict() record['key'] = C if C == 'Enroll Count': record['bar'] = 'true' record['values'] = [[date2epoch(d),int(v)] for d,v in self.enrollment_df[C].iteritems()] jsondata.append(record) print "JSON dump currently commented out." # str_jsondata = 'var data = '+json.dumps(jsondata) # with open(self.figpath+'enrollment.json', 'w') as outfile: # outfile.write(str_jsondata) return None#self.enrollment_df
def scatter_bubble_size(self,DF,colx,coly,disc_act,figsave=False): """ Creates scatter plot with x=colx, y=coly. Size of markers always proportional to disc_act (discussion activity). Parameters (generated during class initialization) ---------- colx: column to be plotted on x-axis coly: column to be plotted on y-axis disc_act: column for scaling marker (bubble) size figsave: True/False to allow exploratory analysis without saving fig. Output ------ Saves figures to specified directories. Returns ------- None """ ### Data data = DF[[colx,coly,disc_act,'certified']].copy() Jcolx = 0.75 Jcoly = 0.01 bmin = 1.0 bscale = 0.2 data[colx] = data[colx].apply(lambda x: x + Jcolx*(np.random.sample()-Jcolx)) data[coly] = data[coly].apply(lambda x: x + Jcoly*(np.random.sample())) data[disc_act] = data[disc_act].fillna(1.0) certcut = DF[DF['certified']==1].grade.min() fig = plt.figure(figsize=[12,10]) ax1 = fig.add_subplot(1,1,1) #Non-Certs tmp = data[data.certified==0] ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['neutral']) #Certified tmp = data[data.certified==1] ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['institute']) #ax1.legend(loc=5,prop={'size':18},scatterpoints=1,frameon=False) ax1.set_xlim(-0.05,) ax1.set_ylim(-0.05,1.05) ### Generalized Plotting functions xff.texify(fig,ax1,xlabel=colx,ylabel=coly, title='bubble size proportional to %s' % (disc_act.replace('_',' ')), tic_size=20,label_size=24,datefontsize=20) figsavename = self.figpath+'scatter_'+colx+'_'+coly+'_disc_size_'+self.nickname.replace('.','_')+'.png' fig.savefig(figsavename, bbox_inches='tight', dpi=300) return None
def daily_activity(self): """ Creates daily timeseries of discussion activity (only posts/comments/votes from forum data). Parameters (generated during class initialization) ---------- None Output ------ Saves figures to specified directories. Returns ------- None """ fig = plt.figure(figsize=[20,6]) ax1 = fig.add_subplot(1,1,1) ### List of Certified Participants certs = self.pc_plus[self.pc_plus.certified==1].user_id.unique() certs = [str(u) for u in certs] ### Non-Certified post_act =[ ( & ( ].created_at.apply(lambda x: post_act.plot(ax=ax1,style="-o",ms=6,lw=2,color=xff.colors['neutral'],label='$Non-Certified$') #(post_act.cumsum()/10).plot(ax=ax1,style="-",ms=3,color='Orange') ### Certified post_act =[ ( & ( ].created_at.apply(lambda x: post_act.plot(ax=ax1,style="-o",ms=6,lw=2,color=xff.colors['institute'],label='$Certified$') xmin = (self.cinfo['start_date'] - np.timedelta64(2,'W')).item().date() xmax = (self.cinfo['end_date'] + np.timedelta64(4,'W')).item().date() ax1.set_xlim(xmin,xmax) ylim1 = ax1.get_ylim()[1] ax1.vlines([self.cinfo.start_date.item().date(),self.cinfo.end_date.item().date()], 0, ylim1, colors='Gray', lw=1.5, linestyles='--') ax1.set_ylim(0,ylim1) ax1 = xff.timeseries_plot_formatter(ax1,interval=1) ax1.legend(loc=1,prop={'size':24},frameon=False) figsavename = self.figpath+'discussion_activity_'+self.nickname.replace('.','_') xff.texify(fig,ax1,ylabel='Forum Text Submissions', tic_size=20,label_size=24, datefontsize=20, title=self.nickname, figsavename=figsavename+'.png') return None
def bubble_heat_rel_week_vs_chapter(self): """ Bubble Heat - Unique users relative week versus chapter accessed Color and bubble size indicate population size. Parameters (generated during class initialization) ---------- colx: column to be plotted on x-axis coly: column to be plotted on y-axis disc_act: column for scaling marker (bubble) size figsave: True/False to allow exploratory analysis without saving fig. Output ------ Saves figures to specified directories. Returns ------- None """ ### Data data = self.potime.groupby(['rel_week','module_id']).username.apply(lambda x: len(x.unique())) data = data.reset_index().rename(columns={0:'uniqU'}) data = pd.merge(data,self.caxis[['index','chapter']],how='left',left_on='module_id',right_index=True).dropna() MINUSERS = 20 data = data[data.uniqU>MINUSERS] W = data.groupby(['chapter','rel_week'])['index','uniqU'].agg([min,max]).reset_index() fig = plt.figure(figsize=[24,16]) ax1 = fig.add_subplot(1,1,1) c = W[('uniqU','max')] cmhot = plt.get_cmap("CMRmap") sc = ax1.scatter(W[('index','min')],W[('rel_week','')],s=0.95*W[('uniqU','max')], c=c,edgecolors='None',cmap=cmhot,alpha=0.95) cbar = plt.colorbar(sc) cbar.set_label('$N$',rotation=90,fontsize=30) #cbar.set_ticklabels([r'${0}$'.format("%s" % (y.get_text())) for y in]) ### Generalized Plotting functions figsavename = self.figpath+'bubble_heat_rel_week_vs_chapter_'+self.nickname.replace('.','_') xff.texify(fig,ax1,xlabel='Course Structure Index',ylabel='Week Relative to Course Launch', title='Unique Users visiting Chapters each Relative Week (min %d users for display)' % (MINUSERS), tic_size=30,label_size=30, figsavename=figsavename+'.png') return None
def activity_distributions(self, columns=[], **kwargs): """ Activity Distributions from person_course collection using Pandas built in hist function. There are several columns which can be plotted as distributions: features = ['ndays_act', 'nevents', 'nforum_events', 'nforum_posts', 'nplay_video', 'nproblem_check', 'nprogcheck'] Course Reports focus on nevents, ndays_act, and nchapters Parameters ---------- columns : list of column names Typical column names - ['ndays_act', 'nevents', 'nforum_events', 'nforum_posts', 'nplay_video', 'nproblem_check', 'nprogcheck'] Returns ------- None """ data = self.person.filter(items=columns) for col in columns: if col not in self.person: columns.remove(col) print "Some of the specified columns do not exist in the person_course collection." print col if len(columns) == 0: print "No specified columns available." return None for column in data.columns: ### Plot fig = plt.figure(figsize=(12, 7)) ax1 = fig.add_subplot(1, 1, 1) check = self.distribution_logic(ax1, column, True) ### Generalized Plotting functions figsavename = self.figpath + 'distribution_' + column.replace( ' ', '_') + '_' + self.nickname.replace('.', '_') print figsavename xff.texify(fig, ax1, figsavename=figsavename + '.png') return None
def grade_distribution(self, **kwargs): ''' Simple grade distribution. Parameters ---------- None Output ------ Figures and respective formats. Returns ------- None ''' NVD3 = kwargs.get('NVD3', False) if self.person.grade.count() < 50: print "Not enough grade information. Return without grade distribution." return None bins = 50 hmin = 0.0 #self.person['grade'].min() hmax = 1.0 #self.person['grade'].max() glowbound = 0.1 ### Plot fig = plt.figure(figsize=(12, 7)) ax1 = fig.add_subplot(1, 1, 1) self.person[(self.person.grade >= glowbound) & (self.person.certified == 0)]['grade'].hist( ax=ax1, bins=bins, range=(hmin, hmax), log=False, cumulative=0, color=xff.colors['neutral'], edgecolor=xff.colors['neutral']) self.person[(self.person.grade >= glowbound) & (self.person.certified == 1)]['grade'].hist( ax=ax1, bins=bins, range=(hmin, hmax), log=False, cumulative=0, color=xff.colors['institute'], edgecolor=xff.colors['institute'], alpha=0.8) xlab = 'Grade' #ax1.set_xlabel(r'%s' % (xlab),fontdict={'fontsize': 30,'style': 'oblique'}) #ax1.set_ylabel(r'Count (log scale)',fontdict={'fontsize': 30,'style': 'oblique'}) ax1.set_xlim(0, hmax) ax1.set_ylim(1, ) ax1.legend(['$Non-Certified$', '$Certified$'], loc=1, prop={'size': 24}, frameon=False) ax1.set_xticklabels([r'$%.1f$' % x for x in ax1.get_xticks()], fontsize=30) ax1.set_yticklabels([r'$%d$' % y for y in ax1.get_yticks()], fontsize=30) ### Generalized Plotting functions figsavename = self.figpath + 'grade_distribution_' + self.nickname.replace( '.', '_') xff.texify(fig, ax1, xlabel='Grade (> %.2f)' % (glowbound), ylabel='Count', gridb='y', figsavename=figsavename + '.png') #---------------------------------------------------------------- ### NVD3 Interactive if NVD3: ### Data bins = 50 hmin = 0.0 hmax = 1.0 DATA = self.person[(self.person.grade >= glowbound)] Y1, X1 = np.histogram(DATA[DATA.certified == 0].grade.values, bins=bins, range=(hmin, hmax)) Y2, X2 = np.histogram(DATA[DATA.certified == 1].grade.values, bins=bins, range=(hmin, hmax)) ### FIGURE from nvd3 import multiBarChart ### Output File figsavename = self.figpath + 'interactive_grade_distribution_' + self.nickname + '.html' output_file = open(figsavename, 'w') print figsavename title = "Grade Distribution: %s" % self._xdata.course_id chart = multiBarChart(name=title, height=400) chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") nb_element = len(X1) extra_serie = { "tooltip": { "y_start": "", "y_end": "" }, "color": xff.colors['neutral'] } chart.add_serie(name="Count", y=Y1, x=X1, extra=extra_serie) extra_serie = { "tooltip": { "y_start": "", "y_end": "" }, "color": xff.colors['institute'] } chart.add_serie(name="Duration", y=Y2, x=X2, extra=extra_serie) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() return None
def country_of_origin(self, **kwargs): """ Creates figures for the top "ccnum" of enrolled countries. Parameters (generated during class initialization) ---------- ccnum = number of requested countries to be plotted. Max 25 for plotting issues. NVD3 = False. If true, nvd3 interactive figure output. Output ------ Saves figures to specified directories. Returns ------- None """ NVD3 = kwargs.get('NVD3', False) ccnum = kwargs.get('ccnum', 10) cc = self.person.final_cc.value_counts().order(ascending=False) if self.person[self.person.certified == 1].username.count() > self.mincerts: certs = self.person[self.person.certified == 1].final_cc.value_counts() else: certs = pd.Series(index=cc.index) cc = pd.concat([cc, certs], join='inner', axis=1, keys=['$Non-Certified$', '$Certified$']) cc = cc.sort('$Non-Certified$', ascending=False)[0:ccnum] perc = 100. * cc / cc.sum() perc = perc.apply(lambda x: np.round(x, 1)) #print perc fig = plt.figure(figsize=(12, 6)) ax1 = fig.add_subplot(1, 1, 1) perc.plot( ax=ax1, kind='bar', color=[xff.colors['neutral'], xff.colors['institute']], rot=40, ) ### Plot Details ax1.set_xticklabels([r'$%s$' % x for x in perc.index]) ax1.set_yticklabels( [r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()], fontsize=30) ax1.legend(loc=1, prop={'size': 28}, frameon=False) ### Generalized Plotting functions figsavename = self.figpath + 'country_geoloc_distribution_' + self.nickname.replace( '.', '_') print figsavename xff.texify(fig, ax1, xlabel='Country Code', ylabel=None, figsavename=figsavename + '.png') ### Output JSON Records = 'value' #cc = cc.reset_index().rename(columns={'index':'label'}) #cc.dropna().to_json(figsavename+'.json',orient='records') #---------------------------------------------------------------- ### NVD3 Interactive if NVD3: ### FIGURE from nvd3 import multiBarChart ### Output File figsavename = self.figpath + 'interactive_country_distribution_' + self.nickname + '.html' output_file = open(figsavename, 'w') print figsavename title = "Education Level Distribution: %s" % self._xd.course_id charttype = 'multiBarChart' chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f") chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") nb_element = len(perc) X = perc.index #list(range(nb_element)) Y1 = perc.ix[:, '$Non-Certified$'].values Y2 = perc.ix[:, '$Certified$'].values ### Series 1 extra_serie1 = { "tooltip": { "y_start": "", "y_end": "%" }, "color": xff.colors['neutral'], "format": ".1f" } chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1) ### Series 2 extra_serie2 = { "tooltip": { "y_start": "", "y_end": "%" }, "color": xff.colors['institute'], "format": ".1f" } chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() return None
def gender(self,**kwargs): """ Creates gender distribution figures. Parameters (generated during class initialization) ---------- NVD3 = False. If true, nvd3 interactive figure output. Output ------ Saves figures to specified directories. Returns ------- None """ NVD3 = kwargs.get('NVD3',False) ### Removes those users not having the option to fill in edX registration data. trim_data = self.person[(self.person.registered==1) & (self.person.user_id>156633)] ### Data gdict = {'f': "$Female$",'m': "$Male$",'o':"$Other$"} glist = ['$Female$','$Male$'] ### Munge and Plot gender = trim_data.gender.dropna().apply(lambda x: gdict[x]).value_counts() #print gender certs = trim_data[trim_data.certified==1] if certs.username.count() > self.mincerts: certs = certs.gender.dropna().apply(lambda x: gdict[x]).value_counts() else: certs = pd.Series(index=gender.index) gender = pd.concat([gender,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$']) gender = 100.*gender/gender.sum() gender = gender.apply(lambda x: np.round(x,1)) fig = plt.figure(figsize=(12,6)) ax1 = fig.add_subplot(1,1,1) gender.ix[glist,:].plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=0) ### Plot Details ax1.set_xticklabels([r'%s' % x for x in glist]) ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30) ax1.legend(loc=2,prop={'size':28},frameon=False) ### Generalized Plotting functions figsavename = self.figpath+'gender_distribution_'+self.nickname.replace('.','_') print figsavename xff.texify(fig,ax1,xlabel=None,ylabel='Count',figsavename=figsavename+'.png') # ### Output JSON Records # = 'value' # gender = gender.reset_index().rename(columns={'index':'label'}) # gender.dropna().to_json(figsavename+'.json',orient='records') #---------------------------------------------------------------- ### NVD3 Interactive if NVD3: '' X = [ x.replace('$','') for x in gender.index ] Y1 = gender.ix[glist,'$Non-Certified$'].values Y2 = gender.ix[glist,'$Certified$'].values #---------------------------------------------------------------- ### BAR Chart from nvd3 import multiBarChart ### Output File figsavename = self.figpath+'interactive_gender_distribution_'+self.nickname+'.html' output_file = open(figsavename, 'w') print figsavename title = "Gender Distribution: %s" % self._xd.course_id charttype = 'multiBarChart' chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f") chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") nb_element = len(gender.ix[glist,:]) ### Series 1 extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"}, "color":xff.colors['neutral'], "format":".1f" } chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1) ### Series 2 extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"}, "color":xff.colors['institute'], "format":".1f" } chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() #---------------------------------------------------------------- ### Pie Chart from nvd3 import pieChart ### Output File figsavename = self.figpath+'interactive_gender_piechart_'+self.nickname+'.html' output_file = open(figsavename, 'w') print figsavename title = "Gender Pie Chart: %s" % self._xd.course_id charttype = 'multiBarChart' chart = pieChart(name=charttype, color_category='category20c', height=400, width=400) chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") extra_serie = {"tooltip": {"y_start": "", "y_end": " certified"}} chart.add_serie(y=Y1, x=X, extra=extra_serie) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() return None
def last_day_plots(self, **kwargs): """ Plots using the last_event date from enrollment_df. Parameters ---------- None Output ------ Figures and respective formats. Returns ------- None """ col = self.lastact_df.columns[0] tmp = self.lastact_df[col][self.lastact_df[col] > 100] mindate = tmp.index[0] - timedelta(days=21) maxdate = tmp.index[-1] + timedelta(days=21) #print mindate,maxdate ### For JSON Data Output jsondata = [] def date2epoch(x): return int((datetime.combine(x, datetime.min.time()) - datetime(1970, 1, 1)).total_seconds() * 1000) for C in self.lastact_df.columns: #print C,self.lastact_df[C].idxmin(),self.lastact_df[C].idxmax() fig = plt.figure(figsize=(12, 6)) ax1 = fig.add_subplot(1, 1, 1) self.lastact_df[C].plot(ax=ax1, color=xff.colors['institute'], rot=0, lw=3, label=self.nickname) ax1.set_xlim(mindate, maxdate) ax1 = xff.timeseries_plot_formatter(ax1) ax1.set_yticklabels( [r'${0}$'.format("%d" % (y)) for y in ax1.get_yticks()]) #ax1.legend(loc=4,prop={'size':22},frameon=False,scatterpoints=1) ### Generalized Plotting functions figsavename = self.figpath + C.replace( ' ', '_') + '_' + self.nickname.replace('.', '_') print figsavename xff.texify( fig, ax1, ylabel=C, #title=self._xdata.course_id+' - All Registrants', datefontsize=20, gridb='y', figsavename=figsavename + '.png') ### Append JSON Data record = collections.OrderedDict() record['key'] = C if C == 'Last Activity Count': record['bar'] = 'true' record['values'] = [[date2epoch(d), int(v)] for d, v in self.lastact_df[C].iteritems()] jsondata.append(record) print "JSON dump currently commented out." # str_jsondata = 'var data = '+json.dumps(jsondata) # with open(self.figpath+'lastday.json', 'w') as outfile: # outfile.write(str_jsondata) return None
def andrew_ho_diagram(self, **kwargs): """ Plot showing the intersection of enrollment populations by categories defined in the 2013 (published 2014) course reports. Parameters ---------- None Output ------ Figures and respective formats. Returns ------- None """ ### Registration Types self.person['Only Registered'] = 0 self.person['Only Viewed'] = 0 self.person['Only Explored'] = 0 g = self.person ### This is a bit silly, but keeps lines short despite the conditions. ### Create disjoint groups (note g and self.person are the same) #Only Registered: A - (B+C+D) reg_list = g[(g['registered'] == 1) & (g['viewed'] == 0) & (g['explored'] == 0) & (g['certified'] == 0)].user_id self.person.ix[self.person[self.person.user_id.isin(reg_list)].index, 'Only Registered'] = 1 #Only Viewed: B - (C+D) view_list = g[(g.user_id.isin(reg_list) == False) & (g['viewed'] == 1) & (g['explored'] == 0) & (g['certified'] == 0)].user_id self.person.ix[self.person[self.person.user_id.isin(view_list)].index, 'Only Viewed'] = 1 #Only Explored: C - (D) exp_list = g[(g.user_id.isin(reg_list) == False) & (g.user_id.isin(view_list) == False) & (g['explored'] == 1) & (g['certified'] == 0)].user_id self.person.ix[self.person[self.person.user_id.isin(exp_list)].index, 'Only Explored'] = 1 ### Figure fig = plt.figure(figsize=(12, 8)) ax1 = fig.add_subplot(111) #Circles SCALE = 1000 expratio = SCALE * 100. * len( self.person[self.person['Only Explored'] == 1]) / len( self.person[self.person['registered'] == 1]) #print expratio if expratio > 17000: expratio = 1000 certratio = SCALE * 100. * len( self.person[self.person['certified'] == 1]) / len( self.person[self.person['registered'] == 1]) #print certratio if certratio < 1000: certratio = 1000 # csize = 1000 # Ratio and csize give relative size of explored and certified circles. # # expl = len(self.person[self.person['Only Explored']==1]) # # cert = len(self.person[self.person['certified']==1]) ax1.scatter([0.45], [0.5], s=expratio, edgecolor='Black', lw=2, color='white', alpha=0.8) ax1.scatter([0.50], [0.5], s=certratio, edgecolor=xff.colors['institute'], lw=2, color='white', alpha=0.8) #Rectangles rect1 = matplotlib.patches.Rectangle((0, 0), 1, 1, fill=False, fc='white', ec='black', lw=2) rect2 = matplotlib.patches.Rectangle((0.15, 0.15), 0.7, 0.7, fill=False, fc='white', ec='black', lw=2) ax1.add_patch(rect1) ax1.add_patch(rect2) FS = 25 ### Only registerd x = len(self.person[self.person['Only Registered'] == 1]) y = 100.0 * x / len(self.person[self.person['registered'] == 1]) ax1.text(0.05, 0.925, '$Only\ registered:\ %d\ \ (%.1f\%%)$' % (x, y), fontsize=FS, ha='left') x = len(self.person[self.person['Only Viewed'] == 1]) y = 100.0 * x / len(self.person[self.person['registered'] == 1]) ax1.text(0.2, 0.775, '$Only\ viewed:\ %d\ \ (%.1f\%%)$' % (x, y), fontsize=FS, ha='left') x = len(self.person[self.person['Only Explored'] == 1]) y = 100.0 * x / len(self.person[self.person['registered'] == 1]) ax1.text(0.175, 0.2, '$Only\ explored:$\n$%d\ \ (%.1f\%%)$' % (x, y), fontsize=FS, ha='left') x = len(self.person[self.person['certified'] == 1]) y = 100.0 * x / len(self.person[self.person['registered'] == 1]) ax1.text(0.625, 0.35, '$Certified:$\n$%d\ \ (%.1f\%%)$' % (x, y), fontsize=FS, ha='left') ax1.set_xlim([-0.01, 1.01]) ax1.set_ylim([-0.01, 1.01]) ## suppress spines for key in ax1.spines.keys(): ax1.spines[key].set_color('none') ax1.axes.get_xaxis().set_visible(False) ax1.axes.get_yaxis().set_visible(False) ### Generalized Plotting functions figsavename = self.figpath + 'AHO_Diagram' + '_' + self.nickname.replace( '.', '_') xff.texify(fig, ax1, figsavename=figsavename + '.png') # ### Package data # jsonout = pd.Series() # for sg in ['Only Registered','Only Viewed','Only Explored','certified']: # count = len(self.person[self.person[sg]==1].username.unique()) # jsonout.set_value(sg.capitalize()+'(N=%d)'%(count),count) # jsonout = 100.0*jsonout/jsonout.sum() # jsondata = [] # for l,v in jsonout.iteritems(): # record = collections.OrderedDict() # record['label'] = l # record['value'] = v # jsondata.append(record) # print "JSON dump currently commented out." # # str_jsondata = 'var data = '+json.dumps(jsondata) # # with open(self.figpath+'percent_participanttypes.json', 'w') as outfile: # # outfile.write(str_jsondata) return None
def content_touches_viz(self,horiz_w_data,certified): ''' ''' fig = plt.figure(figsize=(32,12)) fig.subplots_adjust(hspace=.1) #plt.rcParams.update({'font.size': 20}) ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2) #ax1.plot(tmp['order'],tmp['uniqU'],'o') bars1 =['order'],horiz_w_data['users'],3.0,alpha=0.8,edgecolor='none') #Colors the bars (this is where reindexing matters) for i,b in enumerate(bars1): flipi = i#len(tmp.index)-1-i if horiz_w_data.color[flipi] != 'Pink': bars1[i].set_facecolor(horiz_w_data.color[flipi]) bars1[i].set_edgecolor(horiz_w_data.color[flipi]) else: bars1[i].set_facecolor('none') bars1[i].set_edgecolor('none') #ax1.plot(tmp['order'],tmp.uniqU,'-o',color='Silver',alpha=0.8) ax1.set_xlabel('Course Structure Index') ax1.set_ylabel('Unique Users') #ax1.set_xlim(0,2100) bars2 =['order'],horiz_w_data.scale,2.5,edgecolor='none') #Colors the bars (this is where reindexing matters) for i,b in enumerate(bars2): flipi = i#len(vert.index)-1-i bars2[i].set_facecolor(horiz_w_data.color[flipi]) invert = True # Choose whether to have the CC plot left or right oriented ha = 'left' if invert == True: ax2.invert_yaxis() ha = 'right' ax2.axes.get_xaxis().set_ticks([]) #ax2.set_xlim(0,500) ax2.axes.get_yaxis().set_ticks([]) #ax2.set_xlim(0,2100) ax2.set_ylim(7,-1) #fig.patch.set_visible(False) ax2.axis('off') #!!!!! x limits must be set together ax1.set_xlim(ax2.get_xlim()[0],ax2.get_xlim()[1]) #ax1.set_ylim(0,7500) xff.texify(fig,ax1,tic_size=32,label_size=32) dpiset = 300 if certified==True: figsavename = self.figpath+'content_touches_horizontal_certified_'+self.nickname.replace('.','_')+'.png' else: figsavename = self.figpath+'content_touches_horizontal_'+self.nickname.replace('.','_')+'.png' fig.savefig(figsavename, bbox_inches='tight', dpi=dpiset) return None
def level_of_education(self,**kwargs): ''' Plot Level of Education Attained; typically taken from the edX enrollment questionairre. ''' """ Creates distribution of highest level of education attained. Parameters (generated during class initialization) ---------- NVD3 = False. If true, nvd3 interactive figure output. Output ------ Saves figures to specified directories. Returns ------- None """ NVD3 = kwargs.get('NVD3',False) ### Level of Education (LoE) ### Data eddict = {'el': "Less\ than$\n$ Secondary",'jhs': "Less\ than$\n$ Secondary",'none':"Less\ than$\n$ Secondary", 'hs':"Secondary",'a':"Secondary", 'b':"Bachelor\'s", 'm': "Master\'s", 'p_se': "Doctorate",'p_oth': "Doctorate",'p': "Doctorate", 'other': None,'NA':None,'nan':None, } edlist = ["Less\ than$\n$ Secondary","Secondary","Bachelor\'s","Master\'s","Doctorate"] trim_data = self.person[(self.person.registered==1) & (self.person.user_id>156633)] edlevels = trim_data.LoE.apply(lambda x: eddict[str(x)] if x in eddict.keys() else None).value_counts()[edlist] if trim_data[trim_data.certified==1].username.count() > self.mincerts: certs = trim_data[trim_data.certified==1].LoE.apply(lambda x: eddict[str(x)] if x in eddict.keys() else None).value_counts()[edlist] else: certs = pd.Series(index=edlevels.index) edlevels = pd.concat([edlevels,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$']) edlevels = 100.*edlevels/edlevels.sum() edlevels = edlevels.apply(lambda x: np.round(x,1)) #print edlevels #Plot fig = plt.figure(figsize=(12,6)) ax1 = fig.add_subplot(1,1,1) edlevels.plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=40) ### Plot Details ax1.set_xticklabels([r'$%s$' % x for x in edlist]) ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30) ax1.legend(loc=2,prop={'size':22},frameon=False) ### Generalized Plotting functions figsavename = self.figpath+'loe_distribution_'+self.nickname.replace('.','_') print figsavename xff.texify(fig,ax1,xlabel=None,ylabel=None,figsavename=figsavename+'.png') ### Output JSON Records = 'value' #cc = cc.reset_index().rename(columns={'index':'label'}) #cc.dropna().to_json(figsavename+'.json',orient='records') #---------------------------------------------------------------- ### NVD3 Interactive if NVD3: ### FIGURE from nvd3 import multiBarChart ### Output File figsavename = self.figpath+'interactive_edlevel_distribution_'+self.nickname+'.html' output_file = open(figsavename, 'w') print figsavename title = "Education Level Distribution: %s" % self._xd.course_id charttype = 'multiBarChart' chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f") chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") nb_element = len(edlevels) X = [ x.replace('\ ',' ').replace('$\n$',' ') for x in edlevels.index ] #list(range(nb_element)) Y1 = edlevels.ix[:,'$Non-Certified$'].values Y2 = edlevels.ix[:,'$Certified$'].values ### Series 1 extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"}, "color":xff.colors['neutral'], "format":".1f" } chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1) ### Series 2 extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"}, "color":xff.colors['institute'], "format":".1f" } chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() return None
def age(self,**kwargs): """ Creates gender distribution figures. Parameters (generated during class initialization) ---------- NVD3 = False. If true, nvd3 interactive figure output. Output ------ Saves figures to specified directories. Returns ------- None """ NVD3 = kwargs.get('NVD3',False) ### Removes those users not having the option to fill in edX registration data. trim_data = self.person[(self.person.registered==1) & (self.person.user_id>156633)] # Add age column from year_of_birth trim_data['age'] = trim_data['YoB'].apply(lambda x: - x if isinstance(x,int) else None) age = trim_data.age.dropna() h,e = np.histogram(age.values,bins=9,range=(0,90)) age = pd.Series(data=h,index=['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89']) certs = trim_data[trim_data.certified==1] if certs.username.count() > self.mincerts: certs = certs.age.dropna() h,e = np.histogram(certs.values,bins=9,range=(0,90)) certs = pd.Series(data=h,index=['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89']) else: certs = pd.Series(index=age.index) age = pd.concat([age,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$']) age = 100.*age/age.sum() age = age.apply(lambda x: np.round(x,1)) #print age #---------------------------------------------------------------- ### Static Matplotlib PNG fig = plt.figure(figsize=(12,6)) ax1 = fig.add_subplot(1,1,1) age.plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=40,) ### Plot Details ax1.set_xticklabels([r'$%s$' % x for x in age.index]) ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30) ax1.legend(loc=1,prop={'size':28},frameon=False) ### Generalized Plotting functions figsavename = self.figpath+'age_distribution_'+self.nickname.replace('.','_') print figsavename xff.texify(fig,ax1,xlabel='Age',ylabel=None,figsavename=figsavename+'.png') #---------------------------------------------------------------- ### NVD3 Interactive if NVD3: ### FIGURE from nvd3 import multiBarChart ### Output File figsavename = self.figpath+'interactive_age_distribution_'+self.nickname+'.html' output_file = open(figsavename, 'w') print figsavename title = "Age Distribution: %s" % self._xd.course_id charttype = 'multiBarChart' chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f") chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") nb_element = len(age) X = age.index #list(range(nb_element)) Y1 = age['$Non-Certified$'].values Y2 = age['$Certified$'].values ### Series 1 extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"}, "color":xff.colors['neutral'], "format":".1f" } chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1) ### Series 2 extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"}, "color":xff.colors['institute'], "format":".1f" } chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() return None
def grade_vs_nchapters(self, **kwargs): """ Scatter plot of final grade versus nchapters accessed. All points are jittered for clarity. Text labels are added to indicate subpopulations. Parameters ---------- NVD3 = False. If true, nvd3 interactive figure output. Output ------ Figures and respective formats. Returns ------- None """ NVD3 = kwargs.get('NVD3', False) if 'nchapters' not in self.person or 'grade' not in self.person or 'certified' not in self.person: print "One of the three columns necessary for this plot is missing. Check person_course for: 'nchapters','grade', and 'certified'." return None ### Data data = self.person[['nchapters', 'grade', 'certified']].copy() chap_jmax = 0.75 grade_jmax = 0.005 data.nchapters = data.nchapters.apply(lambda x: x + chap_jmax * (np.random.sample() - 0.5)) data.grade = data.grade.apply(lambda x: x + grade_jmax * (np.random.sample())) data = data.dropna() certcut = self.person[self.person['certified'] == 1].grade.min() ### Plot fig = plt.figure(figsize=(12, 10)) ax1 = fig.add_subplot(1, 1, 1) #Non-Certs data[data.certified == 0].plot('nchapters', 'grade', style='.', color=xff.colors['neutral'], label=self.nickname, ax=ax1) #Certified data[data.certified == 1].plot('nchapters', 'grade', style='.', color=xff.colors['institute'], ax=ax1) ### Illustrations (labels) ncmax = self.person[self.person.certified == 1].nchapters.order( )[-20::].min() ### Funny, but this cuts off staff ax1.hlines(certcut, 0, ncmax + 1, lw=2) ax1.vlines(int(ncmax / 2), 0, certcut, lw=2) ax1.text(ncmax / 6, certcut - 0.05, '$Viewed$', fontsize=30, alpha=0.75) ax1.text(ncmax / 2 + (ncmax / 4), certcut - 0.05, '$Explored$', fontsize=30, alpha=0.75) ax1.text( ncmax / 2, 0.8, '$Certified$', fontsize=30, alpha=0.75, horizontalalignment='center', ) ### Plot Details ax1.set_xticklabels([r'$%0.f$' % x for x in ax1.get_xticks()]) ax1.set_yticklabels([r'$%0.1f$' % x for x in ax1.get_yticks()], fontsize=30) #ax1.legend(loc=4,prop={'size':28},frameon=False) ax1.set_xlim(0, ncmax + 1) ax1.set_ylim(0, 1.01) ### Generalized Plotting functions figsavename = self.figpath + 'scatter_grade_vs_nchapters_' + self.nickname.replace( '.', '_') xff.texify(fig, ax1, xlabel='Chapters Viewed', ylabel='Grade', gridb='y', figsavename=figsavename + '.png') #---------------------------------------------------------------- ### NVD3 Interactive if NVD3: data = data[(data.nchapters > 0) & (data.grade > 0)].dropna() randrows = np.random.choice(data.index.values, 2000) data = data.ix[randrows, :] X1 = data[data.certified == 0].nchapters.values Y1 = data[data.certified == 0].grade.values X2 = data[data.certified == 1].nchapters.values Y2 = data[data.certified == 1].grade.values #print X1,Y1 ### FIGURE from nvd3 import scatterChart ### Output File figsavename = self.figpath + 'interactive_scatter_grade_nchap_' + self.nickname + '.html' output_file = open(figsavename, 'w') print figsavename title = "Scatter Plot Grade vs Chapters Viewed: %s" % self._xdata.course_id chart = scatterChart(name=title, width=850, height=550, x_is_date=False, x_axis_format=".1f", y_axis_format=".1f") chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n") nb_element = len(X1) kwargs1 = {'shape': 'circle', 'size': '3'} kwargs2 = {'shape': 'circle', 'size': '3'} extra_serie = {"tooltip": {"y_start": "", "y_end": " calls"}} chart.add_serie(name="Participants", y=Y1, x=X1, extra=extra_serie, **kwargs1) chart.add_serie(name="Certified", y=Y2, x=X2, extra=extra_serie, **kwargs2) ### Final Output chart.buildhtml() output_file.write(chart.htmlcontent) #--------------------------------------- #close Html file output_file.close() return None
def scatter_bubble_size(self,colx,coly,disc_act,figsave=False): """ Creates scatter plot with x=colx, y=coly. Size of markers always proportional to disc_act (discussion activity). Parameters (generated during class initialization) ---------- colx: column to be plotted on x-axis coly: column to be plotted on y-axis disc_act: column for scaling marker (bubble) size figsave: True/False to allow exploratory analysis without saving fig. Output ------ Saves figures to specified directories. Returns ------- None """ ### Data data = self.person[[colx,coly,disc_act,'certified']].copy() Jcolx = 0.75 Jcoly = 0.01 Jcolz = 10 # 1.0/sqrt(10000) # print Jcolz bmin = 1.0 bscale = 0.2 data[colx] = data[colx].apply(lambda x: x + Jcolx*(np.random.sample()-Jcolx)) data[coly] = data[coly].apply(lambda x: x + Jcoly*(np.random.sample())) data[disc_act] = data[disc_act].fillna(1.0).apply(lambda x: x + Jcolz*(np.random.sample())) ### Take top N discussants, and set their dot size to the Nth + 1 highest (lowest of the set) Nd = 5 topN = data[disc_act].order().index[-Nd:] data.ix[topN,disc_act] = data.ix[topN[1],disc_act] if colx=='time_in_course': data[colx] = data[colx].apply(np.log) if coly=='time_in_course': data[coly] = data[coly].apply(np.log) if disc_act=='time_in_course': data[disc_act] = data[disc_act].apply(np.log) certcut = self.person[self.person['certified']==1].grade.min() fig = plt.figure(figsize=[12,10]) ax1 = fig.add_subplot(1,1,1) #Non-Certs tmp = data[data.certified==0] ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['neutral']) #Certified tmp = data[data.certified==1] ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['institute']) #ax1.legend(loc=5,prop={'size':18},scatterpoints=1,frameon=False) ax1.set_ylim(-0.05,1.05) ax1.set_xlim(6,16) ax1.set_xticks([np.log(x) for x in [600,3600,10*3600,100*3600]]) ax1.set_xticklabels(['10 min','1 hr','10 hrs','100 hrs'],rotation=40) # ax1.set_xticks([np.log(x) for x in [1,10,60,600,3600,10*3600,100*3600]]) # ax1.set_xticklabels(['1 sec','10 sec','1 min','10 min','1 hr','10 hrs','100 hrs'],rotation=40) ### Generalized Plotting functions figsavename = self.figpath+'scatter_'+colx+'_'+coly+'_disc_size_'+self.nickname.replace('.','_') ylabel = coly.replace('_',' ') if ylabel == 'time in course': ylabel = 'Total Time In Course' xff.texify(fig,ax1,xlabel=colx.replace('_',' '), ylabel=ylabel, title='bubble size proportional to %s' % (disc_act.replace('_',' ')), tic_size=20,label_size=24,datefontsize=20, figsavename=figsavename+'.png') return None
