Пример #1
0
    def time_on_course(self):
        """
        Estimation of time spent in course. Using self.potime,
        click events are grouped by username and time sorted. 
        Time differences >= 10 sec and <= 3600 sec are summed. 
        Justification for those cutoffs can be found here:
        http://cacm.acm.org/magazines/2014/4/173221-who-does-what-in-a-massive-open-online-course/fulltext
        Submitted Paper: "eText Use in Blended Introductory Physics Courses: Interpreting Meaningful Interactions and the Effects of Course Structure"
        
        Only estimates time for users with more than one event.

        Parameters (generated during class initialization)
        ----------
        None

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        fig = plt.figure(figsize=[12,6])
        ax1 = fig.add_subplot(1,1,1)

        ### All Non-Certified
        participants = self.person[self.person.certified==0].username.dropna().unique()
        self.time_spent[participants].apply(np.log).hist(ax=ax1,bins=100,range=[0,16],
                                                    color='Silver',edgecolor=None,alpha=0.9,label="$Non-Certified$")
        
        ### Certified
        certs = self.person[self.person.certified==1].username.dropna().unique()
        self.time_spent[certs].apply(np.log).hist(ax=ax1,bins=100,range=[0,16],
                                             color='Crimson',edgecolor=None,alpha=0.9,label="$Certified$")

        ax1.set_xticks([np.log(x) for x in [1,10,60,600,3600,10*3600,100*3600]])
        ax1.set_xticklabels(['1 sec','10 sec','1 min','10 min','1 hr','10 hrs','100 hrs'],rotation=40)
        ax1.legend()

        figsavename = self.figpath+'time_in_course_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,
                   xlabel='Total Time In Course',
                   ylabel='Count',
                   # title=self.nickname,
                   tic_size=24,
                   label_size=24,
                   gridb='y',
                   figsavename=figsavename+'.png')


        # print figsavename 
        # if figsavename != None:
        #     dpiset = 300
        #     #fig.savefig('OUTPUT_Figures/%s/%s_%s.png' %(mens2_cid,figsavename,nickname), bbox_inches='tight', dpi=dpiset)
        #     fig.savefig('%s' % (figsavename), bbox_inches='tight', dpi=dpiset)

        return None
Пример #2
0
    def daily_unique_users(self):
        """

        """
        fig = plt.figure(figsize=[20, 6])
        ax1 = fig.add_subplot(1, 1, 1)

        sgcolors = ['Silver', 'Crimson']
        sglabels = ['$Non-Certified$', '$Certified$']

        for sg in [0, 1]:
            users = self.person[self.person.certified ==
                                sg].username.dropna().unique()
            daily = pd.crosstab(
                self.potime[self.potime.username.isin(users)].username,
                self.potime.date)
            daily = daily[daily > 0].count().sort_index()
            daily.index = [np.datetime64(d) for d in daily.index]
            daily.plot(ax=ax1,
                       style="-o",
                       ms=6,
                       lw=2,
                       color=sgcolors[sg],
                       rot=0,
                       label=sglabels[sg])

        xmin = (self.cinfo['start_date'] -
                np.timedelta64(2, 'W')).item().date()
        xmax = (self.cinfo['end_date'] + np.timedelta64(4, 'W')).item().date()

        ax1.set_xlim(xmin, xmax)
        ax1 = xff.timeseries_plot_formatter(ax1, interval=1)
        ax1.legend(loc=1, prop={'size': 24}, frameon=False)

        ylim1 = ax1.get_ylim()[1]
        ax1.vlines([
            self.cinfo.start_date.item().date(),
            self.cinfo.end_date.item().date()
        ],
                   0,
                   ylim1,
                   colors='Gray',
                   lw=1.5,
                   linestyles='--')
        ax1.set_ylim(0, ylim1)

        figsavename = self.figpath + 'daily_unique_users_' + self.nickname.replace(
            '.', '_')
        xff.texify(fig,
                   ax1,
                   ylabel='Unique Users',
                   tic_size=20,
                   label_size=24,
                   datefontsize=20,
                   title=self.nickname,
                   figsavename=figsavename + '.png')

        return None
Пример #3
0
    def resource_use(self, category):
        """
        Number of unique resources by category.

        Parameters (generated during class initialization)
        ----------
        category: ['video','problem', etc.]

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        resources = self.caxis[self.caxis.category == category].index.unique()
        usage = self.freq.filter(resources).count(axis=1)

        bins = len(resources)
        brange = (0, usage.max())
        sgcolors = ['Silver', 'Crimson']
        sglabels = ['$Non-Certified$', '$Certified$']

        fig = plt.figure(figsize=[12, 6])
        ax1 = fig.add_subplot(1, 1, 1)

        for sg in [
                0, 1
        ]:  ### Note, later this could be all four subgroups: explored, certified, etc
            subpop = self.person[self.person.certified ==
                                 sg].username.dropna().unique()
            usage[subpop].hist(
                range=brange,
                bins=bins,
                cumulative=0,  #histtype='step',
                normed=True,
                alpha=0.85,
                edgecolor=sgcolors[sg],
                color=sgcolors[sg])
            # usage[subpop].hist(range=brange,bins=bins,
            #                    cumulative=-1,histtype='step',
            #                    normed=True,alpha=0.85,
            #                    edgecolor=sgcolors[sg],color=sgcolors[sg],label=None)

        ax1.legend(sglabels, loc=9, prop={'size': 24}, frameon=False)

        figsavename = self.figpath + 'resource_use_' + category + '_dist_' + self.nickname.replace(
            '.', '_')
        xff.texify(fig,
                   ax1,
                   xlabel='Unique %s resources' % (category),
                   ylabel='Normalized Count',
                   title=self.nickname,
                   figsavename=figsavename + '.png')

        return None
Пример #4
0
    def enrollment_plots(self,**kwargs):
        """
        Plots using the start_date from enrollment_df.

        Parameters
        ----------
        None
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        """
        
        ### For JSON Data Output
        jsondata = []
        
        def date2epoch(x):
            return int( (datetime.combine(x, datetime.min.time()) - datetime(1970,1,1)).total_seconds()*1000 )

        for C in self.enrollment_df.columns:
            fig = plt.figure(figsize=(12,6))
            ax1 = fig.add_subplot(1,1,1)
            self.enrollment_df[C].plot(ax=ax1,color=xff.colors['institute'],rot=0,lw=3,label=self.nickname)
            
            ax1 = xff.timeseries_plot_formatter(ax1)
            ax1.set_yticklabels([r'${0}$'.format("%d" % (y)) for y in ax1.get_yticks()])
            #ax1.legend(loc=4,prop={'size':22},frameon=False,scatterpoints=1)
            
            ### Generalized Plotting functions
            figsavename = self.figpath+C.replace(' ','_')+'_'+self.nickname.replace('.','_')
            print figsavename
            xff.texify(fig,ax1,
                          ylabel=C,
                          #title=self._xdata.course_id+' - All Registrants',
                          datefontsize=20,
                          gridb='y',
                          figsavename=figsavename+'.png')

            ### Append JSON Data
            record = collections.OrderedDict()
            record['key'] = C
            if C == 'Enroll Count':
                record['bar'] = 'true'
            record['values'] = [[date2epoch(d),int(v)] for d,v in self.enrollment_df[C].iteritems()]
            jsondata.append(record)
                        
        print "JSON dump currently commented out."
        # str_jsondata = 'var data = '+json.dumps(jsondata)
        # with open(self.figpath+'enrollment.json', 'w') as outfile:
        #     outfile.write(str_jsondata)

        return None#self.enrollment_df
Пример #5
0
    def scatter_bubble_size(self,DF,colx,coly,disc_act,figsave=False):
        """
        Creates scatter plot with x=colx, y=coly.
        Size of markers always proportional to disc_act (discussion activity).
        
        Parameters (generated during class initialization)
        ----------
        colx: column to be plotted on x-axis
        coly: column to be plotted on y-axis
        disc_act: column for scaling marker (bubble) size
        figsave: True/False to allow exploratory analysis without saving fig.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        ### Data
        data = DF[[colx,coly,disc_act,'certified']].copy()
        Jcolx = 0.75
        Jcoly = 0.01
        bmin = 1.0
        bscale = 0.2
        data[colx] = data[colx].apply(lambda x: x + Jcolx*(np.random.sample()-Jcolx))
        data[coly] = data[coly].apply(lambda x: x + Jcoly*(np.random.sample()))
        data[disc_act] = data[disc_act].fillna(1.0)

        certcut = DF[DF['certified']==1].grade.min()

        fig = plt.figure(figsize=[12,10])
        ax1 = fig.add_subplot(1,1,1)
        #Non-Certs
        tmp = data[data.certified==0]
        ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['neutral'])
        #Certified
        tmp = data[data.certified==1]
        ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['institute'])

        #ax1.legend(loc=5,prop={'size':18},scatterpoints=1,frameon=False)
        ax1.set_xlim(-0.05,)
        ax1.set_ylim(-0.05,1.05)

        ### Generalized Plotting functions
        xff.texify(fig,ax1,xlabel=colx,ylabel=coly,
                   title='bubble size proportional to %s' % (disc_act.replace('_',' ')),
                   tic_size=20,label_size=24,datefontsize=20)

        figsavename = self.figpath+'scatter_'+colx+'_'+coly+'_disc_size_'+self.nickname.replace('.','_')+'.png'
        fig.savefig(figsavename, bbox_inches='tight', dpi=300)

        return None
Пример #6
0
    def daily_activity(self):
        """
        Creates daily timeseries of discussion activity (only posts/comments/votes from forum data).
       
        
        Parameters (generated during class initialization)
        ----------
        None

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        fig = plt.figure(figsize=[20,6])
        ax1 = fig.add_subplot(1,1,1)

        ### List of Certified Participants
        certs = self.pc_plus[self.pc_plus.certified==1].user_id.unique()
        certs = [str(u) for u in certs]

        ### Non-Certified
        post_act = self.forum[ (self.forum.created_at.notnull()) & (self.forum.author_id.isin(certs)==False) ].created_at.apply(lambda x: x.date()).value_counts().sort_index()
        post_act.plot(ax=ax1,style="-o",ms=6,lw=2,color=xff.colors['neutral'],label='$Non-Certified$')
        #(post_act.cumsum()/10).plot(ax=ax1,style="-",ms=3,color='Orange')
        
        ### Certified
        post_act = self.forum[ (self.forum.created_at.notnull()) & (self.forum.author_id.isin(certs)) ].created_at.apply(lambda x: x.date()).value_counts().sort_index()
        post_act.plot(ax=ax1,style="-o",ms=6,lw=2,color=xff.colors['institute'],label='$Certified$')

        xmin = (self.cinfo['start_date'] - np.timedelta64(2,'W')).item().date()
        xmax = (self.cinfo['end_date'] + np.timedelta64(4,'W')).item().date()

        ax1.set_xlim(xmin,xmax)

        ylim1 = ax1.get_ylim()[1]
        ax1.vlines([self.cinfo.start_date.item().date(),self.cinfo.end_date.item().date()], 0, ylim1, colors='Gray', lw=1.5, linestyles='--')
        ax1.set_ylim(0,ylim1)

        ax1 = xff.timeseries_plot_formatter(ax1,interval=1)
        ax1.legend(loc=1,prop={'size':24},frameon=False)

        figsavename = self.figpath+'discussion_activity_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,ylabel='Forum Text Submissions',
                   tic_size=20,label_size=24,
                   datefontsize=20,
                   title=self.nickname,
                   figsavename=figsavename+'.png')

        return None
Пример #7
0
    def bubble_heat_rel_week_vs_chapter(self):
        """
        Bubble Heat - Unique users relative week versus chapter accessed
        Color and bubble size indicate population size.
        
        Parameters (generated during class initialization)
        ----------
        colx: column to be plotted on x-axis
        coly: column to be plotted on y-axis
        disc_act: column for scaling marker (bubble) size
        figsave: True/False to allow exploratory analysis without saving fig.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        ### Data
        data = self.potime.groupby(['rel_week','module_id']).username.apply(lambda x: len(x.unique()))
        data = data.reset_index().rename(columns={0:'uniqU'})
        data = pd.merge(data,self.caxis[['index','chapter']],how='left',left_on='module_id',right_index=True).dropna()
        
        MINUSERS = 20
        data = data[data.uniqU>MINUSERS]

        W = data.groupby(['chapter','rel_week'])['index','uniqU'].agg([min,max]).reset_index()
        
        fig = plt.figure(figsize=[24,16])
        ax1 = fig.add_subplot(1,1,1)

        c = W[('uniqU','max')]
        cmhot = plt.get_cmap("CMRmap")

        sc = ax1.scatter(W[('index','min')],W[('rel_week','')],s=0.95*W[('uniqU','max')],
                    c=c,edgecolors='None',cmap=cmhot,alpha=0.95)

        cbar = plt.colorbar(sc)
        cbar.set_label('$N$',rotation=90,fontsize=30)
        cbar.ax.tick_params(labelsize=30)
        cbar.ax.tick_params(labelsize=30)
        #cbar.set_ticklabels([r'${0}$'.format("%s" % (y.get_text())) for y in cbar.ax.get_yticklabels()])

        ### Generalized Plotting functions
        figsavename = self.figpath+'bubble_heat_rel_week_vs_chapter_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,xlabel='Course Structure Index',ylabel='Week Relative to Course Launch',
                   title='Unique Users visiting Chapters each Relative Week (min %d users for display)' % (MINUSERS),
                   tic_size=30,label_size=30,
                   figsavename=figsavename+'.png')

        return None
Пример #8
0
    def resource_use(self,category):
        """
        Number of unique resources by category.

        Parameters (generated during class initialization)
        ----------
        category: ['video','problem', etc.]

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """



        resources = self.caxis[self.caxis.category==category].index.unique()
        usage = self.freq.filter(resources).count(axis=1)  
        
        bins = len(resources)
        brange = (0,usage.max())
        sgcolors = ['Silver','Crimson']
        sglabels = ['$Non-Certified$','$Certified$']

        fig = plt.figure(figsize=[12,6])
        ax1 = fig.add_subplot(1,1,1)

        for sg in [0,1]:  ### Note, later this could be all four subgroups: explored, certified, etc
            subpop = self.person[self.person.certified==sg].username.dropna().unique()
            usage[subpop].hist(range=brange,bins=bins,
                               cumulative=0,#histtype='step',
                               normed=True,alpha=0.85,
                               edgecolor=sgcolors[sg],color=sgcolors[sg])
            # usage[subpop].hist(range=brange,bins=bins,
            #                    cumulative=-1,histtype='step',
            #                    normed=True,alpha=0.85,
            #                    edgecolor=sgcolors[sg],color=sgcolors[sg],label=None)

        ax1.legend(sglabels,loc=9,prop={'size':24},frameon=False)
    

        figsavename = self.figpath+'resource_use_'+category+'_dist_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,
                   xlabel='Unique %s resources' % (category),
                   ylabel='Normalized Count',
                   title=self.nickname,
                   figsavename=figsavename+'.png')

        return None
Пример #9
0
    def activity_distributions(self):
        """
        Creates discussion activity distributions for 
        users with > 0 activities (at least one action).
        
        Parameters (generated during class initialization)
        ----------
        None

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        gtzero = self.pc_plus[self.pc_plus.total_disc > 0]
        for col in ['upvotes', 'posts', 'comments', 'total_disc']:
            gtzero[col] = gtzero[col].apply(lambda x: np.log(x) if x else None)
            fig = plt.figure(figsize=[12, 7])
            ax1 = fig.add_subplot(1, 1, 1)
            ### Non-Certs
            gtzero[gtzero.certified == 0][col].hist(
                bins=50,
                range=[0, 9],
                log=True,
                color=xff.colors['neutral'],
                edgecolor=xff.colors['neutral'])
            ### Certs
            gtzero[gtzero.certified == 1][col].hist(
                bins=50,
                range=[0, 9],
                log=True,
                color=xff.colors['institute'],
                edgecolor=xff.colors['institute'],
                alpha=0.8)
            tics = [1, 10, 100, 1000, 10000]
            ax1.set_xticks([np.log(x) for x in tics])
            ax1.set_xticklabels(tics)
            ax1.set_xlim([0, 8])
            xff.texify(fig, ax1, xlabel=col.replace('_', ' '), ylabel='Count')

            figsavename = self.figpath + 'disc_act_distribution_' + col + '_' + self.nickname + '.png'
            fig.savefig(figsavename, bbox_inches='tight', dpi=300)

        return None
Пример #10
0
    def activity_distributions(self, columns=[], **kwargs):
        """
        Activity Distributions from person_course collection using Pandas built in hist function.

        There are several columns which can be plotted as distributions:
        features = ['ndays_act', 'nevents', 'nforum_events', 'nforum_posts', 'nplay_video', 'nproblem_check', 'nprogcheck']
            
        Course Reports focus on nevents, ndays_act, and nchapters

        Parameters
        ----------
        columns : list of column names

        Typical column names - ['ndays_act', 'nevents', 'nforum_events', 
                                'nforum_posts', 'nplay_video', 'nproblem_check', 
                                'nprogcheck']

        Returns
        -------
        None
        """

        data = self.person.filter(items=columns)

        for col in columns:
            if col not in self.person:
                columns.remove(col)
                print "Some of the specified columns do not exist in the person_course collection."
                print col

        if len(columns) == 0:
            print "No specified columns available."
            return None

        for column in data.columns:
            ### Plot
            fig = plt.figure(figsize=(12, 7))
            ax1 = fig.add_subplot(1, 1, 1)
            check = self.distribution_logic(ax1, column, True)

            ### Generalized Plotting functions
            figsavename = self.figpath + 'distribution_' + column.replace(
                ' ', '_') + '_' + self.nickname.replace('.', '_')
            print figsavename
            xff.texify(fig, ax1, figsavename=figsavename + '.png')

        return None
Пример #11
0
    def activity_distributions(self,columns=[],**kwargs):
        """
        Activity Distributions from person_course collection using Pandas built in hist function.

        There are several columns which can be plotted as distributions:
        features = ['ndays_act', 'nevents', 'nforum_events', 'nforum_posts', 'nplay_video', 'nproblem_check', 'nprogcheck']
            
        Course Reports focus on nevents, ndays_act, and nchapters

        Parameters
        ----------
        columns : list of column names

        Typical column names - ['ndays_act', 'nevents', 'nforum_events', 
                                'nforum_posts', 'nplay_video', 'nproblem_check', 
                                'nprogcheck']

        Returns
        -------
        None
        """

        data = self.person.filter(items=columns)

        for col in columns:
            if col not in self.person: 
                columns.remove(col)
                print "Some of the specified columns do not exist in the person_course collection."
                print col

        if len(columns) == 0:
            print "No specified columns available."  
            return None
        
        for column in data.columns:
            ### Plot
            fig = plt.figure(figsize=(12,7))
            ax1 = fig.add_subplot(1,1,1)
            check = self.distribution_logic(ax1,column,True)       
            
            ### Generalized Plotting functions
            figsavename = self.figpath+'distribution_'+column.replace(' ','_')+'_'+self.nickname.replace('.','_')
            print figsavename
            xff.texify(fig,ax1,figsavename=figsavename+'.png')

        return None
Пример #12
0
    def activity_distributions(self):
        """
        Creates discussion activity distributions for 
        users with > 0 activities (at least one action).
        
        Parameters (generated during class initialization)
        ----------
        None

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """
        
        gtzero = self.pc_plus[self.pc_plus.total_disc>0]
        for col in ['upvotes','posts','comments','total_disc']:
            gtzero[col] = gtzero[col].apply(lambda x: np.log(x) if x else None)
            fig = plt.figure(figsize=[12,7])
            ax1 = fig.add_subplot(1,1,1)
            ### Non-Certs
            gtzero[gtzero.certified==0][col].hist(bins=50,range=[0,9],log=True,
                                                  color=xff.colors['neutral'],
                                                  edgecolor=xff.colors['neutral'])
            ### Certs
            gtzero[gtzero.certified==1][col].hist(bins=50,range=[0,9],log=True,
                                                  color=xff.colors['institute'],
                                                  edgecolor=xff.colors['institute'],
                                                  alpha=0.8)
            tics = [1,10,100,1000,10000]
            ax1.set_xticks([np.log(x) for x in tics])
            ax1.set_xticklabels(tics)
            ax1.set_xlim([0,8])
            xff.texify(fig,ax1,xlabel=col.replace('_',' '),ylabel='Count')

            figsavename = self.figpath+'disc_act_distribution_'+col+'_'+self.nickname+'.png'
            fig.savefig(figsavename, bbox_inches='tight', dpi=300)

        return None
Пример #13
0
    def daily_unique_users(self):
        """

        """
        fig = plt.figure(figsize=[20,6])
        ax1 = fig.add_subplot(1,1,1)

        sgcolors = ['Silver','Crimson']
        sglabels = ['$Non-Certified$','$Certified$']

        for sg in [0,1]:
            users = self.person[self.person.certified==sg].username.dropna().unique()
            daily = pd.crosstab(self.potime[self.potime.username.isin(users)].username,self.potime.date)
            daily = daily[daily>0].count().sort_index()
            daily.index = [np.datetime64(d) for d in daily.index]
            daily.plot(ax=ax1,style="-o",ms=6,lw=2,color=sgcolors[sg],rot=0,label=sglabels[sg])
        
        xmin = (self.cinfo['start_date'] - np.timedelta64(2,'W')).item().date()
        xmax = (self.cinfo['end_date'] + np.timedelta64(4,'W')).item().date()

        ax1.set_xlim(xmin,xmax)
        ax1 = xff.timeseries_plot_formatter(ax1, interval=1)
        ax1.legend(loc=1,prop={'size':24},frameon=False)

        ylim1 = ax1.get_ylim()[1]
        ax1.vlines([self.cinfo.start_date.item().date(),self.cinfo.end_date.item().date()], 0, ylim1, colors='Gray', lw=1.5, linestyles='--')
        ax1.set_ylim(0,ylim1)

        figsavename = self.figpath+'daily_unique_users_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,ylabel='Unique Users',
                   tic_size=20,label_size=24,
                   datefontsize=20,
                   title=self.nickname,
                   figsavename=figsavename+'.png')


        return None
Пример #14
0
    def grade_distribution(self, **kwargs):
        '''
        Simple grade distribution.

        Parameters
        ----------
        None
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        '''

        NVD3 = kwargs.get('NVD3', False)

        if self.person.grade.count() < 50:
            print "Not enough grade information. Return without grade distribution."
            return None

        bins = 50
        hmin = 0.0  #self.person['grade'].min()
        hmax = 1.0  #self.person['grade'].max()
        glowbound = 0.1

        ### Plot
        fig = plt.figure(figsize=(12, 7))
        ax1 = fig.add_subplot(1, 1, 1)

        self.person[(self.person.grade >= glowbound)
                    & (self.person.certified == 0)]['grade'].hist(
                        ax=ax1,
                        bins=bins,
                        range=(hmin, hmax),
                        log=False,
                        cumulative=0,
                        color=xff.colors['neutral'],
                        edgecolor=xff.colors['neutral'])

        self.person[(self.person.grade >= glowbound)
                    & (self.person.certified == 1)]['grade'].hist(
                        ax=ax1,
                        bins=bins,
                        range=(hmin, hmax),
                        log=False,
                        cumulative=0,
                        color=xff.colors['institute'],
                        edgecolor=xff.colors['institute'],
                        alpha=0.8)

        xlab = 'Grade'
        #ax1.set_xlabel(r'%s' % (xlab),fontdict={'fontsize': 30,'style': 'oblique'})
        #ax1.set_ylabel(r'Count (log scale)',fontdict={'fontsize': 30,'style': 'oblique'})
        ax1.set_xlim(0, hmax)
        ax1.set_ylim(1, )
        ax1.legend(['$Non-Certified$', '$Certified$'],
                   loc=1,
                   prop={'size': 24},
                   frameon=False)
        ax1.set_xticklabels([r'$%.1f$' % x for x in ax1.get_xticks()],
                            fontsize=30)
        ax1.set_yticklabels([r'$%d$' % y for y in ax1.get_yticks()],
                            fontsize=30)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'grade_distribution_' + self.nickname.replace(
            '.', '_')
        xff.texify(fig,
                   ax1,
                   xlabel='Grade (> %.2f)' % (glowbound),
                   ylabel='Count',
                   gridb='y',
                   figsavename=figsavename + '.png')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:

            ### Data
            bins = 50
            hmin = 0.0
            hmax = 1.0
            DATA = self.person[(self.person.grade >= glowbound)]
            Y1, X1 = np.histogram(DATA[DATA.certified == 0].grade.values,
                                  bins=bins,
                                  range=(hmin, hmax))
            Y2, X2 = np.histogram(DATA[DATA.certified == 1].grade.values,
                                  bins=bins,
                                  range=(hmin, hmax))

            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath + 'interactive_grade_distribution_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Grade Distribution: %s" % self._xdata.course_id
            chart = multiBarChart(name=title, height=400)
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(X1)

            extra_serie = {
                "tooltip": {
                    "y_start": "",
                    "y_end": ""
                },
                "color": xff.colors['neutral']
            }
            chart.add_serie(name="Count", y=Y1, x=X1, extra=extra_serie)
            extra_serie = {
                "tooltip": {
                    "y_start": "",
                    "y_end": ""
                },
                "color": xff.colors['institute']
            }
            chart.add_serie(name="Duration", y=Y2, x=X2, extra=extra_serie)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #15
0
    def country_of_origin(self, **kwargs):
        """
        Creates figures for the top "ccnum" of enrolled countries.
       
        Parameters (generated during class initialization)
        ----------
        ccnum = number of requested countries to be plotted. Max 25 for plotting issues.
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3', False)

        ccnum = kwargs.get('ccnum', 10)

        cc = self.person.final_cc.value_counts().order(ascending=False)
        if self.person[self.person.certified ==
                       1].username.count() > self.mincerts:
            certs = self.person[self.person.certified ==
                                1].final_cc.value_counts()
        else:
            certs = pd.Series(index=cc.index)

        cc = pd.concat([cc, certs],
                       join='inner',
                       axis=1,
                       keys=['$Non-Certified$', '$Certified$'])
        cc = cc.sort('$Non-Certified$', ascending=False)[0:ccnum]
        perc = 100. * cc / cc.sum()
        perc = perc.apply(lambda x: np.round(x, 1))
        #print perc

        fig = plt.figure(figsize=(12, 6))
        ax1 = fig.add_subplot(1, 1, 1)
        perc.plot(
            ax=ax1,
            kind='bar',
            color=[xff.colors['neutral'], xff.colors['institute']],
            rot=40,
        )

        ### Plot Details
        ax1.set_xticklabels([r'$%s$' % x for x in perc.index])
        ax1.set_yticklabels(
            [r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],
            fontsize=30)
        ax1.legend(loc=1, prop={'size': 28}, frameon=False)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'country_geoloc_distribution_' + self.nickname.replace(
            '.', '_')
        print figsavename
        xff.texify(fig,
                   ax1,
                   xlabel='Country Code',
                   ylabel=None,
                   figsavename=figsavename + '.png')

        ### Output JSON Records
        #cc.name = 'value'
        #cc = cc.reset_index().rename(columns={'index':'label'})
        #cc.dropna().to_json(figsavename+'.json',orient='records')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath + 'interactive_country_distribution_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Education Level Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = multiBarChart(name=charttype,
                                  height=350,
                                  x_axis_format="",
                                  y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(perc)
            X = perc.index  #list(range(nb_element))
            Y1 = perc.ix[:, '$Non-Certified$'].values
            Y2 = perc.ix[:, '$Certified$'].values

            ### Series 1
            extra_serie1 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['neutral'],
                "format": ".1f"
            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)

            ### Series 2
            extra_serie2 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['institute'],
                "format": ".1f"
            }
            chart.add_serie(name="Certificate Earners",
                            y=Y2,
                            x=X,
                            extra=extra_serie2)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #16
0
    def gender(self,**kwargs):
        """
        Creates gender distribution figures.
       
        Parameters (generated during class initialization)
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """
        
        NVD3 = kwargs.get('NVD3',False)

        ### Removes those users not having the option to fill in edX registration data.
        trim_data = self.person[(self.person.registered==1) & (self.person.user_id>156633)]

        ### Data
        gdict =  {'f': "$Female$",'m': "$Male$",'o':"$Other$"}
        glist = ['$Female$','$Male$']

        ### Munge and Plot
        gender = trim_data.gender.dropna().apply(lambda x: gdict[x]).value_counts()
        #print gender
        certs = trim_data[trim_data.certified==1]
        if certs.username.count() > self.mincerts:
            certs = certs.gender.dropna().apply(lambda x: gdict[x]).value_counts()
        else:
            certs = pd.Series(index=gender.index)    

        gender = pd.concat([gender,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$']) 
        gender = 100.*gender/gender.sum()
        gender = gender.apply(lambda x: np.round(x,1))

        fig = plt.figure(figsize=(12,6))
        ax1 = fig.add_subplot(1,1,1)
        gender.ix[glist,:].plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=0)

        ### Plot Details
        ax1.set_xticklabels([r'%s' % x for x in glist])
        ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30)
        ax1.legend(loc=2,prop={'size':28},frameon=False)

        ### Generalized Plotting functions
        figsavename = self.figpath+'gender_distribution_'+self.nickname.replace('.','_')
        print figsavename
        xff.texify(fig,ax1,xlabel=None,ylabel='Count',figsavename=figsavename+'.png')

        # ### Output JSON Records
        # gender.name = 'value'
        # gender = gender.reset_index().rename(columns={'index':'label'})
        # gender.dropna().to_json(figsavename+'.json',orient='records')


        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            
            'http://nvd3.org/examples/pie.html'
            

            X = [ x.replace('$','') for x in gender.index ]
            Y1 = gender.ix[glist,'$Non-Certified$'].values
            Y2 = gender.ix[glist,'$Certified$'].values

            #----------------------------------------------------------------
            ### BAR Chart
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath+'interactive_gender_distribution_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Gender Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(gender.ix[glist,:])
            

            ### Series 1
            extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['neutral'],
                            "format":".1f"
                            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)
            
            ### Series 2
            extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['institute'],
                            "format":".1f"
                            }
            chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2)
            
            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()


            #----------------------------------------------------------------
            ### Pie Chart
            from nvd3 import pieChart

            ### Output File
            figsavename = self.figpath+'interactive_gender_piechart_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Gender Pie Chart: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = pieChart(name=charttype, color_category='category20c', height=400, width=400)
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")

            extra_serie = {"tooltip": {"y_start": "", "y_end": " certified"}}

            chart.add_serie(y=Y1, x=X, extra=extra_serie)
            
            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()



        return None
Пример #17
0
    def bubble_heat_rel_week_vs_chapter(self):
        """
        Bubble Heat - Unique users relative week versus chapter accessed
        Color and bubble size indicate population size.
        
        Parameters (generated during class initialization)
        ----------
        colx: column to be plotted on x-axis
        coly: column to be plotted on y-axis
        disc_act: column for scaling marker (bubble) size
        figsave: True/False to allow exploratory analysis without saving fig.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        ### Data
        data = self.potime.groupby(
            ['rel_week',
             'module_id']).username.apply(lambda x: len(x.unique()))
        data = data.reset_index().rename(columns={0: 'uniqU'})
        data = pd.merge(data,
                        self.caxis[['index', 'chapter']],
                        how='left',
                        left_on='module_id',
                        right_index=True).dropna()

        MINUSERS = 20
        data = data[data.uniqU > MINUSERS]

        W = data.groupby(['chapter',
                          'rel_week'])['index',
                                       'uniqU'].agg([min, max]).reset_index()

        fig = plt.figure(figsize=[24, 16])
        ax1 = fig.add_subplot(1, 1, 1)

        c = W[('uniqU', 'max')]
        cmhot = plt.get_cmap("CMRmap")

        sc = ax1.scatter(W[('index', 'min')],
                         W[('rel_week', '')],
                         s=0.95 * W[('uniqU', 'max')],
                         c=c,
                         edgecolors='None',
                         cmap=cmhot,
                         alpha=0.95)

        cbar = plt.colorbar(sc)
        cbar.set_label('$N$', rotation=90, fontsize=30)
        cbar.ax.tick_params(labelsize=30)
        cbar.ax.tick_params(labelsize=30)
        #cbar.set_ticklabels([r'${0}$'.format("%s" % (y.get_text())) for y in cbar.ax.get_yticklabels()])

        ### Generalized Plotting functions
        figsavename = self.figpath + 'bubble_heat_rel_week_vs_chapter_' + self.nickname.replace(
            '.', '_')
        xff.texify(
            fig,
            ax1,
            xlabel='Course Structure Index',
            ylabel='Week Relative to Course Launch',
            title=
            'Unique Users visiting Chapters each Relative Week (min %d users for display)'
            % (MINUSERS),
            tic_size=30,
            label_size=30,
            figsavename=figsavename + '.png')

        return None
Пример #18
0
    def time_on_course(self):
        """
        Estimation of time spent in course. Using self.potime,
        click events are grouped by username and time sorted. 
        Time differences >= 10 sec and <= 3600 sec are summed. 
        Justification for those cutoffs can be found here:
        http://cacm.acm.org/magazines/2014/4/173221-who-does-what-in-a-massive-open-online-course/fulltext
        Submitted Paper: "eText Use in Blended Introductory Physics Courses: Interpreting Meaningful Interactions and the Effects of Course Structure"
        
        Only estimates time for users with more than one event.

        Parameters (generated during class initialization)
        ----------
        None

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        fig = plt.figure(figsize=[12, 6])
        ax1 = fig.add_subplot(1, 1, 1)

        ### All Non-Certified
        participants = self.person[self.person.certified ==
                                   0].username.dropna().unique()
        self.time_spent[participants].apply(np.log).hist(
            ax=ax1,
            bins=100,
            range=[0, 16],
            color='Silver',
            edgecolor=None,
            alpha=0.9,
            label="$Non-Certified$")

        ### Certified
        certs = self.person[self.person.certified ==
                            1].username.dropna().unique()
        self.time_spent[certs].apply(np.log).hist(ax=ax1,
                                                  bins=100,
                                                  range=[0, 16],
                                                  color='Crimson',
                                                  edgecolor=None,
                                                  alpha=0.9,
                                                  label="$Certified$")

        ax1.set_xticks(
            [np.log(x) for x in [1, 10, 60, 600, 3600, 10 * 3600, 100 * 3600]])
        ax1.set_xticklabels([
            '1 sec', '10 sec', '1 min', '10 min', '1 hr', '10 hrs', '100 hrs'
        ],
                            rotation=40)
        ax1.legend()

        figsavename = self.figpath + 'time_in_course_' + self.nickname.replace(
            '.', '_')
        xff.texify(
            fig,
            ax1,
            xlabel='Total Time In Course',
            ylabel='Count',
            # title=self.nickname,
            tic_size=24,
            label_size=24,
            gridb='y',
            figsavename=figsavename + '.png')

        # print figsavename
        # if figsavename != None:
        #     dpiset = 300
        #     #fig.savefig('OUTPUT_Figures/%s/%s_%s.png' %(mens2_cid,figsavename,nickname), bbox_inches='tight', dpi=dpiset)
        #     fig.savefig('%s' % (figsavename), bbox_inches='tight', dpi=dpiset)

        return None
Пример #19
0
    def grade_distribution(self,**kwargs):
        '''
        Simple grade distribution.

        Parameters
        ----------
        None
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        '''

        NVD3 = kwargs.get('NVD3',False)

        if self.person.grade.count() < 50:
            print "Not enough grade information. Return without grade distribution."
            return None

        bins = 50 
        hmin = 0.0 #self.person['grade'].min()
        hmax = 1.0 #self.person['grade'].max()
        glowbound = 0.1

        ### Plot
        fig = plt.figure(figsize=(12,7))
        ax1 = fig.add_subplot(1,1,1)

        self.person[(self.person.grade>=glowbound) & (self.person.certified==0)]['grade'].hist(ax=ax1,bins=bins,range=(hmin,hmax),log=False,cumulative=0,
                                                 color=xff.colors['neutral'],edgecolor=xff.colors['neutral'])
        
        self.person[(self.person.grade>=glowbound) & (self.person.certified==1)]['grade'].hist(ax=ax1,bins=bins,range=(hmin,hmax),log=False,cumulative=0,
                                                 color=xff.colors['institute'],edgecolor=xff.colors['institute'],alpha=0.8)
        
        xlab = 'Grade'
        #ax1.set_xlabel(r'%s' % (xlab),fontdict={'fontsize': 30,'style': 'oblique'})
        #ax1.set_ylabel(r'Count (log scale)',fontdict={'fontsize': 30,'style': 'oblique'})
        ax1.set_xlim(0,hmax)
        ax1.set_ylim(1,)
        ax1.legend(['$Non-Certified$','$Certified$'],loc=1,prop={'size':24},frameon=False)
        ax1.set_xticklabels([r'$%.1f$' % x for x in ax1.get_xticks()],fontsize=30)
        ax1.set_yticklabels([r'$%d$' % y for y in ax1.get_yticks()],fontsize=30)

        ### Generalized Plotting functions
        figsavename = self.figpath+'grade_distribution_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,
                   xlabel='Grade (> %.2f)' % (glowbound),
                   ylabel='Count',
                   gridb='y',
                   figsavename=figsavename+'.png')

        
        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            
            ### Data
            bins = 50
            hmin = 0.0
            hmax = 1.0
            DATA = self.person[(self.person.grade>=glowbound)]
            Y1,X1 = np.histogram(DATA[DATA.certified==0].grade.values,bins=bins,range=(hmin,hmax))
            Y2,X2 = np.histogram(DATA[DATA.certified==1].grade.values,bins=bins,range=(hmin,hmax))

            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath+'interactive_grade_distribution_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Grade Distribution: %s" % self._xdata.course_id
            chart = multiBarChart(name=title, height=400)
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(X1)

            extra_serie = {"tooltip": {"y_start": "", "y_end": ""},
                           "color":xff.colors['neutral']
                           }
            chart.add_serie(name="Count", y=Y1, x=X1, extra=extra_serie)
            extra_serie = {"tooltip": {"y_start": "", "y_end": ""},
                           "color":xff.colors['institute']
                           }
            chart.add_serie(name="Duration", y=Y2, x=X2, extra=extra_serie)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #20
0
    def last_day_plots(self, **kwargs):
        """
        Plots using the last_event date from enrollment_df.

        Parameters
        ----------
        None
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        """

        col = self.lastact_df.columns[0]
        tmp = self.lastact_df[col][self.lastact_df[col] > 100]
        mindate = tmp.index[0] - timedelta(days=21)
        maxdate = tmp.index[-1] + timedelta(days=21)
        #print mindate,maxdate

        ### For JSON Data Output
        jsondata = []

        def date2epoch(x):
            return int((datetime.combine(x, datetime.min.time()) -
                        datetime(1970, 1, 1)).total_seconds() * 1000)

        for C in self.lastact_df.columns:
            #print C,self.lastact_df[C].idxmin(),self.lastact_df[C].idxmax()
            fig = plt.figure(figsize=(12, 6))
            ax1 = fig.add_subplot(1, 1, 1)
            self.lastact_df[C].plot(ax=ax1,
                                    color=xff.colors['institute'],
                                    rot=0,
                                    lw=3,
                                    label=self.nickname)

            ax1.set_xlim(mindate, maxdate)
            ax1 = xff.timeseries_plot_formatter(ax1)
            ax1.set_yticklabels(
                [r'${0}$'.format("%d" % (y)) for y in ax1.get_yticks()])
            #ax1.legend(loc=4,prop={'size':22},frameon=False,scatterpoints=1)

            ### Generalized Plotting functions
            figsavename = self.figpath + C.replace(
                ' ', '_') + '_' + self.nickname.replace('.', '_')
            print figsavename
            xff.texify(
                fig,
                ax1,
                ylabel=C,
                #title=self._xdata.course_id+' - All Registrants',
                datefontsize=20,
                gridb='y',
                figsavename=figsavename + '.png')

            ### Append JSON Data
            record = collections.OrderedDict()
            record['key'] = C
            if C == 'Last Activity Count':
                record['bar'] = 'true'
            record['values'] = [[date2epoch(d), int(v)]
                                for d, v in self.lastact_df[C].iteritems()]
            jsondata.append(record)

        print "JSON dump currently commented out."
        # str_jsondata = 'var data = '+json.dumps(jsondata)
        # with open(self.figpath+'lastday.json', 'w') as outfile:
        #     outfile.write(str_jsondata)

        return None
Пример #21
0
    def andrew_ho_diagram(self, **kwargs):
        """
        Plot showing the intersection of enrollment populations by categories
        defined in the 2013 (published 2014) course reports.
        http://odl.mit.edu/mitx-working-papers/ 

        Parameters
        ----------
        None
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        """

        ### Registration Types
        self.person['Only Registered'] = 0
        self.person['Only Viewed'] = 0
        self.person['Only Explored'] = 0

        g = self.person  ### This is a bit silly, but keeps lines short despite the conditions.

        ### Create disjoint groups (note g and self.person are the same)
        #Only Registered:  A - (B+C+D)
        reg_list = g[(g['registered'] == 1) & (g['viewed'] == 0) &
                     (g['explored'] == 0) & (g['certified'] == 0)].user_id
        self.person.ix[self.person[self.person.user_id.isin(reg_list)].index,
                       'Only Registered'] = 1

        #Only Viewed:  B - (C+D)
        view_list = g[(g.user_id.isin(reg_list) == False) & (g['viewed'] == 1)
                      & (g['explored'] == 0) & (g['certified'] == 0)].user_id
        self.person.ix[self.person[self.person.user_id.isin(view_list)].index,
                       'Only Viewed'] = 1

        #Only Explored:  C - (D)
        exp_list = g[(g.user_id.isin(reg_list) == False)
                     & (g.user_id.isin(view_list) == False) &
                     (g['explored'] == 1) & (g['certified'] == 0)].user_id
        self.person.ix[self.person[self.person.user_id.isin(exp_list)].index,
                       'Only Explored'] = 1

        ### Figure
        fig = plt.figure(figsize=(12, 8))
        ax1 = fig.add_subplot(111)

        #Circles
        SCALE = 1000
        expratio = SCALE * 100. * len(
            self.person[self.person['Only Explored'] == 1]) / len(
                self.person[self.person['registered'] == 1])
        #print expratio
        if expratio > 17000:
            expratio = 1000

        certratio = SCALE * 100. * len(
            self.person[self.person['certified'] == 1]) / len(
                self.person[self.person['registered'] == 1])
        #print certratio
        if certratio < 1000:
            certratio = 1000
        # csize = 1000  # Ratio and csize give relative size of explored and certified circles.
        # # expl = len(self.person[self.person['Only Explored']==1])
        # # cert = len(self.person[self.person['certified']==1])

        ax1.scatter([0.45], [0.5],
                    s=expratio,
                    edgecolor='Black',
                    lw=2,
                    color='white',
                    alpha=0.8)
        ax1.scatter([0.50], [0.5],
                    s=certratio,
                    edgecolor=xff.colors['institute'],
                    lw=2,
                    color='white',
                    alpha=0.8)

        #Rectangles
        rect1 = matplotlib.patches.Rectangle((0, 0),
                                             1,
                                             1,
                                             fill=False,
                                             fc='white',
                                             ec='black',
                                             lw=2)
        rect2 = matplotlib.patches.Rectangle((0.15, 0.15),
                                             0.7,
                                             0.7,
                                             fill=False,
                                             fc='white',
                                             ec='black',
                                             lw=2)
        ax1.add_patch(rect1)
        ax1.add_patch(rect2)

        FS = 25
        ### Only registerd
        x = len(self.person[self.person['Only Registered'] == 1])
        y = 100.0 * x / len(self.person[self.person['registered'] == 1])
        ax1.text(0.05,
                 0.925,
                 '$Only\ registered:\ %d\ \ (%.1f\%%)$' % (x, y),
                 fontsize=FS,
                 ha='left')
        x = len(self.person[self.person['Only Viewed'] == 1])
        y = 100.0 * x / len(self.person[self.person['registered'] == 1])
        ax1.text(0.2,
                 0.775,
                 '$Only\ viewed:\ %d\ \ (%.1f\%%)$' % (x, y),
                 fontsize=FS,
                 ha='left')
        x = len(self.person[self.person['Only Explored'] == 1])
        y = 100.0 * x / len(self.person[self.person['registered'] == 1])
        ax1.text(0.175,
                 0.2,
                 '$Only\ explored:$\n$%d\ \ (%.1f\%%)$' % (x, y),
                 fontsize=FS,
                 ha='left')
        x = len(self.person[self.person['certified'] == 1])
        y = 100.0 * x / len(self.person[self.person['registered'] == 1])
        ax1.text(0.625,
                 0.35,
                 '$Certified:$\n$%d\ \ (%.1f\%%)$' % (x, y),
                 fontsize=FS,
                 ha='left')

        ax1.set_xlim([-0.01, 1.01])
        ax1.set_ylim([-0.01, 1.01])

        ## suppress spines
        for key in ax1.spines.keys():
            ax1.spines[key].set_color('none')

        ax1.axes.get_xaxis().set_visible(False)
        ax1.axes.get_yaxis().set_visible(False)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'AHO_Diagram' + '_' + self.nickname.replace(
            '.', '_')
        xff.texify(fig, ax1, figsavename=figsavename + '.png')

        # ### Package data
        # jsonout = pd.Series()
        # for sg in ['Only Registered','Only Viewed','Only Explored','certified']:
        #     count = len(self.person[self.person[sg]==1].username.unique())
        #     jsonout.set_value(sg.capitalize()+'(N=%d)'%(count),count)

        # jsonout = 100.0*jsonout/jsonout.sum()

        # jsondata = []
        # for l,v in jsonout.iteritems():
        #     record = collections.OrderedDict()
        #     record['label'] = l
        #     record['value'] = v
        #     jsondata.append(record)

        # print "JSON dump currently commented out."
        # # str_jsondata = 'var data = '+json.dumps(jsondata)
        # # with open(self.figpath+'percent_participanttypes.json', 'w') as outfile:
        # #     outfile.write(str_jsondata)

        return None
Пример #22
0
    def content_touches_viz(self,horiz_w_data,certified):
        '''

        '''
        fig = plt.figure(figsize=(32,12))
        fig.subplots_adjust(hspace=.1)
        #plt.rcParams.update({'font.size': 20})
        ax1 = fig.add_subplot(2,1,1)
        ax2 = fig.add_subplot(2,1,2)

        #ax1.plot(tmp['order'],tmp['uniqU'],'o')
        bars1 = ax1.bar(horiz_w_data['order'],horiz_w_data['users'],3.0,alpha=0.8,edgecolor='none')
        #Colors the bars (this is where reindexing matters)
        for i,b in enumerate(bars1):
            flipi = i#len(tmp.index)-1-i
            if horiz_w_data.color[flipi] != 'Pink':
                bars1[i].set_facecolor(horiz_w_data.color[flipi])
                bars1[i].set_edgecolor(horiz_w_data.color[flipi])
            else:
                bars1[i].set_facecolor('none')
                bars1[i].set_edgecolor('none')
                
        #ax1.plot(tmp['order'],tmp.uniqU,'-o',color='Silver',alpha=0.8)
        ax1.set_xlabel('Course Structure Index')
        ax1.set_ylabel('Unique Users')
        #ax1.set_xlim(0,2100)

        bars2 = ax2.bar(horiz_w_data['order'],horiz_w_data.scale,2.5,edgecolor='none')

        #Colors the bars (this is where reindexing matters)
        for i,b in enumerate(bars2):
           flipi = i#len(vert.index)-1-i
           bars2[i].set_facecolor(horiz_w_data.color[flipi])

        invert = True # Choose whether to have the CC plot left or right oriented
        ha = 'left'
        if invert == True:
            ax2.invert_yaxis()
            ha = 'right'

        ax2.axes.get_xaxis().set_ticks([])
        #ax2.set_xlim(0,500)
        ax2.axes.get_yaxis().set_ticks([])
        #ax2.set_xlim(0,2100)
        ax2.set_ylim(7,-1)

        #fig.patch.set_visible(False)
        ax2.axis('off')

        #!!!!! x limits must be set together
        ax1.set_xlim(ax2.get_xlim()[0],ax2.get_xlim()[1])
        #ax1.set_ylim(0,7500)

        xff.texify(fig,ax1,tic_size=32,label_size=32)

        dpiset = 300
        if certified==True:
            figsavename = self.figpath+'content_touches_horizontal_certified_'+self.nickname.replace('.','_')+'.png'
        else:
            figsavename = self.figpath+'content_touches_horizontal_'+self.nickname.replace('.','_')+'.png'
        
        fig.savefig(figsavename, bbox_inches='tight', dpi=dpiset)

        return None
Пример #23
0
    def country_of_origin(self,**kwargs):
        """
        Creates figures for the top "ccnum" of enrolled countries.
       
        Parameters (generated during class initialization)
        ----------
        ccnum = number of requested countries to be plotted. Max 25 for plotting issues.
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """
        
        NVD3 = kwargs.get('NVD3',False)

        ccnum = kwargs.get('ccnum',10)

        cc = self.person.final_cc.value_counts().order(ascending=False)
        if self.person[self.person.certified==1].username.count() > self.mincerts:
            certs = self.person[self.person.certified==1].final_cc.value_counts()
        else:
            certs = pd.Series(index=cc.index)

        cc = pd.concat([cc,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$'])
        cc = cc.sort('$Non-Certified$',ascending=False)[0:ccnum]
        perc = 100.*cc/cc.sum()
        perc = perc.apply(lambda x: np.round(x,1))
        #print perc

        fig = plt.figure(figsize=(12,6))
        ax1 = fig.add_subplot(1,1,1)
        perc.plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=40,)

        ### Plot Details
        ax1.set_xticklabels([r'$%s$' % x for x in perc.index])
        ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30)
        ax1.legend(loc=1,prop={'size':28},frameon=False)
        
        ### Generalized Plotting functions
        figsavename = self.figpath+'country_geoloc_distribution_'+self.nickname.replace('.','_')
        print figsavename
        xff.texify(fig,ax1,xlabel='Country Code',ylabel=None,figsavename=figsavename+'.png')

        ### Output JSON Records
        #cc.name = 'value'
        #cc = cc.reset_index().rename(columns={'index':'label'})
        #cc.dropna().to_json(figsavename+'.json',orient='records')
        

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath+'interactive_country_distribution_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Education Level Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(perc)
            X = perc.index #list(range(nb_element))
            Y1 = perc.ix[:,'$Non-Certified$'].values
            Y2 = perc.ix[:,'$Certified$'].values

            ### Series 1
            extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['neutral'],
                            "format":".1f"
                            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)
            
            ### Series 2
            extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['institute'],
                            "format":".1f"
                            }
            chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2)
            
            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #24
0
    def level_of_education(self,**kwargs):
        '''
        Plot Level of Education Attained; typically taken from the edX enrollment questionairre.
        '''
        """
        Creates distribution of highest level of education attained.
       
        Parameters (generated during class initialization)
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """
        
        NVD3 = kwargs.get('NVD3',False)

        ### Level of Education (LoE)
        ### Data
        eddict = {'el': "Less\ than$\n$ Secondary",'jhs': "Less\ than$\n$ Secondary",'none':"Less\ than$\n$ Secondary",
                  'hs':"Secondary",'a':"Secondary",
                  'b':"Bachelor\'s",
                  'm': "Master\'s",
                  'p_se': "Doctorate",'p_oth': "Doctorate",'p': "Doctorate",
                  'other': None,'NA':None,'nan':None,
                  }

        edlist = ["Less\ than$\n$ Secondary","Secondary","Bachelor\'s","Master\'s","Doctorate"]
        trim_data = self.person[(self.person.registered==1) & (self.person.user_id>156633)]
        
        edlevels = trim_data.LoE.apply(lambda x: eddict[str(x)] if x in eddict.keys() else None).value_counts()[edlist]
        if trim_data[trim_data.certified==1].username.count() > self.mincerts:
            certs = trim_data[trim_data.certified==1].LoE.apply(lambda x: eddict[str(x)] if x in eddict.keys() else None).value_counts()[edlist]
        else:
            certs = pd.Series(index=edlevels.index)    

        edlevels = pd.concat([edlevels,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$']) 
        edlevels = 100.*edlevels/edlevels.sum()
        edlevels = edlevels.apply(lambda x: np.round(x,1))

        #print edlevels
        
        #Plot
        fig = plt.figure(figsize=(12,6))
        ax1 = fig.add_subplot(1,1,1)
        
        edlevels.plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=40)
        
        ### Plot Details
        ax1.set_xticklabels([r'$%s$' % x for x in edlist])
        ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30)
        ax1.legend(loc=2,prop={'size':22},frameon=False)
        
        ### Generalized Plotting functions
        figsavename = self.figpath+'loe_distribution_'+self.nickname.replace('.','_')
        print figsavename
        xff.texify(fig,ax1,xlabel=None,ylabel=None,figsavename=figsavename+'.png')

        ### Output JSON Records
        #cc.name = 'value'
        #cc = cc.reset_index().rename(columns={'index':'label'})
        #cc.dropna().to_json(figsavename+'.json',orient='records')


        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath+'interactive_edlevel_distribution_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Education Level Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'

            chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(edlevels)
            X = [ x.replace('\ ',' ').replace('$\n$',' ') for x in edlevels.index ] #list(range(nb_element))
            Y1 = edlevels.ix[:,'$Non-Certified$'].values
            Y2 = edlevels.ix[:,'$Certified$'].values

            ### Series 1
            extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['neutral'],
                            "format":".1f"
                            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)
            
            ### Series 2
            extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['institute'],
                            "format":".1f"
                            }
            chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2)
            
            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()


        return None
Пример #25
0
    def age(self,**kwargs):
        """
        Creates gender distribution figures.
       
        Parameters (generated during class initialization)
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3',False)

        ### Removes those users not having the option to fill in edX registration data.
        trim_data = self.person[(self.person.registered==1) & (self.person.user_id>156633)]
        # Add age column from year_of_birth
        trim_data['age'] = trim_data['YoB'].apply(lambda x: datetime.datetime.now().year - x if isinstance(x,int) else None)

        age = trim_data.age.dropna()
        h,e = np.histogram(age.values,bins=9,range=(0,90))
        age = pd.Series(data=h,index=['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89'])
        
        certs = trim_data[trim_data.certified==1]
        if certs.username.count() > self.mincerts:
            certs = certs.age.dropna()
            h,e = np.histogram(certs.values,bins=9,range=(0,90))
            certs = pd.Series(data=h,index=['0-9','10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89'])
        else:
            certs = pd.Series(index=age.index)

        age = pd.concat([age,certs],join='inner',axis=1,keys=['$Non-Certified$','$Certified$']) 
        age = 100.*age/age.sum()
        age = age.apply(lambda x: np.round(x,1))
        #print age



        #----------------------------------------------------------------
        ### Static Matplotlib PNG
        fig = plt.figure(figsize=(12,6))
        ax1 = fig.add_subplot(1,1,1)
        age.plot(ax=ax1,kind='bar',color=[xff.colors['neutral'],xff.colors['institute']],rot=40,)

        ### Plot Details
        ax1.set_xticklabels([r'$%s$' % x for x in age.index])
        ax1.set_yticklabels([r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],fontsize=30)
        ax1.legend(loc=1,prop={'size':28},frameon=False)
        
        ### Generalized Plotting functions
        figsavename = self.figpath+'age_distribution_'+self.nickname.replace('.','_')
        print figsavename
        xff.texify(fig,ax1,xlabel='Age',ylabel=None,figsavename=figsavename+'.png')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath+'interactive_age_distribution_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Age Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = multiBarChart(name=charttype, height=350, x_axis_format="", y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(age)
            X = age.index #list(range(nb_element))
            Y1 = age['$Non-Certified$'].values
            Y2 = age['$Certified$'].values

            ### Series 1
            extra_serie1 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['neutral'],
                            "format":".1f"
                            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)
            
            ### Series 2
            extra_serie2 = {"tooltip": {"y_start": "", "y_end": "%"},
                            "color":xff.colors['institute'],
                            "format":".1f"
                            }
            chart.add_serie(name="Certificate Earners", y=Y2, x=X, extra=extra_serie2)
            
            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()


        return None
Пример #26
0
    def scatter_bubble_size(self, DF, colx, coly, disc_act, figsave=False):
        """
        Creates scatter plot with x=colx, y=coly.
        Size of markers always proportional to disc_act (discussion activity).
        
        Parameters (generated during class initialization)
        ----------
        colx: column to be plotted on x-axis
        coly: column to be plotted on y-axis
        disc_act: column for scaling marker (bubble) size
        figsave: True/False to allow exploratory analysis without saving fig.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        ### Data
        data = DF[[colx, coly, disc_act, 'certified']].copy()
        Jcolx = 0.75
        Jcoly = 0.01
        bmin = 1.0
        bscale = 0.2
        data[colx] = data[colx].apply(lambda x: x + Jcolx *
                                      (np.random.sample() - Jcolx))
        data[coly] = data[coly].apply(lambda x: x + Jcoly *
                                      (np.random.sample()))
        data[disc_act] = data[disc_act].fillna(1.0)

        certcut = DF[DF['certified'] == 1].grade.min()

        fig = plt.figure(figsize=[12, 10])
        ax1 = fig.add_subplot(1, 1, 1)
        #Non-Certs
        tmp = data[data.certified == 0]
        ax1.scatter(tmp[colx],
                    tmp[coly],
                    s=bscale * tmp[disc_act],
                    color=xff.colors['neutral'])
        #Certified
        tmp = data[data.certified == 1]
        ax1.scatter(tmp[colx],
                    tmp[coly],
                    s=bscale * tmp[disc_act],
                    color=xff.colors['institute'])

        #ax1.legend(loc=5,prop={'size':18},scatterpoints=1,frameon=False)
        ax1.set_xlim(-0.05, )
        ax1.set_ylim(-0.05, 1.05)

        ### Generalized Plotting functions
        xff.texify(fig,
                   ax1,
                   xlabel=colx,
                   ylabel=coly,
                   title='bubble size proportional to %s' %
                   (disc_act.replace('_', ' ')),
                   tic_size=20,
                   label_size=24,
                   datefontsize=20)

        figsavename = self.figpath + 'scatter_' + colx + '_' + coly + '_disc_size_' + self.nickname.replace(
            '.', '_') + '.png'
        fig.savefig(figsavename, bbox_inches='tight', dpi=300)

        return None
Пример #27
0
    def grade_vs_nchapters(self, **kwargs):
        """
        Scatter plot of final grade versus nchapters accessed. All points are 
        jittered for clarity. Text labels are added to indicate subpopulations.

        Parameters
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3', False)

        if 'nchapters' not in self.person or 'grade' not in self.person or 'certified' not in self.person:
            print "One of the three columns necessary for this plot is missing. Check person_course for: 'nchapters','grade', and 'certified'."
            return None

        ### Data
        data = self.person[['nchapters', 'grade', 'certified']].copy()
        chap_jmax = 0.75
        grade_jmax = 0.005
        data.nchapters = data.nchapters.apply(lambda x: x + chap_jmax *
                                              (np.random.sample() - 0.5))
        data.grade = data.grade.apply(lambda x: x + grade_jmax *
                                      (np.random.sample()))
        data = data.dropna()

        certcut = self.person[self.person['certified'] == 1].grade.min()

        ### Plot
        fig = plt.figure(figsize=(12, 10))
        ax1 = fig.add_subplot(1, 1, 1)
        #Non-Certs
        data[data.certified == 0].plot('nchapters',
                                       'grade',
                                       style='.',
                                       color=xff.colors['neutral'],
                                       label=self.nickname,
                                       ax=ax1)
        #Certified
        data[data.certified == 1].plot('nchapters',
                                       'grade',
                                       style='.',
                                       color=xff.colors['institute'],
                                       ax=ax1)

        ### Illustrations (labels)
        ncmax = self.person[self.person.certified == 1].nchapters.order(
        )[-20::].min()  ### Funny, but this cuts off staff
        ax1.hlines(certcut, 0, ncmax + 1, lw=2)
        ax1.vlines(int(ncmax / 2), 0, certcut, lw=2)
        ax1.text(ncmax / 6,
                 certcut - 0.05,
                 '$Viewed$',
                 fontsize=30,
                 alpha=0.75)
        ax1.text(ncmax / 2 + (ncmax / 4),
                 certcut - 0.05,
                 '$Explored$',
                 fontsize=30,
                 alpha=0.75)
        ax1.text(
            ncmax / 2,
            0.8,
            '$Certified$',
            fontsize=30,
            alpha=0.75,
            horizontalalignment='center',
        )

        ### Plot Details
        ax1.set_xticklabels([r'$%0.f$' % x for x in ax1.get_xticks()])
        ax1.set_yticklabels([r'$%0.1f$' % x for x in ax1.get_yticks()],
                            fontsize=30)
        #ax1.legend(loc=4,prop={'size':28},frameon=False)
        ax1.set_xlim(0, ncmax + 1)
        ax1.set_ylim(0, 1.01)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'scatter_grade_vs_nchapters_' + self.nickname.replace(
            '.', '_')
        xff.texify(fig,
                   ax1,
                   xlabel='Chapters Viewed',
                   ylabel='Grade',
                   gridb='y',
                   figsavename=figsavename + '.png')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:

            data = data[(data.nchapters > 0) & (data.grade > 0)].dropna()
            randrows = np.random.choice(data.index.values, 2000)
            data = data.ix[randrows, :]
            X1 = data[data.certified == 0].nchapters.values
            Y1 = data[data.certified == 0].grade.values
            X2 = data[data.certified == 1].nchapters.values
            Y2 = data[data.certified == 1].grade.values
            #print X1,Y1

            ### FIGURE
            from nvd3 import scatterChart

            ### Output File
            figsavename = self.figpath + 'interactive_scatter_grade_nchap_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Scatter Plot Grade vs Chapters Viewed: %s" % self._xdata.course_id
            chart = scatterChart(name=title,
                                 width=850,
                                 height=550,
                                 x_is_date=False,
                                 x_axis_format=".1f",
                                 y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(X1)

            kwargs1 = {'shape': 'circle', 'size': '3'}
            kwargs2 = {'shape': 'circle', 'size': '3'}

            extra_serie = {"tooltip": {"y_start": "", "y_end": " calls"}}

            chart.add_serie(name="Participants",
                            y=Y1,
                            x=X1,
                            extra=extra_serie,
                            **kwargs1)
            chart.add_serie(name="Certified",
                            y=Y2,
                            x=X2,
                            extra=extra_serie,
                            **kwargs2)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #28
0
    def scatter_bubble_size(self,colx,coly,disc_act,figsave=False):
        """
        Creates scatter plot with x=colx, y=coly.
        Size of markers always proportional to disc_act (discussion activity).
        
        Parameters (generated during class initialization)
        ----------
        colx: column to be plotted on x-axis
        coly: column to be plotted on y-axis
        disc_act: column for scaling marker (bubble) size
        figsave: True/False to allow exploratory analysis without saving fig.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        ### Data
        data = self.person[[colx,coly,disc_act,'certified']].copy()
        Jcolx = 0.75
        Jcoly = 0.01
        Jcolz = 10 # 1.0/sqrt(10000)
        # print Jcolz
        bmin = 1.0
        bscale = 0.2
        data[colx] = data[colx].apply(lambda x: x + Jcolx*(np.random.sample()-Jcolx))
        data[coly] = data[coly].apply(lambda x: x + Jcoly*(np.random.sample()))
        data[disc_act] = data[disc_act].fillna(1.0).apply(lambda x: x + Jcolz*(np.random.sample()))
        ### Take top N discussants, and set their dot size to the Nth + 1 highest (lowest of the set)
        Nd = 5 
        topN = data[disc_act].order().index[-Nd:]
        data.ix[topN,disc_act] = data.ix[topN[1],disc_act] 

        if colx=='time_in_course':
            data[colx] = data[colx].apply(np.log)
        if coly=='time_in_course':
            data[coly] = data[coly].apply(np.log)
        if disc_act=='time_in_course':
            data[disc_act] = data[disc_act].apply(np.log)

        certcut = self.person[self.person['certified']==1].grade.min()

        fig = plt.figure(figsize=[12,10])
        ax1 = fig.add_subplot(1,1,1)
        #Non-Certs
        tmp = data[data.certified==0]
        ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['neutral'])
        #Certified
        tmp = data[data.certified==1]
        ax1.scatter(tmp[colx],tmp[coly],s=bscale*tmp[disc_act],color=xff.colors['institute'])

        #ax1.legend(loc=5,prop={'size':18},scatterpoints=1,frameon=False)
        
        ax1.set_ylim(-0.05,1.05)

        ax1.set_xlim(6,16)
        ax1.set_xticks([np.log(x) for x in [600,3600,10*3600,100*3600]])
        ax1.set_xticklabels(['10 min','1 hr','10 hrs','100 hrs'],rotation=40)
        # ax1.set_xticks([np.log(x) for x in [1,10,60,600,3600,10*3600,100*3600]])
        # ax1.set_xticklabels(['1 sec','10 sec','1 min','10 min','1 hr','10 hrs','100 hrs'],rotation=40)

        ### Generalized Plotting functions
        figsavename = self.figpath+'scatter_'+colx+'_'+coly+'_disc_size_'+self.nickname.replace('.','_')
        ylabel = coly.replace('_',' ')
        if ylabel == 'time in course':
            ylabel = 'Total Time In Course'
        xff.texify(fig,ax1,xlabel=colx.replace('_',' '),
                   ylabel=ylabel,
                   title='bubble size proportional to %s' % (disc_act.replace('_',' ')),
                   tic_size=20,label_size=24,datefontsize=20,
                   figsavename=figsavename+'.png')

        return None
Пример #29
0
    def andrew_ho_diagram(self,**kwargs):
        """
        Plot showing the intersection of enrollment populations by categories
        defined in the 2013 (published 2014) course reports.
        http://odl.mit.edu/mitx-working-papers/ 

        Parameters
        ----------
        None
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        """

        ### Registration Types
        self.person['Only Registered'] = 0
        self.person['Only Viewed'] = 0
        self.person['Only Explored'] = 0

        g = self.person ### This is a bit silly, but keeps lines short despite the conditions.
        
        ### Create disjoint groups (note g and self.person are the same)
        #Only Registered:  A - (B+C+D) 
        reg_list = g[ (g['registered']==1) & (g['viewed']==0) & (g['explored']==0) & (g['certified']==0) ].user_id
        self.person.ix[self.person[self.person.user_id.isin(reg_list)].index,'Only Registered'] = 1

        #Only Viewed:  B - (C+D) 
        view_list = g[ (g.user_id.isin(reg_list)==False) & (g['viewed']==1) & (g['explored']==0) & (g['certified']==0) ].user_id
        self.person.ix[self.person[self.person.user_id.isin(view_list)].index,'Only Viewed'] = 1

        #Only Explored:  C - (D)
        exp_list = g[ (g.user_id.isin(reg_list)==False) & (g.user_id.isin(view_list)==False) & (g['explored']==1) & (g['certified']==0) ].user_id
        self.person.ix[self.person[self.person.user_id.isin(exp_list)].index,'Only Explored'] = 1


        ### Figure
        fig = plt.figure(figsize=(12,8))
        ax1 = fig.add_subplot(111)

        #Circles
        SCALE = 1000
        expratio = SCALE*100.*len(self.person[self.person['Only Explored']==1])/len(self.person[self.person['registered']==1])
        #print expratio
        if expratio > 17000:
            expratio = 1000
        
        certratio = SCALE*100.*len(self.person[self.person['certified']==1])/len(self.person[self.person['registered']==1])
        #print certratio
        if certratio < 1000:
            certratio = 1000
        # csize = 1000  # Ratio and csize give relative size of explored and certified circles.
        # # expl = len(self.person[self.person['Only Explored']==1])
        # # cert = len(self.person[self.person['certified']==1])

        ax1.scatter([0.45],[0.5],s=expratio,edgecolor='Black',lw=2,color='white',alpha=0.8)
        ax1.scatter([0.50],[0.5],s=certratio,edgecolor=xff.colors['institute'],lw=2,color='white',alpha=0.8)

        #Rectangles
        rect1 = matplotlib.patches.Rectangle((0,0), 1, 1,fill=False,fc='white',ec='black',lw=2)
        rect2 = matplotlib.patches.Rectangle((0.15,0.15), 0.7, 0.7,fill=False,fc='white',ec='black',lw=2)
        ax1.add_patch(rect1)
        ax1.add_patch(rect2)

        FS = 25
        ### Only registerd
        x = len(self.person[self.person['Only Registered']==1])
        y = 100.0*x/len(self.person[self.person['registered']==1])
        ax1.text(0.05,0.925,'$Only\ registered:\ %d\ \ (%.1f\%%)$' % (x,y),fontsize=FS,ha='left')
        x = len(self.person[self.person['Only Viewed']==1])
        y = 100.0*x/len(self.person[self.person['registered']==1])
        ax1.text(0.2,0.775,'$Only\ viewed:\ %d\ \ (%.1f\%%)$' % (x,y),fontsize=FS,ha='left')
        x = len(self.person[self.person['Only Explored']==1])
        y = 100.0*x/len(self.person[self.person['registered']==1])
        ax1.text(0.175,0.2,'$Only\ explored:$\n$%d\ \ (%.1f\%%)$' % (x,y),fontsize=FS,ha='left')
        x = len(self.person[self.person['certified']==1])
        y = 100.0*x/len(self.person[self.person['registered']==1])
        ax1.text(0.625,0.35,'$Certified:$\n$%d\ \ (%.1f\%%)$' % (x,y),fontsize=FS,ha='left')

        ax1.set_xlim([-0.01, 1.01])
        ax1.set_ylim([-0.01, 1.01])

        ## suppress spines
        for key in ax1.spines.keys():
            ax1.spines[key].set_color('none')

        ax1.axes.get_xaxis().set_visible(False)
        ax1.axes.get_yaxis().set_visible(False)

        ### Generalized Plotting functions
        figsavename = self.figpath+'AHO_Diagram'+'_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,
                   figsavename=figsavename+'.png')

        # ### Package data
        # jsonout = pd.Series()
        # for sg in ['Only Registered','Only Viewed','Only Explored','certified']:
        #     count = len(self.person[self.person[sg]==1].username.unique())
        #     jsonout.set_value(sg.capitalize()+'(N=%d)'%(count),count)

        # jsonout = 100.0*jsonout/jsonout.sum()

        # jsondata = []
        # for l,v in jsonout.iteritems():
        #     record = collections.OrderedDict()
        #     record['label'] = l
        #     record['value'] = v
        #     jsondata.append(record)
        
        # print "JSON dump currently commented out."                
        # # str_jsondata = 'var data = '+json.dumps(jsondata)
        # # with open(self.figpath+'percent_participanttypes.json', 'w') as outfile:
        # #     outfile.write(str_jsondata)


        return None
Пример #30
0
    def grade_vs_nchapters(self,**kwargs):
        """
        Scatter plot of final grade versus nchapters accessed. All points are 
        jittered for clarity. Text labels are added to indicate subpopulations.

        Parameters
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.
        
        Output
        ------
        Figures and respective formats.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3',False)
        
        if 'nchapters' not in self.person or 'grade' not in self.person or 'certified' not in self.person:
            print "One of the three columns necessary for this plot is missing. Check person_course for: 'nchapters','grade', and 'certified'."
            return None

        ### Data
        data = self.person[['nchapters','grade','certified']].copy()
        chap_jmax = 0.75
        grade_jmax = 0.005
        data.nchapters = data.nchapters.apply(lambda x: x + chap_jmax*(np.random.sample()-0.5))
        data.grade = data.grade.apply(lambda x: x + grade_jmax*(np.random.sample()))
        data = data.dropna()

        certcut = self.person[self.person['certified']==1].grade.min()

        ### Plot
        fig = plt.figure(figsize=(12,10))
        ax1 = fig.add_subplot(1,1,1)
        #Non-Certs
        data[data.certified==0].plot('nchapters','grade',style='.',color=xff.colors['neutral'],label=self.nickname,ax=ax1)
        #Certified
        data[data.certified==1].plot('nchapters','grade',style='.',color=xff.colors['institute'],ax=ax1)

        ### Illustrations (labels)
        ncmax = self.person[self.person.certified==1].nchapters.order()[-20::].min() ### Funny, but this cuts off staff
        ax1.hlines(certcut,0,ncmax+1,lw=2)
        ax1.vlines(int(ncmax/2),0,certcut,lw=2)
        ax1.text(ncmax/6,certcut-0.05,'$Viewed$',fontsize=30,alpha=0.75)
        ax1.text(ncmax/2+(ncmax/4),certcut-0.05,'$Explored$',fontsize=30,alpha=0.75)
        ax1.text(ncmax/2,0.8,'$Certified$',fontsize=30,alpha=0.75,horizontalalignment='center',)

        ### Plot Details
        ax1.set_xticklabels([r'$%0.f$' % x for x in ax1.get_xticks()])
        ax1.set_yticklabels([r'$%0.1f$' % x for x in ax1.get_yticks()],fontsize=30)
        #ax1.legend(loc=4,prop={'size':28},frameon=False)
        ax1.set_xlim(0,ncmax+1)
        ax1.set_ylim(0,1.01)


        ### Generalized Plotting functions
        figsavename = self.figpath+'scatter_grade_vs_nchapters_'+self.nickname.replace('.','_')
        xff.texify(fig,ax1,
                   xlabel='Chapters Viewed',
                   ylabel='Grade',
                   gridb='y',
                   figsavename=figsavename+'.png')

        
        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            
            data = data[(data.nchapters>0) & (data.grade>0)].dropna()
            randrows = np.random.choice(data.index.values,2000)
            data = data.ix[randrows,:]
            X1 = data[data.certified==0].nchapters.values
            Y1 = data[data.certified==0].grade.values
            X2 = data[data.certified==1].nchapters.values
            Y2 = data[data.certified==1].grade.values
            #print X1,Y1

            ### FIGURE
            from nvd3 import scatterChart

            ### Output File
            figsavename = self.figpath+'interactive_scatter_grade_nchap_'+self.nickname+'.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Scatter Plot Grade vs Chapters Viewed: %s" % self._xdata.course_id
            chart = scatterChart(name=title, width=850, height=550, x_is_date=False, x_axis_format=".1f", y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(X1)

            kwargs1 = {'shape': 'circle', 'size': '3'}
            kwargs2 = {'shape': 'circle', 'size': '3'}

            extra_serie = {"tooltip": {"y_start": "", "y_end": " calls"}}

            chart.add_serie(name="Participants", y=Y1, x=X1, extra=extra_serie, **kwargs1)
            chart.add_serie(name="Certified", y=Y2, x=X2, extra=extra_serie, **kwargs2)

            
            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()


        return None
Пример #31
0
    def content_touches_viz(self, horiz_w_data, certified):
        '''

        '''
        fig = plt.figure(figsize=(32, 12))
        fig.subplots_adjust(hspace=.1)
        #plt.rcParams.update({'font.size': 20})
        ax1 = fig.add_subplot(2, 1, 1)
        ax2 = fig.add_subplot(2, 1, 2)

        #ax1.plot(tmp['order'],tmp['uniqU'],'o')
        bars1 = ax1.bar(horiz_w_data['order'],
                        horiz_w_data['users'],
                        3.0,
                        alpha=0.8,
                        edgecolor='none')
        #Colors the bars (this is where reindexing matters)
        for i, b in enumerate(bars1):
            flipi = i  #len(tmp.index)-1-i
            if horiz_w_data.color[flipi] != 'Pink':
                bars1[i].set_facecolor(horiz_w_data.color[flipi])
                bars1[i].set_edgecolor(horiz_w_data.color[flipi])
            else:
                bars1[i].set_facecolor('none')
                bars1[i].set_edgecolor('none')

        #ax1.plot(tmp['order'],tmp.uniqU,'-o',color='Silver',alpha=0.8)
        ax1.set_xlabel('Course Structure Index')
        ax1.set_ylabel('Unique Users')
        #ax1.set_xlim(0,2100)

        bars2 = ax2.bar(horiz_w_data['order'],
                        horiz_w_data.scale,
                        2.5,
                        edgecolor='none')

        #Colors the bars (this is where reindexing matters)
        for i, b in enumerate(bars2):
            flipi = i  #len(vert.index)-1-i
            bars2[i].set_facecolor(horiz_w_data.color[flipi])

        invert = True  # Choose whether to have the CC plot left or right oriented
        ha = 'left'
        if invert == True:
            ax2.invert_yaxis()
            ha = 'right'

        ax2.axes.get_xaxis().set_ticks([])
        #ax2.set_xlim(0,500)
        ax2.axes.get_yaxis().set_ticks([])
        #ax2.set_xlim(0,2100)
        ax2.set_ylim(7, -1)

        #fig.patch.set_visible(False)
        ax2.axis('off')

        #!!!!! x limits must be set together
        ax1.set_xlim(ax2.get_xlim()[0], ax2.get_xlim()[1])
        #ax1.set_ylim(0,7500)

        xff.texify(fig, ax1, tic_size=32, label_size=32)

        dpiset = 300
        if certified == True:
            figsavename = self.figpath + 'content_touches_horizontal_certified_' + self.nickname.replace(
                '.', '_') + '.png'
        else:
            figsavename = self.figpath + 'content_touches_horizontal_' + self.nickname.replace(
                '.', '_') + '.png'

        fig.savefig(figsavename, bbox_inches='tight', dpi=dpiset)

        return None
Пример #32
0
    def gender(self, **kwargs):
        """
        Creates gender distribution figures.
       
        Parameters (generated during class initialization)
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3', False)

        ### Removes those users not having the option to fill in edX registration data.
        trim_data = self.person[(self.person.registered == 1)
                                & (self.person.user_id > 156633)]

        ### Data
        gdict = {'f': "$Female$", 'm': "$Male$", 'o': "$Other$"}
        glist = ['$Female$', '$Male$']

        ### Munge and Plot
        gender = trim_data.gender.dropna().apply(
            lambda x: gdict[x]).value_counts()
        #print gender
        certs = trim_data[trim_data.certified == 1]
        if certs.username.count() > self.mincerts:
            certs = certs.gender.dropna().apply(
                lambda x: gdict[x]).value_counts()
        else:
            certs = pd.Series(index=gender.index)

        gender = pd.concat([gender, certs],
                           join='inner',
                           axis=1,
                           keys=['$Non-Certified$', '$Certified$'])
        gender = 100. * gender / gender.sum()
        gender = gender.apply(lambda x: np.round(x, 1))

        fig = plt.figure(figsize=(12, 6))
        ax1 = fig.add_subplot(1, 1, 1)
        gender.ix[glist, :].plot(
            ax=ax1,
            kind='bar',
            color=[xff.colors['neutral'], xff.colors['institute']],
            rot=0)

        ### Plot Details
        ax1.set_xticklabels([r'%s' % x for x in glist])
        ax1.set_yticklabels(
            [r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],
            fontsize=30)
        ax1.legend(loc=2, prop={'size': 28}, frameon=False)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'gender_distribution_' + self.nickname.replace(
            '.', '_')
        print figsavename
        xff.texify(fig,
                   ax1,
                   xlabel=None,
                   ylabel='Count',
                   figsavename=figsavename + '.png')

        # ### Output JSON Records
        # gender.name = 'value'
        # gender = gender.reset_index().rename(columns={'index':'label'})
        # gender.dropna().to_json(figsavename+'.json',orient='records')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:

            'http://nvd3.org/examples/pie.html'

            X = [x.replace('$', '') for x in gender.index]
            Y1 = gender.ix[glist, '$Non-Certified$'].values
            Y2 = gender.ix[glist, '$Certified$'].values

            #----------------------------------------------------------------
            ### BAR Chart
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath + 'interactive_gender_distribution_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Gender Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = multiBarChart(name=charttype,
                                  height=350,
                                  x_axis_format="",
                                  y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(gender.ix[glist, :])

            ### Series 1
            extra_serie1 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['neutral'],
                "format": ".1f"
            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)

            ### Series 2
            extra_serie2 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['institute'],
                "format": ".1f"
            }
            chart.add_serie(name="Certificate Earners",
                            y=Y2,
                            x=X,
                            extra=extra_serie2)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

            #----------------------------------------------------------------
            ### Pie Chart
            from nvd3 import pieChart

            ### Output File
            figsavename = self.figpath + 'interactive_gender_piechart_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Gender Pie Chart: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = pieChart(name=charttype,
                             color_category='category20c',
                             height=400,
                             width=400)
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")

            extra_serie = {"tooltip": {"y_start": "", "y_end": " certified"}}

            chart.add_serie(y=Y1, x=X, extra=extra_serie)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #33
0
    def scatter_bubble_size(self, colx, coly, disc_act, figsave=False):
        """
        Creates scatter plot with x=colx, y=coly.
        Size of markers always proportional to disc_act (discussion activity).
        
        Parameters (generated during class initialization)
        ----------
        colx: column to be plotted on x-axis
        coly: column to be plotted on y-axis
        disc_act: column for scaling marker (bubble) size
        figsave: True/False to allow exploratory analysis without saving fig.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        ### Data
        data = self.person[[colx, coly, disc_act, 'certified']].copy()
        Jcolx = 0.75
        Jcoly = 0.01
        Jcolz = 10  # 1.0/sqrt(10000)
        # print Jcolz
        bmin = 1.0
        bscale = 0.2
        data[colx] = data[colx].apply(lambda x: x + Jcolx *
                                      (np.random.sample() - Jcolx))
        data[coly] = data[coly].apply(lambda x: x + Jcoly *
                                      (np.random.sample()))
        data[disc_act] = data[disc_act].fillna(1.0).apply(lambda x: x + Jcolz *
                                                          (np.random.sample()))
        ### Take top N discussants, and set their dot size to the Nth + 1 highest (lowest of the set)
        Nd = 5
        topN = data[disc_act].order().index[-Nd:]
        data.ix[topN, disc_act] = data.ix[topN[1], disc_act]

        if colx == 'time_in_course':
            data[colx] = data[colx].apply(np.log)
        if coly == 'time_in_course':
            data[coly] = data[coly].apply(np.log)
        if disc_act == 'time_in_course':
            data[disc_act] = data[disc_act].apply(np.log)

        certcut = self.person[self.person['certified'] == 1].grade.min()

        fig = plt.figure(figsize=[12, 10])
        ax1 = fig.add_subplot(1, 1, 1)
        #Non-Certs
        tmp = data[data.certified == 0]
        ax1.scatter(tmp[colx],
                    tmp[coly],
                    s=bscale * tmp[disc_act],
                    color=xff.colors['neutral'])
        #Certified
        tmp = data[data.certified == 1]
        ax1.scatter(tmp[colx],
                    tmp[coly],
                    s=bscale * tmp[disc_act],
                    color=xff.colors['institute'])

        #ax1.legend(loc=5,prop={'size':18},scatterpoints=1,frameon=False)

        ax1.set_ylim(-0.05, 1.05)

        ax1.set_xlim(6, 16)
        ax1.set_xticks([np.log(x) for x in [600, 3600, 10 * 3600, 100 * 3600]])
        ax1.set_xticklabels(['10 min', '1 hr', '10 hrs', '100 hrs'],
                            rotation=40)
        # ax1.set_xticks([np.log(x) for x in [1,10,60,600,3600,10*3600,100*3600]])
        # ax1.set_xticklabels(['1 sec','10 sec','1 min','10 min','1 hr','10 hrs','100 hrs'],rotation=40)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'scatter_' + colx + '_' + coly + '_disc_size_' + self.nickname.replace(
            '.', '_')
        ylabel = coly.replace('_', ' ')
        if ylabel == 'time in course':
            ylabel = 'Total Time In Course'
        xff.texify(fig,
                   ax1,
                   xlabel=colx.replace('_', ' '),
                   ylabel=ylabel,
                   title='bubble size proportional to %s' %
                   (disc_act.replace('_', ' ')),
                   tic_size=20,
                   label_size=24,
                   datefontsize=20,
                   figsavename=figsavename + '.png')

        return None
Пример #34
0
    def age(self, **kwargs):
        """
        Creates gender distribution figures.
       
        Parameters (generated during class initialization)
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3', False)

        ### Removes those users not having the option to fill in edX registration data.
        trim_data = self.person[(self.person.registered == 1)
                                & (self.person.user_id > 156633)]
        # Add age column from year_of_birth
        trim_data['age'] = trim_data['YoB'].apply(
            lambda x: datetime.datetime.now().year - x
            if isinstance(x, int) else None)

        age = trim_data.age.dropna()
        h, e = np.histogram(age.values, bins=9, range=(0, 90))
        age = pd.Series(data=h,
                        index=[
                            '0-9', '10-19', '20-29', '30-39', '40-49', '50-59',
                            '60-69', '70-79', '80-89'
                        ])

        certs = trim_data[trim_data.certified == 1]
        if certs.username.count() > self.mincerts:
            certs = certs.age.dropna()
            h, e = np.histogram(certs.values, bins=9, range=(0, 90))
            certs = pd.Series(data=h,
                              index=[
                                  '0-9', '10-19', '20-29', '30-39', '40-49',
                                  '50-59', '60-69', '70-79', '80-89'
                              ])
        else:
            certs = pd.Series(index=age.index)

        age = pd.concat([age, certs],
                        join='inner',
                        axis=1,
                        keys=['$Non-Certified$', '$Certified$'])
        age = 100. * age / age.sum()
        age = age.apply(lambda x: np.round(x, 1))
        #print age

        #----------------------------------------------------------------
        ### Static Matplotlib PNG
        fig = plt.figure(figsize=(12, 6))
        ax1 = fig.add_subplot(1, 1, 1)
        age.plot(
            ax=ax1,
            kind='bar',
            color=[xff.colors['neutral'], xff.colors['institute']],
            rot=40,
        )

        ### Plot Details
        ax1.set_xticklabels([r'$%s$' % x for x in age.index])
        ax1.set_yticklabels(
            [r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],
            fontsize=30)
        ax1.legend(loc=1, prop={'size': 28}, frameon=False)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'age_distribution_' + self.nickname.replace(
            '.', '_')
        print figsavename
        xff.texify(fig,
                   ax1,
                   xlabel='Age',
                   ylabel=None,
                   figsavename=figsavename + '.png')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath + 'interactive_age_distribution_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Age Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'
            chart = multiBarChart(name=charttype,
                                  height=350,
                                  x_axis_format="",
                                  y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(age)
            X = age.index  #list(range(nb_element))
            Y1 = age['$Non-Certified$'].values
            Y2 = age['$Certified$'].values

            ### Series 1
            extra_serie1 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['neutral'],
                "format": ".1f"
            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)

            ### Series 2
            extra_serie2 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['institute'],
                "format": ".1f"
            }
            chart.add_serie(name="Certificate Earners",
                            y=Y2,
                            x=X,
                            extra=extra_serie2)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #35
0
    def level_of_education(self, **kwargs):
        '''
        Plot Level of Education Attained; typically taken from the edX enrollment questionairre.
        '''
        """
        Creates distribution of highest level of education attained.
       
        Parameters (generated during class initialization)
        ----------
        NVD3 = False.  If true, nvd3 interactive figure output.

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        NVD3 = kwargs.get('NVD3', False)

        ### Level of Education (LoE)
        ### Data
        eddict = {
            'el': "Less\ than$\n$ Secondary",
            'jhs': "Less\ than$\n$ Secondary",
            'none': "Less\ than$\n$ Secondary",
            'hs': "Secondary",
            'a': "Secondary",
            'b': "Bachelor\'s",
            'm': "Master\'s",
            'p_se': "Doctorate",
            'p_oth': "Doctorate",
            'p': "Doctorate",
            'other': None,
            'NA': None,
            'nan': None,
        }

        edlist = [
            "Less\ than$\n$ Secondary", "Secondary", "Bachelor\'s",
            "Master\'s", "Doctorate"
        ]
        trim_data = self.person[(self.person.registered == 1)
                                & (self.person.user_id > 156633)]

        edlevels = trim_data.LoE.apply(lambda x: eddict[str(x)] if x in eddict.
                                       keys() else None).value_counts()[edlist]
        if trim_data[trim_data.certified ==
                     1].username.count() > self.mincerts:
            certs = trim_data[trim_data.certified == 1].LoE.apply(
                lambda x: eddict[str(x)]
                if x in eddict.keys() else None).value_counts()[edlist]
        else:
            certs = pd.Series(index=edlevels.index)

        edlevels = pd.concat([edlevels, certs],
                             join='inner',
                             axis=1,
                             keys=['$Non-Certified$', '$Certified$'])
        edlevels = 100. * edlevels / edlevels.sum()
        edlevels = edlevels.apply(lambda x: np.round(x, 1))

        #print edlevels

        #Plot
        fig = plt.figure(figsize=(12, 6))
        ax1 = fig.add_subplot(1, 1, 1)

        edlevels.plot(ax=ax1,
                      kind='bar',
                      color=[xff.colors['neutral'], xff.colors['institute']],
                      rot=40)

        ### Plot Details
        ax1.set_xticklabels([r'$%s$' % x for x in edlist])
        ax1.set_yticklabels(
            [r'${0}\%$'.format("%.0f" % (y)) for y in ax1.get_yticks()],
            fontsize=30)
        ax1.legend(loc=2, prop={'size': 22}, frameon=False)

        ### Generalized Plotting functions
        figsavename = self.figpath + 'loe_distribution_' + self.nickname.replace(
            '.', '_')
        print figsavename
        xff.texify(fig,
                   ax1,
                   xlabel=None,
                   ylabel=None,
                   figsavename=figsavename + '.png')

        ### Output JSON Records
        #cc.name = 'value'
        #cc = cc.reset_index().rename(columns={'index':'label'})
        #cc.dropna().to_json(figsavename+'.json',orient='records')

        #----------------------------------------------------------------
        ### NVD3 Interactive http://nvd3.org/
        if NVD3:
            ### FIGURE
            from nvd3 import multiBarChart

            ### Output File
            figsavename = self.figpath + 'interactive_edlevel_distribution_' + self.nickname + '.html'
            output_file = open(figsavename, 'w')
            print figsavename

            title = "Education Level Distribution: %s" % self._xd.course_id
            charttype = 'multiBarChart'

            chart = multiBarChart(name=charttype,
                                  height=350,
                                  x_axis_format="",
                                  y_axis_format=".1f")
            chart.set_containerheader("\n\n<h2>" + title + "</h2>\n\n")
            nb_element = len(edlevels)
            X = [
                x.replace('\ ', ' ').replace('$\n$', ' ')
                for x in edlevels.index
            ]  #list(range(nb_element))
            Y1 = edlevels.ix[:, '$Non-Certified$'].values
            Y2 = edlevels.ix[:, '$Certified$'].values

            ### Series 1
            extra_serie1 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['neutral'],
                "format": ".1f"
            }
            chart.add_serie(name="Participants", y=Y1, x=X, extra=extra_serie1)

            ### Series 2
            extra_serie2 = {
                "tooltip": {
                    "y_start": "",
                    "y_end": "%"
                },
                "color": xff.colors['institute'],
                "format": ".1f"
            }
            chart.add_serie(name="Certificate Earners",
                            y=Y2,
                            x=X,
                            extra=extra_serie2)

            ### Final Output
            chart.buildhtml()
            output_file.write(chart.htmlcontent)

            #---------------------------------------

            #close Html file
            output_file.close()

        return None
Пример #36
0
    def daily_activity(self):
        """
        Creates daily timeseries of discussion activity (only posts/comments/votes from forum data).
       
        
        Parameters (generated during class initialization)
        ----------
        None

        Output
        ------
        Saves figures to specified directories.

        Returns
        -------
        None
        """

        fig = plt.figure(figsize=[20, 6])
        ax1 = fig.add_subplot(1, 1, 1)

        ### List of Certified Participants
        certs = self.pc_plus[self.pc_plus.certified == 1].user_id.unique()
        certs = [str(u) for u in certs]

        ### Non-Certified
        post_act = self.forum[(self.forum.created_at.notnull()) & (
            self.forum.author_id.isin(certs) == False)].created_at.apply(
                lambda x: x.date()).value_counts().sort_index()
        post_act.plot(ax=ax1,
                      style="-o",
                      ms=6,
                      lw=2,
                      color=xff.colors['neutral'],
                      label='$Non-Certified$')
        #(post_act.cumsum()/10).plot(ax=ax1,style="-",ms=3,color='Orange')

        ### Certified
        post_act = self.forum[(self.forum.created_at.notnull()) & (
            self.forum.author_id.isin(certs))].created_at.apply(
                lambda x: x.date()).value_counts().sort_index()
        post_act.plot(ax=ax1,
                      style="-o",
                      ms=6,
                      lw=2,
                      color=xff.colors['institute'],
                      label='$Certified$')

        xmin = (self.cinfo['start_date'] -
                np.timedelta64(2, 'W')).item().date()
        xmax = (self.cinfo['end_date'] + np.timedelta64(4, 'W')).item().date()

        ax1.set_xlim(xmin, xmax)

        ylim1 = ax1.get_ylim()[1]
        ax1.vlines([
            self.cinfo.start_date.item().date(),
            self.cinfo.end_date.item().date()
        ],
                   0,
                   ylim1,
                   colors='Gray',
                   lw=1.5,
                   linestyles='--')
        ax1.set_ylim(0, ylim1)

        ax1 = xff.timeseries_plot_formatter(ax1, interval=1)
        ax1.legend(loc=1, prop={'size': 24}, frameon=False)

        figsavename = self.figpath + 'discussion_activity_' + self.nickname.replace(
            '.', '_')
        xff.texify(fig,
                   ax1,
                   ylabel='Forum Text Submissions',
                   tic_size=20,
                   label_size=24,
                   datefontsize=20,
                   title=self.nickname,
                   figsavename=figsavename + '.png')

        return None