Пример #1
0
    def processSQLrow(self,row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d'%(year,month)

        time_index = self.time_stamps_index.get(ym,None)
        if time_index is None:
            return

        totaledits = 0
        if row['add_edits'] is not None:
            totaledits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            totaledits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            totaledits += int(row['noop_edits'])
        
        for i,al in enumerate(self.cohorts):
            # for each activity level al

            if totaledits >= al:
                self.data['edits'][i,time_index] += totaledits
                self.data['editors'][i,time_index] += 1
                self.data['added'][i,time_index] += int(row['len_added'])
                self.data['removed'][i,time_index] += -int(row['len_removed'])
                self.data['net'][i,time_index] += int(row['len_added']) + int(row['len_removed'])
Пример #2
0
    def processMongoDocument(self, editor):

        if 'edit_count' not in editor:
            return

        editor_id = editor['user_id']

        if utils.isBot(editor_id):
            return

        for year, edity in editor['edit_count'].items():
            for month, editm in edity.items():
                # extract year and month 20xxxx
                ym = '%s%02d' % (year, int(month))

                time_index = self.time_stamps_index.get(ym, None)
                if time_index is None:
                    continue

                # cohort index depends on the aggregate of the namespaces
                nedits = 0

                for ns, ecount in editm.items():

                    if ns in self.NS:
                        # count edits
                        nedits += ecount

                if nedits > 0:
                    # Sparse representation. Only collecting >0 edits because it is possible that an editor has no edits in that namespace, but edits in other namespaces.
                    cohorts_index = self.getIndex(nedits)

                    # increment the histogram bin for that editor and month
                    # note: as the data is sparse, it means we don't count the number of people with zero edits in that month
                    self.data['editsHistogram'][cohorts_index, time_index] += 1
Пример #3
0
    def processSQLrow(self, row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d' % (year, month)

        time_index = self.time_stamps_index.get(ym, None)
        if time_index is None:
            return

        totaledits = 0
        if row['add_edits'] is not None:
            totaledits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            totaledits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            totaledits += int(row['noop_edits'])

        for i, al in enumerate(self.cohorts):
            # for each activity level al

            if totaledits >= al:
                self.data['edits'][i, time_index] += totaledits
                self.data['editors'][i, time_index] += 1
                self.data['added'][i, time_index] += int(row['len_added'])
                self.data['removed'][i, time_index] += -int(row['len_removed'])
                self.data['net'][i, time_index] += int(row['len_added']) + int(
                    row['len_removed'])
Пример #4
0
    def processSQLrow(self,row):
        # try:
        editor_id = row['%s_user_id'%self.reverttype]

        if utils.isBot(editor_id):
            return

        year = row['%s_year'%self.reverttype]
        month = row['%s_month'%self.reverttype]

        ym = '%d%02d'%(year,month)

        time_index = self.time_stamps_index.get(ym,None)
        if time_index is None:
            return

        firstedit = '%d%02d'%(row['%s_first_edit_year'%self.reverttype],row['%s_first_edit_month'%self.reverttype])

        fe_index = self.time_stamps_index.get(firstedit,None)
        if fe_index is None:
            return

        cohorts_index = self.getIndex(fe_index)

        reverts = row['reverts']

        if reverts < self.activation:
            return

        self.data['%s_editors'%self.reverttype][cohorts_index,time_index] += 1
        
        self.data['%s_edits'%self.reverttype][cohorts_index,time_index] += reverts
Пример #5
0
    def processSQLrow(self,row):

        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']
        ns = str(row['namespace'])

        ym = '%d%02d'%(year,month)

        time_index = self.time_stamps_index.get(ym,None)
        if time_index is None:
            return

        firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month'])

        fe_index = self.time_stamps_index.get(firstedit,None)
        if fe_index is None:
            return

        cohorts_index = self.getIndex(fe_index)

        if ns in self.NS:
            self.data['added'][cohorts_index,time_index] += int(row['len_added'])
            self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
            self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
            self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
Пример #6
0
    def processSQLrow(self, row):
        try:

            editor_id = row['user_id']

            if utils.isBot(editor_id):
                return

            year = row['rev_year']
            month = row['rev_month']
            ns = str(row['namespace'])

            ym = '%d%02d' % (year, month)

            time_index = self.time_stamps_index.get(ym, None)
            if time_index is None:
                return

            firstedit = '%d%02d' % (row['first_edit_year'],
                                    row['first_edit_month'])

            fe_index = self.time_stamps_index.get(firstedit, None)
            if fe_index is None:
                return

            cohorts_index = self.getIndex(fe_index)

            if ns in self.NS:

                if row['len_added'] is not None:
                    self.data['added'][cohorts_index,
                                       time_index] += int(row['len_added'])
                if row['len_removed'] is not None:
                    self.data['removed'][cohorts_index, time_index] += -int(
                        row['len_removed'])
                if row['len_removed'] is not None and row[
                        'len_removed'] is not None:
                    self.data['net'][cohorts_index, time_index] += int(
                        row['len_added']) + int(row['len_removed'])
                if row['len_removed'] is not None and row[
                        'remove_edits'] is not None:
                    self.data['edits'][cohorts_index, time_index] += int(
                        row['add_edits']) + int(row['remove_edits'])

                if editor_id != self.old_user_id:
                    #counting the editor we encountered
                    self.data['editors'][cohorts_index, time_index] += 1

                    self.old_user_id = editor_id

        except:
            raise Exception('row:\n%s' % row)
Пример #7
0
    def processSQLrow(self, row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d' % (year, month)

        time_index = self.time_stamps_index.get(ym, None)
        if time_index is None:
            return

        firstedit = '%d%02d' % (row['first_edit_year'],
                                row['first_edit_month'])

        fe_index = self.time_stamps_index.get(firstedit, None)
        if fe_index is None:
            return

        cohorts_index = self.getIndex(time_index, fe_index)

        edits = 0
        if row['add_edits'] is not None:
            edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            edits += int(row['noop_edits'])

        if edits < self.minedits or (edits > self.maxedits
                                     and self.maxedits is not None):
            return

        self.data['editors'][cohorts_index, time_index] += 1

        if row['len_added'] is not None:
            self.data['added'][cohorts_index,
                               time_index] += int(row['len_added'])
        if row['len_removed'] is not None:
            self.data['removed'][cohorts_index,
                                 time_index] += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.data['net'][cohorts_index, time_index] += int(
                row['len_added']) + int(row['len_removed'])

        self.data['edits'][cohorts_index, time_index] += edits
Пример #8
0
    def processSQLrow(self,row):
        try:

            editor_id = row['user_id']

            if utils.isBot(editor_id):
                return

            year = row['rev_year']
            month = row['rev_month']
            ns = str(row['namespace'])

            ym = '%d%02d'%(year,month)

            time_index = self.time_stamps_index.get(ym,None)
            if time_index is None:
                return

            firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month'])

            fe_index = self.time_stamps_index.get(firstedit,None)
            if fe_index is None:
                return

            cohorts_index = self.getIndex(fe_index)

            if ns in self.NS:

                if row['len_added'] is not None:
                    self.data['added'][cohorts_index,time_index] += int(row['len_added'])
                if row['len_removed'] is not None:    
                    self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
                if row['len_removed'] is not None and row['len_removed'] is not None:
                    self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
                if row['len_removed'] is not None and row['remove_edits'] is not None:
                    self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])


                if editor_id != self.old_user_id:
                    #counting the editor we encountered
                    self.data['editors'][cohorts_index,time_index] += 1

                    self.old_user_id = editor_id



        except:
            raise Exception('row:\n%s'%row)
Пример #9
0
    def processSQLrow(self,row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d'%(year,month)

        time_index = self.time_stamps_index.get(ym,None)
        if time_index is None:
            return

        firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month'])

        fe_index = self.time_stamps_index.get(firstedit,None)
        if fe_index is None:
            return

        cohorts_index = self.getIndex(time_index, fe_index)


        edits = 0
        if row['add_edits'] is not None:
            edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            edits += int(row['noop_edits'])

        if edits < self.minedits or (edits > self.maxedits and self.maxedits is not None):                        
            return

        self.data['editors'][cohorts_index,time_index] += 1

        if row['len_added'] is not None:
            self.data['added'][cohorts_index,time_index] += int(row['len_added'])
        if row['len_removed'] is not None:    
            self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
        
        self.data['edits'][cohorts_index,time_index] += edits
Пример #10
0
    def processSQLrow(self, row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d' % (year, month)

        time_index = self.time_stamps_index.get(ym, None)
        if time_index is None:
            return

        edits = 0
        if row['add_edits'] is not None:
            edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            edits += int(row['noop_edits'])

        cohorts_index = self.getIndex(edits)

        self.data['editors'][cohorts_index, time_index] += 1

        if row['len_added'] is not None:
            self.data['added'][cohorts_index,
                               time_index] += int(row['len_added'])
        if row['len_removed'] is not None:
            self.data['removed'][cohorts_index,
                                 time_index] += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.data['net'][cohorts_index, time_index] += int(
                row['len_added']) + int(row['len_removed'])

        self.data['edits'][cohorts_index, time_index] += edits
Пример #11
0
    def processSQLrow(self, row):

        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']
        day = row['rev_day']
        ns = str(row['namespace'])

        ymd = '%d%02d%02d' % (year, month, day)

        time_index = self.time_stamps_index.get(ymd, None)
        if time_index is None:
            return

        fe = str(row['first_edit_year'])
        firstedit = fe[:8]
        # time.strptime(fe,"%Y%m%d%H%M%S")
        # firstedit = '%d%02d%02d'%(fe[0:4],fe[4:6],fe[6:8])

        fe_index = self.time_stamps_index.get(firstedit, None)
        if fe_index is None:
            return

        cohorts_index = self.getIndex(time_index, fe_index)

        if ns in self.NS:

            self.data['addedPerDay'][cohorts_index,
                                     time_index] += int(row['len_added'])
            self.data['removedPerDay'][cohorts_index,
                                       time_index] += -int(row['len_removed'])
            self.data['netPerDay'][cohorts_index, time_index] += int(
                row['len_added']) + int(row['len_removed'])
            self.data['editsPerDay'][cohorts_index, time_index] += int(
                row['add_edits']) + int(row['remove_edits'])
Пример #12
0
    def processSQLrow(self,row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d'%(year,month)

        time_index = self.time_stamps_index.get(ym,None)
        if time_index is None:
            return


        edits = 0
        if row['add_edits'] is not None:
            edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            edits += int(row['noop_edits'])

        cohorts_index = self.getIndex(edits)

        self.data['editors'][cohorts_index,time_index] += 1

        if row['len_added'] is not None:
            self.data['added'][cohorts_index,time_index] += int(row['len_added'])
        if row['len_removed'] is not None:    
            self.data['removed'][cohorts_index,time_index] += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
        
        self.data['edits'][cohorts_index,time_index] += edits
Пример #13
0
    def processSQLrow(self,row):

        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        year = row['rev_year']
        month = row['rev_month']
        day = row['rev_day']
        ns = str(row['namespace'])

        ymd = '%d%02d%02d'%(year,month,day)

        time_index = self.time_stamps_index.get(ymd,None)
        if time_index is None:
            return

        fe = str(row['first_edit_year'])
        firstedit = fe[:8]
        # time.strptime(fe,"%Y%m%d%H%M%S")
        # firstedit = '%d%02d%02d'%(fe[0:4],fe[4:6],fe[6:8])
        

        fe_index = self.time_stamps_index.get(firstedit,None)
        if fe_index is None:
            return

        cohorts_index = self.getIndex(time_index, fe_index)

        if ns in self.NS:

            self.data['addedPerDay'][cohorts_index,time_index] += int(row['len_added'])
            self.data['removedPerDay'][cohorts_index,time_index] += -int(row['len_removed'])
            self.data['netPerDay'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed'])
            self.data['editsPerDay'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
Пример #14
0
    def processMongoDocument(self,editor):

        if 'edit_count' not in editor:        
            return

        editor_id = editor['user_id']


        if utils.isBot(editor_id):
            return

        for year,edity in editor['edit_count'].items():        
            for month,editm in edity.items():  
                # extract year and month 20xxxx  
                ym = '%s%02d'%(year,int(month))
                
                time_index = self.time_stamps_index.get(ym,None)
                if time_index is None:
                    continue

                # cohort index depends on the aggregate of the namespaces
                nedits = 0                        
                
                for ns,ecount in editm.items():                           
                                                                                    
                    if ns in self.NS:
                        # count edits
                        nedits +=  ecount
                
                if nedits > 0:
                    # Sparse representation. Only collecting >0 edits because it is possible that an editor has no edits in that namespace, but edits in other namespaces.
                    cohorts_index = self.getIndex(nedits)

                    # increment the histogram bin for that editor and month
                    # note: as the data is sparse, it means we don't count the number of people with zero edits in that month
                    self.data['editsHistogram'][cohorts_index,time_index] += 1
Пример #15
0
    def processSQLrow(self, row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return

        if editor_id != self.old_user_id:
            '''When encountering a new editor, we extract the firstedit and the last ym of interest
            '''
            if self.old_user_id is not None:
                # we just finished aggregating information for an editor

                #getting bin of the number of edits the editor has done
                cohorts_index = self.getIndex(self.edits)

                self.data['editors'][cohorts_index, self.fe_index] += 1
                self.data['added'][cohorts_index, self.fe_index] = self.added
                self.data['removed'][cohorts_index,
                                     self.fe_index] = self.removed
                self.data['net'][cohorts_index, self.fe_index] = self.net
                self.data['edits'][cohorts_index, self.fe_index] = self.edits

            # initializing info for current editor
            self.old_user_id = editor_id

            year = row['first_edit_year']
            month = row['first_edit_month']

            self.firstedit = '%d%02d' % (year, month)
            self.fe_index = self.time_stamps_index.get(self.firstedit, None)

            if self.fe_index is None:
                return

            (m, y) = ((month + self.period) % 12,
                      year + (month + self.period) / 12)
            self.lastym = '%d%02d' % (y, m)

            self.edits = 0
            self.added = 0
            self.removed = 0
            self.net = 0

        if self.fe_index is None:
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d' % (year, month)

        if ym > self.lastym:
            return

        if row['add_edits'] is not None:
            self.edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            self.edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            self.edits += int(row['noop_edits'])

        if row['len_added'] is not None:
            self.added += int(row['len_added'])
        if row['len_removed'] is not None:
            self.removed += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.net += int(row['len_added']) + int(row['len_removed'])
Пример #16
0
    def processSQLrow(self,row):
        # try:
        editor_id = row['user_id']

        if utils.isBot(editor_id):
            return


        if editor_id != self.old_user_id:
            '''When encountering a new editor, we extract the firstedit and the last ym of interest
            '''
            if self.old_user_id is not None:
                # we just finished aggregating information for an editor
                
                #getting bin of the number of edits the editor has done
                cohorts_index = self.getIndex(self.edits)

                self.data['editors'][cohorts_index,self.fe_index] += 1
                self.data['added'][cohorts_index,self.fe_index] = self.added
                self.data['removed'][cohorts_index,self.fe_index] = self.removed
                self.data['net'][cohorts_index,self.fe_index] = self.net
                self.data['edits'][cohorts_index,self.fe_index] = self.edits


            # initializing info for current editor
            self.old_user_id = editor_id

            year = row['first_edit_year']
            month = row['first_edit_month']

            self.firstedit = '%d%02d'%(year,month)
            self.fe_index = self.time_stamps_index.get(self.firstedit,None)

            if self.fe_index is None:
                return
            

            (m,y) = ((month+self.period)%12,year+(month+self.period)/12)
            self.lastym = '%d%02d'%(y,m)

            self.edits = 0
            self.added = 0
            self.removed = 0
            self.net = 0


        if self.fe_index is None:
            return

        year = row['rev_year']
        month = row['rev_month']

        ym = '%d%02d'%(year,month)

        if ym > self.lastym:
            return


        
        if row['add_edits'] is not None:
            self.edits += int(row['add_edits'])
        if row['remove_edits'] is not None:
            self.edits += int(row['remove_edits'])
        if row['noop_edits'] is not None:
            self.edits += int(row['noop_edits'])

        

        if row['len_added'] is not None:
            self.added += int(row['len_added'])
        if row['len_removed'] is not None:    
            self.removed += -int(row['len_removed'])
        if row['len_added'] is not None and row['len_removed'] is not None:
            self.net += int(row['len_added']) + int(row['len_removed'])