def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return yield (course_id, user_id), (timestamp, event_type)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = event_data.get('course_id') if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value user_id = event.get('context', {}).get('user_id') if not user_id: return try: user_id = int(user_id) except ValueError: self.incr_counter('User Location', 'Discard event with malformed user_id', 1) return # Get timestamp instead of date string, so we get the latest ip # address for events on the same day. timestamp = eventlog.get_event_time_string(event) if not timestamp: return ip_address = event.get('ip') if not ip_address: log.warning("No ip_address found for user '%s' on '%s'.", user_id, timestamp) return # Get the course_id from context, if it happens to be present. # It's okay if it isn't. # (Not sure if there are particular types of course # interaction we care about, but we might want to only collect # the course_id off of explicit events, and ignore implicit # events as not being "real" interactions with course content. # Or maybe we add a flag indicating explicit vs. implicit, so # that this can be better teased apart. For example, we could # use the latest explicit event for a course, but if there are # none, then use the latest implicit event for the course, and # if there are none, then use the latest overall event.) course_id = eventlog.get_course_id(event) # For multi-output, we will generate a single file for each key value. # When looking at location for user in a course, we don't want to have # an output file per course per date, so just use date as the key, # and have a single file representing all events on the date. yield date_string, (timestamp, ip_address, course_id, user_id)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return yield date_string, (course_id, user_id, timestamp, event_type, mode)
def mapper(self, line): """ Args: line: text line from a tracking event log. Yields: (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct) """ value = self.get_event_and_date_string(line) if value is None: return event, _ = value if event.get('event_type') != 'problem_check' or event.get( 'event_source') != 'server': return timestamp = eventlog.get_event_time_string(event) if timestamp is None: return course_id = eventlog.get_course_id(event) if not course_id: return org_id = opaque_key_util.get_org_id_for_course(course_id) event_data = eventlog.get_event_data(event) if event_data is None: return problem_id = event_data.get('problem_id') if not problem_id: return is_correct = event_data.get('success') == 'correct' saved_tags = event.get('context').get('asides', {}).get( 'tagging_aside', {}).get('saved_tags', {}) yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = eventlog.get_event_username(event) if not username: return # Get timestamp instead of date string, so we get the latest ip # address for events on the same day. timestamp = eventlog.get_event_time_string(event) if not timestamp: return ip_address = event.get('ip') if not ip_address: log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp) return # Get the course_id from context, if it happens to be present. # It's okay if it isn't. # (Not sure if there are particular types of course # interaction we care about, but we might want to only collect # the course_id off of explicit events, and ignore implicit # events as not being "real" interactions with course content. # Or maybe we add a flag indicating explicit vs. implicit, so # that this can be better teased apart. For example, we could # use the latest explicit event for a course, but if there are # none, then use the latest implicit event for the course, and # if there are none, then use the latest overall event.) course_id = eventlog.get_course_id(event) # For multi-output, we will generate a single file for each key value. # When looking at location for user in a course, we don't want to have # an output file per course per date, so just use date as the key, # and have a single file representing all events on the date. yield date_string, (timestamp, ip_address, course_id, username)
def mapper(self, line): """ Args: line: text line from a tracking event log. Yields: (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct) """ value = self.get_event_and_date_string(line) if value is None: return event, _ = value if event.get('event_type') != 'problem_check' or event.get('event_source') != 'server': return timestamp = eventlog.get_event_time_string(event) if timestamp is None: return course_id = eventlog.get_course_id(event) if not course_id: return org_id = opaque_key_util.get_org_id_for_course(course_id) event_data = eventlog.get_event_data(event) if event_data is None: return problem_id = event_data.get('problem_id') if not problem_id: return is_correct = event_data.get('success') == 'correct' saved_tags = event.get('context').get('asides', {}).get('tagging_aside', {}).get('saved_tags', {}) yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1) return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1) event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1) return if event_type not in VIDEO_EVENT_TYPES: # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1) return # self.incr_counter(self.counter_category_name, 'Input Video Events', 1) # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) # Strip username to remove trailing newlines that mess up Luigi. username = event.get('username', '').strip() if not username: log.error("Video event without username: %s", event) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing username', 1) return course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1) return encoded_module_id = event_data.get( 'id', '').strip() # we have seen id values with leading newline if not encoded_module_id: log.warn( 'Video event without valid encoded_module_id (id): {0}'.format( line)) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1) return video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION) if not video_duration: # events may have a 'duration' value of null, so use the same default for those as well. video_duration = VIDEO_UNKNOWN_DURATION # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1) current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in VIDEO_CODES: youtube_id = code current_time = self._check_time_offset( event_data.get('currentTime'), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Play', 1) elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset( event_data.get('currentTime', 0), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Pause', 1) elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Seek', 1) elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset( event_data.get('currentTime'), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Stop', 1) if youtube_id is not None: youtube_id = youtube_id.encode('utf8') # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1) yield ((username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id, video_duration))
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1) return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1) event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1) return if event_type not in VIDEO_EVENT_TYPES: # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1) return # self.incr_counter(self.counter_category_name, 'Input Video Events', 1) # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) user_id = event.get('context', {}).get('user_id') if not user_id: log.error("Video event without user_id in context: %s", event) return # Convert user_id to int if str if not isinstance(user_id, int): user_id = int(user_id) course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1) return encoded_module_id = event_data.get('id', '').strip() # we have seen id values with leading newline if not encoded_module_id: log.warn('Video event without valid encoded_module_id (id): {0}'.format(line)) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1) return video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION) if not video_duration: # events may have a 'duration' value of null, so use the same default for those as well. video_duration = VIDEO_UNKNOWN_DURATION # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1) current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in VIDEO_CODES: youtube_id = code current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Play', 1) elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset(event_data.get('currentTime', 0), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Pause', 1) elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Seek', 1) elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Stop', 1) if youtube_id is not None: youtube_id = youtube_id.encode('utf8') # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1) yield ( (user_id, course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id, video_duration) )
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} forum_post_voted = None if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match( self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER elif event_type.startswith('edx.forum'): forum_post_voted = re.match( r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type) if forum_post_voted: info['vote_value'] = event_data.get('vote_value') if info['vote_value'] not in ['up', 'down']: return info['undo_vote'] = event_data.get('undo_vote', False) date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta( days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string)) if forum_post_voted: # We emit two events for each "voted" event - one for the voting user and one for the # user receiving the vote. username = event_data.get('target_username') if not username: return event_type = 'edx.forum.{}.vote_received'.format( forum_post_voted.group('post_type')) yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in VIDEO_EVENT_TYPES: return # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) # Strip username to remove trailing newlines that mess up Luigi. username = event.get('username', '').strip() if not username: log.error("Video event without username: %s", event) return course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. return encoded_module_id = event_data.get('id') if encoded_module_id is None: log.warn('Video event without valid encoded_module_id (id): {0}'.format(line)) return current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in ('html5', 'mobile'): youtube_id = code current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: return elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset(event_data.get('currentTime', 0), line) if current_time is None: return elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: return elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: return if youtube_id is not None: youtube_id = youtube_id.encode('utf8') yield ( (username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id) )
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta(days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} forum_post_voted = None if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER elif event_type.startswith('edx.forum'): forum_post_voted = re.match(r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type) if forum_post_voted: info['vote_value'] = event_data.get('vote_value') if info['vote_value'] not in ['up', 'down']: return info['undo_vote'] = event_data.get('undo_vote', False) date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta(days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string)) if forum_post_voted: # We emit two events for each "voted" event - one for the voting user and one for the # user receiving the vote. username = event_data.get('target_username') if not username: return event_type = 'edx.forum.{}.vote_received'.format(forum_post_voted.group('post_type')) yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match( self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta( days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))