def test_course_id_from_url_legacy(self): event = { 'event_source': 'server', 'context': {}, 'event_type': '/courses/edX/Open_DemoX/edx_demo_course/info' } self.assertEquals(eventlog.get_course_id(event, from_url=True), 'edX/Open_DemoX/edx_demo_course')
def test_course_id_from_server_url(self): event = { 'event_source': 'server', 'context': {}, 'event_type': '/courses/course-v1:DemoX+DemoX+T1_2014/about' } self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value user_id = event.get('context', {}).get('user_id') if not user_id: self.incr_counter('UserActivity', 'Discard Missing User ID', 1) log.error("User-Activity: event without user_id in context: %s", event) return # Course user activity URLs have changed recently with the introduction of micro-frontends (MFEs). # This code attempts to handle those URL changes with minimal diffences in the number of events processed/used. # # Attempt to extract the course_id. The get_course_id() method will first look for an explicit course ID in the # event context. If that explicit course ID does not exist, the code will then look in the event URL to attempt # to parse out a course ID, using both an old-style URL pattern and a new-style micro-frontend courseware URL pattern. course_id = eventlog.get_course_id(event, from_url=True) if not course_id: # If a course_id has not been extracted successfully, ignore this event. self.incr_counter('UserActivity', 'Discard Missing Course ID', 1) return for label in self.get_predicate_labels(event): yield date_string, self._encode_tuple( (str(user_id), course_id, date_string, label))
def test_course_id_from_browser_url(self): event = { 'event_source': 'browser', 'context': {}, 'page': 'http://test.edx.org/courses/course-v1:DemoX+DemoX+T1_2014/courseware/interactive_demonstrations' } self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
def test_course_id_from_xblock_browser_url(self): event = { 'event_source': 'browser', 'context': {}, 'page': 'https://courses.edx.org/xblock/block-v1:DemoX+DemoX+T1_2014+type@vertical+block@3848270?p1=0&p2=0' } self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
def mapper(self, line): event, date_string = self.get_event_and_date_string(line) or (None, None) if event is None: return if not self.is_valid_input_file(): return org_id = self.get_org_id(event) if org_id not in self.org_id_whitelist: log.debug('Unrecognized organization: org_id=%s', org_id or '') return # Check to see if the org_id is one that should be grouped with other org_ids. org_ids = self.primary_org_ids_for_org_id[org_id] for key_org_id in org_ids: key = (date_string, key_org_id) # Include only requested courses requested_courses = self.courses_for_org_id.get(key_org_id) if requested_courses and eventlog.get_course_id( event, from_url=True) not in requested_courses: continue # Enforce a standard encoding for the parts of the key. Without this a part of the key # might appear differently in the key string when it is coerced to a string by luigi. For example, # if the same org_id appears in two different records, one as a str() type and the other a # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key # string. Although python doesn't care about this difference, hadoop does, and will bucket the # values separately. Which is not what we want. yield tuple([value.encode('utf8') for value in key]), line.strip()
def mapper(self, line): event, date_string = self.get_event_and_date_string(line) or (None, None) if event is None: return if not self.is_valid_input_file(): return org_id = self.get_org_id(event) if org_id not in self.org_id_whitelist: log.debug('Unrecognized organization: org_id=%s', org_id or '') return # Check to see if the org_id is one that should be grouped with other org_ids. org_ids = self.primary_org_ids_for_org_id[org_id] for key_org_id in org_ids: key = (date_string, key_org_id) # Include only requested courses requested_courses = self.courses_for_org_id.get(key_org_id) if requested_courses and eventlog.get_course_id(event, from_url=True) not in requested_courses: continue # Enforce a standard encoding for the parts of the key. Without this a part of the key # might appear differently in the key string when it is coerced to a string by luigi. For example, # if the same org_id appears in two different records, one as a str() type and the other a # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key # string. Although python doesn't care about this difference, hadoop does, and will bucket the # values separately. Which is not what we want. yield tuple([value.encode('utf8') for value in key]), line.strip()
def test_course_id_from_invalid_xblock_browser_url(self): event = { 'event_source': 'browser', 'context': {}, 'page': 'https://courses.edx.org/xblock/block-v1:DemoX+DemoX+T1_2014?p1=0&p2=0' } self.assertIsNone(eventlog.get_course_id(event, from_url=True))
def mapper(self, line): # We only want to consider lines that include the type of event with which we are concerned. if LINK_CLICKED not in line: return value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if not event_type: log.error("encountered event with no event_type: %s", event) return if event_type != LINK_CLICKED: return event_data = eventlog.get_event_data(event) if event_data is None: log.error( "encountered explicit link_clicked event with no event data: %s", event) return course_id = eventlog.get_course_id(event) if course_id is None: log.error( "encountered explicit link_clicked event with invalid course_id: %s", event) return target_url = event_data.get('target_url') if not target_url: log.error( "encountered explicit link_clicked event with no target_url: %s", event) return current_url = event_data.get('current_url') if not current_url: log.error( "encountered explicit link_clicked event with no current_url: %s", event) return # A link is considered "internal" when it does not navigate away from the current host. # Some internal links exclude the host name entirely- they start with / so we account for that. current_loc = urlparse(current_url).netloc target_loc = urlparse(target_url).netloc is_external = 0 if current_loc != target_loc and target_loc != "": is_external = 1 yield (course_id, date_string), (is_external)
def mapper(self, line): # We only want to consider lines that include the type of event with which we are concerned. if LINK_CLICKED not in line: return value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if not event_type: log.error("encountered event with no event_type: %s", event) return if event_type != LINK_CLICKED: return event_data = eventlog.get_event_data(event) if event_data is None: log.error("encountered explicit link_clicked event with no event data: %s", event) return course_id = eventlog.get_course_id(event) if course_id is None: log.error("encountered explicit link_clicked event with invalid course_id: %s", event) return target_url = event_data.get('target_url') if not target_url: log.error("encountered explicit link_clicked event with no target_url: %s", event) return current_url = event_data.get('current_url') if not current_url: log.error("encountered explicit link_clicked event with no current_url: %s", event) return # A link is considered "internal" when it does not navigate away from the current host. # Some internal links exclude the host name entirely- they start with / so we account for that. current_loc = urlparse(current_url).netloc target_loc = urlparse(target_url).netloc is_external = 0 if current_loc != target_loc and target_loc != "": is_external = 1 yield (course_id, date_string), (is_external)
def mapper(self, line): event, date_string = self.get_event_and_date_string(line) or (None, None) if event is None: return course_id = eventlog.get_course_id(event, from_url=True) if course_id is None: return if self.course_id and course_id not in self.course_id: return key = (date_string, course_id) yield tuple([value.encode('utf8') for value in key]), line.strip()
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value user_id = event.get('context', {}).get('user_id') if not user_id: return try: user_id = int(user_id) except ValueError: self.incr_counter('User Location', 'Discard event with malformed user_id', 1) return # Get timestamp instead of date string, so we get the latest ip # address for events on the same day. timestamp = eventlog.get_event_time_string(event) if not timestamp: return ip_address = event.get('ip') if not ip_address: log.warning("No ip_address found for user '%s' on '%s'.", user_id, timestamp) return # Get the course_id from context, if it happens to be present. # It's okay if it isn't. # (Not sure if there are particular types of course # interaction we care about, but we might want to only collect # the course_id off of explicit events, and ignore implicit # events as not being "real" interactions with course content. # Or maybe we add a flag indicating explicit vs. implicit, so # that this can be better teased apart. For example, we could # use the latest explicit event for a course, but if there are # none, then use the latest implicit event for the course, and # if there are none, then use the latest overall event.) course_id = eventlog.get_course_id(event) # For multi-output, we will generate a single file for each key value. # When looking at location for user in a course, we don't want to have # an output file per course per date, so just use date as the key, # and have a single file representing all events on the date. yield date_string, (timestamp, ip_address, course_id, user_id)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return course_id = eventlog.get_course_id(event) if not course_id: return for label in self.get_predicate_labels(event): yield self._encode_tuple((course_id, username, date_string, label)), 1
def mapper(self, line): event, date_string = self.get_event_and_date_string(line) or (None, None) if event is None: return course_id = eventlog.get_course_id(event, from_url=True) if course_id is None: return if self.course and course_id not in self.course: return key = (date_string, course_id) yield tuple([value.encode('utf8') for value in key]), line.strip()
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return course_id = eventlog.get_course_id(event) if not course_id: return for label in self.get_predicate_labels(event): yield date_string, self._encode_tuple( (course_id, username, date_string, label))
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value user_id = event.get('context', {}).get('user_id') if not user_id: self.incr_counter('UserActivity', 'Discard Missing User ID', 1) log.error("User-Activity: event without user_id in context: %s", event) return course_id = eventlog.get_course_id(event) if not course_id: return for label in self.get_predicate_labels(event): yield date_string, self._encode_tuple((str(user_id), course_id, date_string, label))
def mapper(self, line): """ Args: line: text line from a tracking event log. Yields: (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct) """ value = self.get_event_and_date_string(line) if value is None: return event, _ = value if event.get('event_type') != 'problem_check' or event.get( 'event_source') != 'server': return timestamp = eventlog.get_event_time_string(event) if timestamp is None: return course_id = eventlog.get_course_id(event) if not course_id: return org_id = opaque_key_util.get_org_id_for_course(course_id) event_data = eventlog.get_event_data(event) if event_data is None: return problem_id = event_data.get('problem_id') if not problem_id: return is_correct = event_data.get('success') == 'correct' saved_tags = event.get('context').get('asides', {}).get( 'tagging_aside', {}).get('saved_tags', {}) yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id, entity_type, user_actions = self.get_user_actions_from_event(event_data, event_source, event_type) if not entity_id or not entity_type: return for action in user_actions: record = ModuleEngagementRecord( course_id=course_id, username=username, date=DateField().deserialize_from_string(date_string), entity_type=entity_type, entity_id=entity_id, event=action, count=0 ) # The count is populated by the reducer, so exclude it from the key. record_without_count = record.to_string_tuple()[:-1] yield (record_without_count, 1)
def get_event_row_from_document(self, document): event_and_date_string = self.get_event_and_date_string(document) if not event_and_date_string: return event, date_string = event_and_date_string username = event.get('username', '').strip() if not username: return course_id = eventlog.get_course_id(event) if not course_id: return events = [] for label in self.get_predicate_labels(event): event_row = self._encode_tuple( (course_id, username, date_string, label)) events.append(event_row) return events
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = eventlog.get_event_username(event) if not username: return # Get timestamp instead of date string, so we get the latest ip # address for events on the same day. timestamp = eventlog.get_event_time_string(event) if not timestamp: return ip_address = event.get('ip') if not ip_address: log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp) return # Get the course_id from context, if it happens to be present. # It's okay if it isn't. # (Not sure if there are particular types of course # interaction we care about, but we might want to only collect # the course_id off of explicit events, and ignore implicit # events as not being "real" interactions with course content. # Or maybe we add a flag indicating explicit vs. implicit, so # that this can be better teased apart. For example, we could # use the latest explicit event for a course, but if there are # none, then use the latest implicit event for the course, and # if there are none, then use the latest overall event.) course_id = eventlog.get_course_id(event) # For multi-output, we will generate a single file for each key value. # When looking at location for user in a course, we don't want to have # an output file per course per date, so just use date as the key, # and have a single file representing all events on the date. yield date_string, (timestamp, ip_address, course_id, username)
def mapper(self, line): """ Args: line: text line from a tracking event log. Yields: (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct) """ value = self.get_event_and_date_string(line) if value is None: return event, _ = value if event.get('event_type') != 'problem_check' or event.get('event_source') != 'server': return timestamp = eventlog.get_event_time_string(event) if timestamp is None: return course_id = eventlog.get_course_id(event) if not course_id: return org_id = opaque_key_util.get_org_id_for_course(course_id) event_data = eventlog.get_event_data(event) if event_data is None: return problem_id = event_data.get('problem_id') if not problem_id: return is_correct = event_data.get('success') == 'correct' saved_tags = event.get('context').get('asides', {}).get('tagging_aside', {}).get('saved_tags', {}) yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
def mapper(self, line): event, date_string = self.get_event_and_date_string(line) or (None, None) if event is None: return if not self.is_valid_input_file(): return org_id = self.get_org_id(event) if org_id not in self.org_id_whitelist: log.debug('Unrecognized organization: org_id=%s', org_id or '') return # Do not export events that have been explicitly flagged as not being for export. # Any event without an '_export' key will be sent as part of the export by default, # and likewise any event without a falsey value. The preferred value to not export is 'false'. event_data = eventlog.get_event_data(event) if event_data and str(event_data.get('_export', 'true')).lower() in ('n', 'f', '0', 'false', 'no'): return # Check to see if the org_id is one that should be grouped with other org_ids. org_ids = self.primary_org_ids_for_org_id[org_id] for key_org_id in org_ids: key = (date_string, key_org_id) # Include only requested courses requested_courses = self.courses_for_org_id.get(key_org_id) if requested_courses and eventlog.get_course_id(event, from_url=True) not in requested_courses: continue # Enforce a standard encoding for the parts of the key. Without this a part of the key # might appear differently in the key string when it is coerced to a string by luigi. For example, # if the same org_id appears in two different records, one as a str() type and the other a # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key # string. Although python doesn't care about this difference, hadoop does, and will bucket the # values separately. Which is not what we want. yield tuple([value.encode('utf8') for value in key]), line.strip()
def obfuscate_event_entry(self, line): event = eventlog.parse_json_event(line) if event is None: # Unexpected here... log.error(u"Encountered event entry which failed to parse: %r", line) return line course_id = eventlog.get_course_id(event, from_url=True) if course_id is None: # Unexpected here... log.error(u"Encountered event entry with no course_id: %r", line) return line # We cannot use this method as-is, since we need to know what was done to the event, so # that it can be transformed back to its original form once cleaned. # NOT event_data = eventlog.get_event_data(event) event_json_decoded = False event_data = event.get('event') if event_data is None: log.error(u"Encountered event entry with no 'event' payload: %r", line) if event_data == '': # Note that this happens with some browser events. Instead of # failing to parse it as a JSON string, just leave as-is. pass elif isinstance(event_data, basestring): # Cjson produces str, while json produces unicode. Hmm. if len(event_data) == 512 and 'POST' in event_data: # It's a truncated JSON string. But we're going to throw it out anyway, so no worries. pass elif '{' not in event_data and '=' in event_data: # It's a key-value pair from a browser event. Just process as-is, rather than parsing and reassembling. pass else: try: event_data = eventlog.decode_json(event_data) event_json_decoded = True except Exception: log.error(u"Encountered event entry with unparseable 'event' payload: %r", line) # TODO: update the comment! This is where we traverse the event in search of values that should be "cleansed". # Much along the model of what we already do for 'state' in CWSM. Except that we need to be more # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed # to get to strings that can be properly interpreted. event_user_info = self.get_userinfo_from_event(event, event_data) if 'POST' in event_data: if self.parameters['skip_post']: return None updated_event_data = self.obfuscator.obfuscate_structure(event_data, u"event", event_user_info) if updated_event_data is not None: event_source = event.get('event_source') event_type = event.get('event_type') log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type) if event_json_decoded: # TODO: should really use cjson, if that were originally used for decoding the json. updated_event_data = json.dumps(updated_event_data) event['event'] = updated_event_data # TODO: should really use cjson, if that were originally used for decoding the json. return json.dumps(event)
def get_problem_check_event(line_or_event): """ Generates output values for explicit problem_check events. Args: line_or_event: pre-parsed event dict, or text line from a tracking event log Returns: (problem_id, username), (timestamp, problem_check_info) where timestamp is in ISO format, with resolution to the millisecond and problem_check_info is a JSON-serialized dict containing the contents of the problem_check event's 'event' field, augmented with entries for 'timestamp', 'username', and 'context' from the event. or None if there is no valid problem_check event on the line. Example: (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah) """ # Ensure the given event dict is a problem_check event if isinstance(line_or_event, dict): event = line_or_event if event.get('event_type') != 'problem_check': return None # Parse the line into an event dict, if not provided. else: event = eventlog.parse_json_server_event(line_or_event, 'problem_check') if event is None: return None # Get the "problem data". This is the event data, the context, and anything else that would # be useful further downstream. (We could just pass the entire event dict?) # Get the user from the username, not from the user_id in the # context. While we are currently requiring context (as described # above), we might not in future. Older events will not have # context information, so we can't rely on user_id from there. # And we don't expect problem_check events to occur without a # username, and don't expect them to occur with the wrong user # (i.e. one user acting on behalf of another, as in an instructor # acting on behalf of a student). augmented_data_fields = ['context', 'username', 'timestamp'] problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields) if problem_data is None: return None # Get the course_id from context. We won't work with older events # that do not have context information, since they do not directly # provide course_id information. (The problem_id/answer_id values # contain the org and course name, but not the run.) Course_id # information could be found from other events, but it would # require expanding the events being selected. course_id = eventlog.get_course_id(event) if course_id is None: log.error("encountered explicit problem_check event with missing course_id: %s", event) return None if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit problem_check event with bogus course_id: %s", event) return None # Get the problem_id from the event data. problem_id = problem_data.get('problem_id') if problem_id is None: log.error("encountered explicit problem_check event with bogus problem_id: %s", event) return None event = event.get('event', {}) answers = event.get('answers', {}) if len(answers) == 0: return None try: _check_answer_ids(answers) _check_answer_ids(event.get('submission', {})) except (TypeError, ValueError): log.error("encountered explicit problem_check event with invalid answers: %s", event) return None problem_data_json = json.dumps(problem_data) key = (course_id, problem_id, problem_data.get('username')) value = (problem_data.get('timestamp'), problem_data_json) return key, value
def _generate_answers(self, event_string, attempt_category): """ Generates a list of answers given a problem_check event. Args: event_string: a json-encoded string version of an event's data. attempt_category: a string that is 'first' for a user's first response to a question, 'last' otherwise Returns: list of answer data tuples. See docstring for reducer() for more details. """ event = json.loads(event_string) # Get context information: course_id = eventlog.get_course_id(event) timestamp = event.get('timestamp') problem_id = event.get('problem_id') grade = event.get('grade') max_grade = event.get('max_grade') problem_display_name = event.get('context').get('module', {}).get('display_name', None) result = [] def append_submission(answer_id, submission): """Convert submission to result to be returned.""" # First augment submission with problem-level information # not found in the submission: submission['problem_id'] = problem_id submission['problem_display_name'] = problem_display_name submission['attempt_category'] = attempt_category submission['grade'] = grade submission['max_grade'] = max_grade # Add the timestamp so that all responses can be sorted in order. # We want to use the "latest" values for some fields. output_key = (course_id, answer_id) output_value = (timestamp, json.dumps(submission)) result.append((output_key, output_value)) answers = event.get('answers') correct_map = event.get('correct_map', {}) if 'submission' in event: submissions = event.get('submission') for answer_id in submissions: if not self.is_hidden_answer(answer_id): submission = submissions.get(answer_id) # But submission doesn't contain moniker value for answer. # So we check the raw answers, and see if its value is # different. If so, we assume it's a moniker. answer_value = answers[answer_id] if answer_value != submission.get('answer'): submission['answer_value_id'] = answer_value submission['answer_correct_map'] = correct_map.get(answer_id) append_submission(answer_id, submission) else: # Otherwise, it's an older event with no 'submission' # information, so parse it as well as possible. for answer_id in answers: if not self.is_hidden_answer(answer_id): answer_value = answers[answer_id] # Argh. It seems that sometimes we're encountering # bogus answer_id values. In particular, one that # is including the possible choice values, instead # of any actual values selected by the student. # For now, let's just dump an error and skip it, # so that it becomes the equivalent of a hidden # answer. # TODO: Eventually treat it explicitly as a hidden # answer. if answer_id not in correct_map: log.error("Unexpected answer_id %s not in correct_map: %s", answer_id, event) continue correctness = correct_map[answer_id].get('correctness') == 'correct' variant = event.get('state', {}).get('seed') # We do not know the values for 'input_type', # 'response_type', or 'question'. We also don't know if # answer_value should be identified as 'answer_value_id' or # 'answer', so we choose to use 'answer_value_id' here and # never define 'answer'. This allows disambiguation from # events with a submission field, which will always have # an 'answer' and only sometimes have an 'answer_value_id'. submission = { 'answer_value_id': answer_value, 'correct': correctness, 'variant': variant, 'answer_correct_map': correct_map.get(answer_id), } append_submission(answer_id, submission) return result
def test_missing_context(self): event = { 'event_source': 'server' } self.assertIsNone(eventlog.get_course_id(event))
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1) return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1) event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1) return if event_type not in VIDEO_EVENT_TYPES: # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1) return # self.incr_counter(self.counter_category_name, 'Input Video Events', 1) # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) user_id = event.get('context', {}).get('user_id') if not user_id: log.error("Video event without user_id in context: %s", event) return # Convert user_id to int if str if not isinstance(user_id, int): user_id = int(user_id) course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1) return encoded_module_id = event_data.get('id', '').strip() # we have seen id values with leading newline if not encoded_module_id: log.warn('Video event without valid encoded_module_id (id): {0}'.format(line)) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1) return video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION) if not video_duration: # events may have a 'duration' value of null, so use the same default for those as well. video_duration = VIDEO_UNKNOWN_DURATION # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1) current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in VIDEO_CODES: youtube_id = code current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Play', 1) elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset(event_data.get('currentTime', 0), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Pause', 1) elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Seek', 1) elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1) return # Slow: self.incr_counter(self.counter_category_name, 'Subset Stop', 1) if youtube_id is not None: youtube_id = youtube_id.encode('utf8') # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1) yield ( (user_id, course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id, video_duration) )
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} forum_post_voted = None if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match( self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER elif event_type.startswith('edx.forum'): forum_post_voted = re.match( r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type) if forum_post_voted: info['vote_value'] = event_data.get('vote_value') if info['vote_value'] not in ['up', 'down']: return info['undo_vote'] = event_data.get('undo_vote', False) date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta( days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string)) if forum_post_voted: # We emit two events for each "voted" event - one for the voting user and one for the # user receiving the vote. username = event_data.get('target_username') if not username: return event_type = 'edx.forum.{}.vote_received'.format( forum_post_voted.group('post_type')) yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta(days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def obfuscate_event_entry(self, line): event = eventlog.parse_json_event(line) if event is None: # Unexpected here... log.error(u"Encountered event entry which failed to parse: %r", line) return line course_id = eventlog.get_course_id(event, from_url=True) if course_id is None: # Unexpected here... log.error(u"Encountered event entry with no course_id: %r", line) return line # We cannot use this method as-is, since we need to know what was done to the event, so # that it can be transformed back to its original form once cleaned. # NOT event_data = eventlog.get_event_data(event) event_json_decoded = False event_data = event.get('event') if event_data is None: log.error(u"Encountered event entry with no 'event' payload: %r", line) if event_data == '': # Note that this happens with some browser events. Instead of # failing to parse it as a JSON string, just leave as-is. pass elif isinstance(event_data, basestring): # Cjson produces str, while json produces unicode. Hmm. if len(event_data) == 512 and 'POST' in event_data: # It's a truncated JSON string. But we're going to throw it out anyway, so no worries. pass elif '{' not in event_data and '=' in event_data: # It's a key-value pair from a browser event. Just process as-is, rather than parsing and reassembling. pass else: try: event_data = eventlog.decode_json(event_data) event_json_decoded = True except Exception: log.error( u"Encountered event entry with unparseable 'event' payload: %r", line) # TODO: update the comment! This is where we traverse the event in search of values that should be "cleansed". # Much along the model of what we already do for 'state' in CWSM. Except that we need to be more # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed # to get to strings that can be properly interpreted. event_user_info = self.get_userinfo_from_event(event, event_data) if 'POST' in event_data: if self.parameters['skip_post']: return None updated_event_data = self.obfuscator.obfuscate_structure( event_data, u"event", event_user_info) if updated_event_data is not None: event_source = event.get('event_source') event_type = event.get('event_type') log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type) if event_json_decoded: # TODO: should really use cjson, if that were originally used for decoding the json. updated_event_data = json.dumps(updated_event_data) event['event'] = updated_event_data # TODO: should really use cjson, if that were originally used for decoding the json. return json.dumps(event)
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in VIDEO_EVENT_TYPES: return # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) # Strip username to remove trailing newlines that mess up Luigi. username = event.get('username', '').strip() if not username: log.error("Video event without username: %s", event) return course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. return encoded_module_id = event_data.get('id') if encoded_module_id is None: log.warn('Video event without valid encoded_module_id (id): {0}'.format(line)) return current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in ('html5', 'mobile'): youtube_id = code current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: return elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset(event_data.get('currentTime', 0), line) if current_time is None: return elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: return elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset(event_data.get('currentTime'), line) if current_time is None: return if youtube_id is not None: youtube_id = youtube_id.encode('utf8') yield ( (username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id) )
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match( self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta( days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta( days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
def mapper(self, line): # Add a filter here to permit quicker rejection of unrelated events. if VIDEO_EVENT_MINIMUM_STRING not in line: # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1) return value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1) event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1) return if event_type not in VIDEO_EVENT_TYPES: # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1) return # self.incr_counter(self.counter_category_name, 'Input Video Events', 1) # This has already been checked when getting the event, so just fetch the value. timestamp = eventlog.get_event_time_string(event) # Strip username to remove trailing newlines that mess up Luigi. username = event.get('username', '').strip() if not username: log.error("Video event without username: %s", event) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing username', 1) return course_id = eventlog.get_course_id(event) if course_id is None: log.warn('Video event without valid course_id: {0}'.format(line)) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1) return event_data = eventlog.get_event_data(event) if event_data is None: # This should already have been logged. ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1) return encoded_module_id = event_data.get( 'id', '').strip() # we have seen id values with leading newline if not encoded_module_id: log.warn( 'Video event without valid encoded_module_id (id): {0}'.format( line)) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1) return video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION) if not video_duration: # events may have a 'duration' value of null, so use the same default for those as well. video_duration = VIDEO_UNKNOWN_DURATION # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1) current_time = None old_time = None youtube_id = None if event_type == VIDEO_PLAYED: code = event_data.get('code') if code not in VIDEO_CODES: youtube_id = code current_time = self._check_time_offset( event_data.get('currentTime'), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Play', 1) elif event_type == VIDEO_PAUSED: # Pause events may have a missing currentTime value if video is paused at the beginning, # so provide a default of zero. current_time = self._check_time_offset( event_data.get('currentTime', 0), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Pause', 1) elif event_type == VIDEO_SEEK: current_time = self._check_time_offset(event_data.get('new_time'), line) old_time = self._check_time_offset(event_data.get('old_time'), line) if current_time is None or old_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Seek', 1) elif event_type == VIDEO_STOPPED: current_time = self._check_time_offset( event_data.get('currentTime'), line) if current_time is None: ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1) ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1) return ### self.incr_counter(self.counter_category_name, 'Subset Stop', 1) if youtube_id is not None: youtube_id = youtube_id.encode('utf8') # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1) yield ((username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')), (timestamp, event_type, current_time, old_time, youtube_id, video_duration))
def test_missing_context(self): event = {'event_source': 'server'} self.assertIsNone(eventlog.get_course_id(event))
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value username = event.get('username', '').strip() if not username: return event_type = event.get('event_type') if event_type is None: return course_id = eventlog.get_course_id(event) if not course_id: return event_data = eventlog.get_event_data(event) if event_data is None: return event_source = event.get('event_source') entity_id = '' info = {} forum_post_voted = None if event_type == 'problem_check': if event_source != 'server': return problem_id = event_data.get('problem_id') if not problem_id: return entity_id = problem_id if event_data.get('success', 'incorrect').lower() == 'correct': info['correct'] = True elif event_type == 'play_video': encoded_module_id = event_data.get('id') if not encoded_module_id: return entity_id = encoded_module_id elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type): timestamp = eventlog.get_event_time_string(event) if timestamp is None: return info['path'] = event_type info['timestamp'] = timestamp event_type = SUBSECTION_VIEWED_MARKER elif event_type.startswith('edx.forum'): forum_post_voted = re.match(r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type) if forum_post_voted: info['vote_value'] = event_data.get('vote_value') if info['vote_value'] not in ['up', 'down']: return info['undo_vote'] = event_data.get('undo_vote', False) date_grouping_key = date_string if self.interval_type == 'weekly': last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member last_weekday = last_complete_date.isoweekday() split_date = date_string.split('-') event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2])) event_weekday = event_date.isoweekday() days_until_end = last_weekday - event_weekday if days_until_end < 0: days_until_end += 7 end_of_week_date = event_date + datetime.timedelta(days=days_until_end) date_grouping_key = end_of_week_date.isoformat() elif self.interval_type == 'all': # If gathering all data for a given user, use the last complete day of the interval # for joining with enrollment. last_complete_date = self.interval.date_b - datetime.timedelta(days=1) # pylint: disable=no-member date_grouping_key = last_complete_date.isoformat() yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string)) if forum_post_voted: # We emit two events for each "voted" event - one for the voting user and one for the # user receiving the vote. username = event_data.get('target_username') if not username: return event_type = 'edx.forum.{}.vote_received'.format(forum_post_voted.group('post_type')) yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))