def test_course_id_from_url_legacy(self):
     event = {
         'event_source': 'server',
         'context': {},
         'event_type': '/courses/edX/Open_DemoX/edx_demo_course/info'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'edX/Open_DemoX/edx_demo_course')
 def test_course_id_from_server_url(self):
     event = {
         'event_source': 'server',
         'context': {},
         'event_type': '/courses/course-v1:DemoX+DemoX+T1_2014/about'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            self.incr_counter('UserActivity', 'Discard Missing User ID', 1)
            log.error("User-Activity: event without user_id in context: %s",
                      event)
            return

        # Course user activity URLs have changed recently with the introduction of micro-frontends (MFEs).
        # This code attempts to handle those URL changes with minimal diffences in the number of events processed/used.
        #
        # Attempt to extract the course_id. The get_course_id() method will first look for an explicit course ID in the
        # event context. If that explicit course ID does not exist, the code will then look in the event URL to attempt
        # to parse out a course ID, using both an old-style URL pattern and a new-style micro-frontend courseware URL pattern.
        course_id = eventlog.get_course_id(event, from_url=True)
        if not course_id:
            # If a course_id has not been extracted successfully, ignore this event.
            self.incr_counter('UserActivity', 'Discard Missing Course ID', 1)
            return

        for label in self.get_predicate_labels(event):
            yield date_string, self._encode_tuple(
                (str(user_id), course_id, date_string, label))
 def test_course_id_from_browser_url(self):
     event = {
         'event_source': 'browser',
         'context': {},
         'page': 'http://test.edx.org/courses/course-v1:DemoX+DemoX+T1_2014/courseware/interactive_demonstrations'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
 def test_course_id_from_xblock_browser_url(self):
     event = {
         'event_source': 'browser',
         'context': {},
         'page': 'https://courses.edx.org/xblock/block-v1:DemoX+DemoX+T1_2014+type@vertical+block@3848270?p1=0&p2=0'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
示例#6
0
    def mapper(self, line):
        event, date_string = self.get_event_and_date_string(line) or (None,
                                                                      None)
        if event is None:
            return

        if not self.is_valid_input_file():
            return

        org_id = self.get_org_id(event)
        if org_id not in self.org_id_whitelist:
            log.debug('Unrecognized organization: org_id=%s', org_id or '')
            return

        # Check to see if the org_id is one that should be grouped with other org_ids.
        org_ids = self.primary_org_ids_for_org_id[org_id]

        for key_org_id in org_ids:
            key = (date_string, key_org_id)

            # Include only requested courses
            requested_courses = self.courses_for_org_id.get(key_org_id)
            if requested_courses and eventlog.get_course_id(
                    event, from_url=True) not in requested_courses:
                continue

            # Enforce a standard encoding for the parts of the key. Without this a part of the key
            # might appear differently in the key string when it is coerced to a string by luigi. For example,
            # if the same org_id appears in two different records, one as a str() type and the other a
            # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key
            # string. Although python doesn't care about this difference, hadoop does, and will bucket the
            # values separately. Which is not what we want.
            yield tuple([value.encode('utf8') for value in key]), line.strip()
 def test_course_id_from_browser_url(self):
     event = {
         'event_source': 'browser',
         'context': {},
         'page': 'http://test.edx.org/courses/course-v1:DemoX+DemoX+T1_2014/courseware/interactive_demonstrations'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
 def test_course_id_from_url_legacy(self):
     event = {
         'event_source': 'server',
         'context': {},
         'event_type': '/courses/edX/Open_DemoX/edx_demo_course/info'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'edX/Open_DemoX/edx_demo_course')
 def test_course_id_from_server_url(self):
     event = {
         'event_source': 'server',
         'context': {},
         'event_type': '/courses/course-v1:DemoX+DemoX+T1_2014/about'
     }
     self.assertEquals(eventlog.get_course_id(event, from_url=True), 'course-v1:DemoX+DemoX+T1_2014')
    def mapper(self, line):
        event, date_string = self.get_event_and_date_string(line) or (None, None)
        if event is None:
            return

        if not self.is_valid_input_file():
            return

        org_id = self.get_org_id(event)
        if org_id not in self.org_id_whitelist:
            log.debug('Unrecognized organization: org_id=%s', org_id or '')
            return

        # Check to see if the org_id is one that should be grouped with other org_ids.
        org_ids = self.primary_org_ids_for_org_id[org_id]

        for key_org_id in org_ids:
            key = (date_string, key_org_id)

            # Include only requested courses
            requested_courses = self.courses_for_org_id.get(key_org_id)
            if requested_courses and eventlog.get_course_id(event, from_url=True) not in requested_courses:
                continue

            # Enforce a standard encoding for the parts of the key. Without this a part of the key
            # might appear differently in the key string when it is coerced to a string by luigi. For example,
            # if the same org_id appears in two different records, one as a str() type and the other a
            # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key
            # string. Although python doesn't care about this difference, hadoop does, and will bucket the
            # values separately. Which is not what we want.
            yield tuple([value.encode('utf8') for value in key]), line.strip()
 def test_course_id_from_invalid_xblock_browser_url(self):
     event = {
         'event_source': 'browser',
         'context': {},
         'page': 'https://courses.edx.org/xblock/block-v1:DemoX+DemoX+T1_2014?p1=0&p2=0'
     }
     self.assertIsNone(eventlog.get_course_id(event, from_url=True))
    def mapper(self, line):
        # We only want to consider lines that include the type of event with which we are concerned.
        if LINK_CLICKED not in line:
            return

        value = self.get_event_and_date_string(line)

        if value is None:
            return
        event, date_string = value

        event_type = event.get('event_type')
        if not event_type:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type != LINK_CLICKED:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            log.error(
                "encountered explicit link_clicked event with no event data: %s",
                event)
            return

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.error(
                "encountered explicit link_clicked event with invalid course_id: %s",
                event)
            return

        target_url = event_data.get('target_url')
        if not target_url:
            log.error(
                "encountered explicit link_clicked event with no target_url: %s",
                event)
            return

        current_url = event_data.get('current_url')
        if not current_url:
            log.error(
                "encountered explicit link_clicked event with no current_url: %s",
                event)
            return

        # A link is considered "internal" when it does not navigate away from the current host.
        # Some internal links exclude the host name entirely- they start with / so we account for that.
        current_loc = urlparse(current_url).netloc
        target_loc = urlparse(target_url).netloc

        is_external = 0
        if current_loc != target_loc and target_loc != "":
            is_external = 1

        yield (course_id, date_string), (is_external)
    def mapper(self, line):
        # We only want to consider lines that include the type of event with which we are concerned.
        if LINK_CLICKED not in line:
            return

        value = self.get_event_and_date_string(line)

        if value is None:
            return
        event, date_string = value

        event_type = event.get('event_type')
        if not event_type:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type != LINK_CLICKED:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            log.error("encountered explicit link_clicked event with no event data: %s", event)
            return

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.error("encountered explicit link_clicked event with invalid course_id: %s", event)
            return

        target_url = event_data.get('target_url')
        if not target_url:
            log.error("encountered explicit link_clicked event with no target_url: %s", event)
            return

        current_url = event_data.get('current_url')
        if not current_url:
            log.error("encountered explicit link_clicked event with no current_url: %s", event)
            return

        # A link is considered "internal" when it does not navigate away from the current host.
        # Some internal links exclude the host name entirely- they start with / so we account for that.
        current_loc = urlparse(current_url).netloc
        target_loc = urlparse(target_url).netloc

        is_external = 0
        if current_loc != target_loc and target_loc != "":
            is_external = 1

        yield (course_id, date_string), (is_external)
    def mapper(self, line):
        event, date_string = self.get_event_and_date_string(line) or (None, None)
        if event is None:
            return

        course_id = eventlog.get_course_id(event, from_url=True)

        if course_id is None:
            return

        if self.course_id and course_id not in self.course_id:
            return

        key = (date_string, course_id)
        yield tuple([value.encode('utf8') for value in key]), line.strip()
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            return

        try:
            user_id = int(user_id)
        except ValueError:
            self.incr_counter('User Location',
                              'Discard event with malformed user_id', 1)
            return

        # Get timestamp instead of date string, so we get the latest ip
        # address for events on the same day.
        timestamp = eventlog.get_event_time_string(event)
        if not timestamp:
            return

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", user_id,
                        timestamp)
            return

        # Get the course_id from context, if it happens to be present.
        # It's okay if it isn't.

        # (Not sure if there are particular types of course
        # interaction we care about, but we might want to only collect
        # the course_id off of explicit events, and ignore implicit
        # events as not being "real" interactions with course content.
        # Or maybe we add a flag indicating explicit vs. implicit, so
        # that this can be better teased apart.  For example, we could
        # use the latest explicit event for a course, but if there are
        # none, then use the latest implicit event for the course, and
        # if there are none, then use the latest overall event.)
        course_id = eventlog.get_course_id(event)

        # For multi-output, we will generate a single file for each key value.
        # When looking at location for user in a course, we don't want to have
        # an output file per course per date, so just use date as the key,
        # and have a single file representing all events on the date.
        yield date_string, (timestamp, ip_address, course_id, user_id)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        for label in self.get_predicate_labels(event):
            yield self._encode_tuple((course_id, username, date_string, label)), 1
    def mapper(self, line):
        event, date_string = self.get_event_and_date_string(line) or (None,
                                                                      None)
        if event is None:
            return

        course_id = eventlog.get_course_id(event, from_url=True)

        if course_id is None:
            return

        if self.course and course_id not in self.course:
            return

        key = (date_string, course_id)
        yield tuple([value.encode('utf8') for value in key]), line.strip()
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        for label in self.get_predicate_labels(event):
            yield date_string, self._encode_tuple(
                (course_id, username, date_string, label))
示例#19
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            self.incr_counter('UserActivity', 'Discard Missing User ID', 1)
            log.error("User-Activity: event without user_id in context: %s", event)
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        for label in self.get_predicate_labels(event):
            yield date_string, self._encode_tuple((str(user_id), course_id, date_string, label))
示例#20
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            self.incr_counter('UserActivity', 'Discard Missing User ID', 1)
            log.error("User-Activity: event without user_id in context: %s", event)
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        for label in self.get_predicate_labels(event):
            yield date_string, self._encode_tuple((str(user_id), course_id, date_string, label))
示例#21
0
    def mapper(self, line):
        """
        Args:
            line: text line from a tracking event log.

        Yields:  (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)

        """
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _ = value

        if event.get('event_type') != 'problem_check' or event.get(
                'event_source') != 'server':
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        org_id = opaque_key_util.get_org_id_for_course(course_id)

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        problem_id = event_data.get('problem_id')
        if not problem_id:
            return

        is_correct = event_data.get('success') == 'correct'

        saved_tags = event.get('context').get('asides', {}).get(
            'tagging_aside', {}).get('saved_tags', {})

        yield (course_id, org_id, problem_id), (timestamp, saved_tags,
                                                is_correct)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id, entity_type, user_actions = self.get_user_actions_from_event(event_data, event_source, event_type)

        if not entity_id or not entity_type:
            return

        for action in user_actions:
            record = ModuleEngagementRecord(
                course_id=course_id,
                username=username,
                date=DateField().deserialize_from_string(date_string),
                entity_type=entity_type,
                entity_id=entity_id,
                event=action,
                count=0
            )
            # The count is populated by the reducer, so exclude it from the key.
            record_without_count = record.to_string_tuple()[:-1]
            yield (record_without_count, 1)
    def get_event_row_from_document(self, document):
        event_and_date_string = self.get_event_and_date_string(document)
        if not event_and_date_string:
            return
        event, date_string = event_and_date_string

        username = event.get('username', '').strip()
        if not username:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        events = []
        for label in self.get_predicate_labels(event):
            event_row = self._encode_tuple(
                (course_id, username, date_string, label))
            events.append(event_row)
        return events
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = eventlog.get_event_username(event)
        if not username:
            return

        # Get timestamp instead of date string, so we get the latest ip
        # address for events on the same day.
        timestamp = eventlog.get_event_time_string(event)
        if not timestamp:
            return

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp)
            return

        # Get the course_id from context, if it happens to be present.
        # It's okay if it isn't.

        # (Not sure if there are particular types of course
        # interaction we care about, but we might want to only collect
        # the course_id off of explicit events, and ignore implicit
        # events as not being "real" interactions with course content.
        # Or maybe we add a flag indicating explicit vs. implicit, so
        # that this can be better teased apart.  For example, we could
        # use the latest explicit event for a course, but if there are
        # none, then use the latest implicit event for the course, and
        # if there are none, then use the latest overall event.)
        course_id = eventlog.get_course_id(event)

        # For multi-output, we will generate a single file for each key value.
        # When looking at location for user in a course, we don't want to have
        # an output file per course per date, so just use date as the key,
        # and have a single file representing all events on the date.
        yield date_string, (timestamp, ip_address, course_id, username)
    def mapper(self, line):
        """
        Args:
            line: text line from a tracking event log.

        Yields:  (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)

        """
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _ = value

        if event.get('event_type') != 'problem_check' or event.get('event_source') != 'server':
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        org_id = opaque_key_util.get_org_id_for_course(course_id)

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        problem_id = event_data.get('problem_id')
        if not problem_id:
            return

        is_correct = event_data.get('success') == 'correct'

        saved_tags = event.get('context').get('asides', {}).get('tagging_aside', {}).get('saved_tags', {})

        yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
示例#26
0
    def mapper(self, line):
        event, date_string = self.get_event_and_date_string(line) or (None, None)
        if event is None:
            return

        if not self.is_valid_input_file():
            return

        org_id = self.get_org_id(event)
        if org_id not in self.org_id_whitelist:
            log.debug('Unrecognized organization: org_id=%s', org_id or '')
            return

        # Do not export events that have been explicitly flagged as not being for export.
        # Any event without an '_export' key will be sent as part of the export by default,
        # and likewise any event without a falsey value. The preferred value to not export is 'false'.
        event_data = eventlog.get_event_data(event)
        if event_data and str(event_data.get('_export', 'true')).lower() in ('n', 'f', '0', 'false', 'no'):
            return

        # Check to see if the org_id is one that should be grouped with other org_ids.
        org_ids = self.primary_org_ids_for_org_id[org_id]

        for key_org_id in org_ids:
            key = (date_string, key_org_id)

            # Include only requested courses
            requested_courses = self.courses_for_org_id.get(key_org_id)
            if requested_courses and eventlog.get_course_id(event, from_url=True) not in requested_courses:
                continue

            # Enforce a standard encoding for the parts of the key. Without this a part of the key
            # might appear differently in the key string when it is coerced to a string by luigi. For example,
            # if the same org_id appears in two different records, one as a str() type and the other a
            # unicode() then without this change they would appear as u'FooX' and 'FooX' in the final key
            # string. Although python doesn't care about this difference, hadoop does, and will bucket the
            # values separately. Which is not what we want.
            yield tuple([value.encode('utf8') for value in key]), line.strip()
    def obfuscate_event_entry(self, line):
        event = eventlog.parse_json_event(line)
        if event is None:
            # Unexpected here...
            log.error(u"Encountered event entry which failed to parse: %r", line)
            return line
        course_id = eventlog.get_course_id(event, from_url=True)
        if course_id is None:
            # Unexpected here...
            log.error(u"Encountered event entry with no course_id: %r", line)
            return line

        # We cannot use this method as-is, since we need to know what was done to the event, so
        # that it can be transformed back to its original form once cleaned.
        # NOT event_data = eventlog.get_event_data(event)
        event_json_decoded = False
        event_data = event.get('event')

        if event_data is None:
            log.error(u"Encountered event entry with no 'event' payload: %r", line)
        if event_data == '':
            # Note that this happens with some browser events.  Instead of
            # failing to parse it as a JSON string, just leave as-is.
            pass
        elif isinstance(event_data, basestring):
            # Cjson produces str, while json produces unicode.  Hmm.
            if len(event_data) == 512 and 'POST' in event_data:
                # It's a truncated JSON string.  But we're going to throw it out anyway, so no worries.
                pass
            elif '{' not in event_data and '=' in event_data:
                # It's a key-value pair from a browser event.  Just process as-is, rather than parsing and reassembling.
                pass
            else:
                try:
                    event_data = eventlog.decode_json(event_data)
                    event_json_decoded = True
                except Exception:
                    log.error(u"Encountered event entry with unparseable 'event' payload: %r", line)

        # TODO: update the comment!  This is where we traverse the event in search of values that should be "cleansed".
        # Much along the model of what we already do for 'state' in CWSM.  Except that we need to be more
        # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed
        # to get to strings that can be properly interpreted.
        event_user_info = self.get_userinfo_from_event(event, event_data)

        if 'POST' in event_data:
            if self.parameters['skip_post']:
                return None

        updated_event_data = self.obfuscator.obfuscate_structure(event_data, u"event", event_user_info)

        if updated_event_data is not None:
            event_source = event.get('event_source')
            event_type = event.get('event_type')
            log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type)

            if event_json_decoded:
                # TODO: should really use cjson, if that were originally used for decoding the json.
                updated_event_data = json.dumps(updated_event_data)

            event['event'] = updated_event_data

        # TODO: should really use cjson, if that were originally used for decoding the json.
        return json.dumps(event)
示例#28
0
def get_problem_check_event(line_or_event):
    """
    Generates output values for explicit problem_check events.

    Args:

        line_or_event: pre-parsed event dict, or text line from a tracking event log

    Returns:

        (problem_id, username), (timestamp, problem_check_info)

        where timestamp is in ISO format, with resolution to the millisecond
        and problem_check_info is a JSON-serialized dict containing
        the contents of the problem_check event's 'event' field,
        augmented with entries for 'timestamp', 'username', and
        'context' from the event.

        or None if there is no valid problem_check event on the line.

    Example:
            (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah)

    """
    # Ensure the given event dict is a problem_check event
    if isinstance(line_or_event, dict):
        event = line_or_event
        if event.get('event_type') != 'problem_check':
            return None

    # Parse the line into an event dict, if not provided.
    else:
        event = eventlog.parse_json_server_event(line_or_event, 'problem_check')
        if event is None:
            return None

    # Get the "problem data".  This is the event data, the context, and anything else that would
    # be useful further downstream.  (We could just pass the entire event dict?)

    # Get the user from the username, not from the user_id in the
    # context.  While we are currently requiring context (as described
    # above), we might not in future.  Older events will not have
    # context information, so we can't rely on user_id from there.
    # And we don't expect problem_check events to occur without a
    # username, and don't expect them to occur with the wrong user
    # (i.e. one user acting on behalf of another, as in an instructor
    # acting on behalf of a student).
    augmented_data_fields = ['context', 'username', 'timestamp']
    problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields)
    if problem_data is None:
        return None

    # Get the course_id from context.  We won't work with older events
    # that do not have context information, since they do not directly
    # provide course_id information.  (The problem_id/answer_id values
    # contain the org and course name, but not the run.)  Course_id
    # information could be found from other events, but it would
    # require expanding the events being selected.
    course_id = eventlog.get_course_id(event)
    if course_id is None:
        log.error("encountered explicit problem_check event with missing course_id: %s", event)
        return None

    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered explicit problem_check event with bogus course_id: %s", event)
        return None

    # Get the problem_id from the event data.
    problem_id = problem_data.get('problem_id')
    if problem_id is None:
        log.error("encountered explicit problem_check event with bogus problem_id: %s", event)
        return None

    event = event.get('event', {})
    answers = event.get('answers', {})
    if len(answers) == 0:
        return None

    try:
        _check_answer_ids(answers)
        _check_answer_ids(event.get('submission', {}))
    except (TypeError, ValueError):
        log.error("encountered explicit problem_check event with invalid answers: %s", event)
        return None

    problem_data_json = json.dumps(problem_data)
    key = (course_id, problem_id, problem_data.get('username'))
    value = (problem_data.get('timestamp'), problem_data_json)

    return key, value
示例#29
0
    def _generate_answers(self, event_string, attempt_category):
        """
        Generates a list of answers given a problem_check event.

        Args:
            event_string:  a json-encoded string version of an event's data.
            attempt_category: a string that is 'first' for a user's first response to a question, 'last' otherwise

        Returns:
            list of answer data tuples.

        See docstring for reducer() for more details.
        """
        event = json.loads(event_string)

        # Get context information:
        course_id = eventlog.get_course_id(event)
        timestamp = event.get('timestamp')
        problem_id = event.get('problem_id')
        grade = event.get('grade')
        max_grade = event.get('max_grade')
        problem_display_name = event.get('context').get('module', {}).get('display_name', None)
        result = []

        def append_submission(answer_id, submission):
            """Convert submission to result to be returned."""
            # First augment submission with problem-level information
            # not found in the submission:
            submission['problem_id'] = problem_id
            submission['problem_display_name'] = problem_display_name
            submission['attempt_category'] = attempt_category
            submission['grade'] = grade
            submission['max_grade'] = max_grade

            # Add the timestamp so that all responses can be sorted in order.
            # We want to use the "latest" values for some fields.
            output_key = (course_id, answer_id)
            output_value = (timestamp, json.dumps(submission))
            result.append((output_key, output_value))

        answers = event.get('answers')
        correct_map = event.get('correct_map', {})
        if 'submission' in event:
            submissions = event.get('submission')
            for answer_id in submissions:
                if not self.is_hidden_answer(answer_id):
                    submission = submissions.get(answer_id)
                    # But submission doesn't contain moniker value for answer.
                    # So we check the raw answers, and see if its value is
                    # different.  If so, we assume it's a moniker.
                    answer_value = answers[answer_id]
                    if answer_value != submission.get('answer'):
                        submission['answer_value_id'] = answer_value

                    submission['answer_correct_map'] = correct_map.get(answer_id)
                    append_submission(answer_id, submission)

        else:
            # Otherwise, it's an older event with no 'submission'
            # information, so parse it as well as possible.
            for answer_id in answers:
                if not self.is_hidden_answer(answer_id):
                    answer_value = answers[answer_id]

                    # Argh. It seems that sometimes we're encountering
                    # bogus answer_id values.  In particular, one that
                    # is including the possible choice values, instead
                    # of any actual values selected by the student.
                    # For now, let's just dump an error and skip it,
                    # so that it becomes the equivalent of a hidden
                    # answer.

                    # TODO: Eventually treat it explicitly as a hidden
                    # answer.
                    if answer_id not in correct_map:
                        log.error("Unexpected answer_id %s not in correct_map: %s", answer_id, event)
                        continue
                    correctness = correct_map[answer_id].get('correctness') == 'correct'

                    variant = event.get('state', {}).get('seed')

                    # We do not know the values for 'input_type',
                    # 'response_type', or 'question'.  We also don't know if
                    # answer_value should be identified as 'answer_value_id' or
                    # 'answer', so we choose to use 'answer_value_id' here and
                    # never define 'answer'.  This allows disambiguation from
                    # events with a submission field, which will always have
                    # an 'answer' and only sometimes have an 'answer_value_id'.
                    submission = {
                        'answer_value_id': answer_value,
                        'correct': correctness,
                        'variant': variant,
                        'answer_correct_map': correct_map.get(answer_id),
                    }
                    append_submission(answer_id, submission)

        return result
示例#30
0
 def test_missing_context(self):
     event = {
         'event_source': 'server'
     }
     self.assertIsNone(eventlog.get_course_id(event))
示例#31
0
    def mapper(self, line):
        # Add a filter here to permit quicker rejection of unrelated events.
        if VIDEO_EVENT_MINIMUM_STRING not in line:
            # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1)
            return

        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value
        # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1)

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1)
            return

        if event_type not in VIDEO_EVENT_TYPES:
            # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1)
            return

        # self.incr_counter(self.counter_category_name, 'Input Video Events', 1)

        # This has already been checked when getting the event, so just fetch the value.
        timestamp = eventlog.get_event_time_string(event)

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            log.error("Video event without user_id in context: %s", event)
            return
        # Convert user_id to int if str
        if not isinstance(user_id, int):
            user_id = int(user_id)

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.warn('Video event without valid course_id: {0}'.format(line))
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            # This should already have been logged.
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1)
            return

        encoded_module_id = event_data.get('id', '').strip()  # we have seen id values with leading newline
        if not encoded_module_id:
            log.warn('Video event without valid encoded_module_id (id): {0}'.format(line))
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1)
            return

        video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION)
        if not video_duration:
            # events may have a 'duration' value of null, so use the same default for those as well.
            video_duration = VIDEO_UNKNOWN_DURATION

        # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1)

        current_time = None
        old_time = None
        youtube_id = None
        if event_type == VIDEO_PLAYED:
            code = event_data.get('code')
            if code not in VIDEO_CODES:
                youtube_id = code
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Play', 1)
        elif event_type == VIDEO_PAUSED:
            # Pause events may have a missing currentTime value if video is paused at the beginning,
            # so provide a default of zero.
            current_time = self._check_time_offset(event_data.get('currentTime', 0), line)
            if current_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Pause', 1)
        elif event_type == VIDEO_SEEK:
            current_time = self._check_time_offset(event_data.get('new_time'), line)
            old_time = self._check_time_offset(event_data.get('old_time'), line)
            if current_time is None or old_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Seek', 1)
        elif event_type == VIDEO_STOPPED:
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Stop', 1)

        if youtube_id is not None:
            youtube_id = youtube_id.encode('utf8')

        # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1)
        yield (
            (user_id, course_id.encode('utf8'), encoded_module_id.encode('utf8')),
            (timestamp, event_type, current_time, old_time, youtube_id, video_duration)
        )
示例#32
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        forum_post_voted = None
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(
                self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER
        elif event_type.startswith('edx.forum'):
            forum_post_voted = re.match(
                r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type)
            if forum_post_voted:
                info['vote_value'] = event_data.get('vote_value')
                if info['vote_value'] not in ['up', 'down']:
                    return
                info['undo_vote'] = event_data.get('undo_vote', False)

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]),
                                       int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(
                days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username),
               (entity_id, event_type, json.dumps(info), date_string))

        if forum_post_voted:
            # We emit two events for each "voted" event - one for the voting user and one for the
            # user receiving the vote.
            username = event_data.get('target_username')
            if not username:
                return
            event_type = 'edx.forum.{}.vote_received'.format(
                forum_post_voted.group('post_type'))
            yield ((date_grouping_key, course_id, username),
                   (entity_id, event_type, json.dumps(info), date_string))
示例#33
0
    def _generate_answers(self, event_string, attempt_category):
        """
        Generates a list of answers given a problem_check event.

        Args:
            event_string:  a json-encoded string version of an event's data.
            attempt_category: a string that is 'first' for a user's first response to a question, 'last' otherwise

        Returns:
            list of answer data tuples.

        See docstring for reducer() for more details.
        """
        event = json.loads(event_string)

        # Get context information:
        course_id = eventlog.get_course_id(event)
        timestamp = event.get('timestamp')
        problem_id = event.get('problem_id')
        grade = event.get('grade')
        max_grade = event.get('max_grade')
        problem_display_name = event.get('context').get('module', {}).get('display_name', None)
        result = []

        def append_submission(answer_id, submission):
            """Convert submission to result to be returned."""
            # First augment submission with problem-level information
            # not found in the submission:
            submission['problem_id'] = problem_id
            submission['problem_display_name'] = problem_display_name
            submission['attempt_category'] = attempt_category
            submission['grade'] = grade
            submission['max_grade'] = max_grade

            # Add the timestamp so that all responses can be sorted in order.
            # We want to use the "latest" values for some fields.
            output_key = (course_id, answer_id)
            output_value = (timestamp, json.dumps(submission))
            result.append((output_key, output_value))

        answers = event.get('answers')
        correct_map = event.get('correct_map', {})
        if 'submission' in event:
            submissions = event.get('submission')
            for answer_id in submissions:
                if not self.is_hidden_answer(answer_id):
                    submission = submissions.get(answer_id)
                    # But submission doesn't contain moniker value for answer.
                    # So we check the raw answers, and see if its value is
                    # different.  If so, we assume it's a moniker.
                    answer_value = answers[answer_id]
                    if answer_value != submission.get('answer'):
                        submission['answer_value_id'] = answer_value

                    submission['answer_correct_map'] = correct_map.get(answer_id)
                    append_submission(answer_id, submission)

        else:
            # Otherwise, it's an older event with no 'submission'
            # information, so parse it as well as possible.
            for answer_id in answers:
                if not self.is_hidden_answer(answer_id):
                    answer_value = answers[answer_id]

                    # Argh. It seems that sometimes we're encountering
                    # bogus answer_id values.  In particular, one that
                    # is including the possible choice values, instead
                    # of any actual values selected by the student.
                    # For now, let's just dump an error and skip it,
                    # so that it becomes the equivalent of a hidden
                    # answer.

                    # TODO: Eventually treat it explicitly as a hidden
                    # answer.
                    if answer_id not in correct_map:
                        log.error("Unexpected answer_id %s not in correct_map: %s", answer_id, event)
                        continue
                    correctness = correct_map[answer_id].get('correctness') == 'correct'

                    variant = event.get('state', {}).get('seed')

                    # We do not know the values for 'input_type',
                    # 'response_type', or 'question'.  We also don't know if
                    # answer_value should be identified as 'answer_value_id' or
                    # 'answer', so we choose to use 'answer_value_id' here and
                    # never define 'answer'.  This allows disambiguation from
                    # events with a submission field, which will always have
                    # an 'answer' and only sometimes have an 'answer_value_id'.
                    submission = {
                        'answer_value_id': answer_value,
                        'correct': correctness,
                        'variant': variant,
                        'answer_correct_map': correct_map.get(answer_id),
                    }
                    append_submission(answer_id, submission)

        return result
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
示例#35
0
    def obfuscate_event_entry(self, line):
        event = eventlog.parse_json_event(line)
        if event is None:
            # Unexpected here...
            log.error(u"Encountered event entry which failed to parse: %r",
                      line)
            return line
        course_id = eventlog.get_course_id(event, from_url=True)
        if course_id is None:
            # Unexpected here...
            log.error(u"Encountered event entry with no course_id: %r", line)
            return line

        # We cannot use this method as-is, since we need to know what was done to the event, so
        # that it can be transformed back to its original form once cleaned.
        # NOT event_data = eventlog.get_event_data(event)
        event_json_decoded = False
        event_data = event.get('event')

        if event_data is None:
            log.error(u"Encountered event entry with no 'event' payload: %r",
                      line)
        if event_data == '':
            # Note that this happens with some browser events.  Instead of
            # failing to parse it as a JSON string, just leave as-is.
            pass
        elif isinstance(event_data, basestring):
            # Cjson produces str, while json produces unicode.  Hmm.
            if len(event_data) == 512 and 'POST' in event_data:
                # It's a truncated JSON string.  But we're going to throw it out anyway, so no worries.
                pass
            elif '{' not in event_data and '=' in event_data:
                # It's a key-value pair from a browser event.  Just process as-is, rather than parsing and reassembling.
                pass
            else:
                try:
                    event_data = eventlog.decode_json(event_data)
                    event_json_decoded = True
                except Exception:
                    log.error(
                        u"Encountered event entry with unparseable 'event' payload: %r",
                        line)

        # TODO: update the comment!  This is where we traverse the event in search of values that should be "cleansed".
        # Much along the model of what we already do for 'state' in CWSM.  Except that we need to be more
        # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed
        # to get to strings that can be properly interpreted.
        event_user_info = self.get_userinfo_from_event(event, event_data)

        if 'POST' in event_data:
            if self.parameters['skip_post']:
                return None

        updated_event_data = self.obfuscator.obfuscate_structure(
            event_data, u"event", event_user_info)

        if updated_event_data is not None:
            event_source = event.get('event_source')
            event_type = event.get('event_type')
            log.info(u"Obfuscated %s event with event_type = '%s'",
                     event_source, event_type)

            if event_json_decoded:
                # TODO: should really use cjson, if that were originally used for decoding the json.
                updated_event_data = json.dumps(updated_event_data)

            event['event'] = updated_event_data

        # TODO: should really use cjson, if that were originally used for decoding the json.
        return json.dumps(event)
示例#36
0
    def mapper(self, line):
        # Add a filter here to permit quicker rejection of unrelated events.
        if VIDEO_EVENT_MINIMUM_STRING not in line:
            return

        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in VIDEO_EVENT_TYPES:
            return

        # This has already been checked when getting the event, so just fetch the value.
        timestamp = eventlog.get_event_time_string(event)

        # Strip username to remove trailing newlines that mess up Luigi.
        username = event.get('username', '').strip()
        if not username:
            log.error("Video event without username: %s", event)
            return

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.warn('Video event without valid course_id: {0}'.format(line))
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            # This should already have been logged.
            return

        encoded_module_id = event_data.get('id')
        if encoded_module_id is None:
            log.warn('Video event without valid encoded_module_id (id): {0}'.format(line))
            return

        current_time = None
        old_time = None
        youtube_id = None
        if event_type == VIDEO_PLAYED:
            code = event_data.get('code')
            if code not in ('html5', 'mobile'):
                youtube_id = code
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                return
        elif event_type == VIDEO_PAUSED:
            # Pause events may have a missing currentTime value if video is paused at the beginning,
            # so provide a default of zero.
            current_time = self._check_time_offset(event_data.get('currentTime', 0), line)
            if current_time is None:
                return
        elif event_type == VIDEO_SEEK:
            current_time = self._check_time_offset(event_data.get('new_time'), line)
            old_time = self._check_time_offset(event_data.get('old_time'), line)
            if current_time is None or old_time is None:
                return
        elif event_type == VIDEO_STOPPED:
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                return

        if youtube_id is not None:
            youtube_id = youtube_id.encode('utf8')

        yield (
            (username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')),
            (timestamp, event_type, current_time, old_time, youtube_id)
        )
示例#37
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(
                self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]),
                                       int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(
                days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username),
               (entity_id, event_type, json.dumps(info), date_string))
    def mapper(self, line):
        # Add a filter here to permit quicker rejection of unrelated events.
        if VIDEO_EVENT_MINIMUM_STRING not in line:
            # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1)
            return

        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value
        # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1)

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            self.incr_counter(self.counter_category_name,
                              'Discard Missing Event Type', 1)
            return

        if event_type not in VIDEO_EVENT_TYPES:
            # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1)
            return

        # self.incr_counter(self.counter_category_name, 'Input Video Events', 1)

        # This has already been checked when getting the event, so just fetch the value.
        timestamp = eventlog.get_event_time_string(event)

        # Strip username to remove trailing newlines that mess up Luigi.
        username = event.get('username', '').strip()
        if not username:
            log.error("Video event without username: %s", event)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing username', 1)
            return

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.warn('Video event without valid course_id: {0}'.format(line))
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            # This should already have been logged.
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1)
            return

        encoded_module_id = event_data.get(
            'id', '').strip()  # we have seen id values with leading newline
        if not encoded_module_id:
            log.warn(
                'Video event without valid encoded_module_id (id): {0}'.format(
                    line))
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1)
            return

        video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION)
        if not video_duration:
            # events may have a 'duration' value of null, so use the same default for those as well.
            video_duration = VIDEO_UNKNOWN_DURATION

        # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1)

        current_time = None
        old_time = None
        youtube_id = None
        if event_type == VIDEO_PLAYED:
            code = event_data.get('code')
            if code not in VIDEO_CODES:
                youtube_id = code
            current_time = self._check_time_offset(
                event_data.get('currentTime'), line)
            if current_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Play', 1)
        elif event_type == VIDEO_PAUSED:
            # Pause events may have a missing currentTime value if video is paused at the beginning,
            # so provide a default of zero.
            current_time = self._check_time_offset(
                event_data.get('currentTime', 0), line)
            if current_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Pause', 1)
        elif event_type == VIDEO_SEEK:
            current_time = self._check_time_offset(event_data.get('new_time'),
                                                   line)
            old_time = self._check_time_offset(event_data.get('old_time'),
                                               line)
            if current_time is None or old_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Seek', 1)
        elif event_type == VIDEO_STOPPED:
            current_time = self._check_time_offset(
                event_data.get('currentTime'), line)
            if current_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Stop', 1)

        if youtube_id is not None:
            youtube_id = youtube_id.encode('utf8')

        # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1)
        yield ((username.encode('utf8'), course_id.encode('utf8'),
                encoded_module_id.encode('utf8')),
               (timestamp, event_type, current_time, old_time, youtube_id,
                video_duration))
 def test_missing_context(self):
     event = {'event_source': 'server'}
     self.assertIsNone(eventlog.get_course_id(event))
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        forum_post_voted = None
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER
        elif event_type.startswith('edx.forum'):
            forum_post_voted = re.match(r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type)
            if forum_post_voted:
                info['vote_value'] = event_data.get('vote_value')
                if info['vote_value'] not in ['up', 'down']:
                    return
                info['undo_vote'] = event_data.get('undo_vote', False)

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))

        if forum_post_voted:
            # We emit two events for each "voted" event - one for the voting user and one for the
            # user receiving the vote.
            username = event_data.get('target_username')
            if not username:
                return
            event_type = 'edx.forum.{}.vote_received'.format(forum_post_voted.group('post_type'))
            yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
示例#41
0
def get_problem_check_event(line_or_event):
    """
    Generates output values for explicit problem_check events.

    Args:

        line_or_event: pre-parsed event dict, or text line from a tracking event log

    Returns:

        (problem_id, username), (timestamp, problem_check_info)

        where timestamp is in ISO format, with resolution to the millisecond
        and problem_check_info is a JSON-serialized dict containing
        the contents of the problem_check event's 'event' field,
        augmented with entries for 'timestamp', 'username', and
        'context' from the event.

        or None if there is no valid problem_check event on the line.

    Example:
            (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah)

    """
    # Ensure the given event dict is a problem_check event
    if isinstance(line_or_event, dict):
        event = line_or_event
        if event.get('event_type') != 'problem_check':
            return None

    # Parse the line into an event dict, if not provided.
    else:
        event = eventlog.parse_json_server_event(line_or_event, 'problem_check')
        if event is None:
            return None

    # Get the "problem data".  This is the event data, the context, and anything else that would
    # be useful further downstream.  (We could just pass the entire event dict?)

    # Get the user from the username, not from the user_id in the
    # context.  While we are currently requiring context (as described
    # above), we might not in future.  Older events will not have
    # context information, so we can't rely on user_id from there.
    # And we don't expect problem_check events to occur without a
    # username, and don't expect them to occur with the wrong user
    # (i.e. one user acting on behalf of another, as in an instructor
    # acting on behalf of a student).
    augmented_data_fields = ['context', 'username', 'timestamp']
    problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields)
    if problem_data is None:
        return None

    # Get the course_id from context.  We won't work with older events
    # that do not have context information, since they do not directly
    # provide course_id information.  (The problem_id/answer_id values
    # contain the org and course name, but not the run.)  Course_id
    # information could be found from other events, but it would
    # require expanding the events being selected.
    course_id = eventlog.get_course_id(event)
    if course_id is None:
        log.error("encountered explicit problem_check event with missing course_id: %s", event)
        return None

    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered explicit problem_check event with bogus course_id: %s", event)
        return None

    # Get the problem_id from the event data.
    problem_id = problem_data.get('problem_id')
    if problem_id is None:
        log.error("encountered explicit problem_check event with bogus problem_id: %s", event)
        return None

    event = event.get('event', {})
    answers = event.get('answers', {})
    if len(answers) == 0:
        return None

    try:
        _check_answer_ids(answers)
        _check_answer_ids(event.get('submission', {}))
    except (TypeError, ValueError):
        log.error("encountered explicit problem_check event with invalid answers: %s", event)
        return None

    problem_data_json = json.dumps(problem_data)
    key = (course_id, problem_id, problem_data.get('username'))
    value = (problem_data.get('timestamp'), problem_data_json)

    return key, value