def test_make_tasks_with_events_interleaved_between_users(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(user_id=0, time=START_TIME, event_type='get task') create_question_event( user_id=1, time=START_TIME + datetime.timedelta(seconds=1), event_type='get task' ) create_question_event( user_id=0, time=START_TIME + datetime.timedelta(seconds=2), event_type='post task' ) create_question_event( user_id=1, time=START_TIME + datetime.timedelta(seconds=3), event_type='post task' ) compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 2) user_ids = [task_period.user_id for task_period in TaskPeriod.select()] self.assertIn(0, user_ids) self.assertIn(1, user_ids)
def test_add_periods_from_extras_even_if_it_matches_discard_pattern(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(user_id=3, question_index=4, time=START_TIME, event_type='get task') create_question_event(user_id=3, question_index=4, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task') compute_task_periods(discard_periods=({ 'user_id': 3, 'task_index': 4 }, ), extra_periods=({ 'user_id': 3, 'task_index': 4, 'start': datetime.datetime(2000, 1, 1, 12, 0, 1, 0), 'end': datetime.datetime(2000, 1, 1, 12, 0, 2, 0), }, )) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1)
def test_add_periods_from_extras_even_if_it_matches_discard_pattern(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event( user_id=3, question_index=4, time=START_TIME, event_type='get task' ) create_question_event( user_id=3, question_index=4, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task' ) compute_task_periods( discard_periods=({'user_id': 3, 'task_index': 4},), extra_periods=({ 'user_id': 3, 'task_index': 4, 'start': datetime.datetime(2000, 1, 1, 12, 0, 1, 0), 'end': datetime.datetime(2000, 1, 1, 12, 0, 2, 0), },)) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1)
def create_location_rating(compute_index, task_compute_index, event, rating, labels): ''' Returns True if this rating could be matched to an existing task, False otherwise. ''' # Check for hand-written task index labels for this event matching_labels = filter(lambda l: l['event_id'] == event.id, labels) if len(matching_labels) > 0: task_index = matching_labels[0]['task_index'] task_periods = ( TaskPeriod.select() .where( TaskPeriod.compute_index == task_compute_index, TaskPeriod.task_index == task_index, TaskPeriod.user_id == event.user_id, ) ) hand_aligned = True # If a hand-written label wasn't found, search for a task that this rating could have # occurred within. If we successfully find a task, then save the rating event. else: task_periods = ( TaskPeriod.select() .where( TaskPeriod.compute_index == task_compute_index, TaskPeriod.user_id == event.user_id, TaskPeriod.start < event.log_date, TaskPeriod.end > event.log_date, ) ) hand_aligned = False # If a matching task has been found, then save the rating alongside that task. if task_periods.count() > 0: task_period = task_periods[0] LocationRating.create( compute_index=compute_index, user_id=event.user_id, task_index=task_period.task_index, concern_index=task_period.concern_index, url=event.url, rating=rating, title=event.title, visit_date=event.visit_date, hand_aligned=hand_aligned, ) return (task_periods.count() > 0)
def create_location_rating(compute_index, task_compute_index, event, rating, labels): ''' Returns True if this rating could be matched to an existing task, False otherwise. ''' # Check for hand-written task index labels for this event matching_labels = filter(lambda l: l['event_id'] == event.id, labels) if len(matching_labels) > 0: task_index = matching_labels[0]['task_index'] task_periods = (TaskPeriod.select().where( TaskPeriod.compute_index == task_compute_index, TaskPeriod.task_index == task_index, TaskPeriod.user_id == event.user_id, )) hand_aligned = True # If a hand-written label wasn't found, search for a task that this rating could have # occurred within. If we successfully find a task, then save the rating event. else: task_periods = (TaskPeriod.select().where( TaskPeriod.compute_index == task_compute_index, TaskPeriod.user_id == event.user_id, TaskPeriod.start < event.log_date, TaskPeriod.end > event.log_date, )) hand_aligned = False # If a matching task has been found, then save the rating alongside that task. if task_periods.count() > 0: task_period = task_periods[0] LocationRating.create( compute_index=compute_index, user_id=event.user_id, task_index=task_period.task_index, concern_index=task_period.concern_index, url=event.url, rating=rating, title=event.title, visit_date=event.visit_date, hand_aligned=hand_aligned, ) return (task_periods.count() > 0)
def test_skip_task_period_with_unmatching_task_indexes(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(question_index=0, time=START_TIME, event_type='get task') create_question_event(question_index=1, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task') compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 0)
def test_make_task_only_if_both_events_from_one_user(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(user_id=0, time=START_TIME, event_type='get task') create_question_event(user_id=1, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task') compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 0)
def test_make_task_only_if_both_events_from_one_user(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(user_id=0, time=START_TIME, event_type='get task') create_question_event( user_id=1, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task' ) compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 0)
def test_skip_task_period_with_unmatching_task_indexes(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(question_index=0, time=START_TIME, event_type='get task') create_question_event( question_index=1, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task' ) compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 0)
def test_make_tasks_with_events_interleaved_between_users(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(user_id=0, time=START_TIME, event_type='get task') create_question_event(user_id=1, time=START_TIME + datetime.timedelta(seconds=1), event_type='get task') create_question_event(user_id=0, time=START_TIME + datetime.timedelta(seconds=2), event_type='post task') create_question_event(user_id=1, time=START_TIME + datetime.timedelta(seconds=3), event_type='post task') compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 2) user_ids = [task_period.user_id for task_period in TaskPeriod.select()] self.assertIn(0, user_ids) self.assertIn(1, user_ids)
def test_add_periods_with_extras_specification(self): compute_task_periods(extra_periods=({ 'user_id': 3, 'task_index': 4, 'start': datetime.datetime(2000, 1, 1, 12, 0, 1, 0), 'end': datetime.datetime(2000, 1, 1, 12, 0, 2, 0), },)) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1) period = task_periods[0] self.assertEqual(period.user_id, 3) self.assertEqual(period.task_index, 4) self.assertEqual(period.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0)) self.assertEqual(period.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
def compute_location_ratings(labels=HAND_LABELED_EVENTS, task_compute_index=None): # Create a new index for this computation last_compute_index = LocationRating.select( fn.Max(LocationRating.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Determine what will be the compute index of the task periods that ratings are matched to. # This will become the latest compute index if it hasn't been specified. if task_compute_index is None: task_compute_index = TaskPeriod.select(fn.Max( TaskPeriod.compute_index)).scalar() # Create a list to hold all ratings that couldn't be matched to a task period. # At the end, we want to return these, in case it's important for the caller to know # which events we couldn't create rating records for. unmatched_ratings = [] for event in LocationEvent.select(): # Check to see whether this is a rating event rating_match = re.match("^Rating: (\d)+$", event.event_type) if rating_match: # If this is a rating event, extract the rating rating = int(rating_match.group(1)) rating_created = create_location_rating( compute_index=compute_index, task_compute_index=task_compute_index, event=event, rating=rating, labels=labels, ) # If a rating wasn't created, this probably couldn't be matched to a task. # Save a record of which event failed to be matched to a task and which user # this event happened for. if not rating_created: unmatched_ratings.append({ 'user_id': event.user_id, 'event_id': event.id, }) return unmatched_ratings
def test_add_periods_with_extras_specification(self): compute_task_periods( extra_periods=({ 'user_id': 3, 'task_index': 4, 'start': datetime.datetime(2000, 1, 1, 12, 0, 1, 0), 'end': datetime.datetime(2000, 1, 1, 12, 0, 2, 0), }, )) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1) period = task_periods[0] self.assertEqual(period.user_id, 3) self.assertEqual(period.task_index, 4) self.assertEqual(period.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0)) self.assertEqual(period.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
def test_make_task_period(self): # Setup: create two events bounding a single task START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(time=START_TIME, event_type='get task') create_question_event(time=START_TIME + datetime.timedelta(seconds=1), event_type='post task') # Test: make sure a task has been created with the time bounds of the # events that started and ended it. compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1) period = task_periods[0] self.assertEqual(period.user_id, 0) self.assertEqual(period.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0)) self.assertEqual(period.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
def test_make_task_period(self): # Setup: create two events bounding a single task START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(time=START_TIME, event_type='get task') create_question_event( time=START_TIME + datetime.timedelta(seconds=1), event_type='post task' ) # Test: make sure a task has been created with the time bounds of the # events that started and ended it. compute_task_periods(extra_periods=()) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1) period = task_periods[0] self.assertEqual(period.user_id, 0) self.assertEqual(period.start, datetime.datetime(2000, 1, 1, 12, 0, 1, 0)) self.assertEqual(period.end, datetime.datetime(2000, 1, 1, 12, 0, 2, 0))
def compute_location_ratings(labels=HAND_LABELED_EVENTS, task_compute_index=None): # Create a new index for this computation last_compute_index = LocationRating.select(fn.Max(LocationRating.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Determine what will be the compute index of the task periods that ratings are matched to. # This will become the latest compute index if it hasn't been specified. if task_compute_index is None: task_compute_index = TaskPeriod.select(fn.Max(TaskPeriod.compute_index)).scalar() # Create a list to hold all ratings that couldn't be matched to a task period. # At the end, we want to return these, in case it's important for the caller to know # which events we couldn't create rating records for. unmatched_ratings = [] for event in LocationEvent.select(): # Check to see whether this is a rating event rating_match = re.match("^Rating: (\d)+$", event.event_type) if rating_match: # If this is a rating event, extract the rating rating = int(rating_match.group(1)) rating_created = create_location_rating( compute_index=compute_index, task_compute_index=task_compute_index, event=event, rating=rating, labels=labels, ) # If a rating wasn't created, this probably couldn't be matched to a task. # Save a record of which event failed to be matched to a task and which user # this event happened for. if not rating_created: unmatched_ratings.append({ 'user_id': event.user_id, 'event_id': event.id, }) return unmatched_ratings
def test_dont_discard_periods_that_dont_match_discard_pattern(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event(user_id=3, question_index=5, time=START_TIME, event_type='get task') create_question_event(user_id=3, question_index=5, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task') compute_task_periods( discard_periods=({ 'user_id': 3, 'question_index': 4 }, ), extra_periods=(), ) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1)
def test_dont_discard_periods_that_dont_match_discard_pattern(self): START_TIME = datetime.datetime(2000, 1, 1, 12, 0, 1, 0) create_question_event( user_id=3, question_index=5, time=START_TIME, event_type='get task' ) create_question_event( user_id=3, question_index=5, time=START_TIME + datetime.timedelta(seconds=1), event_type='post task' ) compute_task_periods( discard_periods=({'user_id': 3, 'question_index': 4},), extra_periods=(), ) task_periods = TaskPeriod.select() self.assertEqual(task_periods.count(), 1)
def compute_task_periods(discard_periods=DISCARD_TASK_PERIODS, extra_periods=EXTRA_TASK_PERIODS): # Create a new index for this computation last_compute_index = TaskPeriod.select(fn.Max(TaskPeriod.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Compute the ID of the last user to complete the study max_user_id = QuestionEvent.select(fn.Max(QuestionEvent.user_id)).scalar() or 0 # Compute the time that each user spends in each question for user_id in range(0, max_user_id + 1): question_events = ( QuestionEvent .select() .where(QuestionEvent.user_id == user_id) .order_by(QuestionEvent.time.asc()) ) start_task_event = None for question_event in question_events: # If the 'task' page has been loaded, store the question event that started it. if question_event.event_type == 'get task': start_task_event = question_event elif question_event.event_type == 'post task': if start_task_event is not None: # Save an event if the index of task for a 'post' event that comes # after a task starts matches the task index of the event that started it. if question_event.question_index == start_task_event.question_index: # Only save a task period if its user and index are not in the discard list. task_discard_specification = { 'user_id': user_id, 'task_index': question_event.question_index, } if task_discard_specification not in discard_periods: TaskPeriod.create( compute_index=compute_index, user_id=user_id, task_index=question_event.question_index, concern_index=_get_concern_index( user_id, question_event.question_index), start=start_task_event.time, end=question_event.time, ) # As long as we have seen an event for the end of a task, reset # state such that no "start task" event has been seen start_task_event = None # The caller may have provided a list of extra task periods to append to the computed results. # Add these records in one by one. for period_data in extra_periods: TaskPeriod.create( compute_index=compute_index, user_id=period_data['user_id'], task_index=period_data['task_index'], concern_index=_get_concern_index(period_data['user_id'], period_data['task_index']), start=period_data['start'], end=period_data['end'], )
def compute_task_periods(discard_periods=DISCARD_TASK_PERIODS, extra_periods=EXTRA_TASK_PERIODS): # Create a new index for this computation last_compute_index = TaskPeriod.select(fn.Max( TaskPeriod.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Compute the ID of the last user to complete the study max_user_id = QuestionEvent.select(fn.Max( QuestionEvent.user_id)).scalar() or 0 # Compute the time that each user spends in each question for user_id in range(0, max_user_id + 1): question_events = (QuestionEvent.select().where( QuestionEvent.user_id == user_id).order_by( QuestionEvent.time.asc())) start_task_event = None for question_event in question_events: # If the 'task' page has been loaded, store the question event that started it. if question_event.event_type == 'get task': start_task_event = question_event elif question_event.event_type == 'post task': if start_task_event is not None: # Save an event if the index of task for a 'post' event that comes # after a task starts matches the task index of the event that started it. if question_event.question_index == start_task_event.question_index: # Only save a task period if its user and index are not in the discard list. task_discard_specification = { 'user_id': user_id, 'task_index': question_event.question_index, } if task_discard_specification not in discard_periods: TaskPeriod.create( compute_index=compute_index, user_id=user_id, task_index=question_event.question_index, concern_index=_get_concern_index( user_id, question_event.question_index), start=start_task_event.time, end=question_event.time, ) # As long as we have seen an event for the end of a task, reset # state such that no "start task" event has been seen start_task_event = None # The caller may have provided a list of extra task periods to append to the computed results. # Add these records in one by one. for period_data in extra_periods: TaskPeriod.create( compute_index=compute_index, user_id=period_data['user_id'], task_index=period_data['task_index'], concern_index=_get_concern_index(period_data['user_id'], period_data['task_index']), start=period_data['start'], end=period_data['end'], )
def compute_location_visits(task_compute_index=None): # Create a new index for this computation last_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Determine what will be the compute index of the task periods that these visits are matched to. # This will become the latest compute index if it hasn't been specified. if task_compute_index is None: task_compute_index = TaskPeriod.select(fn.Max(TaskPeriod.compute_index)).scalar() # Compute the ID of the last user to complete the study max_user_id = LocationEvent.select(fn.Max(LocationEvent.user_id)).scalar() # Compute the time that each user spends in each question for user_id in range(0, max_user_id + 1): # Visit all tasks for each user for task_index in TASK_RANGE: # Fetch the period of time for this task task_periods = ( TaskPeriod.select() .where( TaskPeriod.compute_index == task_compute_index, TaskPeriod.task_index == task_index, TaskPeriod.user_id == user_id, ) ) if task_periods.count() < 1: continue task_period = task_periods[0] # Fetch the events for all locations the user has visited during this task location_events = ( LocationEvent .select() .where( LocationEvent.user_id == user_id, LocationEvent.log_date >= task_period.start, LocationEvent.log_date <= task_period.end, ) # While we inspect the "log date" when the server received notice of # the event, we use the "visit date" when the browser experienced the # events to sort them, as we think this will preserve the original # ordering much better. See the notes in the `create_location_visit` # method for more details. .order_by(LocationEvent.visit_date.asc()) ) # In the space below, we assemble "visits" from sequences of events. # This dictionary maps a tab-URL tuple to the event that made it active. active_tab_id = None active_tab_latest_url_event = None for event in location_events: # When a new page is loaded in the current tab, this is the end of the # last event and the start of a new one (that will be in the same tab). if event.event_type in NEW_PAGE_EVENTS: if active_tab_id is not None and event.tab_id == active_tab_id: if event.url != active_tab_latest_url_event.url: create_location_visit( compute_index=compute_index, task_period=task_period, user_id=user_id, activating_event=active_tab_latest_url_event, deactivating_event=event, ) active_tab_latest_url_event = event # If the window has been deactivated, then end the visit in the current tab if event.event_type in DEACTIVATING_EVENTS: if active_tab_id is not None: create_location_visit( compute_index=compute_index, task_period=task_period, user_id=user_id, activating_event=active_tab_latest_url_event, deactivating_event=event, ) active_tab_id = None active_tab_latest_url_event = None # If a tab or window has been activated, that tab is now active. if event.event_type in ACTIVATING_EVENTS: # End any visits in progress for other tabs if active_tab_id is not None: create_location_visit( compute_index=compute_index, task_period=task_period, user_id=user_id, activating_event=active_tab_latest_url_event, deactivating_event=event, ) # Set the new active tab active_tab_id = event.tab_id active_tab_latest_url_event = event