def process_segments(mongodb, log_entries): """ For a list of log entries, parse them into a format that makes it easy to construct segments. Indexed by username, each entry in the resulting data structure includes the following: - segments: all segments for this user - entries: all raw log entries """ collection = mongodb[VIDEOS_COL] current_videos = list(collection.find({}, {"video_id": 1}).distinct("video_id")) videos = [] for video in current_videos: videos.append(video) data = {} index = 0 for entry in log_entries: index += 1 if index % 1000 == 0: print ".", # print entry username = get_prop(entry, "USERNAME") # ignore ones that are already processed #if entry["processed"] == 1: # continue # ignore if username is empty if username == "": continue video_id = get_prop(entry, "VIDEO_ID") # non-video player events if video_id == "": # TODO: use this to more accurately capture sessions # and also display before/after destinations continue # video player events else: # if this video is not in the video database, add it if video_id not in videos: print "adding video", video_id register_new_video(mongodb, video_id, entry) videos.append(video_id) if video_id not in data: data[video_id] = {} if username not in data[video_id]: data[video_id][username] = {} data[video_id][username]["segments"] = [] data[video_id][username]["entries"] = [] #TODO: unindent? data[video_id][username]["entries"].append(entry) #TODO: not rely on data? for video_id in data: for username in data[video_id]: #print video_id, username #for entry in data[video_id][username]["entries"]: # print " ", get_prop(entry, "TYPE_EVENT") data[video_id][username]["segments"] = \ construct_segments(data[video_id][username]["entries"]) # print video_id, username, len(data[video_id][username]["segments"]), len(data[video_id][username]["entries"]) del data[video_id][username]["entries"] return data
def send_events_local(): global results client = MongoClient() mongodb = client[MONGODB_NAME] start_time = time.time() valid_events = 0 # Store raw event information for event in results: #entry = {} #for key in event.keys(): # entry[key] = event[key] # flag indicating whether this item has been processed. # entry["processed"] = 0 event["processed"] = 0 collection = mongodb[EVENTS_COL] # get a list of event types to keep: # everything that starts with EVT defined in common.py temp_list = [CONF[key] for key in CONF if key.startswith("EVT")] events_type_list = list(chain(*temp_list)) if get_prop(event, "TYPE_EVENT") in events_type_list: collection.insert(event) valid_events += 1 print "=========== INCOMING EVENTS", len(results), "total,", valid_events, "valid. =============" print sys._getframe().f_code.co_name, "COMPLETED", (time.time() - start_time), "seconds"
def video_interaction_event(mongodb, events): """ Store all video-related events from the tracking log into the database. There are three collections: 1) video_events: raw event information 2) video_segments: watching segments recovered from events 3) video_heatmap: view counts for each second of a video To send events, refer to send_event.py """ valid_events = 0 # Store raw event information for event in events: entry = {} for key in event.keys(): entry[key] = event[key] # flag indicating whether this item has been processed. entry["processed"] = 0 collection = mongodb[EVENTS_COL] # get a list of event types to keep: # everything that starts with EVT defined in common.py temp_list = [CONF[key] for key in CONF if key.startswith("EVT")] events_type_list = list(chain(*temp_list)) if get_prop(event, "TYPE_EVENT") in events_type_list: collection.insert(entry) valid_events += 1 print "=========== INCOMING EVENTS", len(events), "total,", valid_events, "valid. ============="
def register_new_video(mongodb, video_id, entry): """ Add a new video entry to the videos collection. An important thing is to get the video duration information. """ collection = mongodb[VIDEOS_COL] db_entry = {} db_entry["video_id"] = video_id db_entry["host"] = CONF["VIDEO_HOST"] db_entry["duration"] = get_video_duration(video_id, db_entry["host"]) db_entry["video_name"] = get_prop(entry, "VIDEO_NAME") collection.insert(db_entry)
def construct_segments(log_entries): """ Construct a video-watching segment from a list of video player log entries for a single video. A segment indicates a block of time a student watched a part of a video clip. It is used to create various visualizations of students' interaction with video content. A segment includes time_start: when does this segment start? (in sec) time_end: when does this segment end? (in sec) date_start: when did this watching start? (timestamp) date_end: when did this watching end? (timestamp) """ # TODO: do not assume that entries are time-ordered. # make sure it's sorted by time #sorted_entries = sorted(log_entries, key=lambda e: e["time"]) segments = [] # two items are compared, so start from index 1 for i in range(1, len(log_entries)): entry1 = log_entries[i-1] entry2 = log_entries[i] # print get_prop(entry1, "TYPE_EVENT"), get_prop(entry2, "TYPE_EVENT") try: e1_time = datetime.strptime(get_prop(entry1, "TIMESTAMP"), "%Y-%m-%d %H:%M:%S.%f") except ValueError: try: e1_time = datetime.strptime(get_prop(entry1, "TIMESTAMP"), "%Y-%m-%dT%H:%M:%S") except ValueError, v: if len(v.args) > 0 and v.args[0].startswith('unconverted data remains: '): new_time_string = get_prop(entry1, "TIMESTAMP")[:-(len(v.args[0])-26)] e1_time = datetime.strptime(new_time_string, "%Y-%m-%dT%H:%M:%S") else: e1_time = datetime.strptime(get_prop(entry1, "TIMESTAMP"), "%Y-%m-%d %H:%M:%S") # print "e1_time", e1_time except: print "time format error. moving on" continue try: e2_time = datetime.strptime(get_prop(entry2, "TIMESTAMP"), "%Y-%m-%d %H:%M:%S.%f") except ValueError: try: e2_time = datetime.strptime(get_prop(entry2, "TIMESTAMP"), "%Y-%m-%dT%H:%M:%S") except ValueError, v: if len(v.args) > 0 and v.args[0].startswith('unconverted data remains: '): new_time_string = get_prop(entry2, "TIMESTAMP")[:-(len(v.args[0])-26)] e2_time = datetime.strptime(new_time_string, "%Y-%m-%dT%H:%M:%S") else: e2_time = datetime.strptime(get_prop(entry2, "TIMESTAMP"), "%Y-%m-%d %H:%M:%S") # print "e2_time", e2_time except: print "time format error. moving on" continue
continue try: e2_time = datetime.strptime(get_prop(entry2, "TIMESTAMP"), "%Y-%m-%d %H:%M:%S.%f") except ValueError: try: e2_time = datetime.strptime(get_prop(entry2, "TIMESTAMP"), "%Y-%m-%dT%H:%M:%S") except ValueError, v: if len(v.args) > 0 and v.args[0].startswith('unconverted data remains: '): new_time_string = get_prop(entry2, "TIMESTAMP")[:-(len(v.args[0])-26)] e2_time = datetime.strptime(new_time_string, "%Y-%m-%dT%H:%M:%S") except: print "time format error. moving on" continue try: segment = {} if get_prop(entry1, "TYPE_EVENT") not in CONF["EVT_VIDEO_PLAY"]: continue # case 1. play-pause: watch for a while and pause if get_prop(entry2, "TYPE_EVENT") in CONF["EVT_VIDEO_PAUSE"]: # 1) compute time elapsed between play and pause # 2) subtract from the final position to get the starting position # 3) avoid negative time with max(x, 0) # time_diff = time.mktime(e2_time) - time.mktime(e1_time) time_diff = e2_time - e1_time time_diff_secs = time_diff.days * 60 * 60 * 24 + time_diff.seconds try: elapsed_time = float(get_prop(entry2, "VIDEO_TIME")) - time_diff_secs segment["time_start"] = max(elapsed_time, 0) segment["time_end"] = float(get_prop(entry2, "VIDEO_TIME")) except TypeError: print "malformatted field. skipping"