示例#1
0
def RemoveCourseRecords(course_log_path, course_code, mode, cursor):

    # Retrieve all of the course_learner_id
    metadata_path = str(course_log_path + course_code + "/metadata/")
    course_metadata_map = ExtractCourseInformation(metadata_path)
    course_id = course_metadata_map["course_id"]

    course_learner_id_set = set()

    sql = "SELECT learner_index.course_learner_id FROM learner_index where learner_index.course_id = '" + course_id + "';"
    cursor.execute(sql)
    results = cursor.fetchall()
    for result in results:
        course_learner_id = result[0]
        course_learner_id_set.add(course_learner_id)

    if mode == "log":
        tables = [
            "video_interaction", "submissions", "assessments", "quiz_sessions",
            "forum_interaction", "forum_sessions", "sessions"
        ]
        for table in tables:
            for course_learner_id in course_learner_id_set:
                sql = "delete from " + table + " where " + table + ".course_learner_id = '" + course_learner_id + "';"
                cursor.execute(sql)

        # Remove all of the temporary files
        files = os.listdir(str(course_log_path + course_code + "/"))
        for file in files:
            if not os.path.isdir(
                    str(course_log_path + course_code + "/" + file)):
                os.remove(str(course_log_path + course_code + "/" + file))

    if mode == "metadata":
        # Quiz_questions table
        sql = "delete quiz_questions from quiz_questions, course_elements where course_elements.course_id = '" + course_id + "' and course_elements.element_id = quiz_questions.question_id;"
        cursor.execute(sql)

        tables = ["courses", "course_elements"]
        for table in tables:
            sql = "delete from " + table + " where " + table + ".course_id = '" + course_id + "';"
            cursor.execute(sql)

        tables = ["course_learner", "learner_demographic", "learner_index"]
        for table in tables:
            for course_learner_id in course_learner_id_set:
                sql = "delete from " + table + " where " + table + ".course_learner_id = '" + course_learner_id + "';"
                cursor.execute(sql)

    if mode == "survey":
        # Survey_descriptions table
        sql = "delete from survey_descriptions where survey_descriptions.course_id = '" + course_id + "';"
        cursor.execute(sql)
        # Survey_responses table
        for course_learner_id in course_learner_id_set:
            sql = "delete from survey_responses where survey_responses.course_learner_id = '" + course_learner_id + "';"
            cursor.execute(sql)
示例#2
0
def quiz_sessions(metadata_path, daily_log_path,
                  remaining_forum_session_log_path, cursor):

    utc = pytz.UTC

    course_metadata_map = ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]

    # Quiz-related events
    quiz_event_types = []

    # Problem check
    quiz_event_types.append("problem_check")  # Server
    quiz_event_types.append("save_problem_check")
    quiz_event_types.append("problem_check_fail")
    quiz_event_types.append("save_problem_check_fail")

    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    quiz_event_types.append("problem_graded")

    # The server emits problem_rescore events when a problem is successfully rescored.
    quiz_event_types.append("problem_rescore")
    quiz_event_types.append("problem_rescore_fail")

    quiz_event_types.append("problem_reset")  # event_source: serve
    quiz_event_types.append("reset_problem")
    quiz_event_types.append("reset_problem_fail")

    # The server emits problem_save events after a user saves a problem.
    quiz_event_types.append("problem_save")  # event_source: server
    quiz_event_types.append("save_problem_fail")
    quiz_event_types.append("save_problem_success")

    # Show answer
    quiz_event_types.append("problem_show")
    quiz_event_types.append("showanswer")

    quiz_event_types.append("edx.problem.hint.demandhint_displayed")
    quiz_event_types.append("edx.problem.hint.feedback_displayed")

    child_parent_map = course_metadata_map["child_parent_map"]

    learner_logs = {}
    remaining_learner_logs = {}

    quiz_sessions = {}

    # Read remaining event logs
    if os.path.exists(remaining_forum_session_log_path):
        remaining_input_file = open(remaining_forum_session_log_path)
        learner_logs = json.loads(remaining_input_file.read(),
                                  object_hook=json_util.object_hook)

    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)

    input_file = open(daily_log_path, "r")
    for line in input_file:

        jsonObject = json.loads(line)

        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"][
                "user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue

        # For quiz session separation
        global_learner_id = jsonObject["context"]["user_id"]
        event_type = str(jsonObject["event_type"])

        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)

        event_time = jsonObject["time"]

        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue

        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,
                                                "%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)

        if learner_logs.has_key(course_learner_id):
            learner_logs[course_learner_id].append({
                "event_time": event_time,
                "event_type": event_type
            })
        else:
            learner_logs[course_learner_id] = [{
                "event_time": event_time,
                "event_type": event_type
            }]

    input_file.close()

    # For quiz session separation
    for learner in learner_logs.keys():

        course_learner_id = learner
        event_logs = learner_logs[learner]

        # Sorting
        event_logs.sort(cmp=cmp_datetime,
                        key=operator.itemgetter('event_time'))

        session_id = ""
        start_time = ""
        end_time = ""

        final_time = ""

        for i in range(len(event_logs)):

            if session_id == "":

                if "problem+block" in event_logs[i][
                        "event_type"] or "_problem;_" in event_logs[i][
                            "event_type"]:

                    event_type_array = event_logs[i]["event_type"].split("/")

                    if "problem+block" in event_logs[i]["event_type"]:
                        question_id = event_type_array[4]

                    if "_problem;_" in event_logs[i]["event_type"]:
                        question_id = event_type_array[6].replace(";_", "/")

                    if question_id in child_parent_map.keys():
                        parent_block_id = child_parent_map[question_id]
                        session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                        start_time = event_logs[i]["event_time"]
                        end_time = event_logs[i]["event_time"]

            else:

                if "problem+block" in event_logs[i][
                        "event_type"] or "_problem;_" in event_logs[i][
                            "event_type"] or event_logs[i][
                                "event_type"] in quiz_event_types:

                    if event_logs[i][
                            "event_time"] > end_time + datetime.timedelta(
                                hours=0.5):

                        if quiz_sessions.has_key(session_id):
                            quiz_sessions[session_id]["time_array"].append({
                                "start_time":
                                start_time,
                                "end_time":
                                end_time
                            })
                        else:
                            quiz_sessions[session_id] = {
                                "course_learner_id":
                                course_learner_id,
                                "time_array": [{
                                    "start_time": start_time,
                                    "end_time": end_time
                                }]
                            }

                        final_time = event_logs[i]["event_time"]

                        if "problem+block" in event_logs[i][
                                "event_type"] or "_problem;_" in event_logs[i][
                                    "event_type"] or event_logs[i][
                                        "event_type"] in quiz_event_types:
                            event_type_array = event_logs[i][
                                "event_type"].split("/")

                            if "problem+block" in event_logs[i]["event_type"]:
                                question_id = event_type_array[4]

                            if "_problem;_" in event_logs[i]["event_type"]:
                                question_id = event_type_array[6].replace(
                                    ";_", "/")

                            if question_id in child_parent_map.keys():
                                parent_block_id = child_parent_map[question_id]
                                session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                            else:
                                session_id = ""
                                start_time = ""
                                end_time = ""
                    else:
                        end_time = event_logs[i]["event_time"]

                else:

                    if event_logs[i][
                            "event_time"] <= end_time + datetime.timedelta(
                                hours=0.5):
                        end_time = event_logs[i]["event_time"]

                    if quiz_sessions.has_key(session_id):
                        quiz_sessions[session_id]["time_array"].append({
                            "start_time":
                            start_time,
                            "end_time":
                            end_time
                        })
                    else:
                        quiz_sessions[session_id] = {
                            "course_learner_id":
                            course_learner_id,
                            "time_array": [{
                                "start_time": start_time,
                                "end_time": end_time
                            }]
                        }

                    final_time = event_logs[i]["event_time"]

                    session_id = ""
                    start_time = ""
                    end_time = ""

        if final_time != "":
            new_logs = []
            for log in event_logs:
                if log["event_time"] > final_time:
                    new_logs.append(log)

            remaining_learner_logs[course_learner_id] = new_logs

    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_forum_session_log_path, "w")
        output_file.write(
            json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_forum_session_log_path)

    # To compress the session event_logs
    for session_id in quiz_sessions.keys():
        if len(quiz_sessions[session_id]["time_array"]) > 1:

            start_time = ""
            end_time = ""
            updated_time_array = []

            for i in range(len(quiz_sessions[session_id]["time_array"])):
                if i == 0:
                    start_time = quiz_sessions[session_id]["time_array"][i][
                        "start_time"]
                    end_time = quiz_sessions[session_id]["time_array"][i][
                        "end_time"]
                else:
                    if quiz_sessions[session_id]["time_array"][i][
                            "start_time"] > end_time + datetime.timedelta(
                                hours=0.5):
                        updated_time_array.append({
                            "start_time": start_time,
                            "end_time": end_time
                        })
                        start_time = quiz_sessions[session_id]["time_array"][
                            i]["start_time"]
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]
                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })
                    else:
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]

                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })

            quiz_sessions[session_id]["time_array"] = updated_time_array

    for session_id in quiz_sessions.keys():
        course_learner_id = quiz_sessions[session_id]["course_learner_id"]
        for i in range(len(quiz_sessions[session_id]["time_array"])):
            start_time = quiz_sessions[session_id]["time_array"][i][
                "start_time"]
            end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
            if start_time < end_time:
                duration = process_null((end_time - start_time).days * 24 *
                                        60 * 60 +
                                        (end_time - start_time).seconds)
                final_session_id = session_id + "_" + str(
                    start_time) + "_" + str(end_time)
                if duration > 5:
                    array = (final_session_id, course_learner_id, start_time,
                             end_time, duration)
                    sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                    try:
                        cursor.execute(sql, array)
                    except Exception as e:
                        pass
示例#3
0
def survey_mode(metadata_path, survey_path, cursor, pre_id_index, post_id_index):
    
    description_records = []
    response_records = []
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    course_id = course_metadata_map["course_id"]
    
    files = os.listdir(survey_path)
    
    learner_id_map = {}
    
    # Processing ID information
    for file in files:
        if "anon-ids.csv" in file:
            id_file = open(survey_path + file, "r")
            id_reader = csv.reader(id_file)
            id_reader.next()
            for row in id_reader:
                global_id = row[0].replace("\"","")
                anonymized_id = row[1].replace("\"","")
                learner_id_map[anonymized_id] = global_id
    
    # Processing Pre-survey information      
    for file in files:    
        if "pre-survey" in file:
            pre_file = open(survey_path + file, "r")
            pre_reader = csv.reader(pre_file)
            
            question_id_row = pre_reader.next()
            question_description_row = pre_reader.next()
                                               
            for i in range(len(list(question_id_row))):
                question_id = course_id + "_pre_" + question_id_row[i].replace("\"","")
                question_description = question_description_row[i].replace("\'", "\\'")
                array = [question_id, course_id, "pre", question_description]
                description_records.append(array)
                
            for row in pre_reader:
                learner_id = row[pre_id_index]
                print learner_id
                if learner_id in learner_id_map.keys():
                    course_learner_id = course_id + "_" + learner_id_map[learner_id]
                    
                    for i in range(len(list(question_id_row))):
                        question_id = course_id + "_pre_" + question_id_row[i].replace("\"","")
                        response_id = course_learner_id + "_" + "pre" + "_" + question_id_row[i].replace("\"","")
                        answer = row[i]
                        
                        array = [response_id, course_learner_id, question_id, answer]
                        response_records.append(array)           
            pre_file.close()

    # Processing Post-survey information      
    for file in files:    
        if "post-survey" in file:
            post_file = open(survey_path + file, "r")
            post_reader = csv.reader(post_file)
            
            question_id_row = post_reader.next()
            question_description_row = post_reader.next()
                                               
            for i in range(len(list(question_id_row))):
                question_id = course_id + "_post_" + question_id_row[i].replace("\"","")
                question_description = question_description_row[i].replace("\'", "\\'")
                array = [question_id, course_id, "post", question_description]
                description_records.append(array)
                
            for row in post_reader:
                learner_id = row[post_id_index]
                if learner_id in learner_id_map.keys():
                    course_learner_id = course_id + "_" + learner_id_map[learner_id]
                    
                    for i in range(len(list(question_id_row))):
                        question_id = course_id + "_post_" + question_id_row[i].replace("\"","")
                        response_id = course_learner_id + "_post_" + question_id_row[i].replace("\"","")
                        answer = row[i]
                        
                        array = [response_id, course_learner_id, question_id, answer]
                        response_records.append(array)
            post_file.close()
    
    # Database version
    for array in description_records:
        question_id = array[0]
        course_id = array[1]
        question_type = array[2]
        question_description = array[3]
        sql = "insert into survey_descriptions (question_id, course_id, question_type, question_description) values (%s,%s,%s,%s)"
        data = (question_id, course_id, question_type, question_description) 
        cursor.execute(sql, data)
        
    for array in response_records:
        response_id = array[0]
        course_learner_id = array[1]
        question_id = array[2]
        answer = array[3]
        sql = "insert into survey_responses (response_id, course_learner_id, question_id, answer) values (%s,%s,%s,%s)"
        data = (response_id, course_learner_id, question_id, answer)
        cursor.execute(sql, data)
    '''               
示例#4
0
def sessions(metadata_path, daily_log_path, remaining_session_log_path,
             cursor):

    utc = pytz.UTC

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]

    learner_logs = {}
    remaining_learner_logs = {}

    # Read remaining event logs
    if os.path.exists(remaining_session_log_path):
        remaining_input_file = open(remaining_session_log_path)
        learner_logs = json.loads(remaining_input_file.read(),
                                  object_hook=json_util.object_hook)

    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)

    input_file = open(daily_log_path, "r")
    for line in input_file:

        jsonObject = json.loads(line)

        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"][
                "user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue

        # For session separation
        global_learner_id = jsonObject["context"]["user_id"]
        event_type = str(jsonObject["event_type"])

        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)

        event_time = jsonObject["time"]

        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue

        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,
                                                "%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)

        if course_learner_id in course_learner_id_set:
            learner_logs[course_learner_id].append({
                "event_time": event_time,
                "event_type": event_type
            })
        else:
            learner_logs[course_learner_id] = [{
                "event_time": event_time,
                "event_type": event_type
            }]
            course_learner_id_set.add(course_learner_id)

    input_file.close()

    # For session separation
    for learner in learner_logs.keys():

        course_learner_id = learner
        event_logs = learner_logs[learner]

        # Sorting
        event_logs.sort(cmp=cmp_datetime,
                        key=operator.itemgetter('event_time'))

        session_id = ""
        start_time = ""
        end_time = ""

        final_time = ""

        for i in range(len(event_logs)):

            if start_time == "":

                # Initialization
                start_time = event_logs[i]["event_time"]
                end_time = event_logs[i]["event_time"]

            else:

                if event_logs[i]["event_time"] > end_time + datetime.timedelta(
                        hours=0.5):

                    session_id = course_learner_id + "_" + str(
                        start_time) + "_" + str(end_time)
                    duration = (end_time - start_time).days * 24 * 60 * 60 + (
                        end_time - start_time).seconds

                    if duration > 5:
                        array = (session_id, course_learner_id, start_time,
                                 end_time, process_null(duration))
                        sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                        try:
                            cursor.execute(sql, array)
                        except Exception as e:
                            pass

                    final_time = event_logs[i]["event_time"]

                    # Re-initialization
                    session_id = ""
                    start_time = event_logs[i]["event_time"]
                    end_time = event_logs[i]["event_time"]

                else:

                    if event_logs[i]["event_type"] == "page_close":

                        end_time = event_logs[i]["event_time"]

                        session_id = course_learner_id + "_" + str(
                            start_time) + "_" + str(end_time)
                        duration = (end_time -
                                    start_time).days * 24 * 60 * 60 + (
                                        end_time - start_time).seconds

                        if duration > 5:
                            array = (session_id, course_learner_id, start_time,
                                     end_time, process_null(duration))
                            sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                            try:
                                cursor.execute(sql, array)
                            except Exception as e:
                                pass

                        # Re-initialization
                        session_id = ""
                        start_time = ""
                        end_time = ""

                        final_time = event_logs[i]["event_time"]

                    else:

                        end_time = event_logs[i]["event_time"]

        if final_time != "":
            new_logs = []
            for log in event_logs:
                if log["event_time"] > final_time:
                    new_logs.append(log)

            remaining_learner_logs[course_learner_id] = new_logs

    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_session_log_path, "w")
        output_file.write(
            json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_session_log_path)
示例#5
0
def learner_mode(metadata_path, course_code, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    array = (course_metadata_map["course_id"],
             course_metadata_map["course_name"],
             course_metadata_map["start_time"],
             course_metadata_map["end_time"])
    sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)"
    cursor.execute(sql, array)

    # Course_element table
    for element_id in course_metadata_map["element_time_map"].keys():

        element_start_time = course_metadata_map["element_time_map"][
            element_id]

        # Some contents released just one hour earlier than the hour of start time.
        # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00.
        # However, those 2nd week contents are count as 1st week.
        # In order to avoid above situation, I use date to replace datetime here.
        week = process_null(
            getDayDiff(course_metadata_map["start_time"].date(),
                       element_start_time.date()) / 7 + 1)

        array = (element_id,
                 course_metadata_map["element_type_map"][element_id], week,
                 course_metadata_map["course_id"])
        sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)"
        cursor.execute(sql, array)

    # Quiz_question table
    quiz_question_map = course_metadata_map["quiz_question_map"]
    block_type_map = course_metadata_map["block_type_map"]
    element_time_map_due = course_metadata_map["element_time_map_due"]

    for question_id in quiz_question_map:

        question_due = ""
        question_weight = quiz_question_map[question_id]
        quiz_question_parent = course_metadata_map["child_parent_map"][
            question_id]

        if (question_due == "") and (quiz_question_parent
                                     in element_time_map_due):
            question_due = element_time_map_due[quiz_question_parent]

        while not block_type_map.has_key(quiz_question_parent):
            quiz_question_parent = course_metadata_map["child_parent_map"][
                quiz_question_parent]
            if (question_due == "") and (quiz_question_parent
                                         in element_time_map_due):
                question_due = element_time_map_due[quiz_question_parent]

        quiz_question_type = block_type_map[quiz_question_parent]
        question_due = process_null(question_due)

        array = (question_id, quiz_question_type, question_weight,
                 question_due)
        sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)"
        cursor.execute(sql, array)

    files = os.listdir(metadata_path)

    # Learner_demographic table
    learner_mail_map = {}

    # Course_learner table
    course_learner_map = {}
    learner_enrollment_time_map = {}

    # Enrolled learners set
    enrolled_learner_set = set()

    course_id = ""

    # Processing student_courseenrollment data
    for file in files:
        if "student_courseenrollment" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                course_id = record[2]
                time = datetime.datetime.strptime(record[3],
                                                  "%Y-%m-%d %H:%M:%S")
                course_learner_id = course_id + "_" + global_learner_id

                if cmp_datetime(course_metadata_map["end_time"], time):

                    enrolled_learner_set.add(global_learner_id)

                    array = (global_learner_id, course_id, course_learner_id)
                    sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)"
                    cursor.execute(sql, array)

                    course_learner_map[global_learner_id] = course_learner_id
                    learner_enrollment_time_map[global_learner_id] = time
            input_file.close()

    # Processing auth_user data
    for file in files:
        if "auth_user-" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                if record[0] in enrolled_learner_set:
                    learner_mail_map[record[0]] = record[4]
            input_file.close()

    # Processing certificates_generatedcertificate data
    num_uncertifiedLearners = 0
    num_certifiedLearners = 0
    for file in files:
        if "certificates_generatedcertificate" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                final_grade = process_null(record[3])
                enrollment_mode = record[14].replace("\n", "")
                certificate_status = record[7]

                register_time = ""
                if course_learner_map.has_key(global_learner_id):
                    register_time = learner_enrollment_time_map[
                        global_learner_id]
                register_time = process_null(register_time)

                if course_learner_map.has_key(global_learner_id):
                    num_certifiedLearners += 1
                    array = (course_learner_map[global_learner_id],
                             final_grade, enrollment_mode, certificate_status,
                             register_time)
                    sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)"
                    cursor.execute(sql, array)
                else:
                    num_uncertifiedLearners += 1
            input_file.close()

    # Processing auth_userprofile data
    for file in files:
        if "auth_userprofile" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                gender = record[7]
                year_of_birth = process_null(process_null(record[9]))
                level_of_education = record[10]
                country = record[13]

                course_learner_id = process_null(course_id + "_" +
                                                 global_learner_id)

                if global_learner_id in enrolled_learner_set:
                    array = (course_learner_id, gender, year_of_birth,
                             level_of_education, country,
                             learner_mail_map[global_learner_id])
                    sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)"
                    cursor.execute(sql, array)
            input_file.close()

    # Generating forum_interaction records for courses starting before 1T2015
    if "1T2015" in course_code or "2014" in course_code or "2013" in course_code:
        forum_interaction_mongo(metadata_path, cursor)
示例#6
0
def forum_interaction(metadata_path, daily_log_path, cursor):

    course_metadata_map = ExtractCourseInformation(metadata_path)

    # Forum-related events
    forum_event_types = []
    forum_event_types.append("edx.forum.thread.created")
    forum_event_types.append("edx.forum.response.created")
    forum_event_types.append("edx.forum.comment.created")

    input_file = open(daily_log_path, "r")
    for line in input_file:
        jsonObject = json.loads(line)

        event_type = jsonObject["event_type"]

        if event_type in forum_event_types:

            # Skip records without user_id
            if "user_id" not in jsonObject["context"] or jsonObject["context"][
                    "user_id"] == "" or jsonObject["context"][
                        "user_id"] == None:
                continue

            post_timestamp = jsonObject["time"]

            # Check whether the event record belongs to that day
            log_date = post_timestamp[0:10]
            if log_date not in daily_log_path:
                print "Log not belonging to the day...\t" + log_date
                continue

            post_id = jsonObject["event"]["id"]
            course_learner_id = jsonObject["context"]["course_id"] + "_" + str(
                jsonObject["context"]["user_id"])

            post_type = ""
            if event_type == "edx.forum.thread.created":
                post_type = "Thread_" + jsonObject["event"]["thread_type"]
            if event_type == "edx.forum.response.created":
                post_type = "Response"
            if event_type == "edx.forum.comment.created":
                post_type = "Comment"

            post_title = ""
            if event_type == "edx.forum.thread.created":
                post_title = jsonObject["event"]["title"]

            post_content = jsonObject["event"]["body"]

            post_timestamp = post_timestamp[0:19]
            post_timestamp = post_timestamp.replace("T", " ")
            post_timestamp = datetime.datetime.strptime(
                post_timestamp, "%Y-%m-%d %H:%M:%S")

            post_parent_id = ""
            if event_type == "edx.forum.comment.created":
                post_parent_id = jsonObject["event"]["response"]["id"]

            post_thread_id = ""
            if event_type == "edx.forum.response.created":
                post_thread_id = jsonObject["event"]["discussion"]["id"]

            # Pre-processing title & content
            post_title = post_title.replace("\n", " ")
            post_title = post_title.replace("\\", "\\\\")
            post_title = post_title.replace("\'", "\\'")
            post_content = post_content.replace("\n", " ")
            post_content = post_content.replace("\\", "\\\\")
            post_content = post_content.replace("\'", "\\'")

            post_title = cleanUnicode(post_title)
            post_content = cleanUnicode(post_content)

            if post_timestamp < course_metadata_map["end_time"]:
                array = [
                    post_id, course_learner_id, post_type, post_title,
                    post_content, post_timestamp, post_parent_id,
                    post_thread_id
                ]
                sql = "replace into forum_interaction(post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) values (%s,%s,%s,%s,%s,%s,%s,%s);"
                try:
                    cursor.execute(sql, array)
                except Exception as e:
                    pass

    input_file.close()
示例#7
0
def forum_interaction_mongo(metadata_path, cursor):

    course_metadata_map = ExtractCourseInformation(metadata_path)

    files = os.listdir(metadata_path)
    for file in files:
        if ".mongo" in file:
            forum_file = open(str(metadata_path + file), "r")
            for line in forum_file:
                jsonObject = json.loads(line)

                post_id = jsonObject["_id"]["$oid"]
                course_learner_id = jsonObject["course_id"] + "_" + jsonObject[
                    "author_id"]

                post_type = jsonObject["_type"]
                if post_type == "CommentThread":
                    post_type += "_" + jsonObject["thread_type"]
                if "parent_id" in jsonObject and jsonObject["parent_id"] != "":
                    post_type = "Comment_Reply"

                post_title = ""
                if "title" in jsonObject:
                    post_title = jsonObject["title"]

                post_content = jsonObject["body"]

                post_timestamp = jsonObject["created_at"]["$date"]
                if type(post_timestamp) == type(100):
                    post_timestamp = strftime("%Y-%m-%d %H:%M:%S",
                                              gmtime(post_timestamp / 1000))
                    post_timestamp = datetime.datetime.strptime(
                        post_timestamp, "%Y-%m-%d %H:%M:%S")
                if isinstance(post_timestamp, unicode):
                    post_timestamp = post_timestamp[0:19]
                    post_timestamp = post_timestamp.replace("T", " ")
                    post_timestamp = datetime.datetime.strptime(
                        post_timestamp, "%Y-%m-%d %H:%M:%S")

                post_parent_id = ""
                if "parent_id" in jsonObject:
                    post_parent_id = jsonObject["parent_id"]["$oid"]

                post_thread_id = ""
                if "comment_thread_id" in jsonObject:
                    post_thread_id = jsonObject["comment_thread_id"]["$oid"]

                post_title = post_title.replace("\n", " ")
                post_title = post_title.replace("\\", "\\\\")
                post_title = post_title.replace("\'", "\\'")

                post_content = post_content.replace("\n", " ")
                post_content = post_content.replace("\\", "\\\\")
                post_content = post_content.replace("\'", "\\'")

                if post_timestamp < course_metadata_map["end_time"]:

                    array = [
                        post_id, course_learner_id, post_type, post_title,
                        post_content, post_timestamp, post_parent_id,
                        post_thread_id
                    ]
                    sql = "replace into forum_interaction(post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) values (%s,%s,%s,%s,%s,%s,%s,%s);"
                    cursor.execute(sql, array)

            forum_file.close()
示例#8
0
def forum_interaction(metadata_path, cursor):

    forum_interaction_records = []

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    # Processing forum data
    files = os.listdir(metadata_path)
    for file in files:
        if ".mongo" in file:
            forum_file = open(metadata_path + file, "r")
            for line in forum_file:
                jsonObject = json.loads(line)

                post_id = jsonObject["_id"]["$oid"]
                course_learner_id = jsonObject["course_id"] + "_" + jsonObject[
                    "author_id"]

                post_type = jsonObject["_type"]
                if post_type == "CommentThread":
                    post_type += "_" + jsonObject["thread_type"]
                if "parent_id" in jsonObject and jsonObject["parent_id"] != "":
                    post_type = "Comment_Reply"

                post_title = ""
                if "title" in jsonObject:
                    post_title = jsonObject["title"]

                post_content = jsonObject["body"]

                post_timestamp = jsonObject["created_at"]["$date"]
                if type(post_timestamp) == type(100):
                    post_timestamp = strftime("%Y-%m-%d %H:%M:%S",
                                              gmtime(post_timestamp / 1000))
                    post_timestamp = datetime.datetime.strptime(
                        post_timestamp, "%Y-%m-%d %H:%M:%S")
                if isinstance(post_timestamp, unicode):
                    post_timestamp = post_timestamp[0:19]
                    post_timestamp = post_timestamp.replace("T", " ")
                    post_timestamp = datetime.datetime.strptime(
                        post_timestamp, "%Y-%m-%d %H:%M:%S")

                post_parent_id = ""
                if "parent_id" in jsonObject:
                    post_parent_id = jsonObject["parent_id"]["$oid"]

                post_thread_id = ""
                if "comment_thread_id" in jsonObject:
                    post_thread_id = jsonObject["comment_thread_id"]["$oid"]

                post_title = post_title.replace("\n", " ")
                post_title = post_title.replace("\\", "\\\\")
                post_title = post_title.replace("\'", "\\'")

                post_content = post_content.replace("\n", " ")
                post_content = post_content.replace("\\", "\\\\")
                post_content = post_content.replace("\'", "\\'")

                if post_timestamp < course_metadata_map["end_time"]:

                    array = [
                        post_id, course_learner_id, post_type, post_title,
                        post_content, post_timestamp, post_parent_id,
                        post_thread_id
                    ]
                    forum_interaction_records.append(array)

            forum_file.close()

    # Database version
    for array in forum_interaction_records:
        post_id = array[0]
        course_learner_id = array[1]
        post_type = array[2]
        post_title = cleanUnicode(array[3])
        post_content = cleanUnicode(array[4])
        post_timestamp = array[5]
        post_parent_id = array[6]
        post_thread_id = array[7]
        sql = "insert into forum_interaction(post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) values (%s,%s,%s,%s,%s,%s,%s,%s);"
        data = (post_id, course_learner_id, post_type, post_title,
                post_content, post_timestamp, post_parent_id, post_thread_id)

        cursor.execute(sql, data)

    # File version
    '''
示例#9
0
def forum_sessions(metadata_path, log_path, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    start_date = course_metadata_map["start_date"]
    end_date = course_metadata_map["end_date"]

    current_date = start_date
    end_next_date = getNextDay(end_date)

    forum_event_types = []
    forum_event_types.append("edx.forum.comment.created")
    forum_event_types.append("edx.forum.response.created")
    forum_event_types.append("edx.forum.response.voted")
    forum_event_types.append("edx.forum.thread.created")
    forum_event_types.append("edx.forum.thread.voted")
    forum_event_types.append("edx.forum.searched")

    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}

    forum_sessions_record = []

    log_files = os.listdir(log_path)

    while True:

        if current_date == end_next_date:
            break

        for log_file in log_files:

            if current_date in log_file:

                print log_file
                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()

                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                log_file = open(log_path + log_file, "r")
                lines = log_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    if jsonObject["context"]["user_id"] == "":
                        continue

                    # For forum session separation
                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])

                    if "/discussion/" in event_type or event_type in forum_event_types:
                        if event_type != "edx.forum.searched":
                            event_type = "forum_activity"

                    if global_learner_id != "":

                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(
                            global_learner_id)

                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(
                            event_time, "%Y-%m-%d %H:%M:%S")

                        # added for relevant elements
                        event_page = ""
                        if jsonObject.has_key("page"):
                            event_page = str(jsonObject["page"])

                        event_path = ""
                        if jsonObject.has_key("path"):
                            event_path = str(jsonObject["path"])

                        event_referer = ""
                        if jsonObject.has_key("referer"):
                            event_referer = str(jsonObject["referer"])

                        if course_learner_id in course_learner_id_set:
                            learner_all_event_logs[course_learner_id].append({
                                "event_time":
                                event_time,
                                "event_type":
                                event_type,
                                "page":
                                event_page,
                                "path":
                                event_path,
                                "referer":
                                event_referer
                            })
                        else:
                            learner_all_event_logs[course_learner_id] = [{
                                "event_time":
                                event_time,
                                "event_type":
                                event_type,
                                "page":
                                event_page,
                                "path":
                                event_path,
                                "referer":
                                event_referer
                            }]
                            course_learner_id_set.add(course_learner_id)

                # For forum session separation
                for learner in learner_all_event_logs.keys():

                    course_learner_id = learner
                    event_logs = learner_all_event_logs[learner]
                    course_id = course_learner_id.split("_")[0]

                    # Sorting
                    event_logs.sort(cmp=cmp_datetime,
                                    key=operator.itemgetter('event_time'))

                    session_id = ""
                    start_time = ""
                    end_time = ""
                    times_search = 0

                    final_time = ""

                    # represent the elements which just before the session.
                    session_rel_element_pre = ""
                    # represent the elements which is mentioned in the session.
                    session_rel_element_cur = ""

                    for i in range(len(event_logs)):

                        rel_element_cur = courseElementsFinder(
                            event_logs[i], course_id)

                        if session_id == "":

                            if event_logs[i]["event_type"] in [
                                    "forum_activity", "edx.forum.searched"
                            ]:
                                # Initialization
                                session_id = "forum_session_" + course_learner_id
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                                if event_logs[i][
                                        "event_type"] == "edx.forum.searched":
                                    times_search += 1
                                # Added for relevant element id
                                session_rel_element_cur = rel_element_cur
                        else:

                            if event_logs[i]["event_type"] in [
                                    "forum_activity", "edx.forum.searched"
                            ]:

                                if event_logs[i][
                                        "event_time"] > end_time + datetime.timedelta(
                                            hours=0.5):

                                    session_id = session_id + "_" + str(
                                        start_time) + "_" + str(end_time)
                                    duration = (end_time - start_time
                                                ).days * 24 * 60 * 60 + (
                                                    end_time -
                                                    start_time).seconds

                                    if duration > 5:
                                        rel_element_id = ""
                                        if session_rel_element_cur != "":
                                            rel_element_id = session_rel_element_cur
                                        else:
                                            rel_element_id = session_rel_element_pre
                                        array = [
                                            session_id, course_learner_id,
                                            times_search, start_time, end_time,
                                            duration, rel_element_id
                                        ]
                                        forum_sessions_record.append(array)

                                    final_time = event_logs[i]["event_time"]

                                    # Re-initialization
                                    session_id = "forum_session_" + course_learner_id
                                    start_time = event_logs[i]["event_time"]
                                    end_time = event_logs[i]["event_time"]
                                    if event_logs[i][
                                            "event_type"] == "edx.forum.searched":
                                        times_search = 1
                                    # Added for relevant element id
                                    session_rel_element_cur = rel_element_cur

                                else:

                                    end_time = event_logs[i]["event_time"]
                                    if event_logs[i][
                                            "event_type"] == "edx.forum.searched":
                                        times_search += 1
                                    if session_rel_element_cur == "":
                                        session_rel_element_cur = rel_element_cur

                            else:

                                if event_logs[i][
                                        "event_time"] <= end_time + datetime.timedelta(
                                            hours=0.5):
                                    end_time = event_logs[i]["event_time"]

                                session_id = session_id + "_" + str(
                                    start_time) + "_" + str(end_time)
                                duration = (end_time -
                                            start_time).days * 24 * 60 * 60 + (
                                                end_time - start_time).seconds

                                if duration > 5:
                                    rel_element_id = ""
                                    if session_rel_element_cur != "":
                                        rel_element_id = session_rel_element_cur
                                    else:
                                        rel_element_id = session_rel_element_pre
                                    array = [
                                        session_id, course_learner_id,
                                        times_search, start_time, end_time,
                                        duration, rel_element_id
                                    ]
                                    forum_sessions_record.append(array)

                                final_time = event_logs[i]["event_time"]

                                # Re-initialization
                                session_id = ""
                                start_time = ""
                                end_time = ""
                                times_search = 0

                        # session_rel_element_pre is used for recording the element id
                        # of the most recent event logs before the session logs
                        if rel_element_cur != "":
                            session_rel_element_pre = rel_element_cur

                    if final_time != "":
                        new_logs = []
                        for log in event_logs:
                            if log["event_time"] >= final_time:
                                new_logs.append(log)

                        updated_learner_all_event_logs[
                            course_learner_id] = new_logs

                log_file.close()

        current_date = getNextDay(current_date)

    # Database version
    for array in forum_sessions_record:
        session_id = array[0]
        course_learner_id = array[1]
        times_search = process_null(array[2])
        start_time = array[3]
        end_time = array[4]
        duration = process_null(array[5])
        rel_element_id = array[6]
        sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, times_search, start_time,
                end_time, duration, rel_element_id)
        cursor.execute(sql, data)

    # File version
    '''
示例#10
0
def sessions(metadata_path, log_path, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    current_date = course_metadata_map["start_date"]
    end_next_date = getNextDay(course_metadata_map["end_date"])

    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    session_record = []

    log_files = os.listdir(log_path)

    while True:

        if current_date == end_next_date:
            break

        for file in log_files:

            if current_date in file:

                print file

                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()

                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                input_file = open(log_path + file, "r")
                lines = input_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])

                    if global_learner_id != "":
                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(
                            global_learner_id)

                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(
                            event_time, "%Y-%m-%d %H:%M:%S")

                        if course_learner_id in course_learner_id_set:
                            learner_all_event_logs[course_learner_id].append({
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            })
                        else:
                            learner_all_event_logs[course_learner_id] = [{
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            }]
                            course_learner_id_set.add(course_learner_id)

                for course_learner_id in learner_all_event_logs.keys():

                    event_logs = learner_all_event_logs[course_learner_id]

                    # Sorting
                    event_logs.sort(cmp=cmp_datetime,
                                    key=operator.itemgetter('event_time'))

                    session_id = ""
                    start_time = ""
                    end_time = ""

                    final_time = ""

                    for i in range(len(event_logs)):

                        if start_time == "":

                            # Initialization
                            start_time = event_logs[i]["event_time"]
                            end_time = event_logs[i]["event_time"]

                        else:

                            if event_logs[i][
                                    "event_time"] > end_time + datetime.timedelta(
                                        hours=0.5):

                                session_id = course_learner_id + "_" + str(
                                    start_time) + "_" + str(end_time)
                                duration = (end_time -
                                            start_time).days * 24 * 60 * 60 + (
                                                end_time - start_time).seconds

                                if duration > 5:
                                    array = [
                                        session_id, course_learner_id,
                                        start_time, end_time, duration
                                    ]
                                    session_record.append(array)

                                final_time = event_logs[i]["event_time"]

                                # Re-initialization
                                session_id = ""
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]

                            else:

                                if event_logs[i]["event_type"] == "page_close":

                                    end_time = event_logs[i]["event_time"]

                                    session_id = course_learner_id + "_" + str(
                                        start_time) + "_" + str(end_time)
                                    duration = (end_time - start_time
                                                ).days * 24 * 60 * 60 + (
                                                    end_time -
                                                    start_time).seconds

                                    if duration > 5:
                                        array = [
                                            session_id, course_learner_id,
                                            start_time, end_time, duration
                                        ]
                                        session_record.append(array)

                                    # Re-initialization
                                    session_id = ""
                                    start_time = ""
                                    end_time = ""

                                    final_time = event_logs[i]["event_time"]

                                else:

                                    end_time = event_logs[i]["event_time"]

                    if final_time != "":
                        new_logs = []
                        for log in event_logs:
                            if log["event_time"] >= final_time:
                                new_logs.append(log)

                        updated_learner_all_event_logs[
                            course_learner_id] = new_logs

        current_date = getNextDay(current_date)

    # Filter duplicated records
    updated_session_record = []
    session_id_set = set()
    for array in session_record:
        session_id = array[0]
        if session_id not in session_id_set:
            session_id_set.add(session_id)
            updated_session_record.append(array)

    session_record = updated_session_record

    # Database version
    for array in session_record:
        session_id = array[0]
        course_learner_id = array[1]
        start_time = array[2]
        end_time = array[3]
        duration = process_null(array[4])
        sql = "insert into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, start_time, end_time, duration)
        cursor.execute(sql, data)

    # File version
    '''
示例#11
0
def learner_mode(metadata_path, cursor):

    course_record = []
    course_element_record = []
    learner_index_record = []
    course_learner_record = []
    learner_demographic_record = []

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    course_record.append([
        course_metadata_map["course_id"], course_metadata_map["course_name"],
        course_metadata_map["start_time"], course_metadata_map["end_time"]
    ])

    # Course_element table
    for element_id in course_metadata_map["element_time_map"].keys():

        element_start_time = course_metadata_map["element_time_map"][
            element_id]
        # Some contents released just one hour earlier than the hour of start time.
        # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00.
        # However, those 2nd week contents are count as 1st week.
        # In order to avoid above situation, I use date to replace datetime here.
        week = getDayDiff(course_metadata_map["start_time"].date(),
                          element_start_time.date()) / 7 + 1

        array = [
            element_id, course_metadata_map["element_type_map"][element_id],
            week, course_metadata_map["course_id"]
        ]
        course_element_record.append(array)

    files = os.listdir(metadata_path)

    # Learner_demographic table
    learner_mail_map = {}

    # Course_learner table
    course_learner_map = {}
    learner_enrollment_time_map = {}

    # Enrolled learners set
    enrolled_learner_set = set()

    course_id = ""

    # Processing student_courseenrollment data
    for file in files:
        if "student_courseenrollment" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                course_id = record[2]
                time = datetime.datetime.strptime(record[3],
                                                  "%Y-%m-%d %H:%M:%S")
                course_learner_id = course_id + "_" + global_learner_id

                if cmp_datetime(course_metadata_map["end_time"], time):
                    enrolled_learner_set.add(global_learner_id)

                    array = [global_learner_id, course_id, course_learner_id]
                    learner_index_record.append(array)

                    course_learner_map[global_learner_id] = course_learner_id
                    learner_enrollment_time_map[global_learner_id] = time

            input_file.close()

            print "The number of enrolled learners is: " + str(
                len(enrolled_learner_set)) + "\n"

    # Processing auth_user data
    for file in files:
        if "auth_user-" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                if record[0] in enrolled_learner_set:
                    learner_mail_map[record[0]] = record[4]
            input_file.close()

    # Processing certificates_generatedcertificate data
    num_uncertifiedLearners = 0
    num_certifiedLearners = 0
    for file in files:
        if "certificates_generatedcertificate" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                final_grade = record[3]
                enrollment_mode = record[14].replace("\n", "")
                certificate_status = record[7]

                register_time = ""
                if course_learner_map.has_key(global_learner_id):
                    register_time = learner_enrollment_time_map[
                        global_learner_id]

                if course_learner_map.has_key(global_learner_id):
                    num_certifiedLearners += 1
                    array = [
                        course_learner_map[global_learner_id], final_grade,
                        enrollment_mode, certificate_status, register_time
                    ]
                    course_learner_record.append(array)
                else:
                    num_uncertifiedLearners += 1

            input_file.close()

            print "The number of uncertified & certified learners is: " + str(
                num_uncertifiedLearners) + "\t" + str(
                    num_certifiedLearners) + "\n"

    # Processing auth_userprofile data
    for file in files:
        if "auth_userprofile" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                gender = record[7]
                year_of_birth = record[9]
                level_of_education = record[10]
                country = record[13]

                course_learner_id = course_id + "_" + global_learner_id

                if global_learner_id in enrolled_learner_set:
                    array = [
                        course_learner_id, gender, year_of_birth,
                        level_of_education, country,
                        learner_mail_map[global_learner_id]
                    ]
                    learner_demographic_record.append(array)

            input_file.close()

    # Database version
    # Course table
    for array in course_record:
        course_id = course_metadata_map["course_id"]
        course_name = course_metadata_map["course_name"]
        start_time = course_metadata_map["start_time"]
        end_time = course_metadata_map["end_time"]
        sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)"
        data = (course_id, course_name, start_time, end_time)
        cursor.execute(sql, data)

    for array in course_element_record:
        element_id = array[0]
        element_type = array[1]
        week = process_null(array[2])
        course_id = array[3]
        sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)"
        data = (element_id, element_type, week, course_id)
        cursor.execute(sql, data)

    # Learner_index table
    for array in learner_index_record:
        global_learner_id = array[0]
        course_id = array[1]
        course_learner_id = array[2]
        sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)"
        data = (global_learner_id, course_id, course_learner_id)
        cursor.execute(sql, data)

    # Course_learner table
    for array in course_learner_record:
        course_learner_id = array[0]
        final_grade = process_null(array[1])
        enrollment_mode = array[2]
        certificate_status = array[3]
        register_time = process_null(array[4])
        sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)"
        data = (course_learner_id, final_grade, enrollment_mode,
                certificate_status, register_time)
        cursor.execute(sql, data)

    # Learner_demographic table
    for array in learner_demographic_record:
        course_learner_id = process_null(array[0])
        gender = array[1]
        year_of_birth = process_null(process_null(array[2]))
        level_of_education = array[3]
        country = array[4]
        email = array[5]
        email = email.replace("\'", "")
        sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)"
        data = (course_learner_id, gender, year_of_birth, level_of_education,
                country, email)
        cursor.execute(sql, data)

    # File version
    '''
示例#12
0
def quiz_sessions(metadata_path, log_path, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    # Processing events data
    submission_event_collection = []

    # Problem check
    submission_event_collection.append("problem_check")  # Server
    submission_event_collection.append("save_problem_check")
    submission_event_collection.append("problem_check_fail")
    submission_event_collection.append("save_problem_check_fail")

    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    submission_event_collection.append("problem_graded")

    # The server emits problem_rescore events when a problem is successfully rescored.
    submission_event_collection.append("problem_rescore")
    submission_event_collection.append("problem_rescore_fail")

    submission_event_collection.append("problem_reset")  # event_source: serve
    submission_event_collection.append("reset_problem")
    submission_event_collection.append("reset_problem_fail")

    # The server emits problem_save events after a user saves a problem.
    submission_event_collection.append("problem_save")  # event_source: server
    submission_event_collection.append("save_problem_fail")
    submission_event_collection.append("save_problem_success")

    # Show answer
    submission_event_collection.append("problem_show")
    submission_event_collection.append("showanswer")

    current_date = course_metadata_map["start_date"]
    end_next_date = getNextDay(course_metadata_map["end_date"])

    log_files = os.listdir(log_path)

    child_parent_map = course_metadata_map["child_parent_map"]

    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    quiz_sessions = {}

    while True:

        if current_date == end_next_date:
            break

        for file in log_files:
            if current_date in file:

                print file

                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()

                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                input_file = open(log_path + file, "r")
                lines = input_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])

                    if global_learner_id != "":

                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(
                            global_learner_id)

                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(
                            event_time, "%Y-%m-%d %H:%M:%S")

                        if learner_all_event_logs.has_key(course_learner_id):
                            learner_all_event_logs[course_learner_id].append({
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            })
                        else:
                            learner_all_event_logs[course_learner_id] = [{
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            }]

                # For quiz session separation
                for course_learner_id in learner_all_event_logs.keys():

                    event_logs = learner_all_event_logs[course_learner_id]

                    # Sorting
                    event_logs.sort(cmp=cmp_datetime,
                                    key=operator.itemgetter('event_time'))

                    session_id = ""
                    start_time = ""
                    end_time = ""

                    final_time = ""

                    for i in range(len(event_logs)):

                        if session_id == "":

                            if "problem+block" in event_logs[i][
                                    "event_type"] or "_problem;_" in event_logs[
                                        i]["event_type"] or event_logs[i][
                                            "event_type"] in submission_event_collection:

                                event_type_array = event_logs[i][
                                    "event_type"].split("/")

                                if "problem+block" in event_logs[i][
                                        "event_type"]:
                                    question_id = event_type_array[4]

                                if "_problem;_" in event_logs[i]["event_type"]:
                                    question_id = event_type_array[6].replace(
                                        ";_", "/")

                                if question_id in child_parent_map.keys():

                                    parent_block_id = child_parent_map[
                                        question_id]

                                    session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                    start_time = event_logs[i]["event_time"]
                                    end_time = event_logs[i]["event_time"]

                        else:

                            if "problem+block" in event_logs[i][
                                    "event_type"] or "_problem;_" in event_logs[
                                        i]["event_type"] or event_logs[i][
                                            "event_type"] in submission_event_collection:

                                if event_logs[i][
                                        "event_time"] > end_time + datetime.timedelta(
                                            hours=0.5):

                                    if quiz_sessions.has_key(session_id):
                                        quiz_sessions[session_id][
                                            "time_array"].append({
                                                "start_time":
                                                start_time,
                                                "end_time":
                                                end_time
                                            })
                                    else:
                                        quiz_sessions[session_id] = {
                                            "course_learner_id":
                                            course_learner_id,
                                            "time_array": [{
                                                "start_time": start_time,
                                                "end_time": end_time
                                            }]
                                        }

                                    final_time = event_logs[i]["event_time"]

                                    if "problem+block" in event_logs[i][
                                            "event_type"] or "_problem;_" in event_logs[
                                                i]["event_type"] or event_logs[
                                                    i]["event_type"] in submission_event_collection:
                                        event_type_array = event_logs[i][
                                            "event_type"].split("/")

                                        if "problem+block" in event_logs[i][
                                                "event_type"]:
                                            question_id = event_type_array[4]

                                        if "_problem;_" in event_logs[i][
                                                "event_type"]:
                                            question_id = event_type_array[
                                                6].replace(";_", "/")

                                        if question_id in child_parent_map.keys(
                                        ):
                                            parent_block_id = child_parent_map[
                                                question_id]
                                            session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                            start_time = event_logs[i][
                                                "event_time"]
                                            end_time = event_logs[i][
                                                "event_time"]
                                        else:
                                            session_id = ""
                                            start_time = ""
                                            end_time = ""
                                else:
                                    end_time = event_logs[i]["event_time"]

                            else:

                                if event_logs[i][
                                        "event_time"] <= end_time + datetime.timedelta(
                                            hours=0.5):
                                    end_time = event_logs[i]["event_time"]

                                if quiz_sessions.has_key(session_id):
                                    quiz_sessions[session_id][
                                        "time_array"].append({
                                            "start_time":
                                            start_time,
                                            "end_time":
                                            end_time
                                        })
                                else:
                                    quiz_sessions[session_id] = {
                                        "course_learner_id":
                                        course_learner_id,
                                        "time_array": [{
                                            "start_time": start_time,
                                            "end_time": end_time
                                        }]
                                    }

                                final_time = event_logs[i]["event_time"]

                                session_id = ""
                                start_time = ""
                                end_time = ""

                    if final_time != "":
                        new_logs = []
                        for log in event_logs:
                            if log["event_time"] >= final_time:
                                new_logs.append(log)

                        updated_learner_all_event_logs[
                            course_learner_id] = new_logs

        current_date = getNextDay(current_date)

    # To compress the session event_logs
    for session_id in quiz_sessions.keys():
        if len(quiz_sessions[session_id]["time_array"]) > 1:

            start_time = ""
            end_time = ""
            updated_time_array = []

            for i in range(len(quiz_sessions[session_id]["time_array"])):
                if i == 0:
                    start_time = quiz_sessions[session_id]["time_array"][i][
                        "start_time"]
                    end_time = quiz_sessions[session_id]["time_array"][i][
                        "end_time"]
                else:
                    if quiz_sessions[session_id]["time_array"][i][
                            "start_time"] > end_time + datetime.timedelta(
                                hours=0.5):
                        updated_time_array.append({
                            "start_time": start_time,
                            "end_time": end_time
                        })
                        start_time = quiz_sessions[session_id]["time_array"][
                            i]["start_time"]
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]
                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })
                    else:
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]

                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })

            quiz_sessions[session_id]["time_array"] = updated_time_array

    quiz_session_record = []

    for session_id in quiz_sessions.keys():
        course_learner_id = quiz_sessions[session_id]["course_learner_id"]
        for i in range(len(quiz_sessions[session_id]["time_array"])):

            start_time = quiz_sessions[session_id]["time_array"][i][
                "start_time"]
            end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
            if start_time < end_time:
                duration = (end_time - start_time).days * 24 * 60 * 60 + (
                    end_time - start_time).seconds
                final_session_id = session_id + "_" + str(
                    start_time) + "_" + str(end_time)

                if duration > 5:
                    array = [
                        final_session_id, course_learner_id, start_time,
                        end_time, duration
                    ]
                    quiz_session_record.append(array)

    # Database version
    for array in quiz_session_record:
        session_id = array[0]
        course_learner_id = array[1]
        start_time = array[2]
        end_time = array[3]
        duration = process_null(array[4])
        sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, start_time, end_time, duration)
        cursor.execute(sql, data)
    ''' 
示例#13
0
def quiz_mode(metadata_path, log_path, cursor):

    # quiz_question_record = []
    # submissions = {}
    # assessments = {}

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    quiz_question_map = course_metadata_map["quiz_question_map"]
    block_type_map = course_metadata_map["block_type_map"]
    element_time_map_due = course_metadata_map["element_time_map_due"]

    for question_id in quiz_question_map:

        question_due = ""

        question_weight = quiz_question_map[question_id]

        quiz_question_parent = course_metadata_map["child_parent_map"][
            question_id]

        if (question_due == "") and (quiz_question_parent
                                     in element_time_map_due):
            question_due = element_time_map_due[quiz_question_parent]

        while not block_type_map.has_key(quiz_question_parent):
            quiz_question_parent = course_metadata_map["child_parent_map"][
                quiz_question_parent]
            if (question_due == "") and (quiz_question_parent
                                         in element_time_map_due):
                question_due = element_time_map_due[quiz_question_parent]

        quiz_question_type = block_type_map[quiz_question_parent]
        question_due = process_null(question_due)
        # array_quiz = [question_id, quiz_question_type, question_weight, question_due]
        # quiz_question_record.append(array_quiz)
        sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)"
        data = (question_id, quiz_question_type, question_weight, question_due)
        cursor.execute(sql, data)

    # Processing events data
    submission_event_collection = []

    # Problem check
    submission_event_collection.append("problem_check")  # Server
    '''
    submission_event_collection.append("save_problem_check")
    submission_event_collection.append("problem_check_fail")
    submission_event_collection.append("save_problem_check_fail")
    
    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    submission_event_collection.append("problem_graded")
    
    # The server emits problem_rescore events when a problem is successfully rescored.
    submission_event_collection.append("problem_rescore")
    submission_event_collection.append("problem_rescore_fail")
    
    submission_event_collection.append("problem_reset") # event_source: serve
    submission_event_collection.append("reset_problem")
    submission_event_collection.append("reset_problem_fail")
    
    # The server emits problem_save events after a user saves a problem.
    submission_event_collection.append("problem_save") # event_source: server
    submission_event_collection.append("save_problem_fail")
    submission_event_collection.append("save_problem_success")
    
    # Show answer
    submission_event_collection.append("problem_show")
    submission_event_collection.append("showanswer")
    '''

    current_date = course_metadata_map["start_date"]
    end_next_date = getNextDay(course_metadata_map["end_date"])

    log_files = os.listdir(log_path)

    submission_uni_index = 0
    while True:

        if current_date == end_next_date:
            break

        for file in log_files:
            if current_date in file:

                print file

                input_file = open(log_path + file, "r")
                lines = input_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    if jsonObject["event_type"] in submission_event_collection:

                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue

                        global_learner_id = jsonObject["context"]["user_id"]

                        if global_learner_id != "":

                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(
                                global_learner_id)

                            question_id = ""

                            grade = ""
                            max_grade = ""

                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(
                                event_time, "%Y-%m-%d %H:%M:%S")

                            if isinstance(jsonObject["event"], dict):
                                question_id = jsonObject["event"]["problem_id"]

                                # The fields "grade" and "max_grade" are specific to submission event "problem_check"
                                if jsonObject["event"].has_key(
                                        "grade"
                                ) and jsonObject["event"].has_key("max_grade"):
                                    grade = jsonObject["event"]["grade"]
                                    max_grade = jsonObject["event"][
                                        "max_grade"]

                            if question_id != "":

                                submission_id = course_learner_id + "_" + question_id + "_" + str(
                                    submission_uni_index)
                                submission_uni_index = submission_uni_index + 1

                                # For submissions
                                # array_submission = [submission_id, course_learner_id, question_id, event_time]
                                # submissions[submission_id] = array_submission
                                submission_timestamp = event_time
                                sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values (%s,%s,%s,%s)"
                                data = (submission_id, course_learner_id,
                                        question_id, submission_timestamp)
                                cursor.execute(sql, data)

                                # For assessments
                                if grade != "" and max_grade != "":
                                    # array_assessment = [submission_id, course_learner_id, max_grade, grade]
                                    # assessments[submission_id] = array_assessment
                                    assessment_id = submission_id
                                    sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values (%s,%s,%s,%s)"
                                    data = (assessment_id, course_learner_id,
                                            max_grade, grade)
                                    cursor.execute(sql, data)

        current_date = getNextDay(current_date)

    # submission_record = []
    # assessment_record = []

    # for submission_id in submissions.keys():
    #     submission_record.append(submissions[submission_id])

    # for assessment_id in assessments.keys():
    #     assessment_record.append(assessments[assessment_id])

    # Database version
    # Quiz_question table
    # for array in quiz_question_record:
    #     question_id = array[0]
    #     question_type = array[1]
    #     question_weight = array[2]
    #     question_due = array[3]
    #     sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values"
    #     sql += "('%s','%s','%s','%s');" % (question_id, question_type, question_weight, question_due)
    #     cursor.execute(sql)

    # Submissions table
    # for array in submission_record:
    #     submission_id = array[0]
    #     course_learner_id = array[1]
    #     question_id = array[2]
    #     submission_timestamp = array[3]
    #     sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values"
    #     sql += "('%s','%s','%s','%s');" % (submission_id, course_learner_id, question_id, submission_timestamp)
    #     cursor.execute(sql)

    # Submissions table
    # for array in assessment_record:
    #     assessment_id = array[0]
    #     course_learner_id = array[1]
    #     max_grade = array[2]
    #     grade = array[3]
    #     sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values"
    #     sql += "('%s','%s','%s','%s');" % (assessment_id, course_learner_id, max_grade, grade)
    #     cursor.execute(sql)
    ''' 
示例#14
0
def video_interaction(metadata_path, log_path, cursor):
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    
    current_date = course_metadata_map["start_date"]   
    end_next_date = getNextDay(course_metadata_map["end_date"])
    
    video_interaction_map = {}
    
    # Video-related event types
    video_event_types = []

    video_event_types.append("hide_transcript")
    video_event_types.append("edx.video.transcript.hidden")
    
    video_event_types.append("edx.video.closed_captions.hidden")
    video_event_types.append("edx.video.closed_captions.shown")
    
    video_event_types.append("load_video")
    video_event_types.append("edx.video.loaded")
    
    video_event_types.append("pause_video")
    video_event_types.append("edx.video.paused")
    
    video_event_types.append("play_video")
    video_event_types.append("edx.video.played")
    
    video_event_types.append("seek_video")
    video_event_types.append("edx.video.position.changed")
    
    video_event_types.append("show_transcript")
    video_event_types.append("edx.video.transcript.shown")
    
    video_event_types.append("speed_change_video")
    
    video_event_types.append("stop_video")
    video_event_types.append("edx.video.stopped")
    
    video_event_types.append("video_hide_cc_menu")
    video_event_types.append("edx.video.language_menu.hidden")
    
    video_event_types.append("video_show_cc_menu")
    video_event_types.append("edx.video.language_menu.shown")
    
    '''
    # Navigation-related event types
    navigation_event_types = []
    navigation_event_types.append("page_close")
    navigation_event_types.append("seq_goto")
    navigation_event_types.append("seq_next")
    navigation_event_types.append("seq_prev")
    '''
    
    learner_video_event_logs = {}
    updated_learner_video_event_logs = {}
    
    log_files = os.listdir(log_path)
    
    while True:
        
        if current_date == end_next_date:
            break;
        
        for file in log_files:           
            if current_date in file:
                
                print file

                learner_video_event_logs.clear()
                learner_video_event_logs = updated_learner_video_event_logs.copy()
                updated_learner_video_event_logs.clear()
                
                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_video_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)
                
                input_file = open(log_path + file,"r")
                lines = input_file.readlines()
                        
                for line in lines:
                    
                    jsonObject = json.loads(line)
                    
                    if jsonObject["event_type"] in video_event_types:
                        
                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue
                        
                        global_learner_id = jsonObject["context"]["user_id"]
                        
                        if global_learner_id != "":
                            
                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(global_learner_id)
                            
                            video_id = ""
                        
                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
                        
                            event_type = jsonObject["event_type"]
                        
                            # For seek event
                            new_time = 0
                            old_time = 0
                        
                            # For speed change event
                            new_speed = 0
                            old_speed = 0
                        
                            # This sub-condition does not exist in log data
                            # if isinstance(jsonObject["event"], dict):
                            #     video_id = jsonObject["event"]["id"]
                        
                            if isinstance(jsonObject["event"], unicode):
                                event_jsonObject = json.loads(jsonObject["event"])
                                video_id = event_jsonObject["id"]
                                
                                video_id = video_id.replace("-", "://", 1)
                                video_id = video_id.replace("-", "/")
                            
                                # For video seek event
                                if "new_time" in event_jsonObject and "old_time" in event_jsonObject:
                                    new_time = event_jsonObject["new_time"]
                                    old_time = event_jsonObject["old_time"]                                                                      
                                                                                
                                # For video speed change event           
                                if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject:
                                    new_speed = event_jsonObject["new_speed"]
                                    old_speed = event_jsonObject["old_speed"]
                        
                            # To record video seek event                
                            if event_type in ["seek_video","edx.video.position.changed"]:
                                if new_time is not None and old_time is not None:
                                    if course_learner_id in course_learner_id_set:
                                        learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time})
                                    else:
                                        learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time}]
                                        course_learner_id_set.add(course_learner_id)
                                continue
                        
                            # To record video speed change event                
                            if event_type in ["speed_change_video"]:
                                if course_learner_id in course_learner_id_set:
                                    learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed})
                                else:
                                    learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed}]
                                    course_learner_id_set.add(course_learner_id)
                                continue                                                                      
                         
                            if course_learner_id in course_learner_id_set:
                                learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id})
                            else:
                                learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id}]
                                course_learner_id_set.add(course_learner_id)
                    
                    # For non-video-related events                                    
                    if jsonObject["event_type"] not in video_event_types:
                        
                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue
                        
                        global_learner_id = jsonObject["context"]["user_id"]
                        
                        if global_learner_id != "":
                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(global_learner_id)                                  
                        
                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
                        
                            event_type = jsonObject["event_type"]                  
                                                      
                            if course_learner_id in course_learner_id_set:
                                learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type})
                            else:
                                learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}]
                                course_learner_id_set.add(course_learner_id)
                                  
                for course_learner_id in learner_video_event_logs.keys():
                    
                    video_id = ""
                    
                    event_logs = learner_video_event_logs[course_learner_id]
                    
                    # Sorting
                    event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                    
                    video_start_time = ""
                    final_time = ""
                    
                    # For video seek event
                    times_forward_seek = 0
                    duration_forward_seek = 0
                    times_backward_seek = 0
                    duration_backward_seek = 0
                    
                    # For video speed change event
                    speed_change_last_time = ""
                    times_speed_up = 0
                    times_speed_down = 0               
                    
                    # For video pause event                   
                    pause_check = False
                    pause_start_time = ""
                    duration_pause = 0                    
                                      
                    for log in event_logs:
                        
                        if log["event_type"] in ["play_video", "edx.video.played"]:
                            
                            video_start_time = log["event_time"]
                            video_id = log["video_id"]

                            if pause_check:
                                
                                duration_pause = (log["event_time"] - pause_start_time).seconds
                                video_interaction_id = course_learner_id + "_" + video_id + "_" + str(pause_start_time)
                                
                                if duration_pause > 2 and duration_pause < 600:
                                    if video_interaction_id in video_interaction_map.keys():
                                        video_interaction_map[video_interaction_id]["times_pause"] = 1                                        
                                        video_interaction_map[video_interaction_id]["duration_pause"] = duration_pause
                                
                                pause_check = False
                                                        
                            continue 
                        
                        if video_start_time != "":                                                    
                           
                            if log["event_time"] > video_start_time + datetime.timedelta(hours=0.5):
                                
                                video_start_time = ""
                                video_id = ""
                                final_time = log["event_time"]
                                
                            else:                               
                                
                                # 0. Seek
                                if log["event_type"] in ["seek_video", "edx.video.position.changed"] and video_id == log["video_id"]:                                                                       
                                    # Forward seek event
                                    if log["new_time"] > log["old_time"]:
                                        times_forward_seek += 1
                                        duration_forward_seek += log["new_time"] - log["old_time"]
                                    # Backward seek event                                    
                                    if log["new_time"] < log["old_time"]:
                                        times_backward_seek += 1
                                        duration_backward_seek += log["old_time"] - log["new_time"]
                                    continue
                                
                                # 1. Speed change
                                if log["event_type"] == "speed_change_video" and video_id == log["video_id"]:
                                    if speed_change_last_time == "":
                                        speed_change_last_time = log["event_time"]
                                        old_speed = log["old_speed"]
                                        new_speed = log["new_speed"]                                        
                                        if old_speed < new_speed:
                                            times_speed_up += 1
                                        if old_speed > new_speed:
                                            times_speed_down += 1
                                    else:
                                        if (log["event_time"] - speed_change_last_time).seconds > 10:
                                            old_speed = log["old_speed"]
                                            new_speed = log["new_speed"]                                        
                                            if old_speed < new_speed:
                                                times_speed_up += 1
                                            if old_speed > new_speed:
                                                times_speed_down += 1
                                        speed_change_last_time = log["event_time"]
                                    continue
                                
                                # 2. Pause/Stop situation
                                if log["event_type"] in ["pause_video", "edx.video.paused", "stop_video", "edx.video.stopped"] and video_id == log["video_id"]:                                    
                                    
                                    watch_duration = (log["event_time"] - video_start_time).seconds
                                    
                                    video_end_time = log["event_time"]
                                    video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time)
                                 
                                    if watch_duration > 5:                                        
                                        video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration,
                                                                        "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, 
                                                                        "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek,
                                                                        "times_speed_up":times_speed_up, "times_speed_down":times_speed_down,
                                                                        "start_time":video_start_time, "end_time":video_end_time}

                                    if log["event_type"] in ["pause_video", "edx.video.paused"]:
                                        pause_check = True
                                        pause_start_time = video_end_time
                                    
                                    # For video seek event
                                    times_forward_seek = 0
                                    duration_forward_seek = 0
                                    times_backward_seek = 0
                                    duration_backward_seek = 0
                                    
                                    # For video speed change event
                                    speed_change_last_time = ""
                                    times_speed_up = 0
                                    times_speed_down = 0
                                    
                                    # For video general information                                  
                                    video_start_time =""
                                    video_id = ""
                                    final_time = log["event_time"]
                                    
                                    continue
                                    
                                # 3/4  Page changed/Session closed
                                if log["event_type"] not in video_event_types:
                                    
                                    video_end_time = log["event_time"]
                                    watch_duration = (video_end_time - video_start_time).seconds                
                                    video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time)
                                
                                    if watch_duration > 5:                                        
                                        video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration,
                                                                        "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, 
                                                                        "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek,
                                                                        "times_speed_up":times_speed_up, "times_speed_down":times_speed_down,
                                                                        "start_time":video_start_time, "end_time":video_end_time}
                                    
                                    # For video seek event
                                    times_forward_seek = 0
                                    duration_forward_seek = 0
                                    times_backward_seek = 0
                                    duration_backward_seek = 0
                                    
                                    # For video speed change event
                                    speed_change_last_time = ""
                                    times_speed_up = 0
                                    times_speed_down = 0
                                    
                                    # For video general information
                                    video_start_time = ""                                    
                                    video_id = ""
                                    final_time = log["event_time"]
                                    
                                    continue
                        
                    if final_time != "":
                        new_logs = []                
                        for log in event_logs:                 
                            if log["event_time"] > final_time:
                                new_logs.append(log)
                                
                        updated_learner_video_event_logs[course_learner_id] = new_logs                
                     
        current_date = getNextDay(current_date)
        
    video_interaction_record = []
    
    for interaction_id in video_interaction_map.keys():
        video_interaction_id = interaction_id
        course_learner_id = video_interaction_map[interaction_id]["course_learner_id"]
        video_id = video_interaction_map[interaction_id]["video_id"]
        duration = video_interaction_map[interaction_id]["watch_duration"]
        times_forward_seek = video_interaction_map[interaction_id]["times_forward_seek"]
        duration_forward_seek = video_interaction_map[interaction_id]["duration_forward_seek"]
        times_backward_seek = video_interaction_map[interaction_id]["times_backward_seek"]
        duration_backward_seek = video_interaction_map[interaction_id]["duration_backward_seek"]
        times_speed_up = video_interaction_map[interaction_id]["times_speed_up"]
        times_speed_down = video_interaction_map[interaction_id]["times_speed_down"]
        start_time = video_interaction_map[interaction_id]["start_time"]
        end_time = video_interaction_map[interaction_id]["end_time"]
        
        if "times_pause" in video_interaction_map[interaction_id]:
            times_pause = video_interaction_map[interaction_id]["times_pause"]
            duration_pause = video_interaction_map[interaction_id]["duration_pause"]
        else:
            times_pause = 0
            duration_pause = 0
            
        array = [video_interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time]
        video_interaction_record.append(array)
    
    # Video_interaction table
    # Database version
    for array in video_interaction_record:
        interaction_id = array[0]
        course_learner_id = array[1]
        video_id = array[2]
        duration = process_null(array[3])
        times_forward_seek = process_null(array[4])
        duration_forward_seek = process_null(array[5])
        times_backward_seek = process_null(array[6])
        duration_backward_seek = process_null(array[7])
        times_speed_up = process_null(array[8])
        times_speed_down = process_null(array[9])
        times_pause = process_null(array[10])
        duration_pause = process_null(array[11])
        start_time = array[12]
        end_time = array[13]
        sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        data = (interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time)
        cursor.execute(sql, data)
        
    # File version
    '''
示例#15
0
def video_interaction(metadata_path, daily_log_path,
                      remaining_video_interaction_log_path, cursor):

    utc = pytz.UTC

    course_metadata_map = ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]

    video_interaction_map = {}

    # Video-related event types
    video_event_types = []
    video_event_types.append("hide_transcript")
    video_event_types.append("edx.video.transcript.hidden")
    video_event_types.append("edx.video.closed_captions.hidden")
    video_event_types.append("edx.video.closed_captions.shown")
    video_event_types.append("load_video")
    video_event_types.append("edx.video.loaded")
    video_event_types.append("pause_video")
    video_event_types.append("edx.video.paused")
    video_event_types.append("play_video")
    video_event_types.append("edx.video.played")
    video_event_types.append("seek_video")
    video_event_types.append("edx.video.position.changed")
    video_event_types.append("show_transcript")
    video_event_types.append("edx.video.transcript.shown")
    video_event_types.append("speed_change_video")
    video_event_types.append("stop_video")
    video_event_types.append("edx.video.stopped")
    video_event_types.append("video_hide_cc_menu")
    video_event_types.append("edx.video.language_menu.hidden")
    video_event_types.append("video_show_cc_menu")
    video_event_types.append("edx.video.language_menu.shown")

    learner_logs = {}
    remaining_learner_logs = {}

    # Read remaining event logs
    if os.path.exists(remaining_video_interaction_log_path):
        remaining_input_file = open(remaining_video_interaction_log_path)
        learner_logs = json.loads(remaining_input_file.read(),
                                  object_hook=json_util.object_hook)

    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)

    input_file = open(daily_log_path, "r")
    for line in input_file:

        jsonObject = json.loads(line)

        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"][
                "user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue

        global_learner_id = jsonObject["context"]["user_id"]
        event_type = jsonObject["event_type"]

        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)

        event_time = jsonObject["time"]

        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue

        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,
                                                "%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)

        # For video-related events
        if event_type in video_event_types:

            video_id = ""

            # For seek event
            new_time = 0
            old_time = 0

            # For speed change event
            new_speed = 0
            old_speed = 0

            if isinstance(jsonObject["event"], unicode):
                event_jsonObject = json.loads(jsonObject["event"])
                video_id = event_jsonObject["id"]

                video_id = video_id.replace("-", "://", 1)
                video_id = video_id.replace("-", "/")

                # For video seek event
                if "new_time" in event_jsonObject and "old_time" in event_jsonObject:
                    new_time = event_jsonObject["new_time"]
                    old_time = event_jsonObject["old_time"]

                # For video speed change event
                if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject:
                    new_speed = event_jsonObject["new_speed"]
                    old_speed = event_jsonObject["old_speed"]

            # To record video seek event
            if event_type in ["seek_video", "edx.video.position.changed"]:
                if new_time is not None and old_time is not None:
                    if course_learner_id in course_learner_id_set:
                        learner_logs[course_learner_id].append({
                            "event_time":
                            event_time,
                            "event_type":
                            event_type,
                            "video_id":
                            video_id,
                            "new_time":
                            new_time,
                            "old_time":
                            old_time
                        })
                    else:
                        learner_logs[course_learner_id] = [{
                            "event_time": event_time,
                            "event_type": event_type,
                            "video_id": video_id,
                            "new_time": new_time,
                            "old_time": old_time
                        }]
                        course_learner_id_set.add(course_learner_id)
                continue

            # To record video speed change event
            if event_type in ["speed_change_video"]:
                if course_learner_id in course_learner_id_set:
                    learner_logs[course_learner_id].append({
                        "event_time":
                        event_time,
                        "event_type":
                        event_type,
                        "video_id":
                        video_id,
                        "new_speed":
                        new_speed,
                        "old_speed":
                        old_speed
                    })
                else:
                    learner_logs[course_learner_id] = [{
                        "event_time": event_time,
                        "event_type": event_type,
                        "video_id": video_id,
                        "new_speed": new_speed,
                        "old_speed": old_speed
                    }]
                    course_learner_id_set.add(course_learner_id)
                continue

            if course_learner_id in course_learner_id_set:
                learner_logs[course_learner_id].append({
                    "event_time": event_time,
                    "event_type": event_type,
                    "video_id": video_id
                })
            else:
                learner_logs[course_learner_id] = [{
                    "event_time": event_time,
                    "event_type": event_type,
                    "video_id": video_id
                }]
                course_learner_id_set.add(course_learner_id)

        # For non-video-related events
        if event_type not in video_event_types:
            if course_learner_id in course_learner_id_set:
                learner_logs[course_learner_id].append({
                    "event_time": event_time,
                    "event_type": event_type
                })
            else:
                learner_logs[course_learner_id] = [{
                    "event_time": event_time,
                    "event_type": event_type
                }]
                course_learner_id_set.add(course_learner_id)

    input_file.close()

    # For video interaction extraction
    for learner in learner_logs.keys():

        course_learner_id = learner
        event_logs = learner_logs[learner]

        # Sorting
        event_logs.sort(cmp=cmp_datetime,
                        key=operator.itemgetter('event_time'))

        video_id = ""
        video_start_time = ""
        final_time = ""

        # For video seek event
        times_forward_seek = 0
        duration_forward_seek = 0
        times_backward_seek = 0
        duration_backward_seek = 0

        # For video speed change event
        speed_change_last_time = ""
        times_speed_up = 0
        times_speed_down = 0

        # For video pause event
        pause_check = False
        pause_start_time = ""
        duration_pause = 0

        for i in range(len(event_logs)):

            if event_logs[i]["event_type"] in [
                    "play_video", "edx.video.played"
            ]:
                video_start_time = event_logs[i]["event_time"]
                video_id = event_logs[i]["video_id"]

                if pause_check:
                    duration_pause = (event_logs[i]["event_time"] -
                                      pause_start_time).days * 24 * 60 * 60 + (
                                          event_logs[i]["event_time"] -
                                          pause_start_time).seconds
                    video_interaction_id = course_learner_id + "_" + video_id + "_" + str(
                        pause_start_time)
                    if duration_pause > 2 and duration_pause < 600:
                        if video_interaction_id in video_interaction_map.keys(
                        ):
                            video_interaction_map[video_interaction_id][
                                "times_pause"] = 1
                            video_interaction_map[video_interaction_id][
                                "duration_pause"] = duration_pause
                    pause_check = False
                continue

            if video_start_time != "":

                if event_logs[i][
                        "event_time"] > video_start_time + datetime.timedelta(
                            hours=0.5):

                    video_start_time = ""
                    video_id = ""
                    final_time = event_logs[i]["event_time"]

                else:

                    # 0. Seek
                    if event_logs[i]["event_type"] in [
                            "seek_video", "edx.video.position.changed"
                    ] and video_id == event_logs[i]["video_id"]:
                        # Forward seek event
                        if event_logs[i]["new_time"] > event_logs[i][
                                "old_time"]:
                            times_forward_seek += 1
                            duration_forward_seek += event_logs[i][
                                "new_time"] - event_logs[i]["old_time"]
                        # Backward seek event
                        if event_logs[i]["new_time"] < event_logs[i][
                                "old_time"]:
                            times_backward_seek += 1
                            duration_backward_seek += event_logs[i][
                                "old_time"] - event_logs[i]["new_time"]
                        continue

                    # 1. Speed change
                    if event_logs[i][
                            "event_type"] == "speed_change_video" and video_id == event_logs[
                                i]["video_id"]:
                        if speed_change_last_time == "":
                            speed_change_last_time = event_logs[i][
                                "event_time"]
                            old_speed = event_logs[i]["old_speed"]
                            new_speed = event_logs[i]["new_speed"]
                            if old_speed < new_speed:
                                times_speed_up += 1
                            if old_speed > new_speed:
                                times_speed_down += 1
                        else:
                            if (event_logs[i]["event_time"] -
                                    speed_change_last_time).seconds > 10:
                                old_speed = event_logs[i]["old_speed"]
                                new_speed = event_logs[i]["new_speed"]
                                if old_speed < new_speed:
                                    times_speed_up += 1
                                if old_speed > new_speed:
                                    times_speed_down += 1
                            speed_change_last_time = event_logs[i][
                                "event_time"]
                        continue

                    # 2. Pause/Stop situation
                    if event_logs[i]["event_type"] in [
                            "pause_video", "edx.video.paused", "stop_video",
                            "edx.video.stopped"
                    ] and video_id == event_logs[i]["video_id"]:

                        watch_duration = (event_logs[i]["event_time"] -
                                          video_start_time).seconds

                        video_end_time = event_logs[i]["event_time"]
                        video_interaction_id = course_learner_id + "_" + video_id + "_" + str(
                            video_start_time) + "_" + str(video_end_time)

                        if watch_duration > 5:
                            video_interaction_map[video_interaction_id] = {
                                "course_learner_id": course_learner_id,
                                "video_id": video_id,
                                "type": "video",
                                "watch_duration": watch_duration,
                                "times_forward_seek": times_forward_seek,
                                "duration_forward_seek": duration_forward_seek,
                                "times_backward_seek": times_backward_seek,
                                "duration_backward_seek":
                                duration_backward_seek,
                                "times_speed_up": times_speed_up,
                                "times_speed_down": times_speed_down,
                                "start_time": video_start_time,
                                "end_time": video_end_time
                            }

                        if event_logs[i]["event_type"] in [
                                "pause_video", "edx.video.paused"
                        ]:
                            pause_check = True
                            pause_start_time = video_end_time

                        # For video seek event
                        times_forward_seek = 0
                        duration_forward_seek = 0
                        times_backward_seek = 0
                        duration_backward_seek = 0

                        # For video speed change event
                        speed_change_last_time = ""
                        times_speed_up = 0
                        times_speed_down = 0

                        # For video general information
                        video_start_time = ""
                        video_id = ""
                        final_time = event_logs[i]["event_time"]

                        continue

                    # 3/4  Page changed/Session closed
                    if event_logs[i]["event_type"] not in video_event_types:

                        video_end_time = event_logs[i]["event_time"]
                        watch_duration = (video_end_time -
                                          video_start_time).seconds
                        video_interaction_id = course_learner_id + "_" + video_id + "_" + str(
                            video_start_time) + "_" + str(video_end_time)

                        if watch_duration > 5:
                            video_interaction_map[video_interaction_id] = {
                                "course_learner_id": course_learner_id,
                                "video_id": video_id,
                                "type": "video",
                                "watch_duration": watch_duration,
                                "times_forward_seek": times_forward_seek,
                                "duration_forward_seek": duration_forward_seek,
                                "times_backward_seek": times_backward_seek,
                                "duration_backward_seek":
                                duration_backward_seek,
                                "times_speed_up": times_speed_up,
                                "times_speed_down": times_speed_down,
                                "start_time": video_start_time,
                                "end_time": video_end_time
                            }

                        # For video seek event
                        times_forward_seek = 0
                        duration_forward_seek = 0
                        times_backward_seek = 0
                        duration_backward_seek = 0

                        # For video speed change event
                        speed_change_last_time = ""
                        times_speed_up = 0
                        times_speed_down = 0

                        # For video general information
                        video_start_time = ""
                        video_id = ""
                        final_time = event_logs[i]["event_time"]

                        continue

        if final_time != "":
            new_logs = []
            for log in event_logs:
                if log["event_time"] > final_time:
                    new_logs.append(log)

            remaining_learner_logs[course_learner_id] = new_logs

    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_video_interaction_log_path, "w")
        output_file.write(
            json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_video_interaction_log_path)

    for interaction_id in video_interaction_map.keys():
        video_interaction_id = interaction_id
        course_learner_id = video_interaction_map[interaction_id][
            "course_learner_id"]
        video_id = video_interaction_map[interaction_id]["video_id"]
        duration = process_null(
            video_interaction_map[interaction_id]["watch_duration"])
        times_forward_seek = process_null(
            video_interaction_map[interaction_id]["times_forward_seek"])
        duration_forward_seek = process_null(
            video_interaction_map[interaction_id]["duration_forward_seek"])
        times_backward_seek = process_null(
            video_interaction_map[interaction_id]["times_backward_seek"])
        duration_backward_seek = process_null(
            video_interaction_map[interaction_id]["duration_backward_seek"])
        times_speed_up = process_null(
            video_interaction_map[interaction_id]["times_speed_up"])
        times_speed_down = process_null(
            video_interaction_map[interaction_id]["times_speed_down"])
        start_time = video_interaction_map[interaction_id]["start_time"]
        end_time = video_interaction_map[interaction_id]["end_time"]

        if "times_pause" in video_interaction_map[interaction_id]:
            times_pause = process_null(
                video_interaction_map[interaction_id]["times_pause"])
            duration_pause = process_null(
                video_interaction_map[interaction_id]["duration_pause"])
        else:
            times_pause = 0
            duration_pause = 0

        array = [
            video_interaction_id, course_learner_id, video_id, duration,
            times_forward_seek, duration_forward_seek, times_backward_seek,
            duration_backward_seek, times_speed_up, times_speed_down,
            times_pause, duration_pause, start_time, end_time
        ]
        sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        try:
            cursor.execute(sql, array)
        except Exception as e:
            pass
示例#16
0
def main(argv):
    
    # Read configs
    config = ConfigParser.ConfigParser()
    config.read(argv[0])
    
    # All the configs are read as string
    course_log_path = config.get("data", "path")
    remove_filtered_logs = config.get("data", "remove_filtered_logs")
    log_update_list = json.loads(config.get("data", "log_update_list"))
    metadata_update_list = json.loads(config.get("data", "metadata_update_list"))
    survey_update_map = json.loads(config.get("data", "survey_update_map"))
    
    user = config.get("mysqld", "user")
    password = config.get("mysqld", "password")
    host = config.get("mysqld", "host")
    database = config.get("mysqld", "database")
        
    # Database
    connection = mysql.connector.connect(user=user, password=password, host=host, database=database, charset='utf8mb4')
    cursor = connection.cursor()

    # Delete relevant records before updating the database
    print "Removing log records..."
    for course_code in log_update_list:
        print str("\t" + course_code)
        RemoveCourseRecords(course_log_path, course_code, "log", cursor)
    print "Removing metadata records..."
    for course_code in metadata_update_list:
        print str("\t" + course_code)
        RemoveCourseRecords(course_log_path, course_code, "metadata", cursor)
    print "Removing survey records..."
    for course_code in survey_update_map.keys():
        print str("\t" + course_code)
        RemoveCourseRecords(course_log_path, course_code, "survey", cursor)
        
    print
    
    folders = os.listdir(course_log_path)
    for folder in folders:
        if folder != "daily_logs":
                            
            # Only for Mac OS
            if folder == ".DS_Store":
                continue
                
            course_code = folder
            
            print "Processing\t" + course_code
                
            # A file named "course_processing_tracker" (JSON format) is created 
            # for each course to keep track of the processing files
            tracker_path = str(course_log_path + course_code + "/course_processing_tracker")
            if not os.path.exists(tracker_path):
                    
                output_file = open(tracker_path, "w")
                tracker_map = {}
                    
                # This value is used to keep track of the processing status for the course' daily log files, 
                # i.e., "False" (not finished yet) and "True" (finished)
                tracker_map["status"] = False
                    
                tracker_map["processed_dates"] = []
                tracker_map["num_processed_dates"] = 0                
                output_file.write(json.dumps(tracker_map))
                output_file.close()
                    
            # Read the "course_processing_tracker" file
            input_file =  open(tracker_path, "r")
            tracker_map = json.loads(input_file.read())
            input_file.close()
            
            metadata_path = str(course_log_path + course_code + "/metadata/")
            
            # Determine whether the course_structure file is present
            mark = False
            files = os.listdir(metadata_path)
            for file in files:
                if "course_structure" in file:
                    mark = True
                    break
            if not mark:
                print "The course structure file is missing.\n"
                continue
            
            # Learner mode
            if course_code in metadata_update_list:
                print "Learner Mode processing..."        
                learner_mode(metadata_path, course_code, cursor)
            
            # Survey mode
            survey_path = str(course_log_path + course_code + "/surveys/")
            if course_code in survey_update_map.keys():
                print "Survey Mode processing..."        
                pre_id_index = int(survey_update_map[course_code][0])
                post_id_index = int(survey_update_map[course_code][1])
                survey_mode(metadata_path, survey_path, cursor, pre_id_index, post_id_index)
                    
            if tracker_map["status"]:
                print
                continue
                            
            # Retrieve the start/end date of the course
            course_metadata_map = ExtractCourseInformation(metadata_path)
            course_id = course_metadata_map["course_id"]
            start_date = course_metadata_map["start_date"]
            end_date = course_metadata_map["end_date"]
                               
            current_date = start_date
            while current_date <= end_date:
                    
                current_date_string = str(current_date)[0:10]
                if current_date_string not in tracker_map["processed_dates"]:                  
                                            
                    daily_log_file = str("delftx-edx-events-" + current_date_string + ".log.gz")                           
                    if os.path.exists(str(course_log_path + "/daily_logs/" + daily_log_file)):
                                                   
                        print daily_log_file
                                                                            
                        # Decompress log files
                        unzip_file_path = str(course_log_path + course_code + "/unzip_daily_logs/")
                        if not os.path.exists(unzip_file_path):
                            os.mkdir(unzip_file_path)
                            
                        output_path = str(unzip_file_path + daily_log_file[0:-3])
                            
                        if not os.path.exists(output_path):                        
                            output_file = open(output_path, 'w')
                            with gzip.open(str(course_log_path + "/daily_logs/" + daily_log_file), 'r') as f:
                                for line in f:
                                    jsonObject = json.loads(line)
                                    if course_id in jsonObject["context"]["course_id"]:
                                        output_file.write(line)                
                            output_file.close()    
                                                  
                        daily_log_path = output_path
                        
                        # Video_interaction table
                        # print "1.\t Video_interaction table processing..."        
                        remaining_video_interaction_log_path = course_log_path + course_code + "/remaining_video_interaction_logs"
                        video_interaction(metadata_path, daily_log_path, remaining_video_interaction_log_path, cursor)
                        
                        # Quiz mode
                        # print "2.\t Quiz mode processing..."  
                        quiz_mode(daily_log_path, cursor)
                        
                        # Quiz_sessions table
                        # print "3.\t Quiz_sessions table processing..."  
                        remaining_quiz_session_log_path = course_log_path + course_code + "/remaining_quiz_session_logs"
                        quiz_sessions(metadata_path, daily_log_path, remaining_quiz_session_log_path, cursor)
                        
                        # Forum_interaction table
                        # print "4.\t Forum_interaction table processing..."  
                        forum_interaction(metadata_path, daily_log_path, cursor)
                            
                        # Forum_sessions table
                        # print "5.\t Forum_sessions table processing..."  
                        remaining_forum_session_log_path = course_log_path + course_code + "/remaining_forum_session_logs"
                        forum_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor)                     
                                        
                        # Sessions table
                        # print "6.\t Sessions table processing..."  
                        remaining_session_log_path = course_log_path + course_code + "/remaining_session_logs"
                        sessions(metadata_path, daily_log_path, remaining_session_log_path, cursor)                    
                                                                            
                        tracker_map["processed_dates"].append(current_date_string)
                            
                current_date = getNextDay(current_date)
                                            
            if len(tracker_map["processed_dates"]) == getDayDiff(start_date, end_date) + 1:
                tracker_map["status"] = True
                    
            if tracker_map["num_processed_dates"] != len(tracker_map["processed_dates"]):                
                tracker_map["num_processed_dates"] = len(tracker_map["processed_dates"])                
                output_file = open(tracker_path, "w")
                output_file.write(json.dumps(tracker_map))
                output_file.close()
            
            # Delete the decompressed files
            if remove_filtered_logs == "1":
                log_files = os.listdir(str(course_log_path + "/daily_logs/"))
                for log_file in log_files:
                    os.remove(str(course_log_path + "/daily_logs/" + log_file))
                    
        print