sys.setdefaultencoding('utf-8') sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))) import json,math,MySQLdb,datetime,time import similary_question as sq from recommend import RecommendQuestion from common.db_fetcher import DataBaseFetcher recommend = RecommendQuestion() # object db_fetcher = DataBaseFetcher() question_list = [] # set delete_sql = "DELETE FROM error_question_log WHERE question_id IN ( select c.question_id from (SELECT a.question_id FROM error_question_log a JOIN neworiental_v3.entity_question b ON a.question_id = b.id WHERE b.subject_id != 4) c)" db_fetcher.commit_sql_cmd(delete_sql, 'mysql_logdata') # update rows = db_fetcher.get_sql_result("select question_id from error_question_log", 'mysql_logdata') for row in rows: qid = row[0] question_list.append(qid) def getQuestionWords(question_list): # question word dict_question_info = {} lSet = int(len(question_list) / 1000) + 2 start, end = 0, 0 for i in range(1, lSet): end = i * 1000 if end > len(question_list): end = len(question_list) str_question = ','.join([str(x) for x in question_list[start:end] ]) str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data, c.difficulty, c.question_type from neworiental_v3.entity_question c left \ join neworiental_v3.entity_question_type t on t.type_id=c.question_type_id where c.id in (%s) and subject_id = 4" % str_question
class Report(object): """docstring for Report""" def __init__(self, exercise_id, update_time='2017-11-01 00:00:00'): # do something inits self.db_fetcher = DataBaseFetcher() self.exercise_id = exercise_id self.dict_content_answer = {} # error content self.curr_time = datetime.datetime.now() self.update_time = update_time # update time self.recommend = RecommendQuestion() self.dict_realtion_quesion, self.dict_question_num = self.getRelationQuestion( ) self.dict_parent_quesion = self.getChildQuestion() self.dict_question_diff = self.getRightDifficulty() self.dict_point, self.dict_topic_point = self.getPoints() self.dict_students = self.getStudentId() self.dict_student_score, self.dict_student_records = {}, { x: [] for x in self.dict_students } self.exam_list_records, self.practice_list_records = self.getExerciseRecord( ) #records # dict self.dict_question_types = { 1: '选择题', 2: '填空题', 4: '简答题', 6: '综合题', 17: '计算题', 44: '证明题' } self.dict_question_difficulty = { 1: '易', 2: '中', 3: '难', 4: '极难' } # 1:易 2:中 3:难 4:极难 # feedback self.dict_students_resource = self.getStudentResource() # resource def getExerciseRecord(self): exam_list_records = [] practice_list_records = [] sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \ neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % ( self.update_time) dict_is_choice = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question, student_id, ret, answer, submit_time, question_type = row student_id = long(student_id) if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len( answer) == 0: continue if question in self.dict_realtion_quesion: exam_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_content_answer['%s_%s' % (question, student_id)] = answer elif question in self.dict_parent_quesion: dict_content = self.dict_parent_quesion[question] parent_question_id = dict_content['parent_id'] rank = dict_content['rank'] link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ parent_question_id] if question_num >= 17 and question_num <= 21: if rank == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 2: self.dict_content_answer['%s_%s' % (parent_question_id, student_id)] = answer elif question_num >= 22 and question_num <= 23: key = '%s_%s' % (parent_question_id, student_id) if rank == 1: if answer == 'A': dict_is_choice[key] = 1 else: dict_is_choice[key] = 0 elif rank == 2: if key in dict_is_choice: if dict_is_choice[key] == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 3: self.dict_content_answer[key] = answer else: practice_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) return exam_list_records, practice_list_records def getStudentId(self, file_name='students.txt'): list_students = [] with open(file_name) as handle_f: for line in handle_f: system_id = long(line.strip()) list_students.append(system_id) set_studends = set(list_students) # students set str_content = ','.join([str(x) for x in set_studends]) sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content dict_students = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, name = row if student_id in set_studends: dict_students[student_id] = name return dict_students def getRightDifficulty(self): dict_question_diff = {} sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_diff def getChildQuestion(self): dict_parent_list = {} dict_parent_question = {} sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \ where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: qid = row[0] parent_id = row[1] if parent_id not in dict_parent_list: dict_parent_list[parent_id] = [] dict_parent_list[parent_id].append(qid) for parent_id in dict_parent_list: sort_question = sorted(dict_parent_list[parent_id], reverse=False) count = 1 for item in sort_question: dict_parent_question[item] = { 'parent_id': parent_id, 'rank': count } count += 1 return dict_parent_question def getRelationQuestion(self): dict_realtion_quesion = {} dict_point_question_num = {} sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id = \ b.id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') dict_parent_quesion = {} link_parent_list = [] for row in rows: link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = row dict_realtion_quesion[ link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type arr_point = link_point_id.strip().split(',') for point in arr_point: if point not in dict_point_question_num: dict_point_question_num[point] = [] dict_point_question_num[point].append([ link_question_id, question_num, difficulty, question_exercise_id, question_type ]) return dict_realtion_quesion, dict_point_question_num def getScore(self, question_num, answer): ascore, score = 0, 0 if question_num >= 1 and question_num <= 16: score = 5 if (answer == 'A' or answer == 'B') else 0 ascore = 5 elif question_num >= 17 and question_num <= 21: if answer == 'A': score = 4 elif answer == 'B': score = 7 elif answer == 'C': score = 9 elif answer == 'D': score = 12 ascore = 12 elif question_num >= 22 and question_num <= 23: if answer == 'A': score = 4 elif answer == 'B': score = 7 elif answer == 'C': score = 9 elif answer == 'D': score = 10 ascore = 10 return ascore, score def statQustionReport(self): dict_question_count, dict_question_right = {}, {} dict_point_count, dict_point_right = {}, {} dict_error_students, dict_right_students = {}, {} for item in self.exam_list_records: question, student_id, ret, answer, submit_time, question_type = item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = link_point_id.strip().split(',') ascore, score = self.getScore(question_num, answer) is_right = (answer == 'A') or (answer == 'B') # 1 point for point in arr_points: if point in self.dict_point: name, ptype, question_type, level, parent_id, link_id = self.dict_point[ point] if name not in dict_point_count: dict_point_count[name] = 0 dict_point_right[name] = 0 dict_point_right[name] += 1 if ( question_num <= 16 and is_right) or ( question_num >= 17 and answer == 'D') else 0 # dict_point_count[name] += 1 if link_question_id not in dict_question_count: dict_question_right[link_question_id] = 0 dict_question_count[link_question_id] = 0 dict_question_right[ link_question_id] += 1 if ret == 1 else 0 # dict_question_count[link_question_id] += 1 # questions if link_question_id not in dict_right_students: dict_right_students[link_question_id] = [] dict_error_students[link_question_id] = [] if (question_num <= 16 and is_right) or (question_num >= 17 and answer == 'D'): dict_right_students[link_question_id].append(student_id) else: dict_error_students[link_question_id].append(student_id) if student_id not in self.dict_student_score: self.dict_student_score[student_id] = 0 self.dict_student_score[student_id] += score dict_point_acc = {} for point_name in dict_point_count: acc = dict_point_right[point_name] / dict_point_count[point_name] dict_point_acc[point_name] = acc #print '%s \t%s' % (point_name, acc) # students dict_students_point_question = self.calStudentPoints(dict_point_acc) return dict_students_point_question def getErrorText(self, answer): if answer == 'C': return '审题时,粗心看错了' elif answer == 'D': return '审题时,题意读不懂' elif answer == 'E': return '析题时,思路模糊不清' elif answer == 'F': return '析题时,思路方向错误' elif answer == 'G': return '解题时,公式记错了' elif answer == 'H': return '解题时,计算出错了' elif answer == 'I': return '未作答(或未完成),时间来不及了' elif answer == 'A' or answer == 'B': return '答题正确' def getQuestionWords(self): dict_question_words = {} str_question = ','.join([ str(self.dict_realtion_quesion[x][0]) for x in self.dict_realtion_quesion ]) str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \ t.type_id=c.question_type_id where c.id in (%s)" % str_question s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata') for row in s_rows: question, subject_id, type_id, struct_id, json_data = row if json_data != '' and json_data != 'None' and json_data != None: keywords, res = sq.predict(question, subject_id, type_id, struct_id, json_data) dict_question_words[question] = keywords, res return dict_question_words def calStudentPoints(self, dict_point_acc): dict_coef = {x: 1.0 for x in self.dict_point} # bate = 0.25 dict_student_points = {} dict_students_point_question = {} # student -> error question for student_id in self.dict_students: if student_id not in dict_student_points: dict_student_points[student_id] = {} records = sorted(self.dict_student_records[student_id], key=lambda x: x[-1], reverse=True) dict_error = { 'C': 0.0, 'D': 0.0, 'E': 0.0, 'F': 0.0, 'G': 0.0, 'H': 0.0, 'I': 0.0 } mass_score = 0 dict_question_point = {} for item in records: question, student_id, ret, answer, submit_time, question_type = item if answer == 'answer': continue if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] if question_exercise_id != self.exercise_id: continue if question_type == '单选题' or question_type == '选择题': qtype = 1 elif question_type == '填空题': qtype = 2 else: qtype = 3 difficulty = self.dict_question_diff[link_question_id] arr_points = link_point_id.strip().split(',') a_score, score = self.getScore(question_num, answer) content = '%s_%s' % (question, student_id) if content in self.dict_content_answer: answer = self.dict_content_answer[content] if answer != 'A' and answer != 'B' and answer != 'answer': dict_error[answer] += a_score - score if answer == 'C' or answer == 'G' or answer == 'H': mass_score += a_score - score for point in arr_points: if point not in dict_question_point: dict_question_point[point] = [] dict_question_point[point].append( (link_question_id, qtype, submit_time)) name, ptype, question_type, level, parent_id, link_id = self.dict_point[ point] acc = dict_point_acc[name] if point not in dict_student_points[student_id]: dict_student_points[student_id][point] = 0.0 dict_student_points[student_id][ point] += dict_coef[point] * ( a_score - score ) * acc / math.log(1 + bate * (level + difficulty)) sort_error = sorted(dict_error.items(), key=lambda x: x[1], reverse=True) if len(dict_student_points[student_id]) > 0: sort_point = sorted(dict_student_points[student_id].items(), key=lambda x: x[1], reverse=True) student_name = self.dict_students[student_id] point_name1, ptype1, question_type1, level1, parent_id1, link_id1 = self.dict_point[ point] point_name2, ptype2, question_type2, level2, parent_id2, link_id2 = self.dict_point[ sort_point[1][0]] point_name3, ptype3, question_type3, level3, parent_id2, link_id3 = self.dict_point[ sort_point[2][0]] is_power = 1 if mass_score > 5 else 0 error1, error1score = sort_error[0][0], sort_error[0][1] error2, error2score = sort_error[1][0], sort_error[1][1] error1_name, error2_name = self.getErrorText( error1), self.getErrorText(error2) dict_students_point_question[ student_id] = sort_point, dict_question_point return dict_students_point_question def getPoint2Question(self): dict_point_org_question = {} for answer_id in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ answer_id] arr_point = link_point_id.strip().split(',') for item in arr_point: if item not in dict_point_org_question: dict_point_org_question[item] = [] dict_point_org_question[item].append(link_question_id) return dict_point_org_question def pointsRecQuestion(self, dict_students_point_question, dict_diff, throld=4): # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty # dict_point_org_question = self.getPoint2Question() # org question point dict_question_text = self.getQuestionWords() dict_question_target = {} dict_student_recommend_question = {} print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( '学生id', '学生名字', '考点', '考点名称', '原题id', '推荐习题id', '关键词', '难度') for student_id in dict_students_point_question: arr_point, dict_question_point_target = dict_students_point_question[ student_id] student_name = self.dict_students[student_id] haveTime = 12 recommend_questions = [] pre_set = set() for item_target in arr_point: if haveTime <= 0 and len(recommend_questions) % 3 == 0: break item_point = item_target[0] arr_question = sorted(dict_question_point_target[item_point], key=lambda x: x[-1], reverse=True) key = '%s-%s' % (student_id, item_point) rateRight, difficulty = dict_diff[ key] if key in dict_diff else (0.0, 1) pre_set = pre_set | set([x[0] for x in arr_question]) # for base_question, qtype, submit_time in arr_question: if haveTime <= 0 and len(recommend_questions) % 3 == 0: break keywords, text = dict_question_text[base_question] text = text.replace('\n', '').strip('\r').strip() # filter base_question_key = '%s-%s' % (base_question, difficulty) if base_question_key in dict_question_target: target_questions = dict_question_target[ base_question_key] else: target_questions = self.recommend.getEsResult( base_question, text, keywords, difficulty, qtype, pre_set, throld) dict_question_target[ base_question_key] = target_questions for rec_question in target_questions: if haveTime <= 0 and len(recommend_questions) % 3 == 0: break if rec_question in pre_set: continue haveTime += -1 recommend_questions.append( (base_question, keywords, rec_question, item_point, difficulty)) pre_set.add(rec_question) for rec_item in recommend_questions: source_question, keywords, rec_question, item_point, difficulty = rec_item point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[ item_point] print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( student_id, student_name, item_point, point_name, source_question, rec_question, keywords, difficulty) def importDefault(self, is_new, dict_question_topic): defult_list = [(10805815, 0), (10806190, 2), (10805815, 4), (10806184, 6), (10806182, 8), (10806169, 10), (10806104, 12), (10806057, 14), (10805815, 16), (10806068, 18)] update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values" new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values" db_rows = self.db_fetcher.get_sql_result( "select student_id from entity_student_white_list", "mysql_white_list") is_first = 1 for row in db_rows: student_id = row[0] for item in defult_list: question_id, subject_id = item type_id = 2186 if question_id in dict_question_topic: set_topic = dict_question_topic[question_id] if len(set_topic) > 0: type_id = set_topic.pop() if is_first == 1: update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, subject_id, 0) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, subject_id, 0, 1, 3, type_id) is_first = 0 else: update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, subject_id, 0) new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, subject_id, 0, 1, 3, type_id) if is_new == 1: self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list') else: self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list') def import2DataBase(self, flag, is_new=1, fname='student_rec.txt'): dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic if is_new == 1: self.db_fetcher.commit_sql_cmd( "delete from sync_student_recommend_question", 'mysql_v3_white_list') # update else: self.db_fetcher.commit_sql_cmd( "delete from entity_recommend_question_bytopic", 'mysql_white_list') # update # self.importDefault(is_new, dict_question_topic) # import if flag == 1: # 1,3 5 recomend update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values" insert_sql = "insert into entity_question_recommend(student_id,student_name,point,point_name,question_id,recommend_id,keywords,difficulty) values" new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values" insert_score, is_first = 0, 1 with open(fname) as rec_f: for line in rec_f: if insert_score > 1: arr = line.strip().split('\t') student_id, question_id = arr[0], long(arr[5]) type_id = 2186 if question_id in dict_question_topic: set_topic = dict_question_topic[question_id] if len(set_topic) > 0: type_id = set_topic.pop() if is_first == 1: update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, 0, insert_score) insert_sql += "(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % ( arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, 0, insert_score, 1, 3, type_id) is_first = 0 else: update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, question_id, 0, insert_score) insert_sql += ",(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % ( arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) if is_new == 1: new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, question_id, 0, insert_score, 1, 3, type_id) insert_score += 1 # self.db_fetcher.commit_sql_cmd(insert_sql, 'mysql_logdata') if is_new == 1: # update mysql self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list') # new else: self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list') def getPoints(self): dict_point = {} dict_topic_point = {} sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points' # do something rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row point_id = str(point_id) dict_point[ point_id] = name, ptype, question_type, level, parent_id, link_id if ptype == 3: # arr_topic = [long(x) for x in topics_id.strip().split(',')] for topic in arr_topic: if topic not in dict_topic_point: dict_topic_point[topic] = [] dict_topic_point[topic].append(point_id) return dict_point, dict_topic_point def getStudentResource(self): dict_students_resource = {} str_content = ','.join([str(x) for x in self.dict_students]) sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \ a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % ( str_content) rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, resource_id, deadline = row if resource_id > 0: if student_id not in dict_students_resource: dict_students_resource[student_id] = set() dict_students_resource[student_id].add(resource_id) return dict_students_resource def getQuestionBaseInfo(self, question_set): dict_question_difficulty = {} str_content = ','.join([str(x) for x in question_set]) sql = "select b.id, b.difficulty from neworiental_v3.entity_question where b.id in (%s)" % str_content rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_difficulty def updateStuAdapt2Difficult(self, range_throld=10, difficulty_throld=10): dict_exam_question = {} dict_exam_difficulty = {} dict_question_topic = self.recommend.getThisQuestionTopic() # for exam_item in self.exam_list_records: question, student_id, ret, answer, submit_time, question_type = exam_item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_point = link_point_id.strip().split(',') is_right = 1 if answer == 'A' or answer == 'B' else 0 for point in arr_point: key = '%s-%s' % (student_id, point) if key not in dict_exam_question: dict_exam_question[key] = [] dict_exam_difficulty[key] = {} record = (question, is_right, difficulty, submit_time, question_type) dict_exam_question[key].append(record) if difficulty not in dict_exam_difficulty[key]: dict_exam_difficulty[key][difficulty] = [] dict_exam_difficulty[key][difficulty].append(record) question_set = set([x[0] for x in self.practice_list_records]) dict_question_difficulty = self.getQuestionBaseInfo(question_set) for practice_item in self.practice_list_records: question, student_id, ret, answer, submit_time, question_type = practice_item is_right = 1 if ret == 1 else 0 if question in dict_question_difficulty: difficulty = self.dict_question_difficulty[question] if question in dict_question_topic: topic_set = dict_question_topic[question] for topic in topic_set: if topic in self.dict_topic_point: point = self.dict_topic_point[topic] key = '%s-%s' % (student_id, point) if key not in dict_exam_question: dict_exam_question[key] = [] dict_exam_difficulty[key] = {} if difficulty not in dict_exam_difficulty[key]: dict_exam_difficulty[key][difficulty] = [] record = (question, is_right, difficulty, submit_time, question_type) dict_exam_difficulty[key][difficulty].append( record) dict_exam_question[key].append(record) dict_res = {} self.db_fetcher.commit_sql_cmd('delete from entity_student_feature', 'mysql_logdata') for key in dict_exam_question: arr_content = key.split('-') student_id, point = arr_content[0], arr_content[1] question_records = sorted(dict_exam_question[key], key=lambda x: x[-1], reverse=True) right_cnt = 0 throld = min(len(question_records), range_throld) #if throld < 10: continue for record_item in question_records[:throld]: question, is_right, difficulty, submit_time, question_type = record_item right_cnt += is_right rateRight = right_cnt / range_throld adaDiff, pDiff = 0, -1 for difficulty in dict_exam_difficulty[key]: drecords = sorted(dict_exam_difficulty[key][difficulty], key=lambda x: x[-1], reverse=True) difficulty_throld = min(len(drecords), difficulty_throld) #if difficulty_throld < 4: continue dright_cnt = 0 for diff_item in drecords[:difficulty_throld]: question, is_right, difficulty, submit_time, question_type = diff_item dright_cnt += is_right drateRight = dright_cnt / difficulty_throld if drateRight > pDiff: adaDiff = difficulty pDiff = drateRight point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[ point] insert_sql = "insert into entity_student_feature(student_id, point_id, point_name, adapt_difficulty, master) values(%s, %s, \'%s\', %s, %s)" \ % (student_id, point, point_name, adaDiff, rateRight) insert_id = self.db_fetcher.commit_sql_cmd(insert_sql, 'mysql_logdata') dict_res[key] = rateRight, difficulty return dict_res def getStudentScore(self): dict_student_score = {} for exam_item in self.exam_list_records: exam_question, student_id, ret, answer, submit_time, question_type = exam_item if exam_question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ exam_question] a_score, i_score = self.getScore(question_num, answer) exam_key = '%s-%s' % (student_id, question_exercise_id) if exam_key not in dict_student_score: dict_student_score[exam_key] = 0 dict_student_score[exam_key] += i_score return dict_student_score def getFirstPoint(self): # point dict_relation_point13 = {} sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points \ b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point3, name3, point1, name1 = row dict_relation_point13[point3] = name3, point1, name1 return dict_relation_point13 def getExerciseName(self): dict_exercise_name = {} sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \ on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for exercise_row in exercise_rows: exercise_id, resource_name = exercise_row dict_exercise_name[exercise_id] = resource_name return dict_exercise_name def getExamData(self, dict_student_score, dict_relation_point13, dict_exercise_name): for exam_item in self.exam_list_records: # exam question, student_id, ret, answer, submit_time, question_type = exam_item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = [int(x) for x in link_point_id.strip().split(',')] exam_key = '%s-%s' % (student_id, question_exercise_id) str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True for point3 in arr_points: name3, point1, name1 = dict_relation_point13[point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str(name1) is_first = False else: str_point3 += ',%d' % point3 str_name3 += ',%s' % name3 str_point1 += ',%d' % point1 str_name1 += ',%s' % name1 resource_name = dict_exercise_name[question_exercise_id] exercise_score = dict_student_score[ exam_key] if exam_key in dict_student_score else 0 a_score, i_score = self.getScore(question_num, answer) error_text = self.getErrorText(answer) print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, link_question_id, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, resource_name, exercise_score, question_exercise_id, a_score, i_score, difficulty, ret, question_num, error_text, 1) def getPracticeData(self, dict_relation_point13, dict_question_topic, dict_question_base_info): for practice_item in self.practice_list_records: # practice question, student_id, ret, answer, submit_time, question_type = practice_item str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True if question in dict_question_topic: if question in dict_question_topic: topic_set = dict_question_topic[question] for topic in topic_set: if topic in self.dict_topic_point: arr_point3 = [ int(x) for x in self.dict_topic_point[topic] ] for point3 in arr_point3: name3, point1, name1 = dict_relation_point13[ point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str( name1) is_first = False else: str_point3 += ',%s' % point3 str_name3 += ',%s' % name3 str_point1 += ',%s' % point1 str_name1 += ',%s' % name1 a_score, i_score = 0, 0 # 单选题 选择题 填空题 if question_type == '单选题' or question_type == '选择题' or question_type == '填空题': a_score = 5 else: a_score = 12 if ret == 1: i_score = a_score elif ret == 5: i_score = 0.5 * a_score difficulty, qtype = 1, 1 if question in dict_question_base_info: difficulty, qtype = dict_question_base_info[question] errorText = '正确' if ret == 1 else '错误' print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, question, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score, i_score, difficulty, ret, 0, errorText, 0) def getData(self): dict_student_score = self.getStudentScore() dict_relation_point13 = self.getFirstPoint() # point dict_exercise_name = self.getExerciseName() dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic dict_question_base_info = self.recommend.getThisQuestionBaseInfo( ) # question quality print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \ ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型') self.getExamData(dict_student_score, dict_relation_point13, dict_exercise_name) # exam self.getPracticeData(dict_relation_point13, dict_question_topic, dict_question_base_info) # practice def recommendMonday(self, is_new=0, throld=500): dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic d1 = datetime.datetime.now() d3 = d1 + datetime.timedelta(days=-14) str_monday = d3.strftime("%Y-%m-%d 00:00:00") self.db_fetcher.commit_sql_cmd( "delete from entity_recommend_question_bytopic", 'mysql_white_list') # update self.importDefault(0, dict_question_topic) # import is_first, insert_score = 1, 0 new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values" update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id, topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values" dict_student_cnt = {} records = self.exam_list_records #+ self.practice_list_records for item in records: question, student_id, ret, answer, submit_time, question_type = item if submit_time > str_monday: if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] answer_key = '%s_%s' % (question, student_id) if answer_key in self.dict_content_answer: answer = self.dict_content_answer[answer_key] if answer == 'A' or answer == 'B': ret = 1 else: ret = 2 if ret != 1: if student_id not in dict_student_cnt: dict_student_cnt[student_id] = 0 if dict_student_cnt[student_id] < throld: # print '%s\t%s' % (student_id, link_question_id) if is_first == 1: update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, link_question_id, 0, insert_score) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, link_question_id, 0, insert_score, 1, 3, 2186) is_first = 0 else: update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % ( student_id, link_question_id, 0, insert_score) if is_new == 1: new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % ( student_id, link_question_id, 0, insert_score, 1, 3, 2186) dict_student_cnt[student_id] += 1 insert_score += 1 if is_new == 1: self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list') else: self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')
class CzMathGraph(object): """docstring for czMathGraph""" def __init__(self, stage, subject): self.fetcher = DataBaseFetcher() self.topicDict, self.topicImportDict = self.getTopic(stage, subject) self.quesTopicDict, self.topicQuestionDict = self.getQuestionTopic( ) # static # student exercise self.recordList = self.cacheRecord('question_result.txt') self.topicStudentDict = self.getTopicStudent() # sample def cacheRecord(self, fName, maxWin=3000000): recordList = [] pos = 0 with open(fName) as handleFile: for line in handleFile: arrContent = line.strip().split('\t') studyId, questionId, res = int(arrContent[1]), int( arrContent[2]), arrContent[5] isFlag = False if questionId in self.quesTopicDict: topicSet = self.quesTopicDict[questionId] for topic in topicSet: if topic in self.topicDict: isFlag = True break if isFlag == False: continue # normalize data try: res = int(res) except: try: result = 0 decode_json = json.loads(res) for resDict in decode_json: if resDict['result'] == 1: result += 1 else: result -= 1 res = 1 if result > 0 else 2 except: pass if res != 1: continue # 过滤负样本 recordList.append((studyId, questionId, res)) pos += 1 if pos >= maxWin: break return recordList def getTopicStudent(self): topicStudentDict = {} for item in self.recordList: studyId, questionId, res = item if questionId not in self.quesTopicDict: continue for topic in self.quesTopicDict[questionId]: if topic not in topicStudentDict: topicStudentDict[topic] = {} if studyId not in topicStudentDict[topic]: topicStudentDict[topic][studyId] = 0 topicStudentDict[topic][studyId] += 1 return topicStudentDict # 统计条件概率 def staticTopicProb(self, topic, topicList): if topic not in self.topicStudentDict: return 0.0 fenmuSet = set() # 条件student集合 isFirst = True for item in topicList: if item not in self.topicStudentDict: continue itemSet = set([x for x in self.topicStudentDict[item]]) if isFirst: fenmuSet = itemSet else: fenmuSet = fenmuSet & itemSet fenmu, fenzi = 0.0, 0.0 for student in fenmuSet: for item in topicList: fenmu += self.topicStudentDict[item][student] if student in self.topicStudentDict[topic]: fenzi += self.topicStudentDict[topic][student] p = fenzi / fenmu if fenmu > 0 and fenzi < fenmu else 0.0 return p # 获取词典 def getTopic(self, stage, subjectId): topicDict = {} topicImportDict = {} sql = "select id, name, is_important from entity_topic where subject_id= %s and stage_id = %s" % ( subjectId, stage) db_rows = self.fetcher.get_sql_result(sql, "mysql_v3") for row in db_rows: topicId, name, important = row topicDict[topicId] = name topicImportDict[topicId] = important return topicDict, topicImportDict # 获取问题主题 def getQuestionTopic(self): quesTopicDict = {} topicQuestionDict = {} db_rows = self.fetcher.get_sql_result( "select topic_id, question_id from link_question_topic ", "mysql_v3") for row in db_rows: topicId, questionId = row if topicId not in topicQuestionDict: topicQuestionDict[topicId] = [] topicQuestionDict[topicId].append(questionId) if questionId not in quesTopicDict: quesTopicDict[questionId] = [] quesTopicDict[questionId].append(topicId) quesTopicDict = { key: set(value) for key, value in quesTopicDict.items() } # list to set topicQuestionDict = { key: set(value) for key, value in topicQuestionDict.items() } return quesTopicDict, topicQuestionDict # 统计问题正确率 def statisQuestion(self, subject_id): questionDict = {} db_rows = self.fetcher.get_sql_result( "select question_id, sum(answer_num) answernum, sum(right_num) rightnum from entity_trail_question where subject_id = %s GROUP BY question_id " % subject_id, "mysql_v3") for row in db_rows: questionId, answer, right = row questionDict[questionId] = answer, right return questionDict def isLoopGraph(self, pairTopic, iniRelList, childDict): startTopic, endTopic = pairTopic if startTopic == endTopic: return True # loop stackList = [startTopic] missSet, popSet = set(), set() # init set and pop set while len(stackList) > 0: if endTopic in missSet: return True # the graph is unicom topic = stackList[-1] stackList.pop() # pop if (topic not in childDict) or topic in missSet: continue popSet.add(topic) # add pop set childSet = childDict[topic] for child in childSet: stackList.append(child) missSet.add(child) return False # 构造有像图知识网络 def getTopicDAG(self, threshold=30, inDegreeThr=3, outDegreeThr=4): iniRelList = [] relDict = {} topicList = [topic for topic in self.topicDict] for i in xrange(len(topicList)): for j in xrange(i + 1, len(topicList)): if topicList[i] in self.topicQuestionDict and topicList[ j] in self.topicQuestionDict: iSet, jSet = self.topicQuestionDict[ topicList[i]], self.topicQuestionDict[topicList[j]] key = "%s-%s" % (topicList[i], topicList[j]) mLen = len(iSet & jSet) if mLen > threshold: relDict[key] = mLen sortRel = sorted(relDict.items(), key=lambda x: x[1], reverse=True) inDegreeDict = {} # 入度记录字典 outDegreeDict = {} # 出度记录字典 childDict = {} # child for item in sortRel: inDegreeThr, outDegreeThr = 3, 4 arrTopic = [int(x) for x in item[0].split("-")] if len(arrTopic) != 2: continue start, end = arrTopic[0], arrTopic[1] # 判断转移方向 p0 = self.staticTopicProb(start, [end]) p1 = self.staticTopicProb(end, [start]) pairTopic = (start, end) if p1 < p0 else (end, start) start, end = pairTopic # if self.topicImportDict[start] < 1: outDegreeThr = 1 if self.topicImportDict[end] < 1: inDegreeThr = 1 inDegree = inDegreeDict[end] if end in inDegreeDict else 0 outDegree = outDegreeDict[start] if start in outDegreeDict else 0 isLoop = self.isLoopGraph(pairTopic, iniRelList, childDict) bFlag = (isLoop == False) and (inDegree < inDegreeThr) and ( outDegree < outDegreeThr) if bFlag: iniRelList.append(pairTopic) if end not in childDict: childDict[end] = set() childDict[end].add(start) # 出度 and 入度 outDegreeDict[start] = 1 + outDegree inDegreeDict[end] = 1 + inDegree return iniRelList # 构建系数矩阵数组 def getRowsNormalize(self, topicList): rowsData = [] topicLen = len(topicList) topicIndex = {topicList[i]: i for i in xrange(topicLen)} for item in self.recordList: studyId, questionId, res = item arr = [0] * topicLen if questionId in self.quesTopicDict: topicSet = self.quesTopicDict[questionId] for topic in topicSet: if topic in topicIndex: arr[topicIndex[topic]] = 1 sparr = pd.SparseArray(arr) rowsData.append(arr) return rowsData def makeModel(self): # graph structure initRelation = self.getTopicDAG() # 概率图构建 relationList = [] stuModel = BayesianModel() # DAG for edge in initRelation: try: start, end = edge stuModel.add_edge(str(start), str(end)) relationList.append(edge) except: continue # save file with open('model.txt', 'w') as write_f: for item in relationList: write_f.write( '%s,%s\n' % (self.topicDict[item[0]], self.topicDict[item[1]])) # learning from data topicList = [topic for topic in self.topicDict] rowsData = self.getRowsNormalize(topicList) print 'the rows len is:%s' % len(rowsData)
info += "</tr>" for data in datas: if type(data)==list: info += "<tr>" for k in data: info += "<td>%s</td>" % str(k) info += "</tr>" else: info += "<tr style='background-color:#EEE;'><td colspan='%d'><b>%s</b></td></tr>" % (len(title), str(data)) info += "</table>" return info if __name__=="__main__": domain_list = get_domain_list() now_time = datetime.datetime.now() pre_time = now_time + datetime.timedelta(days=-3) pre_date = pre_time.strftime('%Y-%m-%d') title = '异常抓取站点' data_list = [] for domain in domain_list: sel_sql = "select count(*) from news_report where publish_date >= \'%s 00:00:00\' and domain = \'%s\' and type = 0 limit 1" % (pre_date, domain) db_data = db_fetcher.get_sql_result(sel_sql, 'mysql_insight') domain_count = int(db_data[0][0]) if domain_count == 0: data_list.append(domain) data_list = list(set(data_list)) info = create_html_table(data_list, title) print info
if etype == 0: for key in biandong_list: if unicode(key) in title: etype = 3 return etype if __name__ == "__main__": # update news_report company now_time = datetime.datetime.now() yes_time = now_time + datetime.timedelta(days=-1) yes_date = yes_time.strftime('%Y-%m-%d') condition_sql = "publish_date >= \'%s 00:00:00\' and publish_date <= '%s 23:59:59' and type = 0" % ( yes_date, yes_date) db_data = db_fetcher.get_sql_result( 'select id, title, content from news_report where %s' % condition_sql, 'mysql_insight') for pos in range(len(db_data)): kid, title, content = str(db_data[pos][0]), str(db_data[pos][1]), str( db_data[pos][2]) try: topic_coms = comlib.extract_report( title, content.encode('utf8', 'ignore'), '-1') reported_coms = ",".join([x[0] for x in topic_coms]) com_id, is_confident = comlib.extract_report( title, content.encode('utf8', 'ignore')) reported_com = 0 if com_id and com_id != "0" and is_confident: reported_com = int(com_id) event_type = get_event_type(title) update_sql = "update news_report set reported_com= %s, reported_coms = \'%s\',event_type = %s where id = %s" % (
import sys reload(sys) sys.setdefaultencoding('utf-8') sys.path.append( os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))) from common.db_fetcher import DataBaseFetcher db_fetcher = DataBaseFetcher() if __name__ == "__main__": # update news_report company now_time = datetime.datetime.now() yes_time = now_time + datetime.timedelta(days=-1) yes_date = yes_time.strftime('%Y-%m-%d') db_data = db_fetcher.get_sql_result( 'select id, article_json, publish from kr_articles where publish = \'%s\'' % yes_date, 'mysql_insight') for pos in range(len(db_data)): kid = str(db_data[pos][0]) try: article_json = str(db_data[pos][1]) article_obj = json.loads(article_json, strict=False) url = 'http://36kr.com/p/%s.html' % kid title = article_obj['title'] source = "36Kr" publish_date = '%s 00:00:00' % str(db_data[pos][2]) domain = '36kr.com' content_html = article_obj['content'].encode( 'utf-8' ) if 'content' in article_obj and article_obj['content'] else ''
class TextAnalysis(object): media_dict = None entity_dict = None seo_extractor = SeoExtractor('../seo/com_df_idf') def __init__(self, mysql_fetcher=None, handler_redis=None): self.mysql_fetcher = mysql_fetcher if mysql_fetcher is None: self.mysql_fetcher = DataBaseFetcher() self.handler_redis = handler_redis if handler_redis is None: self.handler_redis = redis.Redis( host='1c34f95e1b12494b.m.cnbja.kvstore.aliyuncs.com', password='******', port=6379, db=5) self.load_entity_dict() def set_media_id(self, media_id): self.media_dict = self.get_media_dict(media_id) # 通过media_id 与向量字典关联 def set_media_dict(self, media_id, forward_index_dict): if media_id in forward_index_dict: self.media_dict = forward_index_dict[media_id] else: self.media_dict = self.get_media_dict(media_id) def load_entity_dict(self): self.entity_dict = set() # handle company company_file = '../../company_recognize/com_name2id.txt' with open(company_file) as com_f: for line in com_f: tokens = line.strip().split('\t') if len(tokens) != 2: continue self.entity_dict.add(tokens[0]) # handle industry db_industry = self.mysql_fetcher.get_sql_result( 'select distinct name from dict_industry', 'mysql_crm') for row_industry in db_industry: self.entity_dict.add(row_industry[0]) # handle organization db_organization = self.mysql_fetcher.get_sql_result( 'select distinct name_abbr from organization', 'mysql_crm') for row_organization in db_organization: self.entity_dict.add(row_organization[0]) # 获取文章关键词-权重值列表 def get_article_weight(self, media_id): response_list = self.seo_extractor.extract_article(media_id, title_weight=2, title_coeff=1.4) if response_list is None or len(response_list) == 0: return {} keyword_dict = {} # result dict for keyword, weight in response_list: if keyword in self.entity_dict: weight *= 1.4 keyword_dict[unicode(keyword, 'utf8')] = weight keyword_dict = { key: value for key, value in keyword_dict.items() if value > term_weight_threshold } if len(keyword_dict) < 5: keyword_dict = { key: value for key, value in keyword_dict.items() if value > 0.68 * term_weight_threshold } return keyword_dict # 删除历史记录的文本向量 def delete_redis_text(self, update_time_range): db_data = self.mysql_fetcher.get_sql_result( "select id from kr_articles where publish < \'%s\'" % update_time_range, 'mysql_insight') for i in range(len(db_data)): media_id = db_data[i][0] str_key = '%s_%s' % (kForwardIndexRedisKey, media_id) self.handler_redis.delete(str_key) # 计算两个向量的余弦相似度 def cos_similarity(self, v1, v2): if len(v1) != len(v2): # 向量维数必须相等 return 0.0 fen_divisor = 0.0 # 除数 dividend_v1 = 0.0 # 被除数 dividend_v2 = 0.0 # 被除数 for pos in range(len(v1)): fen_divisor += v1[pos] * v2[pos] dividend_v1 += v1[pos]**2 dividend_v2 += v2[pos]**2 fen_dividend = math.sqrt(dividend_v1) * math.sqrt(dividend_v2) return (fen_divisor / fen_dividend if fen_dividend > 0.0 else 0.0) # 初始化文本向量 def init_vector(self, vector1, dict_weight): if len(vector1) < 1: return [] v1 = [] # 文本向量vector for pos in range(len(vector1)): if vector1[pos] in dict_weight: v1.append(dict_weight[vector1[pos]]) else: v1.append(0.0) return v1 # 重建media文本信息的正排,并更新redis def media_text_to_forward_index(self, media_id): # 获取需要的字符串 dict_term_weight = self.get_article_weight(media_id) # 存入redis 文本关键词数据 key = media_id str_key = '%s_%s' % (kForwardIndexRedisKey, media_id) str_value = json.dumps(dict_term_weight) self.handler_redis.set(str_key, str_value) return dict_term_weight # 重建update_time_range时间之后的media文本信息的倒排,并更新redis def media_text_to_revert_index(self, update_time_range): #self.delete_redis_text(update_time_range) # 更新redis db_data = self.mysql_fetcher.get_sql_result( 'select id from kr_articles where publish >=\'%s\'' % update_time_range, 'mysql_insight') if len(db_data) < 1: return None revert_index_dict = {} # 倒排索引字典 n_count = 1 for i in range(len(db_data)): media_id = db_data[i][0] # self.cache_media_relate(media_id,update_time_range) # 缓存倒排索引 media_dict = self.get_media_dict(media_id) for term, weight in media_dict.items(): if term not in revert_index_dict: revert_index_dict[term] = {} revert_index_dict[term][media_id] = weight str_key = kRevertIndexRedisKey str_value = str(revert_index_dict) self.handler_redis.set(str_key, str_value) return revert_index_dict # 获取正排 def get_media_dict(self, media_id): media_dict = None str_key = '%s_%s' % (kForwardIndexRedisKey, media_id) str_value = self.handler_redis.get(str_key) # 判断redis 数据库中是否包含media_id 信息 if str_value is None: media_dict = self.media_text_to_forward_index(media_id) else: media_dict = json.loads(str_value) return media_dict # 用倒排计算文本相似度 def calc_simi_using_revert_index(self, revert_index_dict, meta_dict): if revert_index_dict is None: return {} dict_molecule = {} # 分子对象 dict_denominator1 = 0.0 # 分母对象1 result_dict = {} # 结果集字典 key = media_id media_list = [] # 计算所有的d 与该文档的相似度的分子分母 for term, weight in self.media_dict.items(): dict_denominator1 += weight**2 if term not in revert_index_dict: continue doc_weight_list = revert_index_dict[term] for media_id in doc_weight_list: # 获取相关列表 media_list.append(media_id) if media_id in dict_molecule: dict_molecule[ media_id] += weight * doc_weight_list[media_id] else: dict_molecule[ media_id] = weight * doc_weight_list[media_id] for media_id in media_list: if media_id not in meta_dict: continue x, y, dict_denominator2 = meta_dict[media_id] # 分母对象2 fen_dividend = math.sqrt(dict_denominator1) * math.sqrt( dict_denominator2) result_dict[media_id] = dict_molecule[ media_id] / fen_dividend if fen_dividend > 0.0 else 0.0 return result_dict # 构造相似向量 def dict_merge2list(self, d_media, d_compare): vec_words = [] b_similarity = False for media_key in d_media: if media_key in d_compare: vec_words.append(media_key) b_similarity = True return vec_words, b_similarity # 用正排计算文本相似度 def calculate_text_weight(self, compare_id, forward_index_dict): # 判断该向量是否存在 if self.media_dict is None: return 0.0 if compare_id in forward_index_dict: compare_dict = forward_index_dict[compare_id] else: compare_dict = self.get_media_dict(compare_id) # 判断比较向量是否存在 if compare_dict is None: return 0.0 vec_words, b_similarity = self.dict_merge2list(self.media_dict, compare_dict) if b_similarity == False: return 0.0 v1 = self.init_vector(vec_words, self.media_dict) v2 = self.init_vector(vec_words, compare_dict) # 计算余弦相似度值 d_result = self.cos_similarity(v1, v2) return d_result
class Report(object): """docstring for Report""" def __init__(self, update_time='2017-11-01 00:00:00'): # do something inits self.db_fetcher = DataBaseFetcher() self.dict_content_answer = {} # error content self.update_time = update_time # update time self.recommend = RecommendQuestion() self.dict_realtion_quesion, self.dict_question_num, self.dict_question_score = self.getRelationQuestion( ) self.dict_parent_quesion = self.getChildQuestion() self.dict_question_diff = self.getRightDifficulty() self.dict_point, self.dict_topic_point = self.getPoints() self.dict_students = self.getStudentId() self.dict_student_score, self.dict_student_records = {}, { x: [] for x in self.dict_students } self.exam_list_records, self.practice_list_records = self.getExerciseRecord( ) #records # dict self.dict_question_types = { 1: '选择题', 2: '填空题', 4: '简答题', 6: '综合题', 17: '计算题', 44: '证明题' } self.dict_question_difficulty = { 1: '易', 2: '中', 3: '难', 4: '极难' } # 1:易 2:中 3:难 4:极难 # feedback self.dict_students_resource = self.getStudentResource() # resource def getExerciseRecord(self): exam_list_records = [] practice_list_records = [] sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \ neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % ( self.update_time) dict_is_choice = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question, student_id, ret, answer, submit_time, question_type = row student_id = long(student_id) if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len( answer) == 0: continue if question in self.dict_realtion_quesion: exam_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ question, student_id, ret, answer, submit_time, question_type ]) self.dict_content_answer['%s_%s' % (question, student_id)] = answer elif question in self.dict_parent_quesion: dict_content = self.dict_parent_quesion[question] parent_question_id = dict_content['parent_id'] rank = dict_content['rank'] link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ parent_question_id] if question_num >= 17 and question_num <= 21: if rank == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 2: self.dict_content_answer['%s_%s' % (parent_question_id, student_id)] = answer elif question_num >= 22 and question_num <= 23: key = '%s_%s' % (parent_question_id, student_id) if rank == 1: if answer == 'A': dict_is_choice[key] = 1 else: dict_is_choice[key] = 0 elif rank == 2: if key in dict_is_choice: if dict_is_choice[key] == 1: exam_list_records.append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) self.dict_student_records[student_id].append([ parent_question_id, student_id, ret, answer, submit_time, question_type ]) elif rank == 3: self.dict_content_answer[key] = answer else: practice_list_records.append([ question, student_id, ret, answer, submit_time, question_type ]) return exam_list_records, practice_list_records def getStudentId(self, file_name='students.txt'): list_students = [] with open(file_name) as handle_f: for line in handle_f: system_id = long(line.strip()) list_students.append(system_id) set_studends = set(list_students) # students set str_content = ','.join([str(x) for x in set_studends]) sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content dict_students = {} rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, name = row if student_id in set_studends: dict_students[student_id] = name return dict_students def getRightDifficulty(self): dict_question_diff = {} sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_diff def getChildQuestion(self): dict_parent_list = {} dict_parent_question = {} sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \ where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: qid = row[0] parent_id = row[1] if parent_id not in dict_parent_list: dict_parent_list[parent_id] = [] dict_parent_list[parent_id].append(qid) for parent_id in dict_parent_list: sort_question = sorted(dict_parent_list[parent_id], reverse=False) count = 1 for item in sort_question: dict_parent_question[item] = { 'parent_id': parent_id, 'rank': count } count += 1 return dict_parent_question def getRelationQuestion(self): dict_realtion_quesion = {} dict_point_question_num = {} dict_question_score = {} sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type, a.score \ from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id = b.id where a.update_time >= \'%s\'" % self.update_time rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') dict_parent_quesion = {} link_parent_list = [] for row in rows: link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type, score = row dict_realtion_quesion[ link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type dict_question_score[link_question_id] = score arr_point = link_point_id.strip().split(',') for point in arr_point: if point not in dict_point_question_num: dict_point_question_num[point] = [] dict_point_question_num[point].append([ link_question_id, question_num, difficulty, question_exercise_id, question_type ]) return dict_realtion_quesion, dict_point_question_num, dict_question_score def getScore(self, question_num, answer, ascore): ascore, score = 0, 0 if question_num >= 1 and question_num <= 16: score = 5 if (answer == 'A' or answer == 'B') else 0 ascore = 5 elif question_num >= 17 and question_num <= 23: if answer == 'A': if ascore == 10: score = 4 elif ascore == 12: score = 4 elif ascore == 13: score = 5 elif ascore == 14: score = 5 elif ascore == 15: score = 6 elif answer == 'B': if ascore == 10: score = 7 elif ascore == 12: score = 8 elif ascore == 13: score = 9 elif ascore == 14: score = 10 elif ascore == 15: score = 11 elif answer == 'C': if ascore == 10: score = 9 elif ascore == 12: score = 11 elif ascore == 13: score = 12 elif ascore == 14: score = 13 elif ascore == 15: score = 15 elif answer == 'D': score = ascore return score def getRecentlyExercise(throld=5): set_recent = set() sql = "select distinct question_exercise_id from link_question_answer order by question_exercise_id desc limit %s" % throld rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: eid = row eid = long(eid) set_recent.add(eid) return set_recent def getAllScore(self, link_question_id, question_num): ascore = 0 if link_question_id in self.dict_question_score: ascore = self.dict_question_score[link_question_id] else: if question_num >= 1 and question_num <= 16: ascore = 5 elif question_num >= 17 and question_num <= 21: ascore = 12 elif question_num >= 22 and question_num <= 23: ascore = 10 return ascore def statQustionReport(self, range_throld=4, range_target=20): dict_students_point_question = {} dict_student_score = {} for item in self.exam_list_records: question, student_id, ret, answer, submit_time, question_type = item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = link_point_id.strip().split(',') if student_id not in dict_student_score: dict_student_score[student_id] = 0 ascore = self.getAllScore(link_question_id, question_num) score = self.getScore(question_num, answer, ascore) dict_student_score[student_id] += score is_right = (answer == 'A') or (answer == 'B') if student_id not in dict_students_point_question: dict_students_point_question[student_id] = {} # student id point_name = '-'.join([str(x) for x in arr_points]) # name if point_name not in dict_students_point_question[student_id]: dict_students_point_question[student_id][point_name] = [] dict_students_point_question[student_id][point_name].append( (link_question_id, is_right, question_type, difficulty, submit_time)) # question dict_res_student_point_question = {} for student_id in dict_students_point_question: target_rate = 0.5 if student_id in dict_student_score: target_rate = (dict_student_score[student_id] * 0.2 * 1.2) / 150 dict_point_student_question = dict_students_point_question[ student_id] dict_res_student_point_question[student_id] = {} for point_name in dict_point_student_question: arr_question = dict_point_student_question[point_name] target_questions = [] sort_question = sorted(arr_question, key=lambda x: x[-1], reverse=True) right_rate, cnt_rate = 0, range_target if len(sort_question) < cnt_rate: cnt_rate = len(sort_question) for item in sort_question[:cnt_rate]: link_question_id, is_right, qtype, difficulty, submit_time = item if is_right == 0: target_questions.append( (link_question_id, qtype, difficulty, submit_time)) else: right_rate += 1 if right_rate / cnt_rate < target_rate and len( target_questions) > 0: dict_res_student_point_question[student_id][ point_name] = target_questions return dict_res_student_point_question def getErrorText(self, answer): if answer == 'C': return '审题时,粗心看错了' elif answer == 'D': return '审题时,题意读不懂' elif answer == 'E': return '析题时,思路模糊不清' elif answer == 'F': return '析题时,思路方向错误' elif answer == 'G': return '解题时,公式记错了' elif answer == 'H': return '解题时,计算出错了' elif answer == 'I': return '未作答(或未完成),时间来不及了' elif answer == 'A' or answer == 'B': return '答题正确' def getQuestionWords(self): dict_question_words = {} str_question = ','.join([ str(self.dict_realtion_quesion[x][0]) for x in self.dict_realtion_quesion ]) str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \ t.type_id=c.question_type_id where c.id in (%s)" % str_question s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata') for row in s_rows: question, subject_id, type_id, struct_id, json_data = row if json_data != '' and json_data != 'None' and json_data != None: keywords, res = sq.predict(question, subject_id, type_id, struct_id, json_data) dict_question_words[question] = keywords, res return dict_question_words def getPoint2Question(self): dict_point_org_question = {} for answer_id in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ answer_id] arr_point = link_point_id.strip().split(',') for item in arr_point: if item not in dict_point_org_question: dict_point_org_question[item] = [] dict_point_org_question[item].append(link_question_id) return dict_point_org_question def pointsRecQuestion(self, dict_students_point_question, throld=4, error_range=3): # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty # dict_point_org_question = self.getPoint2Question() # org question point dict_question_text = self.getQuestionWords() dict_question_base_info = self.recommend.getThisQuestionBaseInfo( ) # question quality dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic # self.recommendMonday(dict_question_base_info) # error questionsss dict_question_target = {} dict_student_recommend_question = {} cid = 1 student_target = open('student_target.txt', 'w') for student_id in dict_students_point_question: sort_student_item = sorted( dict_students_point_question[student_id].items(), key=lambda x: len(x[-1]), reverse=False) student_name = self.dict_students[student_id] haveTime = 60 recommend_questions = [] pre_set = set() for item_target in sort_student_item: if haveTime <= 0: break item_point, arr_question = item_target item_one = item_point.strip().split('-')[0] key = '%s-%s' % (student_id, item_one) pre_set = pre_set | set([x[0] for x in arr_question]) # if len(arr_question) > error_range: arr_question = arr_question[:error_range] for base_question, qtype, difficulty, submit_time in arr_question: if haveTime <= 0: break if qtype == '选择题': qtype = 1 elif qtype == '填空题': qtype = 2 else: qtype = 3 keywords, text = dict_question_text[base_question] text = text.replace('\n', '').strip('\r').strip() # filter base_question_key = '%s-%s' % (base_question, difficulty) if base_question_key in dict_question_target: target_questions = dict_question_target[ base_question_key] else: target_questions = self.recommend.getEsResult( base_question, text, keywords, difficulty, qtype, pre_set, throld) dict_question_target[ base_question_key] = target_questions for rec_question in target_questions: if haveTime <= 0: break if rec_question in pre_set: continue haveTime += -1 recommend_questions.append( (base_question, keywords, rec_question, item_one, difficulty)) pre_set.add(rec_question) insert_score = 0 for rec_item in recommend_questions: source_question, keywords, rec_question, item_point, difficulty = rec_item point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[ item_point] type_id = 2186 if rec_question in dict_question_base_info: difficulty, question_type, upload_id = dict_question_base_info[ rec_question] else: upload_id = 0 if rec_question in dict_question_topic: set_topic = dict_question_topic[rec_question] if len(set_topic) > 0: type_id = set_topic.pop() str_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s\t%s\t1\t%s\t%s\t针对练习\t重点\t%s\t%s\t%s\t%s\t%s" % ( cid, student_id, rec_question, 4, insert_score, str_time, 1, 3, type_id) student_target.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (cid, student_id, student_name, item_point, point_name, source_question, rec_question, keywords, difficulty)) insert_score += 1 cid += 1 def getPoints(self): dict_point = {} dict_topic_point = {} sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points' # do something rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row point_id = str(point_id) dict_point[ point_id] = name, ptype, question_type, level, parent_id, link_id if ptype == 3: # arr_topic = [long(x) for x in topics_id.strip().split(',')] for topic in arr_topic: if topic not in dict_topic_point: dict_topic_point[topic] = [] dict_topic_point[topic].append(point_id) return dict_point, dict_topic_point def getStudentResource(self): dict_students_resource = {} str_content = ','.join([str(x) for x in self.dict_students]) sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \ a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % ( str_content) rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: student_id, resource_id, deadline = row if resource_id > 0: if student_id not in dict_students_resource: dict_students_resource[student_id] = set() dict_students_resource[student_id].add(resource_id) return dict_students_resource def getQuestionBaseInfo(self, question_set): dict_question_difficulty = {} str_content = ','.join([str(x) for x in question_set]) sql = "select b.id, b.difficulty from neworiental_v3.entity_question where b.id in (%s)" % str_content rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, difficulty = row dict_question_diff[question_id] = int(difficulty) return dict_question_difficulty def getStudentScore(self): dict_student_score = {} for exam_item in self.exam_list_records: exam_question, student_id, ret, answer, submit_time, question_type = exam_item if exam_question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ exam_question] ascore = self.getAllScore(link_question_id, question_num) i_score = self.getScore(question_num, answer, ascore) exam_key = '%s-%s' % (student_id, question_exercise_id) if exam_key not in dict_student_score: dict_student_score[exam_key] = 0 dict_student_score[exam_key] += i_score return dict_student_score def getFirstPoint(self): # point dict_relation_point13 = {} sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points \ b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: point3, name3, point1, name1 = row dict_relation_point13[point3] = name3, point1, name1 return dict_relation_point13 def getExerciseName(self): dict_exercise_name = {} sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \ on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for exercise_row in exercise_rows: exercise_id, resource_name = exercise_row dict_exercise_name[exercise_id] = resource_name return dict_exercise_name def getExamData(self, dict_student_score, dict_relation_point13, dict_exercise_name): for exam_item in self.exam_list_records: # exam question, student_id, ret, answer, submit_time, question_type = exam_item if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] arr_points = [int(x) for x in link_point_id.strip().split(',')] exam_key = '%s-%s' % (student_id, question_exercise_id) str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True for point3 in arr_points: name3, point1, name1 = dict_relation_point13[point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str(name1) is_first = False else: str_point3 += ',%d' % point3 str_name3 += ',%s' % name3 str_point1 += ',%d' % point1 str_name1 += ',%s' % name1 resource_name = dict_exercise_name[question_exercise_id] exercise_score = dict_student_score[ exam_key] if exam_key in dict_student_score else 0 a_score, i_score = self.getScore(question_num, answer) error_text = self.getErrorText(answer) print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, link_question_id, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, resource_name, exercise_score, question_exercise_id, a_score, i_score, difficulty, ret, question_num, error_text, 1) def getPracticeData(self, dict_relation_point13, dict_question_topic, dict_question_base_info): for practice_item in self.practice_list_records: # practice question, student_id, ret, answer, submit_time, question_type = practice_item str_point1, str_name1, str_point3, str_name3 = '', '', '', '' is_first = True if question in dict_question_topic: if question in dict_question_topic: topic_set = dict_question_topic[question] for topic in topic_set: if topic in self.dict_topic_point: arr_point3 = [ int(x) for x in self.dict_topic_point[topic] ] for point3 in arr_point3: name3, point1, name1 = dict_relation_point13[ point3] if is_first == True: str_point3, str_name3, str_point1, str_name1 = str( point3), str(name3), str(point1), str( name1) is_first = False else: str_point3 += ',%s' % point3 str_name3 += ',%s' % name3 str_point1 += ',%s' % point1 str_name1 += ',%s' % name1 a_score, i_score = 0, 0 # 单选题 选择题 填空题 if question_type == '单选题' or question_type == '选择题' or question_type == '填空题': a_score = 5 else: a_score = 12 if ret == 1: i_score = a_score elif ret == 5: i_score = 0.5 * a_score difficulty, qtype = 1, 1 if question in dict_question_base_info: difficulty, qtype, upload_id = dict_question_base_info[ question] errorText = '正确' if ret == 1 else '错误' print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( student_id, question, question_type, str_point1, str_name1, str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score, i_score, difficulty, ret, 0, errorText, 0) def getData(self): dict_student_score = self.getStudentScore() dict_relation_point13 = self.getFirstPoint() # point dict_exercise_name = self.getExerciseName() dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic dict_question_base_info = self.recommend.getThisQuestionBaseInfo( ) # question quality print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \ ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型') self.getExamData(dict_student_score, dict_relation_point13, dict_exercise_name) # exam self.getPracticeData(dict_relation_point13, dict_question_topic, dict_question_base_info) # practice def recommendMonday(self, dict_question_base_info, is_new=1, throld=500): dict_question_topic = self.recommend.getThisQuestionTopic( ) # question topic d1 = datetime.datetime.now() d3 = d1 + datetime.timedelta(days=-7) str_monday = d3.strftime("%Y-%m-%d 00:00:00") is_first, insert_score = 1, 0 dict_student_cnt = {} records = self.exam_list_records #+ self.practice_list_records for item in records: question, student_id, ret, answer, submit_time, question_type = item if submit_time > str_monday: if question in self.dict_realtion_quesion: link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[ question] answer_key = '%s_%s' % (question, student_id) if answer_key in self.dict_content_answer: answer = self.dict_content_answer[answer_key] if answer == 'A' or answer == 'B': ret = 1 else: ret = 2 if ret != 1: if student_id not in dict_student_cnt: dict_student_cnt[student_id] = 0 if dict_student_cnt[student_id] < throld: # print '%s\t%s' % (student_id, link_question_id) if link_question_id in dict_question_base_info: difficulty, question_type, upload_id = dict_question_base_info[ link_question_id] else: upload_id = 0 print "%s\t1\t%s\t%s\t\'最近新错\'\t\'重难点\'\t%s\t%s\t%s\t%s" % ( student_id, link_question_id, 0, insert_score, 1, 3, 2186) dict_student_cnt[student_id] += 1 insert_score += 1
class SimiTags(object): def __init__(self): self.db_fetcher = DataBaseFetcher() self.tag2comlist = self.load_info( 'select cid,tag from crm.company_tag') self.tag_child = self.load_info( 'select tag,parent_tag from crm.tag_tag_relation') self.tag_parent = self.load_info( 'select parent_tag,tag from crm.tag_tag_relation') self.total_num, self.manual_tag_count = self.tag_cid() def load_info(self, cmd): temp_dict = {} db_row = self.db_fetcher.get_sql_result(cmd, 'mysql_readall') for row in db_row: leaf, root = row if root not in temp_dict: temp_dict[root] = [] temp_dict[root].append(leaf) return temp_dict def tag_cid(self): db_row = self.db_fetcher.get_sql_result( 'select count(distinct cid) from crm.company_tag', 'mysql_readall') total_num = db_row[0][0] manual_tag_count = {} db_row = self.db_fetcher.get_sql_result( 'select cid,tag from crm.company_tag where type != 5', 'mysql_readall') for row in db_row: crm_id, tag = row if tag not in manual_tag_count: manual_tag_count[tag] = 0 manual_tag_count[tag] += 1 return total_num, manual_tag_count def get_correlation(self): similarity_tag = {} tag_corre_dict = {} f = 0 for tag1 in self.tag2comlist: tag_corre_dict = {} if tag1.lower() != '人工智能': continue if tag1 not in self.manual_tag_count: continue manual_num_x = self.manual_tag_count[tag1] if manual_num_x < 30: continue l1 = self.tag2comlist[tag1] sum_x = len(l1) if sum_x < 100: continue for tag2 in self.tag2comlist: if tag1 == tag2: continue if tag2 not in self.tag2comlist or tag2 not in self.manual_tag_count: continue manual_num = self.manual_tag_count[tag2] if manual_num < 2: continue l2 = self.tag2comlist[tag2] sum_y = len(l2) if sum_y > sum_x * 1.5: continue xy = len(set(l1) & set(l2)) if xy < 5: continue pf = xy * 1.0 / sum_x if pf > 0.6: continue pc = xy * 1.0 / sum_y pnc = (sum_x - xy + 1) * 1.0 / (self.total_num - sum_y) if pc < 0.01: continue if pnc != 0: value = pc / pnc if value < 1: continue correlation = math.pow(xy, 0.2) * math.pow(value, 0.8) tag_corre_dict[tag2] = correlation tag_items = tag_corre_dict.items() tag_heap = heapq.nlargest(len(tag_items), tag_items, key=lambda x: x[-1]) tag_heap = tag_heap[:40] if len(tag_heap) == 0: continue re = map(lambda x: x[0], tag_heap) if len(re) == 0: continue similarity_tag[tag1] = re print tag1, '\t', "%s" % json.dumps( re, ensure_ascii=False).encode('utf8') return similarity_tag def rm_sub(self, tag_heap): abandon_l = [] for i in range(0, len(tag_heap)): tag1, weight = tag_heap[i] if tag1 not in self.tag2comlist: continue l1 = self.tag2comlist[tag1] sum_x = len(l1) tag1_p, tag1_c = [], [] if tag1 in self.tag_child: tag1_c = self.tag_child[tag1] if tag1 in self.tag_parent: tag1_p = self.tag_parent[tag1] for j in range(i + 1, len(tag_heap) - 1): tag2, weight = tag_heap[j] if tag2 in tag1_p or tag2 in tag1_c: abandon_l.append(j) continue if tag1.find(tag2) != -1 or tag2.find(tag1) != -1: abandon_l.append(j) continue l2 = self.tag2comlist[tag2] sum_y = len(l2) xy = len(set(l1) & set(l2)) if (xy * 1.0 / sum_y) > 0.8: abandon_l.append(j) if (xy * 1.0 / sum_x) > 0.6: abandon_l.append(j) re_l = [] for i in range(0, len(tag_heap) - 1): if i in abandon_l: continue else: re_l.append(tag_heap[i][0]) return re_l
class RecommendQuestion(object): def __init__(self): self.db_fetcher = DataBaseFetcher() # mysql handle self.dict_topic = self.getTopicDict() # topic self.dict_quality = {5: 25, 4: 20, 3: 15, 2: 10, 1: 5} self.dict_question_base_info = self.getQuestionBaseInfo() # base self.dict_question_quality = self.getQuestionQuality() self.dict_question_topic, self.dict_topic_question = self.getQuestionTopic( ) # topic def getThisQuestionTopic(self): return self.dict_question_topic def getThisQuestionBaseInfo(self): return self.dict_question_base_info def getQuestionBaseInfo(self, save_file='question_base.txt'): dict_question_base_info = {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue qid, difficulty, question_type, upload_id = long(arr[0]), int( arr[1]), int(arr[2]), str(arr[3]).strip() try: upload_id = long(upload_id) except: upload_id = 0 dict_question_base_info[ qid] = difficulty, question_type, upload_id max_num = qid sql = "select id, difficulty, question_type, upload_id from neworiental_v3.entity_question where subject_id = 4 and id > %s and state != \'DISABLED\' order by id asc" % max_num rows = self.db_fetcher.get_sql_result(sql, "mysql_logdata") with open(save_file, 'a') as write_f: for row in rows: qid, difficulty, str_question_type, upload_id = row question_type = 1 if str_question_type == '选择题' or str_question_type == '单选题': question_type = 1 elif str_question_type == '填空题': question_type = 2 else: question_type = 3 dict_question_base_info[ qid] = difficulty, question_type, upload_id write_f.write('%s\t%s\t%s\t%s\n' % (qid, difficulty, question_type, upload_id)) return dict_question_base_info def getQuestionQuality(self, save_file='quality.txt'): dict_question_quality = {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue qid, question_id, extra_score = arr[0], long(arr[1]), int( arr[2]) dict_question_quality[question_id] = extra_score max_num = qid sql = "select id, question_id, extra_score from neworiental_v3.entity_question_quality where id > %s" % max_num rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') with open(save_file, 'a') as write_f: for row in rows: qid, question_id, extra_score = row dict_question_quality[question_id] = extra_score write_f.write('%s\t%s\t%s\n' % (qid, question_id, extra_score)) return dict_question_quality def getQuestionTopic(self, save_file='topic.txt'): dict_question_topic, dict_topic_question = {}, {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue tid, question_id, topic_id = arr[0], arr[1], arr[2] question_id = long(question_id) topic_id = long(topic_id) if question_id not in dict_question_topic: dict_question_topic[question_id] = set() if topic_id not in dict_topic_question: dict_topic_question[topic_id] = {} extra_score = 3 if question_id in self.dict_question_quality: extra_score = self.dict_question_quality[question_id] difficulty, question_type = 1, 1 if question_id in self.dict_question_base_info: difficulty, question_type, upload_id = self.dict_question_base_info[ question_id] dict_question_topic[question_id].add(topic_id) # add key = '%s-%s' % (difficulty, question_type) if key not in dict_topic_question[topic_id]: dict_topic_question[topic_id][key] = [] dict_topic_question[topic_id][key].append( (question_id, extra_score)) max_num = tid sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') with open(save_file, 'a') as write_f: for row in rows: tid, question_id, topic_id = row if question_id not in dict_question_topic: dict_question_topic[question_id] = set() if topic_id not in dict_topic_question: dict_topic_question[topic_id] = {} extra_score = 3 if question_id in self.dict_question_quality: extra_score = self.dict_question_quality[question_id] difficulty, question_type = 1, 1 if question_id in self.dict_question_base_info: difficulty, str_question_type, upload_id = self.dict_question_base_info[ question_id] if str_question_type == '选择题' or str_question_type == '单选题': question_type = 1 elif str_question_type == '填空题': question_type = 2 else: question_type = 3 key = '%s-%s' % (difficulty, question_type) dict_question_topic[question_id].add(topic_id) # add if key not in dict_topic_question[topic_id]: dict_topic_question[topic_id][key] = [] dict_topic_question[topic_id][key].append( (question_id, extra_score)) write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id)) for topic in dict_topic_question: # sort for key in dict_topic_question[topic]: dict_topic_question[topic][key] = sorted( dict_topic_question[topic][key], key=lambda x: x[-1], reverse=True) # return dict_question_topic, dict_topic_question def getTopicDict(self): dict_topic = {} sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: topic_id, name = row topic_id = str(topic_id) dict_topic[topic_id] = name return dict_topic def getQuestionRate(self): dict_question_rate = {} sql = "select question_id, rate from question_accuracy" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: question_id, rate = row rate = float(rate) dict_question_rate[question_id] = rate return dict_question_rate def getHeaders(self, teacher_id='eefbac5a4e58432ca317642a2077d362'): return { 'Host': 'jiaoshi.okjiaoyu.cn', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'is_new_okay': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Referer': 'http://jiaoshi.okjiaoyu.cn/quizcenter_vm/quizcenter', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cookie': '_ga=GA1.2.1647528971.1502935646; _gid=GA1.2.1532869714.1506481969; org_id=113; teacher_id=%s; Hm_lvt_2014de1ca4ec84db492ebee33b1dc46c=1506044442,1506324027,1506407711,1506479104; Hm_lpvt_2014de1ca4ec84db492ebee33b1dc46c=1506492873' % teacher_id } def getUrlContent(self, url): req = urllib2.Request(url, headers=self.getHeaders()) hh = urllib2.HTTPHandler() opener = urllib2.build_opener(hh) reply = opener.open(req, timeout=30) if reply.info().get('Content-Encoding') == 'gzip': buf = StringIO(reply.read()) f = gzip.GzipFile(fileobj=buf) res = f.read() f.close() buf.close() reply.close() else: res = reply.read() return res def normalLeven(self, str1, str2): len_str1 = len(str1) + 1 len_str2 = len(str2) + 1 #create matrix matrix = [0 for n in range(len_str1 * len_str2)] #init x axis for i in range(len_str1): matrix[i] = i #init y axis for j in range(0, len(matrix), len_str1): if j % len_str1 == 0: matrix[j] = j // len_str1 for i in range(1, len_str1): for j in range(1, len_str2): if str1[i - 1] == str2[j - 1]: cost = 0 else: cost = 1 matrix[j * len_str1 + i] = min( matrix[(j - 1) * len_str1 + i] + 1, matrix[j * len_str1 + (i - 1)] + 1, matrix[(j - 1) * len_str1 + (i - 1)] + cost) return matrix[-1] def translate(self, str): line = str.strip().decode('utf-8', 'ignore') p2 = re.compile(ur'[^\u4e00-\u9fa5]') zh = " ".join(p2.split(line)).strip() zh = ",".join(zh.split()) outStr = zh return outStr def calScoreQustion(self, question_id, list_rank): dict_score = {} pos = 0 dict_qid_type = {} set_source = self.dict_question_topic[ question_id] if question_id in self.dict_question_topic else set() for item in list_rank: qid, qtype, topic_list, extra_score = item set_target = self.dict_question_topic[ qid] if qid in self.dict_question_topic else set() score = (1 - numpy.tanh( 0.03 * pos)) * 60 + self.dict_quality[extra_score] if len(set_target & set_source) > 0: score += 15 dict_score[qid] = score dict_qid_type[qid] = qtype pos += 1 sort_res = sorted(dict_score.items(), key=lambda x: x[1], reverse=True) list_res = [] for item in sort_res: qid = item[0] qtype = dict_qid_type[qid] list_res.append((qid, qtype)) return list_res def randomTopicHeader(self, question_id, list_res, pre_set, rank_size, throld_size): # 头部数据处理 if question_id in self.dict_question_topic: topic_set = self.dict_question_topic[question_id] tLen = len(topic_set) if tLen > 0: index = random.randint(0, tLen - 1) topic = list(topic_set)[index] # first if question_id in self.dict_question_base_info: difficulty, qtype, upload_id = self.dict_question_base_info[ question_id] key = '%s-%s' % (difficulty, qtype) if key in self.dict_topic_question[topic]: list_res_question = [] arr_content_question = self.dict_topic_question[topic][ key] for content_item in arr_content_question: res_question, extra_score = content_item if res_question not in pre_set and res_question != question_id: list_res_question.append(res_question) if len(list_res_question) > rank_size: break rnt = throld_size - len(list_res) while rnt > 0 and len(list_res_question) > 0: pos = random.randint(0, len(list_res_question) - 1) list_res.append(list_res_question[pos]) rnt += -1 return list_res def getEsResult(self, question_id, text, keywords, difficulty, bqtype, pre_set, throld_size, throld_page=10, throld_cost=5, rank_size=20): """ questionType: 1 选择题 2 填空题 3 判断题 4 简答题 6 综合题 difficulty : 1:易 2:中 3:难 4:极难 """ list_rank = [] headers = self.getHeaders() # get headers str_pre = '' if bqtype == 1: bqtype = 101 for page in range(1, throld_page): try: url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?keyword=%s&subject_id=4&queryType=4&difficulty=%s&type=%s&page=%s" % ( keywords, difficulty, bqtype, page) # url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?_=1506480814420&teacher_id=ce36ea7e9c864af39d186b8e8b672864&type=&difficulty=%s&page=%s&keyword=%s&subject_id=4" % (difficulty, page, keywords) res = self.getUrlContent(url) res = res.replace('\r', '').replace('\n', '').replace( 'null', '').replace(':,', ':\"\",').strip() dict_result = json.loads(res.encode('utf8')) total_page = dict_result['data']['total_page'] if total_page < throld_page: throld_page = total_page # range update list_result = dict_result['data']['list'] for item in list_result: qid = item['id'] question_body = self.translate(item['question_body']) analysis = str(item['analysis']).strip().replace(' ', '') topic_list = item['topic_list'] if len(topic_list) >= 1: type_name = item['type_name'] if type_name == '选择题': option_list = item['option_list'] if len(option_list) != 4: continue cost = self.normalLeven(str_pre, question_body) if qid in self.dict_question_quality: extra_score = self.dict_question_quality[qid] else: extra_score = 1 if item['type_alias'] == '单选题' or item[ 'type_alias'] == '选择题': qtype = 1 elif item['type_alias'] == '填空题': qtype = 2 else: qtype = 3 bCharge = (cost > throld_cost) and (len(analysis) > 50) if bCharge: list_rank.append( (qid, qtype, item['topic_list'], extra_score)) str_pre = question_body except: continue list_index = self.calScoreQustion(question_id, list_rank) # rank res_size = throld_size - 1 if len( list_index) > throld_size - 1 else len(list_index) list_res = [x[0] for x in list_index[:res_size]] # end output list_res = self.randomTopicHeader( question_id, list_res, pre_set, rank_size, throld_size) # header question random return list_res
class QualityQuestion(object): """docstring for QualityQuestion""" def __init__(self): self.db_fetcher = DataBaseFetcher() # mysql handle self.redis_cache = CacheHelper('127.0.0.1', 6379, 1, '') # redis self.dict_simple_science, self.dict_simple_art = self.getSimpleQustion( ) self.dict_question_topic = self.getQuestionTopic() self.dict_topic = self.getTopicDict() def getQuestionTopic(self, save_file='topic.txt'): dict_question_topic = {} max_num = 0 with open(save_file) as handle_f: for line in handle_f: arr = line.strip().split('\t') if len(arr) != 3: continue tid, question_id, topic_id = arr[0], long(arr[1]), int(arr[2]) if question_id not in dict_question_topic: dict_question_topic[question_id] = set() dict_question_topic[question_id].add(topic_id) max_num = tid sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') with open(save_file, 'a') as write_f: for row in rows: tid, question_id, topic_id = row if question_id not in dict_question_topic: dict_question_topic[question_id] = set() dict_question_topic[question_id].add(topic_id) write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id)) return dict_question_topic def getTopicDict(self): dict_topic = {} sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4" rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: topic_id, name = row topic_id = int(topic_id) dict_topic[topic_id] = name return dict_topic def makeSet2Dict(self, set_simple): dict_simple = {} for item in set_simple: arr = item.split('\t') qid, question_type, json_data = arr[0], arr[1], arr[2] qid = long(qid) dict_simple[qid] = question_type, json_data return dict_simple def getSimpleQustion(self): dict_simple_science = {} dict_simple_art = {} is_redis = self.redis_cache.exists( '62951166961') and self.redis_cache.exists('62951084550') if is_redis: # science set_simple_science = self.redis_cache.smembers('62951166961') dict_simple_science = self.makeSet2Dict(set_simple_science) # art set_simple_art = self.redis_cache.smembers('62951084550') dict_simple_art = self.makeSet2Dict(set_simple_art) else: sql = "SELECT id, question_type, json_data, upload_id from neworiental_v3.entity_question where upload_id in (62951166961, 62951084550) and json_data !=\'\' " rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata') for row in rows: qid, question_type, json_data, upload_id = row upload_id = str(upload_id) json_data = json_data.replace('\t', '') if upload_id == '62951166961': # students majored in science dict_simple_science[qid] = question_type, json_data elif upload_id == '62951084550': dict_simple_art[qid] = question_type, json_data str_content = '%s\t%s\t%s' % (qid, question_type, json_data) self.redis_cache.sadd(upload_id, str_content) return dict_simple_science, dict_simple_art def getTopicSimpleQuestion(self): dict_topic_question = {} for qid in self.dict_simple_art: if qid in self.dict_question_topic: set_topic = self.dict_question_topic[qid] for topic in set_topic: if topic not in dict_topic_question: dict_topic_question[topic] = set() dict_topic_question[topic].add(qid) for topic in dict_topic_question: set_topic = dict_topic_question[topic] str_content = '\t'.join([str(x) for x in set_topic]) print '%s\t%s\t%s' % (topic, self.dict_topic[topic], str_content)