Exemplo n.º 1
0
sys.setdefaultencoding('utf-8')
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
import json,math,MySQLdb,datetime,time
import similary_question as sq

from recommend import RecommendQuestion
from common.db_fetcher import DataBaseFetcher

recommend = RecommendQuestion() # object
db_fetcher = DataBaseFetcher()

question_list = [] # set
delete_sql = "DELETE FROM error_question_log WHERE question_id IN ( select c.question_id from (SELECT a.question_id FROM error_question_log a JOIN neworiental_v3.entity_question b ON a.question_id = b.id WHERE b.subject_id != 4) c)"
db_fetcher.commit_sql_cmd(delete_sql, 'mysql_logdata') # update

rows = db_fetcher.get_sql_result("select question_id from error_question_log", 'mysql_logdata')
for row in rows:
    qid = row[0]
    question_list.append(qid)

def getQuestionWords(question_list): # question word
    dict_question_info = {}
    lSet = int(len(question_list) / 1000) + 2
    start, end = 0, 0
    for i in range(1, lSet):
        end = i * 1000
        if end > len(question_list): end = len(question_list)
        str_question = ','.join([str(x) for x in question_list[start:end] ])
        str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data, c.difficulty, c.question_type from neworiental_v3.entity_question c left \
            join neworiental_v3.entity_question_type t on t.type_id=c.question_type_id where c.id in (%s) and subject_id = 4" % str_question
Exemplo n.º 2
0
class Report(object):
    """docstring for Report"""
    def __init__(self, exercise_id, update_time='2017-11-01 00:00:00'):
        # do something inits
        self.db_fetcher = DataBaseFetcher()
        self.exercise_id = exercise_id
        self.dict_content_answer = {}  # error content
        self.curr_time = datetime.datetime.now()

        self.update_time = update_time  # update time
        self.recommend = RecommendQuestion()

        self.dict_realtion_quesion, self.dict_question_num = self.getRelationQuestion(
        )
        self.dict_parent_quesion = self.getChildQuestion()
        self.dict_question_diff = self.getRightDifficulty()
        self.dict_point, self.dict_topic_point = self.getPoints()

        self.dict_students = self.getStudentId()
        self.dict_student_score, self.dict_student_records = {}, {
            x: []
            for x in self.dict_students
        }

        self.exam_list_records, self.practice_list_records = self.getExerciseRecord(
        )  #records

        # dict
        self.dict_question_types = {
            1: '选择题',
            2: '填空题',
            4: '简答题',
            6: '综合题',
            17: '计算题',
            44: '证明题'
        }
        self.dict_question_difficulty = {
            1: '易',
            2: '中',
            3: '难',
            4: '极难'
        }  # 1:易 2:中 3:难 4:极难

        # feedback
        self.dict_students_resource = self.getStudentResource()  # resource

    def getExerciseRecord(self):
        exam_list_records = []
        practice_list_records = []
        sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \
                neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % (
            self.update_time)

        dict_is_choice = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question, student_id, ret, answer, submit_time, question_type = row
            student_id = long(student_id)

            if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len(
                    answer) == 0:
                continue
            if question in self.dict_realtion_quesion:
                exam_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

                self.dict_student_records[student_id].append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])
                self.dict_content_answer['%s_%s' %
                                         (question, student_id)] = answer
            elif question in self.dict_parent_quesion:
                dict_content = self.dict_parent_quesion[question]
                parent_question_id = dict_content['parent_id']

                rank = dict_content['rank']
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    parent_question_id]
                if question_num >= 17 and question_num <= 21:
                    if rank == 1:
                        exam_list_records.append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                        self.dict_student_records[student_id].append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                    elif rank == 2:
                        self.dict_content_answer['%s_%s' %
                                                 (parent_question_id,
                                                  student_id)] = answer
                elif question_num >= 22 and question_num <= 23:
                    key = '%s_%s' % (parent_question_id, student_id)
                    if rank == 1:
                        if answer == 'A':
                            dict_is_choice[key] = 1
                        else:
                            dict_is_choice[key] = 0
                    elif rank == 2:
                        if key in dict_is_choice:
                            if dict_is_choice[key] == 1:
                                exam_list_records.append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                                self.dict_student_records[student_id].append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                    elif rank == 3:
                        self.dict_content_answer[key] = answer
            else:
                practice_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

        return exam_list_records, practice_list_records

    def getStudentId(self, file_name='students.txt'):
        list_students = []
        with open(file_name) as handle_f:
            for line in handle_f:
                system_id = long(line.strip())
                list_students.append(system_id)

        set_studends = set(list_students)  # students set
        str_content = ','.join([str(x) for x in set_studends])
        sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content
        dict_students = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, name = row
            if student_id in set_studends:
                dict_students[student_id] = name

        return dict_students

    def getRightDifficulty(self):
        dict_question_diff = {}
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on  b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_diff

    def getChildQuestion(self):
        dict_parent_list = {}
        dict_parent_question = {}
        sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \
                where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            qid = row[0]
            parent_id = row[1]
            if parent_id not in dict_parent_list:
                dict_parent_list[parent_id] = []

            dict_parent_list[parent_id].append(qid)

        for parent_id in dict_parent_list:
            sort_question = sorted(dict_parent_list[parent_id], reverse=False)
            count = 1
            for item in sort_question:
                dict_parent_question[item] = {
                    'parent_id': parent_id,
                    'rank': count
                }
                count += 1

        return dict_parent_question

    def getRelationQuestion(self):
        dict_realtion_quesion = {}
        dict_point_question_num = {}
        sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id = \
            b.id  where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        dict_parent_quesion = {}
        link_parent_list = []
        for row in rows:
            link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = row
            dict_realtion_quesion[
                link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type
            arr_point = link_point_id.strip().split(',')
            for point in arr_point:
                if point not in dict_point_question_num:
                    dict_point_question_num[point] = []

                dict_point_question_num[point].append([
                    link_question_id, question_num, difficulty,
                    question_exercise_id, question_type
                ])

        return dict_realtion_quesion, dict_point_question_num

    def getScore(self, question_num, answer):
        ascore, score = 0, 0
        if question_num >= 1 and question_num <= 16:
            score = 5 if (answer == 'A' or answer == 'B') else 0
            ascore = 5
        elif question_num >= 17 and question_num <= 21:
            if answer == 'A':
                score = 4
            elif answer == 'B':
                score = 7
            elif answer == 'C':
                score = 9
            elif answer == 'D':
                score = 12
            ascore = 12
        elif question_num >= 22 and question_num <= 23:
            if answer == 'A':
                score = 4
            elif answer == 'B':
                score = 7
            elif answer == 'C':
                score = 9
            elif answer == 'D':
                score = 10
            ascore = 10

        return ascore, score

    def statQustionReport(self):
        dict_question_count, dict_question_right = {}, {}
        dict_point_count, dict_point_right = {}, {}
        dict_error_students, dict_right_students = {}, {}

        for item in self.exam_list_records:
            question, student_id, ret, answer, submit_time, question_type = item

            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = link_point_id.strip().split(',')
                ascore, score = self.getScore(question_num, answer)
                is_right = (answer == 'A') or (answer == 'B')

                # 1 point
                for point in arr_points:
                    if point in self.dict_point:
                        name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                            point]
                        if name not in dict_point_count:
                            dict_point_count[name] = 0
                            dict_point_right[name] = 0

                        dict_point_right[name] += 1 if (
                            question_num <= 16 and is_right) or (
                                question_num >= 17 and answer == 'D') else 0  #
                        dict_point_count[name] += 1

                if link_question_id not in dict_question_count:
                    dict_question_right[link_question_id] = 0
                    dict_question_count[link_question_id] = 0

                dict_question_right[
                    link_question_id] += 1 if ret == 1 else 0  #
                dict_question_count[link_question_id] += 1

                # questions
                if link_question_id not in dict_right_students:
                    dict_right_students[link_question_id] = []
                    dict_error_students[link_question_id] = []

                if (question_num <= 16 and is_right) or (question_num >= 17
                                                         and answer == 'D'):
                    dict_right_students[link_question_id].append(student_id)
                else:
                    dict_error_students[link_question_id].append(student_id)

                if student_id not in self.dict_student_score:
                    self.dict_student_score[student_id] = 0

                self.dict_student_score[student_id] += score

        dict_point_acc = {}
        for point_name in dict_point_count:
            acc = dict_point_right[point_name] / dict_point_count[point_name]
            dict_point_acc[point_name] = acc
            #print '%s \t%s' % (point_name, acc)

        # students
        dict_students_point_question = self.calStudentPoints(dict_point_acc)
        return dict_students_point_question

    def getErrorText(self, answer):
        if answer == 'C':
            return '审题时,粗心看错了'
        elif answer == 'D':
            return '审题时,题意读不懂'
        elif answer == 'E':
            return '析题时,思路模糊不清'
        elif answer == 'F':
            return '析题时,思路方向错误'
        elif answer == 'G':
            return '解题时,公式记错了'
        elif answer == 'H':
            return '解题时,计算出错了'
        elif answer == 'I':
            return '未作答(或未完成),时间来不及了'
        elif answer == 'A' or answer == 'B':
            return '答题正确'

    def getQuestionWords(self):
        dict_question_words = {}
        str_question = ','.join([
            str(self.dict_realtion_quesion[x][0])
            for x in self.dict_realtion_quesion
        ])
        str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \
                    t.type_id=c.question_type_id where c.id in (%s)" % str_question

        s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata')
        for row in s_rows:
            question, subject_id, type_id, struct_id, json_data = row
            if json_data != '' and json_data != 'None' and json_data != None:
                keywords, res = sq.predict(question, subject_id, type_id,
                                           struct_id, json_data)
                dict_question_words[question] = keywords, res

        return dict_question_words

    def calStudentPoints(self, dict_point_acc):
        dict_coef = {x: 1.0 for x in self.dict_point}  #
        bate = 0.25
        dict_student_points = {}
        dict_students_point_question = {}  # student -> error question
        for student_id in self.dict_students:
            if student_id not in dict_student_points:
                dict_student_points[student_id] = {}

            records = sorted(self.dict_student_records[student_id],
                             key=lambda x: x[-1],
                             reverse=True)
            dict_error = {
                'C': 0.0,
                'D': 0.0,
                'E': 0.0,
                'F': 0.0,
                'G': 0.0,
                'H': 0.0,
                'I': 0.0
            }
            mass_score = 0
            dict_question_point = {}
            for item in records:
                question, student_id, ret, answer, submit_time, question_type = item
                if answer == 'answer': continue
                if question in self.dict_realtion_quesion:
                    link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                        question]
                    if question_exercise_id != self.exercise_id: continue
                    if question_type == '单选题' or question_type == '选择题':
                        qtype = 1
                    elif question_type == '填空题':
                        qtype = 2
                    else:
                        qtype = 3

                    difficulty = self.dict_question_diff[link_question_id]
                    arr_points = link_point_id.strip().split(',')
                    a_score, score = self.getScore(question_num, answer)
                    content = '%s_%s' % (question, student_id)
                    if content in self.dict_content_answer:
                        answer = self.dict_content_answer[content]
                        if answer != 'A' and answer != 'B' and answer != 'answer':
                            dict_error[answer] += a_score - score
                            if answer == 'C' or answer == 'G' or answer == 'H':
                                mass_score += a_score - score

                        for point in arr_points:
                            if point not in dict_question_point:
                                dict_question_point[point] = []

                            dict_question_point[point].append(
                                (link_question_id, qtype, submit_time))
                            name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                                point]
                            acc = dict_point_acc[name]
                            if point not in dict_student_points[student_id]:
                                dict_student_points[student_id][point] = 0.0

                            dict_student_points[student_id][
                                point] += dict_coef[point] * (
                                    a_score - score
                                ) * acc / math.log(1 + bate *
                                                   (level + difficulty))

            sort_error = sorted(dict_error.items(),
                                key=lambda x: x[1],
                                reverse=True)
            if len(dict_student_points[student_id]) > 0:
                sort_point = sorted(dict_student_points[student_id].items(),
                                    key=lambda x: x[1],
                                    reverse=True)
                student_name = self.dict_students[student_id]
                point_name1, ptype1, question_type1, level1, parent_id1, link_id1 = self.dict_point[
                    point]
                point_name2, ptype2, question_type2, level2, parent_id2, link_id2 = self.dict_point[
                    sort_point[1][0]]
                point_name3, ptype3, question_type3, level3, parent_id2, link_id3 = self.dict_point[
                    sort_point[2][0]]

                is_power = 1 if mass_score > 5 else 0
                error1, error1score = sort_error[0][0], sort_error[0][1]
                error2, error2score = sort_error[1][0], sort_error[1][1]
                error1_name, error2_name = self.getErrorText(
                    error1), self.getErrorText(error2)
                dict_students_point_question[
                    student_id] = sort_point, dict_question_point

        return dict_students_point_question

    def getPoint2Question(self):
        dict_point_org_question = {}
        for answer_id in self.dict_realtion_quesion:
            link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                answer_id]
            arr_point = link_point_id.strip().split(',')

            for item in arr_point:
                if item not in dict_point_org_question:
                    dict_point_org_question[item] = []

                dict_point_org_question[item].append(link_question_id)

        return dict_point_org_question

    def pointsRecQuestion(self,
                          dict_students_point_question,
                          dict_diff,
                          throld=4):
        # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty
        # dict_point_org_question = self.getPoint2Question() # org question point
        dict_question_text = self.getQuestionWords()
        dict_question_target = {}
        dict_student_recommend_question = {}
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
            '学生id', '学生名字', '考点', '考点名称', '原题id', '推荐习题id', '关键词', '难度')
        for student_id in dict_students_point_question:

            arr_point, dict_question_point_target = dict_students_point_question[
                student_id]
            student_name = self.dict_students[student_id]
            haveTime = 12
            recommend_questions = []
            pre_set = set()
            for item_target in arr_point:
                if haveTime <= 0 and len(recommend_questions) % 3 == 0: break
                item_point = item_target[0]
                arr_question = sorted(dict_question_point_target[item_point],
                                      key=lambda x: x[-1],
                                      reverse=True)

                key = '%s-%s' % (student_id, item_point)
                rateRight, difficulty = dict_diff[
                    key] if key in dict_diff else (0.0, 1)

                pre_set = pre_set | set([x[0] for x in arr_question])  #
                for base_question, qtype, submit_time in arr_question:
                    if haveTime <= 0 and len(recommend_questions) % 3 == 0:
                        break
                    keywords, text = dict_question_text[base_question]
                    text = text.replace('\n', '').strip('\r').strip()  # filter

                    base_question_key = '%s-%s' % (base_question, difficulty)
                    if base_question_key in dict_question_target:
                        target_questions = dict_question_target[
                            base_question_key]
                    else:
                        target_questions = self.recommend.getEsResult(
                            base_question, text, keywords, difficulty, qtype,
                            pre_set, throld)
                        dict_question_target[
                            base_question_key] = target_questions

                    for rec_question in target_questions:
                        if haveTime <= 0 and len(recommend_questions) % 3 == 0:
                            break
                        if rec_question in pre_set: continue

                        haveTime += -1
                        recommend_questions.append(
                            (base_question, keywords, rec_question, item_point,
                             difficulty))
                        pre_set.add(rec_question)

            for rec_item in recommend_questions:
                source_question, keywords, rec_question, item_point, difficulty = rec_item
                point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                    item_point]
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    student_id, student_name, item_point, point_name,
                    source_question, rec_question, keywords, difficulty)

    def importDefault(self, is_new, dict_question_topic):
        defult_list = [(10805815, 0), (10806190, 2), (10805815, 4),
                       (10806184, 6), (10806182, 8), (10806169, 10),
                       (10806104, 12), (10806057, 14), (10805815, 16),
                       (10806068, 18)]
        update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values"
        new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values"
        db_rows = self.db_fetcher.get_sql_result(
            "select student_id from entity_student_white_list",
            "mysql_white_list")
        is_first = 1
        for row in db_rows:
            student_id = row[0]
            for item in defult_list:
                question_id, subject_id = item

                type_id = 2186
                if question_id in dict_question_topic:
                    set_topic = dict_question_topic[question_id]
                    if len(set_topic) > 0:
                        type_id = set_topic.pop()

                if is_first == 1:
                    update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                        student_id, question_id, subject_id, 0)
                    if is_new == 1:
                        new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                            student_id, question_id, subject_id, 0, 1, 3,
                            type_id)
                    is_first = 0
                else:
                    update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                        student_id, question_id, subject_id, 0)
                    new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                        student_id, question_id, subject_id, 0, 1, 3, type_id)

        if is_new == 1:
            self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list')
        else:
            self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')

    def import2DataBase(self, flag, is_new=1, fname='student_rec.txt'):
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        if is_new == 1:
            self.db_fetcher.commit_sql_cmd(
                "delete from sync_student_recommend_question",
                'mysql_v3_white_list')  # update
        else:
            self.db_fetcher.commit_sql_cmd(
                "delete from entity_recommend_question_bytopic",
                'mysql_white_list')  # update

        # self.importDefault(is_new, dict_question_topic) # import
        if flag == 1:  # 1,3 5 recomend
            update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id,topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values"
            insert_sql = "insert into entity_question_recommend(student_id,student_name,point,point_name,question_id,recommend_id,keywords,difficulty) values"

            new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values"
            insert_score, is_first = 0, 1
            with open(fname) as rec_f:
                for line in rec_f:
                    if insert_score > 1:
                        arr = line.strip().split('\t')
                        student_id, question_id = arr[0], long(arr[5])
                        type_id = 2186
                        if question_id in dict_question_topic:
                            set_topic = dict_question_topic[question_id]
                            if len(set_topic) > 0:
                                type_id = set_topic.pop()

                        if is_first == 1:
                            update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, question_id, 0, insert_score)
                            insert_sql += "(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % (
                                arr[0], arr[1], arr[2], arr[3], arr[4], arr[5],
                                arr[6], arr[7])
                            if is_new == 1:
                                new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, question_id, 0, insert_score,
                                    1, 3, type_id)
                            is_first = 0
                        else:
                            update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, question_id, 0, insert_score)
                            insert_sql += ",(%s,\'%s\',%s, \'%s\', %s, %s, \'%s\',%s)" % (
                                arr[0], arr[1], arr[2], arr[3], arr[4], arr[5],
                                arr[6], arr[7])
                            if is_new == 1:
                                new_sql += ",(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, question_id, 0, insert_score,
                                    1, 3, type_id)

                    insert_score += 1

            # self.db_fetcher.commit_sql_cmd(insert_sql, 'mysql_logdata')
            if is_new == 1:  # update mysql
                self.db_fetcher.commit_sql_cmd(new_sql,
                                               'mysql_v3_white_list')  # new
            else:
                self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')

    def getPoints(self):
        dict_point = {}
        dict_topic_point = {}
        sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points'  # do something
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row
            point_id = str(point_id)
            dict_point[
                point_id] = name, ptype, question_type, level, parent_id, link_id
            if ptype == 3:  #
                arr_topic = [long(x) for x in topics_id.strip().split(',')]
                for topic in arr_topic:
                    if topic not in dict_topic_point:
                        dict_topic_point[topic] = []

                    dict_topic_point[topic].append(point_id)

        return dict_point, dict_topic_point

    def getStudentResource(self):
        dict_students_resource = {}
        str_content = ','.join([str(x) for x in self.dict_students])
        sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \
            a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % (
            str_content)

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, resource_id, deadline = row
            if resource_id > 0:
                if student_id not in dict_students_resource:
                    dict_students_resource[student_id] = set()

                dict_students_resource[student_id].add(resource_id)

        return dict_students_resource

    def getQuestionBaseInfo(self, question_set):
        dict_question_difficulty = {}
        str_content = ','.join([str(x) for x in question_set])
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question  where b.id in (%s)" % str_content
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_difficulty

    def updateStuAdapt2Difficult(self, range_throld=10, difficulty_throld=10):
        dict_exam_question = {}
        dict_exam_difficulty = {}
        dict_question_topic = self.recommend.getThisQuestionTopic()  #

        for exam_item in self.exam_list_records:
            question, student_id, ret, answer, submit_time, question_type = exam_item
            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_point = link_point_id.strip().split(',')
                is_right = 1 if answer == 'A' or answer == 'B' else 0

                for point in arr_point:
                    key = '%s-%s' % (student_id, point)
                    if key not in dict_exam_question:
                        dict_exam_question[key] = []
                        dict_exam_difficulty[key] = {}

                    record = (question, is_right, difficulty, submit_time,
                              question_type)
                    dict_exam_question[key].append(record)
                    if difficulty not in dict_exam_difficulty[key]:
                        dict_exam_difficulty[key][difficulty] = []
                    dict_exam_difficulty[key][difficulty].append(record)

        question_set = set([x[0] for x in self.practice_list_records])
        dict_question_difficulty = self.getQuestionBaseInfo(question_set)

        for practice_item in self.practice_list_records:
            question, student_id, ret, answer, submit_time, question_type = practice_item
            is_right = 1 if ret == 1 else 0
            if question in dict_question_difficulty:
                difficulty = self.dict_question_difficulty[question]
                if question in dict_question_topic:
                    topic_set = dict_question_topic[question]
                    for topic in topic_set:
                        if topic in self.dict_topic_point:
                            point = self.dict_topic_point[topic]
                            key = '%s-%s' % (student_id, point)
                            if key not in dict_exam_question:
                                dict_exam_question[key] = []
                                dict_exam_difficulty[key] = {}

                            if difficulty not in dict_exam_difficulty[key]:
                                dict_exam_difficulty[key][difficulty] = []
                            record = (question, is_right, difficulty,
                                      submit_time, question_type)
                            dict_exam_difficulty[key][difficulty].append(
                                record)
                            dict_exam_question[key].append(record)

        dict_res = {}
        self.db_fetcher.commit_sql_cmd('delete from entity_student_feature',
                                       'mysql_logdata')
        for key in dict_exam_question:
            arr_content = key.split('-')
            student_id, point = arr_content[0], arr_content[1]
            question_records = sorted(dict_exam_question[key],
                                      key=lambda x: x[-1],
                                      reverse=True)
            right_cnt = 0
            throld = min(len(question_records), range_throld)
            #if throld < 10: continue
            for record_item in question_records[:throld]:
                question, is_right, difficulty, submit_time, question_type = record_item
                right_cnt += is_right

            rateRight = right_cnt / range_throld
            adaDiff, pDiff = 0, -1
            for difficulty in dict_exam_difficulty[key]:
                drecords = sorted(dict_exam_difficulty[key][difficulty],
                                  key=lambda x: x[-1],
                                  reverse=True)
                difficulty_throld = min(len(drecords), difficulty_throld)
                #if difficulty_throld < 4: continue
                dright_cnt = 0
                for diff_item in drecords[:difficulty_throld]:
                    question, is_right, difficulty, submit_time, question_type = diff_item
                    dright_cnt += is_right

                drateRight = dright_cnt / difficulty_throld
                if drateRight > pDiff:
                    adaDiff = difficulty
                    pDiff = drateRight

            point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                point]
            insert_sql = "insert into entity_student_feature(student_id, point_id, point_name, adapt_difficulty, master) values(%s, %s, \'%s\', %s, %s)" \
                    % (student_id, point, point_name, adaDiff, rateRight)

            insert_id = self.db_fetcher.commit_sql_cmd(insert_sql,
                                                       'mysql_logdata')
            dict_res[key] = rateRight, difficulty

        return dict_res

    def getStudentScore(self):
        dict_student_score = {}
        for exam_item in self.exam_list_records:
            exam_question, student_id, ret, answer, submit_time, question_type = exam_item

            if exam_question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    exam_question]

                a_score, i_score = self.getScore(question_num, answer)
                exam_key = '%s-%s' % (student_id, question_exercise_id)

                if exam_key not in dict_student_score:
                    dict_student_score[exam_key] = 0
                dict_student_score[exam_key] += i_score

        return dict_student_score

    def getFirstPoint(self):  # point
        dict_relation_point13 = {}
        sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points  \
                b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3"

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point3, name3, point1, name1 = row
            dict_relation_point13[point3] = name3, point1, name1

        return dict_relation_point13

    def getExerciseName(self):
        dict_exercise_name = {}
        sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \
                on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time

        exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for exercise_row in exercise_rows:
            exercise_id, resource_name = exercise_row
            dict_exercise_name[exercise_id] = resource_name

        return dict_exercise_name

    def getExamData(self, dict_student_score, dict_relation_point13,
                    dict_exercise_name):
        for exam_item in self.exam_list_records:  # exam
            question, student_id, ret, answer, submit_time, question_type = exam_item
            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = [int(x) for x in link_point_id.strip().split(',')]
                exam_key = '%s-%s' % (student_id, question_exercise_id)
                str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
                is_first = True
                for point3 in arr_points:
                    name3, point1, name1 = dict_relation_point13[point3]
                    if is_first == True:
                        str_point3, str_name3, str_point1, str_name1 = str(
                            point3), str(name3), str(point1), str(name1)
                        is_first = False
                    else:
                        str_point3 += ',%d' % point3
                        str_name3 += ',%s' % name3
                        str_point1 += ',%d' % point1
                        str_name1 += ',%s' % name1

                resource_name = dict_exercise_name[question_exercise_id]
                exercise_score = dict_student_score[
                    exam_key] if exam_key in dict_student_score else 0
                a_score, i_score = self.getScore(question_num, answer)

                error_text = self.getErrorText(answer)
                print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                    student_id, link_question_id, question_type, str_point1,
                    str_name1, str_point3, str_name3, submit_time,
                    resource_name, exercise_score, question_exercise_id,
                    a_score, i_score, difficulty, ret, question_num,
                    error_text, 1)

    def getPracticeData(self, dict_relation_point13, dict_question_topic,
                        dict_question_base_info):
        for practice_item in self.practice_list_records:  # practice
            question, student_id, ret, answer, submit_time, question_type = practice_item
            str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
            is_first = True
            if question in dict_question_topic:
                if question in dict_question_topic:
                    topic_set = dict_question_topic[question]
                    for topic in topic_set:
                        if topic in self.dict_topic_point:
                            arr_point3 = [
                                int(x) for x in self.dict_topic_point[topic]
                            ]
                            for point3 in arr_point3:
                                name3, point1, name1 = dict_relation_point13[
                                    point3]
                                if is_first == True:
                                    str_point3, str_name3, str_point1, str_name1 = str(
                                        point3), str(name3), str(point1), str(
                                            name1)
                                    is_first = False
                                else:
                                    str_point3 += ',%s' % point3
                                    str_name3 += ',%s' % name3
                                    str_point1 += ',%s' % point1
                                    str_name1 += ',%s' % name1

            a_score, i_score = 0, 0
            # 单选题 选择题 填空题
            if question_type == '单选题' or question_type == '选择题' or question_type == '填空题':
                a_score = 5
            else:
                a_score = 12

            if ret == 1:
                i_score = a_score
            elif ret == 5:
                i_score = 0.5 * a_score

            difficulty, qtype = 1, 1
            if question in dict_question_base_info:
                difficulty, qtype = dict_question_base_info[question]

            errorText = '正确' if ret == 1 else '错误'
            print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                student_id, question, question_type, str_point1, str_name1,
                str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score,
                i_score, difficulty, ret, 0, errorText, 0)

    def getData(self):
        dict_student_score = self.getStudentScore()
        dict_relation_point13 = self.getFirstPoint()  # point
        dict_exercise_name = self.getExerciseName()

        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        dict_question_base_info = self.recommend.getThisQuestionBaseInfo(
        )  # question quality
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \
            ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型')

        self.getExamData(dict_student_score, dict_relation_point13,
                         dict_exercise_name)  # exam
        self.getPracticeData(dict_relation_point13, dict_question_topic,
                             dict_question_base_info)  # practice

    def recommendMonday(self, is_new=0, throld=500):
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        d1 = datetime.datetime.now()
        d3 = d1 + datetime.timedelta(days=-14)
        str_monday = d3.strftime("%Y-%m-%d 00:00:00")

        self.db_fetcher.commit_sql_cmd(
            "delete from entity_recommend_question_bytopic",
            'mysql_white_list')  # update
        self.importDefault(0, dict_question_topic)  # import
        is_first, insert_score = 1, 0
        new_sql = "insert into sync_student_recommend_question(system_id, resource_type, resource_id, subject_id, tag1, tag2, score, type, type_level, type_id) values"
        update_sql = "insert into entity_recommend_question_bytopic(system_id, type,chapter_id, topic_id, question_id, `master`, duration, important, subject_id, score, school_publish, org_id, org_type) values"
        dict_student_cnt = {}
        records = self.exam_list_records  #+ self.practice_list_records
        for item in records:
            question, student_id, ret, answer, submit_time, question_type = item
            if submit_time > str_monday:
                if question in self.dict_realtion_quesion:
                    link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                        question]

                    answer_key = '%s_%s' % (question, student_id)
                    if answer_key in self.dict_content_answer:
                        answer = self.dict_content_answer[answer_key]
                    if answer == 'A' or answer == 'B':
                        ret = 1
                    else:
                        ret = 2

                if ret != 1:
                    if student_id not in dict_student_cnt:
                        dict_student_cnt[student_id] = 0
                    if dict_student_cnt[student_id] < throld:
                        # print '%s\t%s' % (student_id, link_question_id)
                        if is_first == 1:
                            update_sql += "(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, link_question_id, 0, insert_score)
                            if is_new == 1:
                                new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, link_question_id, 0,
                                    insert_score, 1, 3, 2186)
                            is_first = 0
                        else:
                            update_sql += ",(%s, 2, 0, 0, %s, 2, 2, 1, %s, %s, 0, 113, 2)" % (
                                student_id, link_question_id, 0, insert_score)
                            if is_new == 1:
                                new_sql += "(%s, 1, %s, %s, \'最近新错\', \'重难点\', %s, %s, %s, %s)" % (
                                    student_id, link_question_id, 0,
                                    insert_score, 1, 3, 2186)

                        dict_student_cnt[student_id] += 1
                        insert_score += 1

        if is_new == 1:
            self.db_fetcher.commit_sql_cmd(new_sql, 'mysql_v3_white_list')
        else:
            self.db_fetcher.commit_sql_cmd(update_sql, 'mysql_white_list')
Exemplo n.º 3
0
class CzMathGraph(object):
    """docstring for czMathGraph"""
    def __init__(self, stage, subject):
        self.fetcher = DataBaseFetcher()
        self.topicDict, self.topicImportDict = self.getTopic(stage, subject)
        self.quesTopicDict, self.topicQuestionDict = self.getQuestionTopic(
        )  # static

        # student exercise
        self.recordList = self.cacheRecord('question_result.txt')
        self.topicStudentDict = self.getTopicStudent()

    # sample
    def cacheRecord(self, fName, maxWin=3000000):
        recordList = []
        pos = 0
        with open(fName) as handleFile:
            for line in handleFile:
                arrContent = line.strip().split('\t')
                studyId, questionId, res = int(arrContent[1]), int(
                    arrContent[2]), arrContent[5]
                isFlag = False
                if questionId in self.quesTopicDict:
                    topicSet = self.quesTopicDict[questionId]
                    for topic in topicSet:
                        if topic in self.topicDict:
                            isFlag = True
                            break

                if isFlag == False: continue
                # normalize data
                try:
                    res = int(res)
                except:
                    try:
                        result = 0
                        decode_json = json.loads(res)
                        for resDict in decode_json:
                            if resDict['result'] == 1:
                                result += 1
                            else:
                                result -= 1

                        res = 1 if result > 0 else 2
                    except:
                        pass

                if res != 1: continue  # 过滤负样本
                recordList.append((studyId, questionId, res))
                pos += 1
                if pos >= maxWin:
                    break

        return recordList

    def getTopicStudent(self):
        topicStudentDict = {}
        for item in self.recordList:
            studyId, questionId, res = item

            if questionId not in self.quesTopicDict: continue
            for topic in self.quesTopicDict[questionId]:
                if topic not in topicStudentDict:
                    topicStudentDict[topic] = {}

                if studyId not in topicStudentDict[topic]:
                    topicStudentDict[topic][studyId] = 0

                topicStudentDict[topic][studyId] += 1

        return topicStudentDict

    # 统计条件概率
    def staticTopicProb(self, topic, topicList):
        if topic not in self.topicStudentDict:
            return 0.0

        fenmuSet = set()  # 条件student集合
        isFirst = True
        for item in topicList:
            if item not in self.topicStudentDict: continue
            itemSet = set([x for x in self.topicStudentDict[item]])
            if isFirst:
                fenmuSet = itemSet
            else:
                fenmuSet = fenmuSet & itemSet

        fenmu, fenzi = 0.0, 0.0
        for student in fenmuSet:
            for item in topicList:
                fenmu += self.topicStudentDict[item][student]

            if student in self.topicStudentDict[topic]:
                fenzi += self.topicStudentDict[topic][student]

        p = fenzi / fenmu if fenmu > 0 and fenzi < fenmu else 0.0
        return p

    #  获取词典
    def getTopic(self, stage, subjectId):
        topicDict = {}
        topicImportDict = {}
        sql = "select id, name, is_important from entity_topic where subject_id= %s and stage_id = %s" % (
            subjectId, stage)
        db_rows = self.fetcher.get_sql_result(sql, "mysql_v3")
        for row in db_rows:
            topicId, name, important = row
            topicDict[topicId] = name
            topicImportDict[topicId] = important

        return topicDict, topicImportDict

    # 获取问题主题
    def getQuestionTopic(self):
        quesTopicDict = {}
        topicQuestionDict = {}
        db_rows = self.fetcher.get_sql_result(
            "select topic_id, question_id from link_question_topic ",
            "mysql_v3")
        for row in db_rows:
            topicId, questionId = row
            if topicId not in topicQuestionDict:
                topicQuestionDict[topicId] = []

            topicQuestionDict[topicId].append(questionId)

            if questionId not in quesTopicDict:
                quesTopicDict[questionId] = []

            quesTopicDict[questionId].append(topicId)

        quesTopicDict = {
            key: set(value)
            for key, value in quesTopicDict.items()
        }  # list to set
        topicQuestionDict = {
            key: set(value)
            for key, value in topicQuestionDict.items()
        }

        return quesTopicDict, topicQuestionDict

    # 统计问题正确率
    def statisQuestion(self, subject_id):
        questionDict = {}
        db_rows = self.fetcher.get_sql_result(
            "select question_id, sum(answer_num) answernum, sum(right_num) rightnum from entity_trail_question where subject_id = %s GROUP BY question_id "
            % subject_id, "mysql_v3")
        for row in db_rows:
            questionId, answer, right = row
            questionDict[questionId] = answer, right

        return questionDict

    def isLoopGraph(self, pairTopic, iniRelList, childDict):
        startTopic, endTopic = pairTopic
        if startTopic == endTopic: return True  # loop
        stackList = [startTopic]
        missSet, popSet = set(), set()  # init set and pop set
        while len(stackList) > 0:
            if endTopic in missSet:
                return True  # the graph is unicom

            topic = stackList[-1]
            stackList.pop()  # pop
            if (topic not in childDict) or topic in missSet: continue
            popSet.add(topic)  # add pop set
            childSet = childDict[topic]
            for child in childSet:
                stackList.append(child)
                missSet.add(child)

        return False

    # 构造有像图知识网络
    def getTopicDAG(self, threshold=30, inDegreeThr=3, outDegreeThr=4):
        iniRelList = []
        relDict = {}
        topicList = [topic for topic in self.topicDict]
        for i in xrange(len(topicList)):
            for j in xrange(i + 1, len(topicList)):
                if topicList[i] in self.topicQuestionDict and topicList[
                        j] in self.topicQuestionDict:
                    iSet, jSet = self.topicQuestionDict[
                        topicList[i]], self.topicQuestionDict[topicList[j]]
                    key = "%s-%s" % (topicList[i], topicList[j])
                    mLen = len(iSet & jSet)
                    if mLen > threshold:
                        relDict[key] = mLen

        sortRel = sorted(relDict.items(), key=lambda x: x[1], reverse=True)
        inDegreeDict = {}  # 入度记录字典
        outDegreeDict = {}  #  出度记录字典
        childDict = {}  # child
        for item in sortRel:
            inDegreeThr, outDegreeThr = 3, 4
            arrTopic = [int(x) for x in item[0].split("-")]
            if len(arrTopic) != 2: continue

            start, end = arrTopic[0], arrTopic[1]

            # 判断转移方向
            p0 = self.staticTopicProb(start, [end])
            p1 = self.staticTopicProb(end, [start])
            pairTopic = (start, end) if p1 < p0 else (end, start)
            start, end = pairTopic  #

            if self.topicImportDict[start] < 1:
                outDegreeThr = 1

            if self.topicImportDict[end] < 1:
                inDegreeThr = 1

            inDegree = inDegreeDict[end] if end in inDegreeDict else 0
            outDegree = outDegreeDict[start] if start in outDegreeDict else 0

            isLoop = self.isLoopGraph(pairTopic, iniRelList, childDict)
            bFlag = (isLoop == False) and (inDegree < inDegreeThr) and (
                outDegree < outDegreeThr)
            if bFlag:
                iniRelList.append(pairTopic)
                if end not in childDict:
                    childDict[end] = set()

                childDict[end].add(start)
                # 出度 and 入度
                outDegreeDict[start] = 1 + outDegree
                inDegreeDict[end] = 1 + inDegree

        return iniRelList

    # 构建系数矩阵数组
    def getRowsNormalize(self, topicList):
        rowsData = []
        topicLen = len(topicList)
        topicIndex = {topicList[i]: i for i in xrange(topicLen)}
        for item in self.recordList:
            studyId, questionId, res = item
            arr = [0] * topicLen
            if questionId in self.quesTopicDict:
                topicSet = self.quesTopicDict[questionId]
                for topic in topicSet:
                    if topic in topicIndex:
                        arr[topicIndex[topic]] = 1

            sparr = pd.SparseArray(arr)
            rowsData.append(arr)

        return rowsData

    def makeModel(self):
        # graph structure
        initRelation = self.getTopicDAG()  # 概率图构建
        relationList = []
        stuModel = BayesianModel()  # DAG
        for edge in initRelation:
            try:
                start, end = edge
                stuModel.add_edge(str(start), str(end))
                relationList.append(edge)
            except:
                continue

        # save file
        with open('model.txt', 'w') as write_f:
            for item in relationList:
                write_f.write(
                    '%s,%s\n' %
                    (self.topicDict[item[0]], self.topicDict[item[1]]))

        # learning from data
        topicList = [topic for topic in self.topicDict]
        rowsData = self.getRowsNormalize(topicList)
        print 'the rows len is:%s' % len(rowsData)
Exemplo n.º 4
0
    info += "</tr>"
    for data in datas:
        if type(data)==list:
            info += "<tr>"
            for k in data:
                info += "<td>%s</td>" % str(k)
            info += "</tr>"
        else:
            info += "<tr style='background-color:#EEE;'><td colspan='%d'><b>%s</b></td></tr>" % (len(title), str(data))
    info += "</table>"
    return info

if __name__=="__main__":
    domain_list = get_domain_list()
    now_time = datetime.datetime.now()
    pre_time = now_time + datetime.timedelta(days=-3)
    pre_date = pre_time.strftime('%Y-%m-%d')

    title = '异常抓取站点'
    data_list = []
    for domain in domain_list:
        sel_sql = "select count(*) from news_report where publish_date >= \'%s 00:00:00\' and domain = \'%s\' and type = 0 limit 1" % (pre_date, domain)
        db_data = db_fetcher.get_sql_result(sel_sql, 'mysql_insight')
        domain_count = int(db_data[0][0])

        if domain_count == 0:
            data_list.append(domain)
    data_list = list(set(data_list))
    info = create_html_table(data_list, title)
    print info
Exemplo n.º 5
0
        if etype == 0:
            for key in biandong_list:
                if unicode(key) in title:
                    etype = 3
    return etype


if __name__ == "__main__":
    # update news_report company
    now_time = datetime.datetime.now()
    yes_time = now_time + datetime.timedelta(days=-1)
    yes_date = yes_time.strftime('%Y-%m-%d')
    condition_sql = "publish_date >= \'%s 00:00:00\' and publish_date <= '%s 23:59:59' and type = 0" % (
        yes_date, yes_date)
    db_data = db_fetcher.get_sql_result(
        'select id, title, content from news_report where %s' % condition_sql,
        'mysql_insight')
    for pos in range(len(db_data)):
        kid, title, content = str(db_data[pos][0]), str(db_data[pos][1]), str(
            db_data[pos][2])
        try:
            topic_coms = comlib.extract_report(
                title, content.encode('utf8', 'ignore'), '-1')
            reported_coms = ",".join([x[0] for x in topic_coms])
            com_id, is_confident = comlib.extract_report(
                title, content.encode('utf8', 'ignore'))
            reported_com = 0
            if com_id and com_id != "0" and is_confident:
                reported_com = int(com_id)
            event_type = get_event_type(title)
            update_sql = "update news_report set reported_com= %s, reported_coms = \'%s\',event_type = %s  where id = %s" % (
Exemplo n.º 6
0
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')))
from common.db_fetcher import DataBaseFetcher

db_fetcher = DataBaseFetcher()

if __name__ == "__main__":
    # update news_report company
    now_time = datetime.datetime.now()
    yes_time = now_time + datetime.timedelta(days=-1)
    yes_date = yes_time.strftime('%Y-%m-%d')
    db_data = db_fetcher.get_sql_result(
        'select id, article_json, publish from kr_articles where publish = \'%s\''
        % yes_date, 'mysql_insight')
    for pos in range(len(db_data)):
        kid = str(db_data[pos][0])
        try:
            article_json = str(db_data[pos][1])
            article_obj = json.loads(article_json, strict=False)

            url = 'http://36kr.com/p/%s.html' % kid
            title = article_obj['title']
            source = "36Kr"
            publish_date = '%s 00:00:00' % str(db_data[pos][2])
            domain = '36kr.com'
            content_html = article_obj['content'].encode(
                'utf-8'
            ) if 'content' in article_obj and article_obj['content'] else ''
Exemplo n.º 7
0
class TextAnalysis(object):
    media_dict = None
    entity_dict = None
    seo_extractor = SeoExtractor('../seo/com_df_idf')

    def __init__(self, mysql_fetcher=None, handler_redis=None):
        self.mysql_fetcher = mysql_fetcher
        if mysql_fetcher is None:
            self.mysql_fetcher = DataBaseFetcher()
        self.handler_redis = handler_redis
        if handler_redis is None:
            self.handler_redis = redis.Redis(
                host='1c34f95e1b12494b.m.cnbja.kvstore.aliyuncs.com',
                password='******',
                port=6379,
                db=5)
        self.load_entity_dict()

    def set_media_id(self, media_id):
        self.media_dict = self.get_media_dict(media_id)

    # 通过media_id 与向量字典关联
    def set_media_dict(self, media_id, forward_index_dict):
        if media_id in forward_index_dict:
            self.media_dict = forward_index_dict[media_id]
        else:
            self.media_dict = self.get_media_dict(media_id)

    def load_entity_dict(self):
        self.entity_dict = set()
        # handle company
        company_file = '../../company_recognize/com_name2id.txt'
        with open(company_file) as com_f:
            for line in com_f:
                tokens = line.strip().split('\t')
                if len(tokens) != 2:
                    continue
                self.entity_dict.add(tokens[0])

        # handle industry
        db_industry = self.mysql_fetcher.get_sql_result(
            'select distinct name from dict_industry', 'mysql_crm')
        for row_industry in db_industry:
            self.entity_dict.add(row_industry[0])

        # handle organization
        db_organization = self.mysql_fetcher.get_sql_result(
            'select distinct name_abbr from organization', 'mysql_crm')
        for row_organization in db_organization:
            self.entity_dict.add(row_organization[0])

    # 获取文章关键词-权重值列表
    def get_article_weight(self, media_id):
        response_list = self.seo_extractor.extract_article(media_id,
                                                           title_weight=2,
                                                           title_coeff=1.4)
        if response_list is None or len(response_list) == 0:
            return {}

        keyword_dict = {}  # result dict
        for keyword, weight in response_list:
            if keyword in self.entity_dict:
                weight *= 1.4
            keyword_dict[unicode(keyword, 'utf8')] = weight

        keyword_dict = {
            key: value
            for key, value in keyword_dict.items()
            if value > term_weight_threshold
        }
        if len(keyword_dict) < 5:
            keyword_dict = {
                key: value
                for key, value in keyword_dict.items()
                if value > 0.68 * term_weight_threshold
            }

        return keyword_dict

    # 删除历史记录的文本向量
    def delete_redis_text(self, update_time_range):
        db_data = self.mysql_fetcher.get_sql_result(
            "select id from kr_articles where publish < \'%s\'" %
            update_time_range, 'mysql_insight')
        for i in range(len(db_data)):
            media_id = db_data[i][0]
            str_key = '%s_%s' % (kForwardIndexRedisKey, media_id)
            self.handler_redis.delete(str_key)

    # 计算两个向量的余弦相似度
    def cos_similarity(self, v1, v2):
        if len(v1) != len(v2):  # 向量维数必须相等
            return 0.0

        fen_divisor = 0.0  # 除数
        dividend_v1 = 0.0  # 被除数
        dividend_v2 = 0.0  # 被除数
        for pos in range(len(v1)):
            fen_divisor += v1[pos] * v2[pos]
            dividend_v1 += v1[pos]**2
            dividend_v2 += v2[pos]**2

        fen_dividend = math.sqrt(dividend_v1) * math.sqrt(dividend_v2)
        return (fen_divisor / fen_dividend if fen_dividend > 0.0 else 0.0)

    # 初始化文本向量
    def init_vector(self, vector1, dict_weight):
        if len(vector1) < 1:
            return []

        v1 = []  # 文本向量vector
        for pos in range(len(vector1)):
            if vector1[pos] in dict_weight:
                v1.append(dict_weight[vector1[pos]])
            else:
                v1.append(0.0)

        return v1

    # 重建media文本信息的正排,并更新redis
    def media_text_to_forward_index(self, media_id):
        # 获取需要的字符串
        dict_term_weight = self.get_article_weight(media_id)

        # 存入redis 文本关键词数据 key = media_id
        str_key = '%s_%s' % (kForwardIndexRedisKey, media_id)
        str_value = json.dumps(dict_term_weight)
        self.handler_redis.set(str_key, str_value)

        return dict_term_weight

    # 重建update_time_range时间之后的media文本信息的倒排,并更新redis
    def media_text_to_revert_index(self, update_time_range):
        #self.delete_redis_text(update_time_range)

        # 更新redis
        db_data = self.mysql_fetcher.get_sql_result(
            'select id from kr_articles where publish >=\'%s\'' %
            update_time_range, 'mysql_insight')
        if len(db_data) < 1:
            return None

        revert_index_dict = {}  # 倒排索引字典
        n_count = 1
        for i in range(len(db_data)):
            media_id = db_data[i][0]
            # self.cache_media_relate(media_id,update_time_range)

            # 缓存倒排索引
            media_dict = self.get_media_dict(media_id)
            for term, weight in media_dict.items():
                if term not in revert_index_dict:
                    revert_index_dict[term] = {}
                revert_index_dict[term][media_id] = weight

        str_key = kRevertIndexRedisKey
        str_value = str(revert_index_dict)
        self.handler_redis.set(str_key, str_value)

        return revert_index_dict

    # 获取正排
    def get_media_dict(self, media_id):
        media_dict = None
        str_key = '%s_%s' % (kForwardIndexRedisKey, media_id)
        str_value = self.handler_redis.get(str_key)
        # 判断redis 数据库中是否包含media_id 信息
        if str_value is None:
            media_dict = self.media_text_to_forward_index(media_id)
        else:
            media_dict = json.loads(str_value)
        return media_dict

    # 用倒排计算文本相似度
    def calc_simi_using_revert_index(self, revert_index_dict, meta_dict):
        if revert_index_dict is None:
            return {}

        dict_molecule = {}  # 分子对象
        dict_denominator1 = 0.0  # 分母对象1

        result_dict = {}  # 结果集字典 key = media_id
        media_list = []
        # 计算所有的d 与该文档的相似度的分子分母
        for term, weight in self.media_dict.items():
            dict_denominator1 += weight**2
            if term not in revert_index_dict:
                continue
            doc_weight_list = revert_index_dict[term]
            for media_id in doc_weight_list:
                # 获取相关列表
                media_list.append(media_id)

                if media_id in dict_molecule:
                    dict_molecule[
                        media_id] += weight * doc_weight_list[media_id]
                else:
                    dict_molecule[
                        media_id] = weight * doc_weight_list[media_id]

        for media_id in media_list:
            if media_id not in meta_dict:
                continue

            x, y, dict_denominator2 = meta_dict[media_id]  # 分母对象2
            fen_dividend = math.sqrt(dict_denominator1) * math.sqrt(
                dict_denominator2)
            result_dict[media_id] = dict_molecule[
                media_id] / fen_dividend if fen_dividend > 0.0 else 0.0

        return result_dict

    # 构造相似向量
    def dict_merge2list(self, d_media, d_compare):
        vec_words = []
        b_similarity = False

        for media_key in d_media:
            if media_key in d_compare:
                vec_words.append(media_key)
                b_similarity = True

        return vec_words, b_similarity

    # 用正排计算文本相似度
    def calculate_text_weight(self, compare_id, forward_index_dict):
        # 判断该向量是否存在
        if self.media_dict is None:
            return 0.0

        if compare_id in forward_index_dict:
            compare_dict = forward_index_dict[compare_id]
        else:
            compare_dict = self.get_media_dict(compare_id)

        # 判断比较向量是否存在
        if compare_dict is None:
            return 0.0

        vec_words, b_similarity = self.dict_merge2list(self.media_dict,
                                                       compare_dict)
        if b_similarity == False:
            return 0.0

        v1 = self.init_vector(vec_words, self.media_dict)
        v2 = self.init_vector(vec_words, compare_dict)

        # 计算余弦相似度值
        d_result = self.cos_similarity(v1, v2)
        return d_result
Exemplo n.º 8
0
class Report(object):
    """docstring for Report"""
    def __init__(self, update_time='2017-11-01 00:00:00'):
        # do something inits
        self.db_fetcher = DataBaseFetcher()
        self.dict_content_answer = {}  # error content

        self.update_time = update_time  # update time
        self.recommend = RecommendQuestion()

        self.dict_realtion_quesion, self.dict_question_num, self.dict_question_score = self.getRelationQuestion(
        )
        self.dict_parent_quesion = self.getChildQuestion()
        self.dict_question_diff = self.getRightDifficulty()
        self.dict_point, self.dict_topic_point = self.getPoints()

        self.dict_students = self.getStudentId()
        self.dict_student_score, self.dict_student_records = {}, {
            x: []
            for x in self.dict_students
        }

        self.exam_list_records, self.practice_list_records = self.getExerciseRecord(
        )  #records

        # dict
        self.dict_question_types = {
            1: '选择题',
            2: '填空题',
            4: '简答题',
            6: '综合题',
            17: '计算题',
            44: '证明题'
        }
        self.dict_question_difficulty = {
            1: '易',
            2: '中',
            3: '难',
            4: '极难'
        }  # 1:易 2:中 3:难 4:极难

        # feedback
        self.dict_students_resource = self.getStudentResource()  # resource

    def getExerciseRecord(self):
        exam_list_records = []
        practice_list_records = []
        sql = "select a.question_id, a.student_id, a.ret_num, a.submit_answer, a.submit_time, b.question_type from entity_student_exercise a join \
                neworiental_v3.entity_question b on b.id = a.question_id where a.submit_time > \'%s\' and a.exercise_source = 6" % (
            self.update_time)

        dict_is_choice = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question, student_id, ret, answer, submit_time, question_type = row
            student_id = long(student_id)

            if answer == 'null' or answer == 'None' or answer == '' or answer == 'NULL' or answer == None or len(
                    answer) == 0:
                continue
            if question in self.dict_realtion_quesion:
                exam_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

                self.dict_student_records[student_id].append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])
                self.dict_content_answer['%s_%s' %
                                         (question, student_id)] = answer
            elif question in self.dict_parent_quesion:
                dict_content = self.dict_parent_quesion[question]
                parent_question_id = dict_content['parent_id']

                rank = dict_content['rank']
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    parent_question_id]
                if question_num >= 17 and question_num <= 21:
                    if rank == 1:
                        exam_list_records.append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                        self.dict_student_records[student_id].append([
                            parent_question_id, student_id, ret, answer,
                            submit_time, question_type
                        ])
                    elif rank == 2:
                        self.dict_content_answer['%s_%s' %
                                                 (parent_question_id,
                                                  student_id)] = answer
                elif question_num >= 22 and question_num <= 23:
                    key = '%s_%s' % (parent_question_id, student_id)
                    if rank == 1:
                        if answer == 'A':
                            dict_is_choice[key] = 1
                        else:
                            dict_is_choice[key] = 0
                    elif rank == 2:
                        if key in dict_is_choice:
                            if dict_is_choice[key] == 1:
                                exam_list_records.append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                                self.dict_student_records[student_id].append([
                                    parent_question_id, student_id, ret,
                                    answer, submit_time, question_type
                                ])
                    elif rank == 3:
                        self.dict_content_answer[key] = answer
            else:
                practice_list_records.append([
                    question, student_id, ret, answer, submit_time,
                    question_type
                ])

        return exam_list_records, practice_list_records

    def getStudentId(self, file_name='students.txt'):
        list_students = []
        with open(file_name) as handle_f:
            for line in handle_f:
                system_id = long(line.strip())
                list_students.append(system_id)

        set_studends = set(list_students)  # students set
        str_content = ','.join([str(x) for x in set_studends])
        sql = "select a.`system_id`, a.`name` from neworiental_user.entity_user a where type = 2 and a.system_id in (%s)" % str_content
        dict_students = {}
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, name = row
            if student_id in set_studends:
                dict_students[student_id] = name

        return dict_students

    def getRightDifficulty(self):
        dict_question_diff = {}
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question b join link_question_answer a on  b.id = a.link_question_id where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_diff

    def getChildQuestion(self):
        dict_parent_list = {}
        dict_parent_question = {}
        sql = "select a.id, a.parent_question_id from link_question_answer b join neworiental_v3.entity_question a on a.parent_question_id = b.link_answer_id \
                where b.question_num >= 17 and b.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            qid = row[0]
            parent_id = row[1]
            if parent_id not in dict_parent_list:
                dict_parent_list[parent_id] = []

            dict_parent_list[parent_id].append(qid)

        for parent_id in dict_parent_list:
            sort_question = sorted(dict_parent_list[parent_id], reverse=False)
            count = 1
            for item in sort_question:
                dict_parent_question[item] = {
                    'parent_id': parent_id,
                    'rank': count
                }
                count += 1

        return dict_parent_question

    def getRelationQuestion(self):
        dict_realtion_quesion = {}
        dict_point_question_num = {}
        dict_question_score = {}
        sql = "select a.link_question_id, a.link_answer_id, a.link_point_id, a.question_num, b.difficulty, a.question_exercise_id, b.question_type, a.score \
        from link_question_answer a join neworiental_v3.entity_question b on a.link_question_id =  b.id  where a.update_time >= \'%s\'" % self.update_time
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        dict_parent_quesion = {}
        link_parent_list = []
        for row in rows:
            link_question_id, link_answer_id, link_point_id, question_num, difficulty, question_exercise_id, question_type, score = row
            dict_realtion_quesion[
                link_answer_id] = link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type
            dict_question_score[link_question_id] = score
            arr_point = link_point_id.strip().split(',')
            for point in arr_point:
                if point not in dict_point_question_num:
                    dict_point_question_num[point] = []

                dict_point_question_num[point].append([
                    link_question_id, question_num, difficulty,
                    question_exercise_id, question_type
                ])

        return dict_realtion_quesion, dict_point_question_num, dict_question_score

    def getScore(self, question_num, answer, ascore):
        ascore, score = 0, 0
        if question_num >= 1 and question_num <= 16:
            score = 5 if (answer == 'A' or answer == 'B') else 0
            ascore = 5
        elif question_num >= 17 and question_num <= 23:
            if answer == 'A':
                if ascore == 10:
                    score = 4
                elif ascore == 12:
                    score = 4
                elif ascore == 13:
                    score = 5
                elif ascore == 14:
                    score = 5
                elif ascore == 15:
                    score = 6
            elif answer == 'B':
                if ascore == 10:
                    score = 7
                elif ascore == 12:
                    score = 8
                elif ascore == 13:
                    score = 9
                elif ascore == 14:
                    score = 10
                elif ascore == 15:
                    score = 11
            elif answer == 'C':
                if ascore == 10:
                    score = 9
                elif ascore == 12:
                    score = 11
                elif ascore == 13:
                    score = 12
                elif ascore == 14:
                    score = 13
                elif ascore == 15:
                    score = 15
            elif answer == 'D':
                score = ascore

        return score

    def getRecentlyExercise(throld=5):
        set_recent = set()
        sql = "select distinct question_exercise_id from link_question_answer order by question_exercise_id desc limit %s" % throld
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            eid = row
            eid = long(eid)
            set_recent.add(eid)

        return set_recent

    def getAllScore(self, link_question_id, question_num):
        ascore = 0
        if link_question_id in self.dict_question_score:
            ascore = self.dict_question_score[link_question_id]
        else:
            if question_num >= 1 and question_num <= 16:
                ascore = 5
            elif question_num >= 17 and question_num <= 21:
                ascore = 12
            elif question_num >= 22 and question_num <= 23:
                ascore = 10

        return ascore

    def statQustionReport(self, range_throld=4, range_target=20):
        dict_students_point_question = {}
        dict_student_score = {}
        for item in self.exam_list_records:
            question, student_id, ret, answer, submit_time, question_type = item

            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = link_point_id.strip().split(',')
                if student_id not in dict_student_score:
                    dict_student_score[student_id] = 0
                ascore = self.getAllScore(link_question_id, question_num)
                score = self.getScore(question_num, answer, ascore)

                dict_student_score[student_id] += score

                is_right = (answer == 'A') or (answer == 'B')
                if student_id not in dict_students_point_question:
                    dict_students_point_question[student_id] = {}  # student id

                point_name = '-'.join([str(x) for x in arr_points])  # name
                if point_name not in dict_students_point_question[student_id]:
                    dict_students_point_question[student_id][point_name] = []

                dict_students_point_question[student_id][point_name].append(
                    (link_question_id, is_right, question_type, difficulty,
                     submit_time))  # question

        dict_res_student_point_question = {}
        for student_id in dict_students_point_question:
            target_rate = 0.5
            if student_id in dict_student_score:
                target_rate = (dict_student_score[student_id] * 0.2 *
                               1.2) / 150

            dict_point_student_question = dict_students_point_question[
                student_id]
            dict_res_student_point_question[student_id] = {}
            for point_name in dict_point_student_question:
                arr_question = dict_point_student_question[point_name]
                target_questions = []
                sort_question = sorted(arr_question,
                                       key=lambda x: x[-1],
                                       reverse=True)
                right_rate, cnt_rate = 0, range_target
                if len(sort_question) < cnt_rate: cnt_rate = len(sort_question)

                for item in sort_question[:cnt_rate]:
                    link_question_id, is_right, qtype, difficulty, submit_time = item
                    if is_right == 0:
                        target_questions.append(
                            (link_question_id, qtype, difficulty, submit_time))
                    else:
                        right_rate += 1

                if right_rate / cnt_rate < target_rate and len(
                        target_questions) > 0:
                    dict_res_student_point_question[student_id][
                        point_name] = target_questions

        return dict_res_student_point_question

    def getErrorText(self, answer):
        if answer == 'C':
            return '审题时,粗心看错了'
        elif answer == 'D':
            return '审题时,题意读不懂'
        elif answer == 'E':
            return '析题时,思路模糊不清'
        elif answer == 'F':
            return '析题时,思路方向错误'
        elif answer == 'G':
            return '解题时,公式记错了'
        elif answer == 'H':
            return '解题时,计算出错了'
        elif answer == 'I':
            return '未作答(或未完成),时间来不及了'
        elif answer == 'A' or answer == 'B':
            return '答题正确'

    def getQuestionWords(self):
        dict_question_words = {}
        str_question = ','.join([
            str(self.dict_realtion_quesion[x][0])
            for x in self.dict_realtion_quesion
        ])
        str_sql = "select c.id, c.subject_id, t.type_id,t.struct_id, c.json_data from neworiental_v3.entity_question c left join neworiental_v3.entity_question_type t on \
                    t.type_id=c.question_type_id where c.id in (%s)" % str_question

        s_rows = self.db_fetcher.get_sql_result(str_sql, 'mysql_logdata')
        for row in s_rows:
            question, subject_id, type_id, struct_id, json_data = row
            if json_data != '' and json_data != 'None' and json_data != None:
                keywords, res = sq.predict(question, subject_id, type_id,
                                           struct_id, json_data)
                dict_question_words[question] = keywords, res

        return dict_question_words

    def getPoint2Question(self):
        dict_point_org_question = {}
        for answer_id in self.dict_realtion_quesion:
            link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                answer_id]
            arr_point = link_point_id.strip().split(',')

            for item in arr_point:
                if item not in dict_point_org_question:
                    dict_point_org_question[item] = []

                dict_point_org_question[item].append(link_question_id)

        return dict_point_org_question

    def pointsRecQuestion(self,
                          dict_students_point_question,
                          throld=4,
                          error_range=3):
        # 1,2 choice question 1:easy 2:difficulty 3,4: comprehensive problem 3:easy 4:difficulty
        # dict_point_org_question = self.getPoint2Question() # org question point
        dict_question_text = self.getQuestionWords()
        dict_question_base_info = self.recommend.getThisQuestionBaseInfo(
        )  # question quality
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        # self.recommendMonday(dict_question_base_info) # error questionsss
        dict_question_target = {}
        dict_student_recommend_question = {}
        cid = 1
        student_target = open('student_target.txt', 'w')
        for student_id in dict_students_point_question:
            sort_student_item = sorted(
                dict_students_point_question[student_id].items(),
                key=lambda x: len(x[-1]),
                reverse=False)
            student_name = self.dict_students[student_id]
            haveTime = 60
            recommend_questions = []
            pre_set = set()
            for item_target in sort_student_item:
                if haveTime <= 0: break
                item_point, arr_question = item_target
                item_one = item_point.strip().split('-')[0]

                key = '%s-%s' % (student_id, item_one)
                pre_set = pre_set | set([x[0] for x in arr_question])  #
                if len(arr_question) > error_range:
                    arr_question = arr_question[:error_range]

                for base_question, qtype, difficulty, submit_time in arr_question:
                    if haveTime <= 0: break
                    if qtype == '选择题':
                        qtype = 1
                    elif qtype == '填空题':
                        qtype = 2
                    else:
                        qtype = 3

                    keywords, text = dict_question_text[base_question]
                    text = text.replace('\n', '').strip('\r').strip()  # filter

                    base_question_key = '%s-%s' % (base_question, difficulty)
                    if base_question_key in dict_question_target:
                        target_questions = dict_question_target[
                            base_question_key]
                    else:
                        target_questions = self.recommend.getEsResult(
                            base_question, text, keywords, difficulty, qtype,
                            pre_set, throld)
                        dict_question_target[
                            base_question_key] = target_questions

                    for rec_question in target_questions:
                        if haveTime <= 0: break
                        if rec_question in pre_set: continue

                        haveTime += -1
                        recommend_questions.append(
                            (base_question, keywords, rec_question, item_one,
                             difficulty))
                        pre_set.add(rec_question)

            insert_score = 0
            for rec_item in recommend_questions:
                source_question, keywords, rec_question, item_point, difficulty = rec_item
                point_name, ptype, question_type, level, parent_id, link_id = self.dict_point[
                    item_point]
                type_id = 2186

                if rec_question in dict_question_base_info:
                    difficulty, question_type, upload_id = dict_question_base_info[
                        rec_question]
                else:
                    upload_id = 0

                if rec_question in dict_question_topic:
                    set_topic = dict_question_topic[rec_question]
                    if len(set_topic) > 0: type_id = set_topic.pop()

                str_time = datetime.datetime.strftime(datetime.datetime.now(),
                                                      '%Y-%m-%d %H:%M:%S')
                print "%s\t%s\t1\t%s\t%s\t针对练习\t重点\t%s\t%s\t%s\t%s\t%s" % (
                    cid, student_id, rec_question, 4, insert_score, str_time,
                    1, 3, type_id)
                student_target.write(
                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                    (cid, student_id, student_name, item_point, point_name,
                     source_question, rec_question, keywords, difficulty))
                insert_score += 1
                cid += 1

    def getPoints(self):
        dict_point = {}
        dict_topic_point = {}
        sql = 'select id, name, type, question_type, level, parent_id, link_id, topics_id from entity_exam_points'  # do something
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point_id, name, ptype, question_type, level, parent_id, link_id, topics_id = row
            point_id = str(point_id)
            dict_point[
                point_id] = name, ptype, question_type, level, parent_id, link_id
            if ptype == 3:  #
                arr_topic = [long(x) for x in topics_id.strip().split(',')]
                for topic in arr_topic:
                    if topic not in dict_topic_point:
                        dict_topic_point[topic] = []

                    dict_topic_point[topic].append(point_id)

        return dict_point, dict_topic_point

    def getStudentResource(self):
        dict_students_resource = {}
        str_content = ','.join([str(x) for x in self.dict_students])
        sql = "select b.student_id, a.resource_id, b.deadline from neworiental_v3.link_respackage_publish_resource a JOIN neworiental_v3.link_respackage_student b on \
            a.publish_id = b.publish_id where a.resource_type = 2 and b.student_id in (%s)" % (
            str_content)

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            student_id, resource_id, deadline = row
            if resource_id > 0:
                if student_id not in dict_students_resource:
                    dict_students_resource[student_id] = set()

                dict_students_resource[student_id].add(resource_id)

        return dict_students_resource

    def getQuestionBaseInfo(self, question_set):
        dict_question_difficulty = {}
        str_content = ','.join([str(x) for x in question_set])
        sql = "select b.id, b.difficulty from neworiental_v3.entity_question  where b.id in (%s)" % str_content
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, difficulty = row
            dict_question_diff[question_id] = int(difficulty)

        return dict_question_difficulty

    def getStudentScore(self):
        dict_student_score = {}
        for exam_item in self.exam_list_records:
            exam_question, student_id, ret, answer, submit_time, question_type = exam_item

            if exam_question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    exam_question]
                ascore = self.getAllScore(link_question_id, question_num)
                i_score = self.getScore(question_num, answer, ascore)
                exam_key = '%s-%s' % (student_id, question_exercise_id)

                if exam_key not in dict_student_score:
                    dict_student_score[exam_key] = 0
                dict_student_score[exam_key] += i_score

        return dict_student_score

    def getFirstPoint(self):  # point
        dict_relation_point13 = {}
        sql = "select a.id id3, a.`name` name3, c.id id1, c.`name` name1 from entity_exam_points a join entity_exam_points  \
                b on b.id = a.parent_id join entity_exam_points c on c.id = b.parent_id where a.type = 3"

        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            point3, name3, point1, name1 = row
            dict_relation_point13[point3] = name3, point1, name1

        return dict_relation_point13

    def getExerciseName(self):
        dict_exercise_name = {}
        sql = "select DISTINCT a.question_exercise_id, b.resource_name from link_question_answer a join neworiental_v3.entity_exercise b \
                on b.id = a.question_exercise_id where a.update_time >= \'%s\'" % self.update_time

        exercise_rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for exercise_row in exercise_rows:
            exercise_id, resource_name = exercise_row
            dict_exercise_name[exercise_id] = resource_name

        return dict_exercise_name

    def getExamData(self, dict_student_score, dict_relation_point13,
                    dict_exercise_name):
        for exam_item in self.exam_list_records:  # exam
            question, student_id, ret, answer, submit_time, question_type = exam_item
            if question in self.dict_realtion_quesion:
                link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                    question]
                arr_points = [int(x) for x in link_point_id.strip().split(',')]
                exam_key = '%s-%s' % (student_id, question_exercise_id)
                str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
                is_first = True
                for point3 in arr_points:
                    name3, point1, name1 = dict_relation_point13[point3]
                    if is_first == True:
                        str_point3, str_name3, str_point1, str_name1 = str(
                            point3), str(name3), str(point1), str(name1)
                        is_first = False
                    else:
                        str_point3 += ',%d' % point3
                        str_name3 += ',%s' % name3
                        str_point1 += ',%d' % point1
                        str_name1 += ',%s' % name1

                resource_name = dict_exercise_name[question_exercise_id]
                exercise_score = dict_student_score[
                    exam_key] if exam_key in dict_student_score else 0
                a_score, i_score = self.getScore(question_num, answer)

                error_text = self.getErrorText(answer)
                print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                    student_id, link_question_id, question_type, str_point1,
                    str_name1, str_point3, str_name3, submit_time,
                    resource_name, exercise_score, question_exercise_id,
                    a_score, i_score, difficulty, ret, question_num,
                    error_text, 1)

    def getPracticeData(self, dict_relation_point13, dict_question_topic,
                        dict_question_base_info):
        for practice_item in self.practice_list_records:  # practice
            question, student_id, ret, answer, submit_time, question_type = practice_item
            str_point1, str_name1, str_point3, str_name3 = '', '', '', ''
            is_first = True
            if question in dict_question_topic:
                if question in dict_question_topic:
                    topic_set = dict_question_topic[question]
                    for topic in topic_set:
                        if topic in self.dict_topic_point:
                            arr_point3 = [
                                int(x) for x in self.dict_topic_point[topic]
                            ]
                            for point3 in arr_point3:
                                name3, point1, name1 = dict_relation_point13[
                                    point3]
                                if is_first == True:
                                    str_point3, str_name3, str_point1, str_name1 = str(
                                        point3), str(name3), str(point1), str(
                                            name1)
                                    is_first = False
                                else:
                                    str_point3 += ',%s' % point3
                                    str_name3 += ',%s' % name3
                                    str_point1 += ',%s' % point1
                                    str_name1 += ',%s' % name1

            a_score, i_score = 0, 0
            # 单选题 选择题 填空题
            if question_type == '单选题' or question_type == '选择题' or question_type == '填空题':
                a_score = 5
            else:
                a_score = 12

            if ret == 1:
                i_score = a_score
            elif ret == 5:
                i_score = 0.5 * a_score

            difficulty, qtype = 1, 1
            if question in dict_question_base_info:
                difficulty, qtype, upload_id = dict_question_base_info[
                    question]

            errorText = '正确' if ret == 1 else '错误'
            print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
                student_id, question, question_type, str_point1, str_name1,
                str_point3, str_name3, submit_time, '推送练习', 0, 0, a_score,
                i_score, difficulty, ret, 0, errorText, 0)

    def getData(self):
        dict_student_score = self.getStudentScore()
        dict_relation_point13 = self.getFirstPoint()  # point
        dict_exercise_name = self.getExerciseName()

        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        dict_question_base_info = self.recommend.getThisQuestionBaseInfo(
        )  # question quality
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \
            ('学生id', '题目id', '题目类型', '一级考点id', '一级考点名称', '三级考点id', '三级考点名称', '提交时间', '试卷名称', '成绩', '题集id', '题目满分', '题目得分', '题目难度', '答题结果', '试卷题号', '错误原因', '考练类型')

        self.getExamData(dict_student_score, dict_relation_point13,
                         dict_exercise_name)  # exam
        self.getPracticeData(dict_relation_point13, dict_question_topic,
                             dict_question_base_info)  # practice

    def recommendMonday(self, dict_question_base_info, is_new=1, throld=500):
        dict_question_topic = self.recommend.getThisQuestionTopic(
        )  # question topic
        d1 = datetime.datetime.now()
        d3 = d1 + datetime.timedelta(days=-7)
        str_monday = d3.strftime("%Y-%m-%d 00:00:00")

        is_first, insert_score = 1, 0
        dict_student_cnt = {}
        records = self.exam_list_records  #+ self.practice_list_records
        for item in records:
            question, student_id, ret, answer, submit_time, question_type = item
            if submit_time > str_monday:
                if question in self.dict_realtion_quesion:
                    link_question_id, link_point_id, question_num, difficulty, question_exercise_id, question_type = self.dict_realtion_quesion[
                        question]

                    answer_key = '%s_%s' % (question, student_id)
                    if answer_key in self.dict_content_answer:
                        answer = self.dict_content_answer[answer_key]
                    if answer == 'A' or answer == 'B':
                        ret = 1
                    else:
                        ret = 2

                if ret != 1:
                    if student_id not in dict_student_cnt:
                        dict_student_cnt[student_id] = 0
                    if dict_student_cnt[student_id] < throld:
                        # print '%s\t%s' % (student_id, link_question_id)
                        if link_question_id in dict_question_base_info:
                            difficulty, question_type, upload_id = dict_question_base_info[
                                link_question_id]
                        else:
                            upload_id = 0

                        print "%s\t1\t%s\t%s\t\'最近新错\'\t\'重难点\'\t%s\t%s\t%s\t%s" % (
                            student_id, link_question_id, 0, insert_score, 1,
                            3, 2186)
                        dict_student_cnt[student_id] += 1
                        insert_score += 1
Exemplo n.º 9
0
class SimiTags(object):
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()
        self.tag2comlist = self.load_info(
            'select cid,tag from crm.company_tag')
        self.tag_child = self.load_info(
            'select tag,parent_tag from crm.tag_tag_relation')
        self.tag_parent = self.load_info(
            'select parent_tag,tag from crm.tag_tag_relation')
        self.total_num, self.manual_tag_count = self.tag_cid()

    def load_info(self, cmd):
        temp_dict = {}
        db_row = self.db_fetcher.get_sql_result(cmd, 'mysql_readall')
        for row in db_row:
            leaf, root = row
            if root not in temp_dict: temp_dict[root] = []
            temp_dict[root].append(leaf)
        return temp_dict

    def tag_cid(self):
        db_row = self.db_fetcher.get_sql_result(
            'select count(distinct cid) from crm.company_tag', 'mysql_readall')
        total_num = db_row[0][0]
        manual_tag_count = {}
        db_row = self.db_fetcher.get_sql_result(
            'select cid,tag from crm.company_tag where type != 5',
            'mysql_readall')
        for row in db_row:
            crm_id, tag = row
            if tag not in manual_tag_count: manual_tag_count[tag] = 0
            manual_tag_count[tag] += 1
        return total_num, manual_tag_count

    def get_correlation(self):
        similarity_tag = {}
        tag_corre_dict = {}
        f = 0
        for tag1 in self.tag2comlist:
            tag_corre_dict = {}
            if tag1.lower() != '人工智能': continue
            if tag1 not in self.manual_tag_count: continue
            manual_num_x = self.manual_tag_count[tag1]
            if manual_num_x < 30: continue
            l1 = self.tag2comlist[tag1]
            sum_x = len(l1)
            if sum_x < 100: continue
            for tag2 in self.tag2comlist:
                if tag1 == tag2: continue
                if tag2 not in self.tag2comlist or tag2 not in self.manual_tag_count:
                    continue
                manual_num = self.manual_tag_count[tag2]
                if manual_num < 2: continue
                l2 = self.tag2comlist[tag2]
                sum_y = len(l2)
                if sum_y > sum_x * 1.5: continue
                xy = len(set(l1) & set(l2))
                if xy < 5: continue
                pf = xy * 1.0 / sum_x
                if pf > 0.6: continue
                pc = xy * 1.0 / sum_y
                pnc = (sum_x - xy + 1) * 1.0 / (self.total_num - sum_y)
                if pc < 0.01: continue
                if pnc != 0:
                    value = pc / pnc
                if value < 1: continue
                correlation = math.pow(xy, 0.2) * math.pow(value, 0.8)
                tag_corre_dict[tag2] = correlation

            tag_items = tag_corre_dict.items()
            tag_heap = heapq.nlargest(len(tag_items),
                                      tag_items,
                                      key=lambda x: x[-1])
            tag_heap = tag_heap[:40]
            if len(tag_heap) == 0: continue
            re = map(lambda x: x[0], tag_heap)
            if len(re) == 0: continue
            similarity_tag[tag1] = re
            print tag1, '\t', "%s" % json.dumps(
                re, ensure_ascii=False).encode('utf8')
        return similarity_tag

    def rm_sub(self, tag_heap):
        abandon_l = []
        for i in range(0, len(tag_heap)):
            tag1, weight = tag_heap[i]
            if tag1 not in self.tag2comlist: continue
            l1 = self.tag2comlist[tag1]
            sum_x = len(l1)
            tag1_p, tag1_c = [], []
            if tag1 in self.tag_child: tag1_c = self.tag_child[tag1]
            if tag1 in self.tag_parent: tag1_p = self.tag_parent[tag1]
            for j in range(i + 1, len(tag_heap) - 1):
                tag2, weight = tag_heap[j]
                if tag2 in tag1_p or tag2 in tag1_c:
                    abandon_l.append(j)
                    continue
                if tag1.find(tag2) != -1 or tag2.find(tag1) != -1:
                    abandon_l.append(j)
                    continue
                l2 = self.tag2comlist[tag2]
                sum_y = len(l2)
                xy = len(set(l1) & set(l2))
                if (xy * 1.0 / sum_y) > 0.8: abandon_l.append(j)
                if (xy * 1.0 / sum_x) > 0.6: abandon_l.append(j)
        re_l = []
        for i in range(0, len(tag_heap) - 1):
            if i in abandon_l: continue
            else: re_l.append(tag_heap[i][0])
        return re_l
Exemplo n.º 10
0
class RecommendQuestion(object):
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()  # mysql handle
        self.dict_topic = self.getTopicDict()  # topic
        self.dict_quality = {5: 25, 4: 20, 3: 15, 2: 10, 1: 5}

        self.dict_question_base_info = self.getQuestionBaseInfo()  # base
        self.dict_question_quality = self.getQuestionQuality()
        self.dict_question_topic, self.dict_topic_question = self.getQuestionTopic(
        )  # topic

    def getThisQuestionTopic(self):
        return self.dict_question_topic

    def getThisQuestionBaseInfo(self):
        return self.dict_question_base_info

    def getQuestionBaseInfo(self, save_file='question_base.txt'):
        dict_question_base_info = {}
        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue

                qid, difficulty, question_type, upload_id = long(arr[0]), int(
                    arr[1]), int(arr[2]), str(arr[3]).strip()
                try:
                    upload_id = long(upload_id)
                except:
                    upload_id = 0

                dict_question_base_info[
                    qid] = difficulty, question_type, upload_id
                max_num = qid

        sql = "select id, difficulty, question_type, upload_id from neworiental_v3.entity_question where subject_id = 4 and id > %s and state != \'DISABLED\' order by id asc" % max_num
        rows = self.db_fetcher.get_sql_result(sql, "mysql_logdata")
        with open(save_file, 'a') as write_f:
            for row in rows:
                qid, difficulty, str_question_type, upload_id = row
                question_type = 1
                if str_question_type == '选择题' or str_question_type == '单选题':
                    question_type = 1
                elif str_question_type == '填空题':
                    question_type = 2
                else:
                    question_type = 3

                dict_question_base_info[
                    qid] = difficulty, question_type, upload_id
                write_f.write('%s\t%s\t%s\t%s\n' %
                              (qid, difficulty, question_type, upload_id))

        return dict_question_base_info

    def getQuestionQuality(self, save_file='quality.txt'):
        dict_question_quality = {}

        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue
                qid, question_id, extra_score = arr[0], long(arr[1]), int(
                    arr[2])
                dict_question_quality[question_id] = extra_score
                max_num = qid

        sql = "select id, question_id, extra_score from neworiental_v3.entity_question_quality where id > %s" % max_num
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        with open(save_file, 'a') as write_f:
            for row in rows:
                qid, question_id, extra_score = row

                dict_question_quality[question_id] = extra_score
                write_f.write('%s\t%s\t%s\n' % (qid, question_id, extra_score))

        return dict_question_quality

    def getQuestionTopic(self, save_file='topic.txt'):
        dict_question_topic, dict_topic_question = {}, {}
        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue
                tid, question_id, topic_id = arr[0], arr[1], arr[2]
                question_id = long(question_id)
                topic_id = long(topic_id)

                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()
                if topic_id not in dict_topic_question:
                    dict_topic_question[topic_id] = {}

                extra_score = 3
                if question_id in self.dict_question_quality:
                    extra_score = self.dict_question_quality[question_id]

                difficulty, question_type = 1, 1
                if question_id in self.dict_question_base_info:
                    difficulty, question_type, upload_id = self.dict_question_base_info[
                        question_id]

                dict_question_topic[question_id].add(topic_id)  # add
                key = '%s-%s' % (difficulty, question_type)
                if key not in dict_topic_question[topic_id]:
                    dict_topic_question[topic_id][key] = []

                dict_topic_question[topic_id][key].append(
                    (question_id, extra_score))
                max_num = tid

        sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        with open(save_file, 'a') as write_f:
            for row in rows:
                tid, question_id, topic_id = row
                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()

                if topic_id not in dict_topic_question:
                    dict_topic_question[topic_id] = {}

                extra_score = 3
                if question_id in self.dict_question_quality:
                    extra_score = self.dict_question_quality[question_id]

                difficulty, question_type = 1, 1
                if question_id in self.dict_question_base_info:
                    difficulty, str_question_type, upload_id = self.dict_question_base_info[
                        question_id]
                    if str_question_type == '选择题' or str_question_type == '单选题':
                        question_type = 1
                    elif str_question_type == '填空题':
                        question_type = 2
                    else:
                        question_type = 3

                key = '%s-%s' % (difficulty, question_type)
                dict_question_topic[question_id].add(topic_id)  # add
                if key not in dict_topic_question[topic_id]:
                    dict_topic_question[topic_id][key] = []

                dict_topic_question[topic_id][key].append(
                    (question_id, extra_score))
                write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id))

        for topic in dict_topic_question:  # sort
            for key in dict_topic_question[topic]:
                dict_topic_question[topic][key] = sorted(
                    dict_topic_question[topic][key],
                    key=lambda x: x[-1],
                    reverse=True)  #

        return dict_question_topic, dict_topic_question

    def getTopicDict(self):
        dict_topic = {}
        sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4"
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            topic_id, name = row
            topic_id = str(topic_id)
            dict_topic[topic_id] = name

        return dict_topic

    def getQuestionRate(self):
        dict_question_rate = {}
        sql = "select question_id, rate from question_accuracy"
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            question_id, rate = row
            rate = float(rate)
            dict_question_rate[question_id] = rate

        return dict_question_rate

    def getHeaders(self, teacher_id='eefbac5a4e58432ca317642a2077d362'):
        return {
            'Host':
            'jiaoshi.okjiaoyu.cn',
            'Connection':
            'keep-alive',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'X-Requested-With':
            'XMLHttpRequest',
            'is_new_okay':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
            'Referer':
            'http://jiaoshi.okjiaoyu.cn/quizcenter_vm/quizcenter',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.8,en;q=0.6',
            'Cookie':
            '_ga=GA1.2.1647528971.1502935646; _gid=GA1.2.1532869714.1506481969; org_id=113; teacher_id=%s; Hm_lvt_2014de1ca4ec84db492ebee33b1dc46c=1506044442,1506324027,1506407711,1506479104; Hm_lpvt_2014de1ca4ec84db492ebee33b1dc46c=1506492873'
            % teacher_id
        }

    def getUrlContent(self, url):
        req = urllib2.Request(url, headers=self.getHeaders())
        hh = urllib2.HTTPHandler()
        opener = urllib2.build_opener(hh)
        reply = opener.open(req, timeout=30)
        if reply.info().get('Content-Encoding') == 'gzip':
            buf = StringIO(reply.read())
            f = gzip.GzipFile(fileobj=buf)
            res = f.read()
            f.close()
            buf.close()
            reply.close()
        else:
            res = reply.read()

        return res

    def normalLeven(self, str1, str2):
        len_str1 = len(str1) + 1
        len_str2 = len(str2) + 1
        #create matrix
        matrix = [0 for n in range(len_str1 * len_str2)]
        #init x axis
        for i in range(len_str1):
            matrix[i] = i

        #init y axis
        for j in range(0, len(matrix), len_str1):
            if j % len_str1 == 0:
                matrix[j] = j // len_str1

        for i in range(1, len_str1):
            for j in range(1, len_str2):
                if str1[i - 1] == str2[j - 1]:
                    cost = 0
                else:
                    cost = 1

                matrix[j * len_str1 + i] = min(
                    matrix[(j - 1) * len_str1 + i] + 1,
                    matrix[j * len_str1 + (i - 1)] + 1,
                    matrix[(j - 1) * len_str1 + (i - 1)] + cost)

        return matrix[-1]

    def translate(self, str):
        line = str.strip().decode('utf-8', 'ignore')
        p2 = re.compile(ur'[^\u4e00-\u9fa5]')
        zh = " ".join(p2.split(line)).strip()
        zh = ",".join(zh.split())
        outStr = zh

        return outStr

    def calScoreQustion(self, question_id, list_rank):
        dict_score = {}
        pos = 0
        dict_qid_type = {}
        set_source = self.dict_question_topic[
            question_id] if question_id in self.dict_question_topic else set()
        for item in list_rank:
            qid, qtype, topic_list, extra_score = item
            set_target = self.dict_question_topic[
                qid] if qid in self.dict_question_topic else set()
            score = (1 - numpy.tanh(
                0.03 * pos)) * 60 + self.dict_quality[extra_score]
            if len(set_target & set_source) > 0:
                score += 15

            dict_score[qid] = score
            dict_qid_type[qid] = qtype
            pos += 1

        sort_res = sorted(dict_score.items(), key=lambda x: x[1], reverse=True)
        list_res = []
        for item in sort_res:
            qid = item[0]
            qtype = dict_qid_type[qid]
            list_res.append((qid, qtype))

        return list_res

    def randomTopicHeader(self, question_id, list_res, pre_set, rank_size,
                          throld_size):
        # 头部数据处理
        if question_id in self.dict_question_topic:
            topic_set = self.dict_question_topic[question_id]
            tLen = len(topic_set)
            if tLen > 0:
                index = random.randint(0, tLen - 1)
                topic = list(topic_set)[index]  # first
                if question_id in self.dict_question_base_info:
                    difficulty, qtype, upload_id = self.dict_question_base_info[
                        question_id]

                    key = '%s-%s' % (difficulty, qtype)
                    if key in self.dict_topic_question[topic]:
                        list_res_question = []

                        arr_content_question = self.dict_topic_question[topic][
                            key]
                        for content_item in arr_content_question:
                            res_question, extra_score = content_item

                            if res_question not in pre_set and res_question != question_id:
                                list_res_question.append(res_question)

                            if len(list_res_question) > rank_size: break

                        rnt = throld_size - len(list_res)
                        while rnt > 0 and len(list_res_question) > 0:
                            pos = random.randint(0, len(list_res_question) - 1)
                            list_res.append(list_res_question[pos])
                            rnt += -1

        return list_res

    def getEsResult(self,
                    question_id,
                    text,
                    keywords,
                    difficulty,
                    bqtype,
                    pre_set,
                    throld_size,
                    throld_page=10,
                    throld_cost=5,
                    rank_size=20):
        """
        questionType:  1   选择题 2   填空题 3   判断题 4   简答题 6  综合题
        difficulty : 1:易 2:中 3:难 4:极难
        """
        list_rank = []
        headers = self.getHeaders()  # get headers
        str_pre = ''
        if bqtype == 1: bqtype = 101
        for page in range(1, throld_page):
            try:
                url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?keyword=%s&subject_id=4&queryType=4&difficulty=%s&type=%s&page=%s" % (
                    keywords, difficulty, bqtype, page)
                # url = "http://jiaoshi.okjiaoyu.cn/teacher-center/search_singlequiz?_=1506480814420&teacher_id=ce36ea7e9c864af39d186b8e8b672864&type=&difficulty=%s&page=%s&keyword=%s&subject_id=4" % (difficulty, page, keywords)
                res = self.getUrlContent(url)
                res = res.replace('\r', '').replace('\n', '').replace(
                    'null', '').replace(':,', ':\"\",').strip()
                dict_result = json.loads(res.encode('utf8'))
                total_page = dict_result['data']['total_page']
                if total_page < throld_page:
                    throld_page = total_page  # range update
                list_result = dict_result['data']['list']
                for item in list_result:
                    qid = item['id']
                    question_body = self.translate(item['question_body'])
                    analysis = str(item['analysis']).strip().replace(' ', '')
                    topic_list = item['topic_list']
                    if len(topic_list) >= 1:
                        type_name = item['type_name']
                        if type_name == '选择题':
                            option_list = item['option_list']
                            if len(option_list) != 4: continue

                        cost = self.normalLeven(str_pre, question_body)
                        if qid in self.dict_question_quality:
                            extra_score = self.dict_question_quality[qid]
                        else:
                            extra_score = 1

                        if item['type_alias'] == '单选题' or item[
                                'type_alias'] == '选择题':
                            qtype = 1
                        elif item['type_alias'] == '填空题':
                            qtype = 2
                        else:
                            qtype = 3

                        bCharge = (cost > throld_cost) and (len(analysis) > 50)
                        if bCharge:
                            list_rank.append(
                                (qid, qtype, item['topic_list'], extra_score))
                            str_pre = question_body
            except:
                continue

        list_index = self.calScoreQustion(question_id, list_rank)  # rank
        res_size = throld_size - 1 if len(
            list_index) > throld_size - 1 else len(list_index)
        list_res = [x[0] for x in list_index[:res_size]]  # end output

        list_res = self.randomTopicHeader(
            question_id, list_res, pre_set, rank_size,
            throld_size)  #  header question random
        return list_res
Exemplo n.º 11
0
class QualityQuestion(object):
    """docstring for QualityQuestion"""
    def __init__(self):
        self.db_fetcher = DataBaseFetcher()  # mysql handle
        self.redis_cache = CacheHelper('127.0.0.1', 6379, 1, '')  # redis
        self.dict_simple_science, self.dict_simple_art = self.getSimpleQustion(
        )

        self.dict_question_topic = self.getQuestionTopic()
        self.dict_topic = self.getTopicDict()

    def getQuestionTopic(self, save_file='topic.txt'):
        dict_question_topic = {}
        max_num = 0
        with open(save_file) as handle_f:
            for line in handle_f:
                arr = line.strip().split('\t')
                if len(arr) != 3: continue
                tid, question_id, topic_id = arr[0], long(arr[1]), int(arr[2])
                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()

                dict_question_topic[question_id].add(topic_id)
                max_num = tid

        sql = "select id, question_id, topic_id from neworiental_v3.link_question_topic where id > %s" % max_num
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        with open(save_file, 'a') as write_f:
            for row in rows:
                tid, question_id, topic_id = row
                if question_id not in dict_question_topic:
                    dict_question_topic[question_id] = set()

                dict_question_topic[question_id].add(topic_id)
                write_f.write('%s\t%s\t%s\n' % (tid, question_id, topic_id))

        return dict_question_topic

    def getTopicDict(self):
        dict_topic = {}
        sql = "select id, `name` from neworiental_v3.entity_topic where subject_id = 4"
        rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
        for row in rows:
            topic_id, name = row
            topic_id = int(topic_id)
            dict_topic[topic_id] = name

        return dict_topic

    def makeSet2Dict(self, set_simple):
        dict_simple = {}
        for item in set_simple:
            arr = item.split('\t')
            qid, question_type, json_data = arr[0], arr[1], arr[2]
            qid = long(qid)
            dict_simple[qid] = question_type, json_data

        return dict_simple

    def getSimpleQustion(self):
        dict_simple_science = {}
        dict_simple_art = {}
        is_redis = self.redis_cache.exists(
            '62951166961') and self.redis_cache.exists('62951084550')
        if is_redis:
            # science
            set_simple_science = self.redis_cache.smembers('62951166961')
            dict_simple_science = self.makeSet2Dict(set_simple_science)
            # art
            set_simple_art = self.redis_cache.smembers('62951084550')
            dict_simple_art = self.makeSet2Dict(set_simple_art)
        else:
            sql = "SELECT id, question_type, json_data, upload_id from neworiental_v3.entity_question where upload_id in (62951166961, 62951084550) and json_data !=\'\' "
            rows = self.db_fetcher.get_sql_result(sql, 'mysql_logdata')
            for row in rows:
                qid, question_type, json_data, upload_id = row
                upload_id = str(upload_id)
                json_data = json_data.replace('\t', '')
                if upload_id == '62951166961':  # students majored in science
                    dict_simple_science[qid] = question_type, json_data
                elif upload_id == '62951084550':
                    dict_simple_art[qid] = question_type, json_data

                str_content = '%s\t%s\t%s' % (qid, question_type, json_data)
                self.redis_cache.sadd(upload_id, str_content)

        return dict_simple_science, dict_simple_art

    def getTopicSimpleQuestion(self):
        dict_topic_question = {}
        for qid in self.dict_simple_art:
            if qid in self.dict_question_topic:
                set_topic = self.dict_question_topic[qid]
                for topic in set_topic:
                    if topic not in dict_topic_question:
                        dict_topic_question[topic] = set()

                    dict_topic_question[topic].add(qid)

        for topic in dict_topic_question:
            set_topic = dict_topic_question[topic]
            str_content = '\t'.join([str(x) for x in set_topic])
            print '%s\t%s\t%s' % (topic, self.dict_topic[topic], str_content)