def _query_class(md5_value): query = "SELECT clsname,day,time,teacher,duration,week,location,students,id FROM ec_classes_" \ + get_semester_code_for_db(xq) + " WHERE id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (md5_value,)) return cursor.fetchall()
def retrieve_english_classes(xq): url = settings.ENGLISH_CLASS_URL s = requests.session() conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() print('Fetching class information...') req = s.post(url, headers=header_info) json_string = req.content.decode('utf-8') json_string = re_quote_compiled.sub('"', json_string) if settings.DEBUG: predefined.print_formatted_info("json_string:" + json_string, True) print("-----\n\n") json_content = json.loads(json_string) for each_class in json_content: if not isinstance(each_class[2], int): # Exclude first invalid line this_clsname = "英语约课" + each_class[4] this_day = predefined.get_day_for_class(each_class[4][1:3]) this_time = predefined.get_time_for_class(each_class[4][4:7]) # 这里切片实际上是有问题的,"9-10"会切成"9-1",之前被坑过了 this_teacher = each_class[3] if each_class[2][0] == '双': this_duration = "4-16" this_week = "双周" else: this_duration = "3-15" this_week = "单周" this_location = each_class[5] md5 = hashlib.md5() class_str = str(each_class[2]) + str(each_class[3]) + str(each_class[4]) + str(each_class[5]) md5.update(class_str.encode('utf-8')) this_id = md5.hexdigest() del md5 class_dict[each_class[2]] = dict(Clsname=this_clsname, Day=this_day, Time=this_time, Teacher=this_teacher, Duration=this_duration, Week=this_week, ID=this_id) predefined.print_formatted_info(class_dict[each_class[2]]) query = "select * from ec_classes_" + predefined.get_semester_code_for_db(xq) + " where id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (this_id,)) class_fetch_result = cursor.fetchall() if not class_fetch_result: cprint('[Add class]', "blue", attrs=["bold"], end='') query = "INSERT INTO ec_classes_" + predefined.get_semester_code_for_db( xq) + " (clsname, day, time, teacher, duration, week, location, students, id) "" \ ""VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" predefined.print_formatted_info("SQL: " + query) cursor.execute(query, ( this_clsname, this_day, this_time, this_teacher, this_duration, this_week, this_location, json.dumps(list()), this_id)) conn.commit() else: cprint("[Class already exists]", "green", attrs=["bold"]) print("--\n") cursor.close() conn.close()
def stu_id_prefix(): url = settings.ENGLISH_CLASS_NAMEROLL_URL s = requests.session() conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() print("Fetching students information...") data = dict(pageNo=1) req = s.get(url, headers=header_info, params=data) # You must use GET method instead of POST here json_string = req.content.decode(encoding='utf-8') json_string = re_quote_compiled.sub('"', json_string) json_content = json.loads(json_string) if settings.DEBUG: print("json_string:\n" + json_string) pages_count = json_content[0][0] records_count = json_content[0][1] this_page = json_content[0][2] if settings.DEBUG: predefined.print_formatted_info(class_dict, True, 'Class_dict') cprint("Fetched %s records in %s pages" % (records_count, pages_count), color='magenta') while this_page <= pages_count: cprint("Now processing page %s:" % this_page, color='cyan') data = dict(pageNo=this_page) req = s.get(url, headers=header_info, params=data) json_string = req.content.decode(encoding='utf-8') json_string = re_quote_compiled.sub('"', json_string) json_content = json.loads(json_string) for each_people in json_content: if not isinstance(each_people[2], int): print(each_people) re_profess_compiled = re.compile(r'[0-9]{4,4}') print("Now processing [%s]%s" % (each_people[6][0:4], re_profess_compiled.split(each_people[4])[0])) # Query in ec_stu_id_prefix table query = "SELECT name FROM ec_stu_id_prefix WHERE prefixes=%s" cursor.execute(query, (each_people[6][0:4], )) fetch_result = cursor.fetchall() if not fetch_result: cprint('[Add]', end='', color='blue', attrs=['bold']) query = "INSERT INTO ec_stu_id_prefix (prefixes, name) values (%s, %s)" cursor.execute( query, (each_people[6][0:4], re_profess_compiled.split(each_people[4])[0])) conn.commit() else: # For unknown reason, there are different majors having same prefixes, therefore you need to handle # this situation. cprint("[Prefix already in table]", color='green', attrs=['bold'], end='') print(fetch_result[0][0]) if re_profess_compiled.split( each_people[4])[0] not in fetch_result[0][0]: query = "UPDATE ec_stu_id_prefix SET name=%s WHERE prefixes=%s" cursor.execute( query, (fetch_result[0][0] + ';' + re_profess_compiled.split(each_people[4])[0], each_people[6][0:4])) conn.commit() print("\n") this_page += 1 cursor.close() conn.close() cprint("Finished!", color='green', attrs=['blink', 'bold'])
def process_english_students(xq): students_list = [] class_list = [] url = settings.ENGLISH_CLASS_NAMEROLL_URL s = requests.session() conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() print("Fetching students information...") data = dict(pageNo=1) req = s.get(url, headers=header_info, params=data) # You must use GET method instead of POST here json_string = req.content.decode(encoding='utf-8') json_string = re_quote_compiled.sub('"', json_string) json_content = json.loads(json_string) if settings.DEBUG: print("json_string:\n" + json_string) pages_count = json_content[0][0] records_count = json_content[0][1] this_page = json_content[0][2] if settings.DEBUG: predefined.print_formatted_info(class_dict, True, 'Class_dict') cprint("Fetched %s records in %s pages" % (records_count, pages_count), color='magenta') while this_page <= pages_count: cprint("Now processing page %s:" % this_page, color='cyan') data = dict(pageNo=this_page) req = s.get(url, headers=header_info, params=data) json_string = req.content.decode(encoding='utf-8') json_string = re_quote_compiled.sub('"', json_string) json_content = json.loads(json_string) for each_people in json_content: if not isinstance(each_people[2], int): cprint("Now processing [%s]%s in class %s" % (each_people[6], each_people[7], each_people[2]), attrs=['bold']) if settings.DEBUG: predefined.print_formatted_info((class_dict[each_people[2]])) # Query ec_classes table query = "SELECT students FROM ec_classes_" + predefined.get_semester_code_for_db(xq) + " WHERE id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (class_dict[each_people[2]]['ID'],)) class_fetch_result = cursor.fetchall() students_list.clear() students_list = json.loads(class_fetch_result[0][0]) if each_people[6] not in students_list: cprint('[Append student to class]', end='', color='blue', attrs=['bold']) students_list.append(each_people[6]) query = "update ec_classes_" + predefined.get_semester_code_for_db( xq) + " set students=%s where id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (json.dumps(students_list), class_dict[each_people[2]]['ID'])) conn.commit() else: cprint("[Student already in this class]", color='green', attrs=['bold']) # Query ec_students table query = "SELECT classes FROM ec_students_" + predefined.get_semester_code_for_db(xq) + " WHERE xh=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (each_people[6],)) class_fetch_result = cursor.fetchall() class_list = json.loads(class_fetch_result[0][0]) if class_dict[each_people[2]]['ID'] not in class_list: class_list.append(class_dict[each_people[2]]['ID']) query = "UPDATE ec_students_" + predefined.get_semester_code_for_db( xq) + " SET classes=%s WHERE xh=%s" cursor.execute(query, (json.dumps(class_list), each_people[6])) conn.commit() else: cprint("[Class already in this student's record]", color='green', attrs=['bold']) print("\n") this_page += 1 cursor.close() conn.close() cprint("Finished!", color='green', attrs=['blink', 'bold'])
def run(self): global TABLE1_COUNT_ADD, TABLE1_COUNT_PASS, TABLE1_COUNT_UPDATE, TABLE2_COUNT_ADD, TABLE2_COUNT_PASS global TOTAL_COUNT, ADD_NEW_COURSE_COUNT, APPEND_TO_COURSE_COUNT conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() while True: my_courses_md5 = [] stu = queue.get() if stu is None: print("Thread %s quit." % self.thread_id) cursor.close() break if settings.DEBUG_LEVEL >= 1: print('Processing student: [xh=%s][xs0101id=%s]%s' % (stu['xh'], stu['xs0101id'], stu['xm'])) # Read file file_address = os.path.join('raw_data', stu['xs0101id']) file = open(file_address + '.html', 'r') soup = BeautifulSoup(file, 'html.parser') # ec_students 表检查 db_lock.acquire() query = 'SELECT xh,semesters FROM ec_students WHERE xh=%s' cursor.execute(query, (stu['xh'], )) fetch_result = cursor.fetchall() db_lock.release() if not fetch_result: # 若找不到,则在 ec_students 表中新增学生 db_lock.acquire() query = "INSERT INTO ec_students (xh, semesters, xs0101id, name) VALUES (%s, %s, %s, %s)" cursor.execute(query, (stu['xh'], json.dumps([ settings.SEMESTER, ]), stu['xs0101id'], stu['xm'])) conn.commit() db_lock.release() count_lock.acquire() TABLE1_COUNT_ADD = TABLE1_COUNT_ADD + 1 count_lock.release() else: # 老生检查学期是否在个人记录中已经存在,若不存在则加入 if settings.DEBUG_LEVEL >= 2: print('Existing student: [%s]%s' % (stu['xh'], stu['xm'])) semesters = json.loads(fetch_result[0][1]) if settings.SEMESTER not in semesters: # 当前学期不在数据库中则加入(不判断的话可能重复加入) semesters.append(settings.SEMESTER) db_lock.acquire() query = "UPDATE ec_students SET semesters=%s WHERE xh=%s" cursor.execute(query, (json.dumps(semesters), stu['xh'])) conn.commit() db_lock.release() count_lock.acquire() TABLE1_COUNT_UPDATE = TABLE1_COUNT_UPDATE + 1 count_lock.release() print("Add semester to student [%s]%s 's record" % (stu['xh'], stu['xm'])) else: count_lock.acquire() TABLE1_COUNT_PASS = TABLE1_COUNT_PASS + 1 count_lock.release() # ec_students 学期表中寻找学生 db_lock.acquire() query = 'SELECT * FROM ec_students_' + get_semester_code_for_db( xq) + ' WHERE xh=%s' cursor.execute(query, (stu['xh'], )) result = cursor.fetchall() db_lock.release() # 在数据库中找不到学生,则增加学生 if not result: if settings.DEBUG_LEVEL >= 2: print('[Add student to ec_students_%s]' % get_semester_code_for_db(xq)) for class_time in range(1, 8): for row_number in range(1, 7): query_selector = 'div[id="' + get_row_code( xq, row_number) + '-' + str(class_time) + '-2"] a' for i in soup.select(query_selector): # i 为 a 元素 course_info = {'clsname': i.contents[0], 'teacher': 'None' if not i.select('font[title="老师"]') else \ i.select('font[title="老师"]')[0].string, 'duration': 'None' if not i.select('font[title="周次"]') else \ i.select('font[title="周次"]')[0].string, 'week': 'None' if not i.select('font[title="单双周"]') else \ i.select('font[title="单双周"]')[0].string, 'location': 'None' if not i.select('font[title="上课地点教室"]') else \ i.select('font[title="上课地点教室"]')[0].string} class_str = str(course_info['clsname']) + \ str(course_info['teacher']) + \ str(course_info['duration']) + \ str(course_info['week']) + \ str(course_info['location']) + \ str(class_time) + \ str(row_number) # 生成class_str用于生成课程 MD5识别码 md5 = hashlib.md5() md5.update(class_str.encode('utf-8')) course_info['hash'] = md5.hexdigest() my_courses_md5.append(md5.hexdigest()) db_lock.acquire() # 查询当前课程 class_fetch_result = _query_class( md5.hexdigest(), cursor) # 如果课程不存在,增加课程 if not class_fetch_result: _add_new_course( course_name=str(course_info['clsname']), class_time=class_time, row_number=row_number, teacher=str(course_info['teacher']), duration=str(course_info['duration']), week=str(course_info['week']), location=str(course_info['location']), md5_value=md5.hexdigest(), xh=stu['xh'], cursor=cursor, conn=conn) # 如果课程存在,在课程entry中增加学生 else: _append_student_to_class( existing_students=json.loads( class_fetch_result[0][7]), this_student=stu['xh'], class_id=md5.hexdigest(), cursor=cursor, conn=conn) db_lock.release() del md5 if settings.DEBUG: print(course_info) # 对 my_courses_md5 去重 class_list_final = list(set(my_courses_md5)) class_list_final.sort(key=my_courses_md5.index) if settings.DEBUG_LEVEL >= 3: print('Courses list(%s): %s' % (len(class_list_final), class_list_final)) predefined.print_formatted_info(query) # 在学期表中新增学生 db_lock.acquire() query = "INSERT INTO ec_students_" + get_semester_code_for_db(xq) \ + " (xh, classes) VALUES (%s, %s)" cursor.execute(query, (stu['xh'], json.dumps(class_list_final))) conn.commit() db_lock.release() # 提交数据及清空 list count_lock.acquire() TABLE2_COUNT_ADD = TABLE2_COUNT_ADD + 1 count_lock.release() # 如果学期表中已经存在数据就跳过(学期表只能从零开始,不能更新) else: print('[PASS] student [%s]%s already exists in %s' % (stu['xh'], stu['xm'], "ec_students_" + get_semester_code_for_db(xq))) count_lock.acquire() TABLE2_COUNT_PASS = TABLE2_COUNT_PASS + 1 count_lock.release() # 每100个处理完毕告知一次 count_lock.acquire() TOTAL_COUNT = TOTAL_COUNT + 1 count_lock.release() if TOTAL_COUNT % 100 == 0: cprint('%s finished.' % TOTAL_COUNT, color='green', attrs=["bold"]) queue.task_done()
# 如果课程存在,在课程entry中增加学生 else: _append_student_to_class(json.loads(class_fetch_result[0][7]), stu['xh'], md5.hexdigest()) del md5 print(class_info) class_info.clear() # 对 my_class_list 去重 class_list_final = list(set(my_class_list)) class_list_final.sort(key=my_class_list.index) if settings.DEBUG: print('Class list(%s): %s' % (len(class_list_final), class_list_final)) predefined.print_formatted_info(query) # 在学期表中新增学生 query = "INSERT INTO ec_students_" + get_semester_code_for_db(xq) \ + " (xh, classes) VALUES (%s, %s)" cursor.execute(query, (stu['xh'], json.dumps(class_list_final))) # 提交数据及清空 list conn.commit() table2_count_add = table2_count_add + 1 my_class_list.clear() # 如果学期表中已经存在数据就跳过(学期表只能从零开始,不能更新) else: print('[PASS] student [%s]%s already exists in %s' % (stu['xh'], stu['xm'], "ec_students_" + get_semester_code_for_db(xq))) table2_count_pass = table2_count_pass + 1
def retrieve_and_process(xq): url = settings.ENGLISH_CLASS_URL_17 s = requests.session() conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() print('Fetching class information...') req = s.post(url, headers=header_info) json_string = req.content.decode('utf-8') # predefined.print_formatted_info("Fetched json_string:" + json_string, True) json_content = json.loads(json_string) for each in json_content["courseInfo"]: print(each) for each in json_content["docs"]: print(each) for class_number in range(len(json_content["courseInfo"])): this_course = json_content["courseInfo"][class_number] this_clsname = "英语约课" + str(class_number + 1) + "班" this_day = predefined.get_day_for_class(this_course["courseTime"][1:3]) this_time = predefined.get_time_for_class( this_course["courseTime"] [4:7]) # 这里切片实际上是有问题的,"9-10"会切成"9-1",之前被坑过了 # Search in docs for teacher name for each in json_content["docs"]: if each["courseNo"] == class_number + 1 + 300: this_teacher = each["teacherName"] break if this_course["courseTime"][0] == '双': this_duration = "4-16" this_week = "双周" else: this_duration = "3-15" this_week = "单周" this_location = this_course["classroom"] # Generate MD5 md5 = hashlib.md5() class_str = str(this_clsname) + \ str(this_teacher) + \ str(this_duration) + \ str(this_week) + \ str(this_location) + \ str(this_day) + \ str(this_time) md5.update(class_str.encode('utf-8')) this_id = md5.hexdigest() del md5 class_dict[class_number] = dict(Clsname=this_clsname, Day=this_day, Time=this_time, Teacher=this_teacher, Duration=this_duration, Week=this_week, ID=this_id) query = "select * from ec_classes_" + predefined.get_semester_code_for_db( xq) + " where id=%s" cursor.execute(query, (this_id, )) class_fetch_result = cursor.fetchall() if not class_fetch_result: # Add new class cprint('[Add class]', "blue", attrs=["bold"], end='') query = "INSERT INTO ec_classes_" + predefined.get_semester_code_for_db( xq ) + " (clsname, day, time, teacher, duration, week, location, students, id) " " \ " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" predefined.print_formatted_info("SQL: " + query) cursor.execute(query, (this_clsname, this_day, this_time, this_teacher, this_duration, this_week, this_location, json.dumps(list()), this_id)) conn.commit() else: cprint("[Class already exists]", "green", attrs=["bold"]) predefined.print_formatted_info(class_dict[class_number]) print("--\n") # Process student students_list = [] class_list = [] for each_people in json_content["docs"]: cprint("Now processing [%s]%s in class %s" % (each_people["stuID"], each_people["stuName"], each_people["className"]), attrs=['bold']) # Query ec_classes table query = "SELECT students FROM ec_classes_" + predefined.get_semester_code_for_db( xq) + " WHERE id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (class_dict[each_people["courseNo"] - 300 - 1]['ID'], )) class_fetch_result = cursor.fetchall() students_list.clear() students_list = json.loads(class_fetch_result[0][0]) if each_people["stuID"] not in students_list: # Append student to class cprint('[Append student to class]', end='', color='blue', attrs=['bold']) students_list.append(each_people["stuID"]) query = "update ec_classes_" + predefined.get_semester_code_for_db( xq) + " set students=%s where id=%s" cursor.execute( query, (json.dumps(students_list), class_dict[each_people["courseNo"] - 300 - 1]['ID'])) conn.commit() else: # Student already in this class cprint("[Student already in this class]", color='green', attrs=['bold']) # Query ec_students table query = "SELECT classes FROM ec_students_" + predefined.get_semester_code_for_db( xq) + " WHERE xh=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (each_people["stuID"], )) class_fetch_result = cursor.fetchall() class_list = json.loads(class_fetch_result[0][0]) if class_dict[each_people["courseNo"] - 300 - 1]['ID'] not in class_list: # Append class to student's record class_list.append(class_dict[each_people["courseNo"] - 300 - 1]['ID']) query = "UPDATE ec_students_" + predefined.get_semester_code_for_db( xq) + " SET classes=%s WHERE xh=%s" cursor.execute(query, (json.dumps(class_list), each_people["stuID"])) conn.commit() cprint("[Add class %s to student %s's record]" % (class_dict[each_people["courseNo"] - 300 - 1]['ID'], each_people["stuID"]), color='green', attrs=['bold']) else: # Class already in the student's record cprint("[Class already in this student's record]", color='green', attrs=['bold']) print("\n") cursor.close() conn.close() cprint("Finished!", color='green', attrs=['blink', 'bold'])
def process_data(xq): class_info = {} class_list = [] students_list = [] names_json = open("stu_data.json") names = json.load(names_json) conn = mysql.connector.connect(**settings.MYSQL_CONFIG) cursor = conn.cursor() for stu in names: cprint('Processing student: [%s]%s' % (stu['xh'], stu['xm']), attrs=["bold"]) file_addr = os.path.join('raw_data', stu['xs0101id']) file = open(file_addr + '.html', 'r') soup = BeautifulSoup(file, 'html.parser') query = 'SELECT * FROM ec_students_' + get_semester_code_for_db(xq) + ' WHERE xh=%s' if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (stu['xh'],)) if not cursor.fetchall(): cprint('[ADD STUDENT]', attrs=['bold']) for class_time in range(1, 8): for row_number in range(1, 7): query_selector = 'div[id="' + get_row_code(row_number) + '-' + str( class_time) + '-2"] a' for i in soup.select(query_selector): # i 为 a 元素 class_info['clsname'] = i.contents[0] class_info['teacher'] = 'None' if not i.select('font[title="老师"]') else \ i.select('font[title="老师"]')[0].string class_info['duration'] = 'None' if not i.select('font[title="周次"]') else \ i.select('font[title="周次"]')[0].string class_info['week'] = 'None' if not i.select('font[title="单双周"]') else \ i.select('font[title="单双周"]')[0].string class_info['location'] = 'None' if not i.select('font[title="上课地点教室"]') else \ i.select('font[title="上课地点教室"]')[0].string class_str = str(class_info['clsname']) + str(class_info['teacher']) + str( class_info['duration']) + str(class_info['week']) + str(class_info['location']) + str( class_time) + str(row_number) md5 = hashlib.md5() md5.update(class_str.encode('utf-8')) class_info['hash'] = md5.hexdigest() class_list.append(md5.hexdigest()) query = "SELECT * FROM ec_classes_" + get_semester_code_for_db(xq) + " WHERE id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (md5.hexdigest(),)) class_fetch_result = cursor.fetchall() if not class_fetch_result: cprint('[ADD CLASS]', end='', color="blue", attrs=['bold']) students_list.clear() students_list.append(stu['xh']) query = "INSERT INTO ec_classes_" + get_semester_code_for_db( xq) + "(clsname, day, time, teacher, duration, week, location, students, id) VALUES (" \ "%s, %s, %s, %s, %s, %s, %s, %s, %s) " if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, ( str(class_info['clsname']), class_time, row_number, str(class_info['teacher']), str(class_info['duration']), str(class_info['week']), str(class_info['location']), json.dumps(students_list), md5.hexdigest())) conn.commit() else: students_list.clear() students_list = json.loads(class_fetch_result[0][7]) # For unknown reason, education management system in CSU may show your same classes twice, # hence you need to check whether this happens if stu['xh'] not in students_list: students_list.append(stu['xh']) query = "UPDATE ec_classes_" + get_semester_code_for_db( xq) + " SET students=%s WHERE id=%s" if settings.DEBUG: predefined.print_formatted_info(query) cursor.execute(query, (json.dumps(students_list), md5.hexdigest())) conn.commit() cprint('[APPEND STUDENT]', end='', color='blue', attrs=['bold']) del md5 print(class_info) class_info.clear() query = "INSERT INTO ec_students_" + get_semester_code_for_db( xq) + " (xs0101id, name, xh, classes) VALUES (%s, %s, %s, %s)" # 对 class_list 去重 class_list_final = list(set(class_list)) class_list_final.sort(key=class_list.index) if settings.DEBUG: print('Class list(%s): %s' % (len(class_list_final), class_list_final)) predefined.print_formatted_info(query) cursor.execute(query, (stu['xs0101id'], stu['xm'], stu['xh'], json.dumps(class_list_final))) conn.commit() class_list.clear() class_list_final.clear() else: cprint('[PASS] STUDENT ALREADY EXISTS', color='green', attrs=['bold']) print('\n') cursor.close() conn.close() cprint("Finished!", color='red')