def downloadKnowled(self, select_sql=SQL.SELECT_SUBJECT_RELATION, inser_konw_sql=SQL.INSERT_21CNJY_KNOWLED, know_url=URL.KNOW_URL, know_child_url=URL.KNOW_CHILD_URL): '''下载知识点''' pg = PostgreSql() try: for row in pg.getAll(select_sql): try: subject_code, xd, subject_zname, course_21 = row response = self.session.get( know_url % (xd, course_21, Utils.getCurrMilliSecond()), headers=self.headers) rs = self.__recursiveKnowled( response.json(), 1, subject_code, know_child_url % ('%s', xd, course_21, '%s')) if rs: pg.batchExecute(inser_konw_sql, rs) pg.commit() logger.info( u'完成二一组卷网(学段:%s,学科:%s,线上学科名称:%s,线上学科代码:%s)知识点的导入,导入知识点数量:%d', xd, course_21, subject_zname, subject_code, len(rs)) except Exception as e: logger.exception( u'二一组卷网(学段:%s,学科:%s,线上学科名称:%s,线上学科代码:%s)知识点的导入异常', xd, course_21, subject_zname, subject_code) pg.rollback() finally: pg.close()
def downloadSubject(self, select_sql=SQL.SELECT_SUBJECT_RELATION, update_sql=SQL.UPDATE_SUBJECT_RELATION, subjects_url=URL.SUBJECTS_URL % Utils.getCurrMilliSecond()): '''下载21世纪学科学段,并更新与线网关系对应''' response = self.session.get(subjects_url, headers=self.headers) zj21cnjy_subject = response.json() pg = PostgreSql() try: update_params = [] for row in pg.getAll(select_sql): subject_code, xd, subject_zname, course_21 = row if course_21: continue course_21_name = None for key, value in zj21cnjy_subject[str(xd)].iteritems(): value_temp = value if value != u'政治思品' else u'政治' if subject_zname.find(value_temp) > -1: course_21 = key course_21_name = value if course_21: update_params.append( (course_21, course_21_name, subject_code)) if update_params: pg.batchExecute(update_sql, update_params) pg.commit() logger.info(u'完成21cnjy与线网学科学段对应关系更新,更新数量:%d', len(update_params)) except Exception as e: logger.exception(u'21cnjy与线网学科学段对应关系更新出现异常,异常信息:%s', e.message) pg.rollback() finally: pg.close()
def execExtract(self, select_main_sql=SQL.select_main_sql, insert_image_sql=SQL.insert_image_sql, update_main_sql=SQL.update_main_sql, select_convert_sql=SQL.select_convert_sql): '''执行提取图''' postgreSql = PostgreSql() count = 0 rs = True try: flag = True # 代表数据库里面还有需要处理的数据 while flag: try: flag = False insert_image_params = [] update_main_params = [] for rows in postgreSql.getAll(select_main_sql): flag = True seq = rows[0] qid = rows[1] try: urls = [] for col in rows[2:]: for j_url in JyeooUtil.getJyeooImg(col): if j_url not in urls: urls.append(j_url) # 生成临时的图片文件 self.__generateTmpImage(urls, postgreSql, select_convert_sql) # 插入数据到img表 insert_image_params.append( (seq, qid, json.dumps(urls), 0 if urls else 2)) # 更新jyeoo主表的数据状态 update_main_params.append((1 if urls else 2, qid)) except Exception as ex: rs = False logger.exception( u"提取图片-----处理qi=%s,创建题目的图片发生异常,异常信息:%s" % (qid, ex.message)) return rs if insert_image_params: postgreSql.batchExecute(insert_image_sql, insert_image_params) if update_main_params: postgreSql.batchExecute(update_main_sql, update_main_params) postgreSql.commit() count += len(insert_image_params) logger.info(u'提取图片-----已成功处理题目数量:%d' % count) except Exception as e: postgreSql.rollback() rs = False logger.exception(u"提取图片-----批量处理-异常信息:%s" % (e.message)) finally: postgreSql.close() return rs
def extractQuesImage(self, rows=1000, select_batch_ques=SQL.SELECT_BATCH_QUES, insert_image_url=SQL.INSERT_IMAGE_URL, update_status=SQL.UPDATE_STATUS): '''分析提取题目图片''' logger.info(u'开始分析提取题目图片') seq = 0 try: pg = PostgreSql() flag = True count = 0 while flag: try: flag = False insert_params = [] update_params = [] for row in pg.getAll(select_batch_ques, (0, seq, rows)): flag = True qid = row[0] old_id = row[1] seq = row[2] try: urls = [row[3], row[4]] for col in row[5:]: if col is None: continue urls.extend(self.__get21cnjyImg(col)) # 生成临时的图片文件 self.__generateTmpImage(urls) # 插入数据到img表 存在图片状态为0,不存在图片状态为2 insert_params.append((seq, qid, Utils.toJson(urls), 0 if urls else 2)) # 更新21cnjy主表的数据状态 存在图片状态修改为1,不存在图片状态为2 update_params.append((1 if urls else 2, qid)) except Exception as ex: logger.exception( u"处理qi=%s,old_id=%s,创建题目的图片发生异常,异常信息:%s" % (qid, old_id, ex.message)) if update_params: pg.batchExecute(update_status, update_params) if insert_params: pg.batchExecute(insert_image_url, insert_params) pg.commit() count += len(update_params) logger.info(u'已成功处理题目数量:%d' % count) except Exception as e: pg.rollback() logger.exception("批量处理-异常信息:%s" % (e.message)) finally: pg.close()
def findImags(self, subject): seq_num = 0 rows = self.ROWS try: pg = PostgreSql() flag = True count = 0 while flag: try: flag = False insert_params = [] update_params = [] for row in pg.getAll(self.SELECT_SQL, (subject, seq_num, rows)): flag = True qid = row[0] old_id = row[1] seq_num = row[2] try: urls = [] for col in row[3:]: if col is None: continue urls.extend(self.getZjImg(col)) print(urls) # 生成临时的图片文件 self.generateTmpImage(urls) # 插入数据到img表 存在图片状态为0,不存在图片状态为2 insert_params.append( (qid, json.dumps(urls), 0 if urls else 2)) # 更新jyeoo主表的数据状态 存在图片状态修改为1,不存在图片状态为2 update_params.append((1 if urls else 2, qid)) except Exception as ex: logger.exception( u"处理qi=%s,old_id=%s,创建题目的图片发生异常,异常信息:%s" % (qid, old_id, ex.message)) if update_params: pg.batchExecute(self.UPDATE_SQL, update_params) if insert_params: pg.batchExecute(self.INSERT_SQL, insert_params) pg.commit() count += len(update_params) logger.info(u'已成功处理题目数量:%d' % count) except Exception as e: pg.rollback() logger.exception("批量处理-异常信息:%s" % (e.message)) finally: pg.close()
def downloadQuestions( self, ques_type='', ques_pg_url=URL.QUES_PG_URL, select_knowled_id=SQL.SELECT_KNOWLED_ID, select_subject_sql=SQL.SELECT_SUBJECT_RELATION, select_params_type=SQL.SELECT_PARAMS_TYPE, update_knowled_downloded=SQL.UPDATE_KNOWLED_DOWNLODED): '''按知识点下载题目''' pg = PostgreSql() try: #获取题目类型信息 ques_type_dic = {} for row_ques_types in pg.getAll(select_params_type, ('ques_type', 1)): subject_code, code_21cnjy, name_21cnjy, code, name = row_ques_types #按学科类型分组 if not ques_type_dic.has_key(subject_code): ques_type_dic[subject_code] = {} #具体学科的编码类型信息 ques_type_dic[subject_code][code_21cnjy] = { 'code_21cnjy': code_21cnjy, 'name_21cnjy': name_21cnjy, 'code': code, 'name': name } #获取学段信息 for row in pg.getAll(select_subject_sql): try: subject_code, xd, subject_zname, course_21 = row self.__downloadQuestionsBySubject( row, pg, ques_pg_url, ques_type_dic[subject_code], ques_type, select_knowled_id, update_knowled_downloded) except Exception as e: logger.exception( u'二一组卷网(学段:%s,学科:%s,线上学科名称:%s,线上学科代码:%s)题目导入异常', xd, course_21, subject_zname, subject_code) pg.rollback() raise e finally: pg.close()
def findImags(subject): rows = ROWS try: pg = PostgreSql() flag = True count = 0 qid = '0' err_count = 0 while flag: try: flag = False update_params = [] update_err_params = [] for row in pg.getAll(SELECT_SQL, (subject, qid, rows)): flag = True qid = row[0] url = getZjImg(row[1])[0] try: choice_answer = getTextByImageUrl2(url) answer_arr = [] answer_arr.append(choice_answer) update_params.append( (json.dumps(answer_arr, ensure_ascii=False), qid)) count += 1 except Exception as ex: err_count += 1 update_err_params.append((-1, qid)) logger.exception('异常的题目ID:%s,url:%s', qid, url) if update_params: pg.batchExecute(UPDATE_SQL, update_params) if update_err_params: pg.batchExecute(UPDATE_STATUS_SQL, update_err_params) pg.commit() logger.info(u'学科编码:%d,已成功处理题目数量:%d,错误数量:%d' % (subject, count, err_count)) except Exception as e: pg.rollback() logger.exception("学科编码:%d,批量处理-异常信息:%s" % (subject, e.message)) finally: pg.close()
def downloadQueryParams(self, select_sql=SQL.SELECT_SUBJECT_RELATION, insert_sql=SQL.INSERT_21CNJY_TYPE, query_param_url=URL.QUERY_PARAM_URL, ques_query_type=QUES_QUERY_TYPE): '''下载题目查询参数----如题目类型、难度等等''' pg = PostgreSql() count = 0 try: for row in pg.getAll(select_sql): try: subject_code, xd, subject_zname, course_21 = row insert_params = [] response = self.session.get( query_param_url % (xd, course_21, Utils.getCurrMilliSecond()), headers=self.headers) for param_type, values in response.json().iteritems(): if not ques_query_type.has_key(param_type): continue for code, name in values.iteritems(): count += 1 id = count insert_params.append( (id, ques_query_type[param_type], code, name, subject_code)) if insert_params: pg.batchExecute(insert_sql, insert_params) pg.commit() logger.info( u'完成二一组卷网(学段:%s,学科:%s,线上学科名称:%s,线上学科代码:%s)查询参数的导入,导入参数的数量:%d,所有学科处理总数%d', xd, course_21, subject_zname, subject_code, len(insert_params), count) except Exception as e: logger.exception( u'二一组卷网(学段:%s,学科:%s,线上学科名称:%s,线上学科代码:%s)查询参数的导入异常', xd, course_21, subject_zname, subject_code) pg.rollback() finally: pg.close()
def main(self,startTime=start_time, root_path=PATH.rootImagPath, pic_new_path=PATH.pic_new_path): select_sql = self.SELECT_SQL_IMG update_sql = self.UPDATE_SQL update_sql_img = self.UPDATE_SQL_IMG insert_sql = self.INSERT_SQL_CONVERT curr_time = time.time() curr_time_strft = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(curr_time)) # 记录当前分析时间 logger.info(u'本次分析时间:%s,秒:%.2f' % (curr_time_strft, curr_time)) postgreSql = PostgreSql() count = 0 total = 0 try: flag = True # 代表数据库里面还有需要处理的数据 id = 0 while flag: try: flag = False update_params = [] update_image_params = [] insert_params = [] for rows in postgreSql.getAll(select_sql % id): flag = True total += 1 id = rows[0] if rows[0] > id else id qid = rows[1] urls = rows[2] try: isDownloadFinish = True urlMap = {} for url in json.loads(urls): url_path = urlparse.urlsplit(url) fileName = os.path.join(root_path, url_path.path[1:]) if os.path.exists(fileName): mtime = os.path.getmtime(fileName) if mtime >= curr_time: isDownloadFinish = False elif start_time <= mtime: (temp, extension) = os.path.splitext(fileName) # 新文件名称 file_new_name = "%s%s" % (Utils.getStrMD5(url + "-mqm"), extension) # 新文件名称 - 全名 file_new_name_all = os.path.join(pic_new_path, file_new_name) # 新的url url_new = image_url + file_new_name urlMap[url] = url_new if not os.path.exists(file_new_name_all): shutil.copy2(fileName, file_new_name_all) Utils.modifyMD5(file_new_name_all) insert_params.append((url, url_new)) else: # 表示为之前处理过的图片 sql_url = self.SQL_URL urlMap[url] = postgreSql.getOne(sql_url, (url,))[0] # if urlMap[url]: # logger.error(u'oldurl:%s,数据不存在'% url) else: isDownloadFinish = False # 下载完成就更新t_jyeoo_img_url if isDownloadFinish: update_image_params.append((1, qid)) # 设置替换的图片url、更新原始数据表的状态为3(有图片、图片下载完成) update_params.append((json.dumps(urlMap), 3, qid)) except Exception as ex: logger.exception(u"处理qi=%s,校验题目的所有图片下载是否完成发生异常,异常信息:%s" % (qid, ex.message)) if update_params: postgreSql.batchExecute(update_sql, update_params) if update_image_params: postgreSql.batchExecute(update_sql_img, update_image_params) if insert_params: postgreSql.batchExecute(insert_sql, insert_params) postgreSql.commit() count += len(update_image_params) logger.info(u'已成功处理题目数量:%d,校验题目数量总数:%d' % (count, total)) except Exception as e: postgreSql.rollback() logger.exception("批量处理-异常信息:%s" % (e.message)) finally: postgreSql.close()
def execParseImage(self, select_image_sql=SQL.select_image_sql, select_convert_sql=SQL.select_convert_sql, update_main_url_sql=SQL.update_main_url_sql, update_image_sql=SQL.update_image_sql, insert_convert_sql=SQL.insert_convert_sql, picture_path=PATH.picture_path, pic_new_path=PATH.pic_new_path, pic_relative_path=PATH.pic_relative_path, image_url=image_url): pic_new_real_path = os.path.join(pic_new_path, pic_relative_path) image_real_url = urlparse.urljoin(image_url, pic_relative_path) logger.info(u'进入处理图片流程,原始图片路径:%s,处理后图片存放路径:%s,图片url前缀地址:%s', picture_path, pic_new_real_path, image_real_url) if not os.path.exists(pic_new_real_path): os.makedirs(pic_new_real_path) postgreSql = PostgreSql() count = 0 total = 0 rs = True try: flag = True # 代表数据库里面还有需要处理的数据 id = 0 while flag: try: flag = False update_main_params = [] update_image_params = [] insert_convert_params = [] for rows in postgreSql.getAll(select_image_sql % id): flag = True total += 1 id = rows[0] if rows[0] > id else id qid = rows[1] urls = rows[2] try: isDownloadFinish = True urlMap = {} for url in json.loads(urls): url_path = urlparse.urlsplit(url) fileName = os.path.join( picture_path, url_path.path[1:]) if os.path.exists(fileName): (temp, extension) = os.path.splitext(fileName) # 新文件名称 file_new_name = "%s%s" % (Utils.getStrMD5( url + "-mqm"), extension) # 新文件名称 - 全名 file_new_name_all = os.path.join( pic_new_path, file_new_name) # 新的url url_new = image_real_url + file_new_name urlMap[url] = url_new if os.path.exists(file_new_name_all): if not postgreSql.getOne( select_convert_sql, (url, )): insert_convert_params.append( url, url_new) else: shutil.copy2(fileName, file_new_name_all) Utils.modifyMD5(file_new_name_all) insert_convert_params.append( (url, url_new)) else: #查询 rs = postgreSql.getOne( select_convert_sql, (url, )) if rs: urlMap[url] = rs[0] else: isDownloadFinish = False # 下载完成就更新t_jyeoo_img_url if isDownloadFinish: update_image_params.append((1, qid)) # 设置替换的图片url、更新原始数据表的状态为3(有图片、图片下载完成) update_main_params.append( (json.dumps(urlMap), 3, qid)) except Exception as ex: rs = False logger.exception( u"处理图片流程,qi=%s,校验题目的所有图片下载是否完成发生异常,异常信息:%s" % (qid, ex.message)) return rs if update_main_params: postgreSql.batchExecute(update_main_url_sql, update_main_params) if update_image_params: postgreSql.batchExecute(update_image_sql, update_image_params) if insert_convert_params: postgreSql.batchExecute(insert_convert_sql, insert_convert_params) postgreSql.commit() count += len(update_image_params) logger.info(u'处理图片流程,已成功处理题目数量:%d,校验题目数量总数:%d' % (count, total)) except Exception as e: rs = False postgreSql.rollback() logger.exception(u"处理图片流程,批量处理-异常信息:%s" % (e.message)) finally: postgreSql.close() return rs