def updteTotalKnowled(self): '''给知识点增加题目总数total字段''' mongo = MongoDB() coll = mongo.getCollection(COLL.knowled) for key, value in subjects.items(): for xd in value['xds']: self.updteTotalKnowledByCid(value, xd, coll)
def updateOrInsertLastKnowledPgScrapyUrl(self, subject, xd): '''修改或新增pg表中内容为最新''' mongo = MongoDB() coll = mongo.getCollection(COLL.knowled) coll_pg = mongo.getCollection(COLL.pg_url) count_add = 0 count_update = 0 for kpg in self.generateLastKnowledPgScrapyUrlByCid(subject, xd, coll): doc = coll_pg.find_one({'kid': kpg['kid'], 'pg': kpg['pg']}) if not doc: coll_pg.insert_one(kpg) count_add += 1 elif doc['total'] != kpg['total']: coll_pg.update_one({ 'kid': kpg['kid'], 'pg': kpg['pg'] }, { '$set': { 'total': kpg['total'], 'rows': kpg['rows'] }, '$unset': {'status'} }) count_update += 1 logger.info(u'新增爬取pg数量:%d,修改爬取的pg数量:%d', count_add, count_update)
def mainSelection(self,c_url,category_url,course): c_name = course[1] c_url = c_url % c_name category_url = category_url % c_name response = self.session.get(c_url) root_soup = BeautifulSoup(html_parser.unescape(response.content), "lxml") ul_soup = root_soup.find('ul',id='JYE_BOOK_TREE_HOLDER') mongo = MongoDB() coll = mongo.getCollection(COLL.SELECTION) for ek_li_soup in ul_soup.find_all('li',attrs={'ek':True}): #教材ID,教材名称 ek_id = ek_li_soup['ek'] ek_name = ek_li_soup['nm'] for bk_li_soup in ek_li_soup.find_all('li',attrs={'bk':True}): # 年级ID,年级名称 bk_id = bk_li_soup['bk'] bk_name = bk_li_soup['nm'] data = {'a':bk_id,'q':'','f':0,'cb':'_setQ','r':random.random()} resp = self.session.post(category_url,data=data) pk_ul_soup = BeautifulSoup(html_parser.unescape(resp.content), "lxml").find('ul',id='JYE_POINT_TREE_HOLDER') try: s_rows = self.pareSelection(pk_ul_soup,bk_id,bk_name,ek_id,ek_name,course,1) coll.insert_many(s_rows) logger.info(u'完成下载菁优章节,教材名称:%s,年级名称:%s,科目名称:%s-%s,科目主url:%s',ek_name,bk_name,c_name,course[-1],c_url) except Exception as e: logger.exception(u'分析下载菁优章节异常,教材名称:%s,年级名称:%s,科目名称:%s-%s,科目主url:%s',ek_name,bk_name,course[-1],c_name,c_url)
def exportQuestionTypeToExecel(): mongon = MongoDB() coll = mongon.getCollection(COLL.question_type['question_channel_type']) question_type_dic = {} for doc in coll.find(): for cid in doc['cids']: cid_arr = question_type_dic[cid] if question_type_dic.has_key( cid) else [] cid_arr.append({ 'id': doc['id'], 'name': doc['name'], 'cname': subjectName_dic[cid] }) question_type_dic[cid] = cid_arr book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('题目类型', cell_overwrite_ok=True) sheet.write(0, 0, u'类型编码') sheet.write(0, 1, u'类型名称') sheet.write(0, 2, u'学科名称') sheet.write(0, 3, u'学科编码') row = 1 for key, values in question_type_dic.items(): for value in values: sheet.write(row, 0, value['id']) sheet.write(row, 1, value['name']) sheet.write(row, 2, value['cname']) sheet.write(row, 3, key) row += 1 book.save('zujuan_question_type.xls')
def mainKnowled(self): '''分析所有学科的知识点''' mongo = MongoDB() coll = mongo.getCollection(COLL.knowled) #coll.create_index([("id",pymongo.ASCENDING)],unique=True) for key, value in subjects.items(): for xdKey, xdValue in xds.items(): if xdValue['xd'] in value['xds']: try: coll.insert(self.parseKnowled(value, xdValue)) except Exception as e: logger.exception(u'处理学科:%s,学段:%s;出现异常,异常信息:%s', value['name'], xdValue['name'], e.message)
def generateLastKnowledPgScrapyUrl(self): '''生成最后知识点分析爬取的url''' mongo = MongoDB() coll = mongo.getCollection(COLL.knowled) coll_pg = mongo.getCollection(COLL.pg_url) #coll_pg.create_index([("kid", pymongo.ASCENDING),("pg", pymongo.ASCENDING)], unique=True) for key, value in subjects.items(): for xd in value['xds']: try: coll_pg.insert_many( self.generateLastKnowledPgScrapyUrlByCid( value, xd, coll)) except Exception as e: logger.exception(u'生成知识点url错误,学科:%s,学段:%d,错误信息:%s', value['name'], xd, e.message)
def generateQuestions(self, subjectCode): mongo = MongoDB() coll_question = mongo.getCollection(COLL.question) # coll_question.create_index([("question_id", pymongo.ASCENDING)], unique=True) cursor = mongo.getCollection(COLL.pg_url).find({ 'cid': subjectCode, 'status': 1 }) count_add = 0 count_update = 0 count_repeat = 0 for doc in cursor: kid = doc['kid'] ktitle = doc['ktitle'] for question in doc['pgdata']['data'][0]['questions']: q = coll_question.find_one( {'question_id': question['question_id']}) if q: kids = q['kids'] flag = False for kid_dic in kids: if kid_dic['kid'] == kid: flag = True count_repeat += 1 break if flag: continue kids.append({'kid': kid, 'ktitle': ktitle}) coll_question.update_one( {'question_id': question['question_id']}, {'$set': { 'kids': kids }}) count_update += 1 else: data = { 'question_id': question['question_id'], 'cid': subjectCode, 'old_data': question, 'kids': [{ 'kid': kid, 'ktitle': ktitle }] } coll_question.insert_one(data) count_add += 1 logger.info(u'本次生成学科:%d,新增题目:%d,修改kids数量:%d,忽略掉的重复题目:%d', subjectCode, count_add, count_update, count_repeat)
def parseParperPropAll(self, url=URL.paper_url): '''分析试卷的所有公共属性''' mongo = MongoDB() #创建唯一索引 # for key, value in COLL.type.items(): # coll = mongo.getCollection(value) # coll.create_index([(key+'_id',pymongo.ASCENDING)],unique=True) for key, value in subjects.items(): for xd in value['xds']: self.parsePaperProp(value, xd, mongo, url)
def downloadPaper(self, subject, xd): '''下载试卷''' subjectCode = xd * 10 + subject['code'] mongo = MongoDB() coll = mongo.getCollection(COLL.paper) for doc in coll.find({ 'cid': subjectCode, 'status': { '$exists': False } }): try: response = self.session.get(doc['url']) #root_soup = BeautifulSoup(response.content, "lxml") #script_soup = root_soup.find('script',text=re.compile(u'var\s*MockDataTestPaper\s*=\s*\[')) #datastr = re.findall(u'var\s*MockDataTestPaper\s*=\s*(\[{.+?}\])\s*;\s*',script_soup.get_text())[0] datastr = re.findall( u'var\s*MockDataTestPaper\s*=\s*(\[{.+?}\])\s*;\s*', response.content)[0] MockDataTestPaper = json.loads(datastr) coll.update_one({'paper_id': doc['paper_id']}, { "$set": { "paper_detail": MockDataTestPaper, 'status': 1 }, "$currentDate": { "lastModified": True } }) except Exception as e: logger.exception(u'下载试卷失败,试卷Id:%s,试卷url:%s,学科名称:%s,学科编码:%d', doc['paper_id'], doc['url'], subject['name'], subjectCode) if re.findall(r'sorry!\s*系统出错了!', response.content): coll.update_one({'paper_id': doc['paper_id']}, { "$set": { "error": u'sorry! 系统出错了!', 'status': -1 }, "$currentDate": { "lastModified": True } })
def exportKnowledToExecel(): mongon = MongoDB() coll = mongon.getCollection(COLL.knowled) book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('知识点关系', cell_overwrite_ok=True) sheet.write(0, 0, u'学科名称') sheet.write(0, 1, u'学科编码') sheet.write(0, 2, u'一级知识点名称') sheet.write(0, 3, u'一级知识点编码') sheet.write(0, 4, u'二级知识点名称') sheet.write(0, 5, u'二级知识点编码') sheet.write(0, 6, u'三级知识点名称') sheet.write(0, 7, u'三级知识点编码') sheet.write(0, 8, u'四级知识点名称') sheet.write(0, 9, u'四级知识点编码') row = 1 for doc in coll.find({ 'hasChild': False }).sort([('cid', pymongo.ASCENDING)]): sheet.write(row, 0, subjectName_dic[doc['cid']]) sheet.write(row, 1, doc['cid']) writeKnowldeToSheet(doc, row, sheet, coll) row += 1 book.save('zujuan_knowled.xls')
def localDataToRemote(self): '''数据库之前复制数据''' mongo_local = MongoDB() mongo_remote = MongoDB('192.168.26.159', 27017) cursor = mongo_local.getCollection(COLL.pg_url).find({'status': 1}) coll_remote = mongo_remote.getCollection(COLL.pg_url) for doc in cursor: coll_remote.update_one({ 'kid': doc['kid'], 'pg': doc['pg'] }, {"$set": { 'status': doc['status'], 'pgdata': doc['pgdata'] }})
def downloadPaperIds(self, subject, xd, paper_pg_url=URL.paper_pg_url, baseUrl=URL.rootUrl): subjectCode = xd * 10 + subject['code'] mongo = MongoDB() coll_grade = mongo.getCollection(COLL.type['grade']) coll_province = mongo.getCollection(COLL.type['province']) cursor_grade = coll_grade.find({'cids': {'$all': [subjectCode]}}) cursor_province = coll_province.find({'cids': {'$all': [subjectCode]}}) #将省份迭代器转换为tuple provinces = tuple(cursor_province) #学科试卷类型字典 papertype_dic = {} for doc_papertype in mongo.getCollection(COLL.type['papertype']).find( {'cids': { '$all': [subjectCode] }}): papertype_dic[doc_papertype['papertype_name']] = doc_papertype[ 'papertype_id'] #试卷数据集合 coll_paper = mongo.getCollection(COLL.paper) coll_paper.create_index([('paper_id', pymongo.ASCENDING)], unique=True) pg = 1 for doc_grade in cursor_grade: for doc_province in provinces: url = paper_pg_url % (subject['chid'], xd, pg, doc_grade['grade_id'], doc_province['province_id']) try: self.downloadPaperPgIds(url, subjectCode, papertype_dic, doc_grade, doc_province, coll_paper, baseUrl) logger.info(u'完成学科名称%s,学科编码%d,年级:%s,地区:%s,所有试卷Id等简单信息爬取', subject['name'], subjectCode, doc_grade['grade_name'], doc_province['province_name']) except Exception as e: logger.exception( u'下载试卷分页信息错误,学科名称%s,学科编码%d,爬取url:%s,年级:%s,地区:%s 异常信息:%s', subject['name'], subjectCode, url, doc_grade['grade_name'], doc_province['province_name'], e.message)
total += 1 if len(params) >= 1000: try: pg.batchExecute(sql, params) pg.commit() count += len(params) params = [] except Exception as e: pg.rollback() print(e.message) print u'处理总数:%d,成功处理数量: %d' % (total, count) if params: try: pg.batchExecute(sql, params) pg.commit() count += len(params) except Exception as e: pg.rollback() print(e.message) print u'处理总数:%d,成功处理数量: %d' % (total, count) if __name__ == '__main__': mongon = MongoDB() pg = PostgreSql() try: exportKnowledToPg(mongon, pg) finally: pg.close() mongon.close()
def parseQuestionAllType(self): '''分析题目所有类型''' mongo = MongoDB() for key, value in subjects.items(): for xd in value['xds']: self.parseQuesiontAllTypeByCid(value, xd, mongo)
def updteTotalKnowledToSubject(self, subject, xd): '''更新指定学科学段的total字段''' mongo = MongoDB() coll = mongo.getCollection(COLL.knowled) self.updteTotalKnowledByCid(subject, xd, coll)
def paresToPg(self,cid): mg = MongoDB() pg = PostgreSql() try: coll = mg.getCollection(COLL.question) # 获取题型 type_dic = {} for row in pg.getAll('SELECT code,name,zujuan_code,zujuan_name FROM t_ques_type_zujuan_relation where subject_code = %s',(cid,)): type_dic[str(row[2])] = {'code': row[0], 'name': row[1]} insert_params = [] self.total = 0 self.count = 0 for doc in coll.find({'cid':cid,'status':{'$in':[0,1]}}): try: self.total += 1 old_id = doc['question_id'] #题目原始id qid = str(uuid.uuid1()) # 新的题目Id difficulty = doc['new_data']['difficult_index'] # 难度 subject = cid #课程ID provider = '04' # 来源 status = 0 #状态 # 最后一级知识点集合 points = [] for kid_dic in doc['kids']: points.append({'code':kid_dic['kid'],'name':kid_dic['ktitle']}) points = json.dumps(points) # 题目类型处理 question_channel_type = doc['new_data']['question_channel_type'] cate = type_dic[question_channel_type]['code'] cate_name = type_dic[question_channel_type]['name'] # 题干、答案 content = doc['new_data']['question_text'] answer = [] if doc['new_data'].has_key('list') and doc['new_data']['list']: for childe_content in doc['new_data']['list']: content = '%s <br/> %s' %(content,childe_content['question_text']) if not childe_content['answer']: coll.update_one({'question_id': old_id}, {'$set': {'status': None}, "$currentDate": {"lastModified": True}}) raise Exception('题目答案异常,id:%s' % old_id) answer.append('<img align="top" src="%s" />' % childe_content['answer']) if childe_content['options']: content = '%s <br/> %s' % (content, json.dumps( childe_content['options'],ensure_ascii=False)) else: if not doc['new_data']['answer']: coll.update_one({'question_id': old_id}, {'$set': {'status': None}, "$currentDate": {"lastModified": True}}) raise Exception('题目答案异常,id:%s' % old_id) answer.append('<img align="top" src="%s" />' % doc['new_data']['answer']) answer = json.dumps(answer,ensure_ascii=False) #题目选项处理 options = [] if question_channel_type in ['1','2']: if isinstance(doc['new_data']['options'], unicode): #选择题没有选项的状态,变为-1 coll.update_one({'question_id': old_id}, {'$set': {'status': -1}, "$currentDate": {"lastModified": True}}) for key,value in doc['new_data']['options'].items(): options.insert(ord(key)-ord('A'),value) options = json.dumps(options,ensure_ascii=False) analyses = doc['new_data']['explanation'] if doc['status'] == 1 else None #解析 if analyses: analyses = '< img align="top" src="%s" />' % analyses insert_params.append((qid,answer,analyses,cate,cate_name,content,options,points,subject,difficulty,status,provider,old_id)) except Exception as e: logger.exception(u'处理分析组卷题目失败,题目id-%s',old_id) if len(insert_params)>= 1000: count = self.batchInsertExecute(pg,insert_params) insert_params = [] self.batchInsertExecute(pg,insert_params) finally: mg.close() pg.close()