示例#1
0
 def updteTotalKnowled(self):
     '''给知识点增加题目总数total字段'''
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.knowled)
     for key, value in subjects.items():
         for xd in value['xds']:
             self.updteTotalKnowledByCid(value, xd, coll)
示例#2
0
 def updateOrInsertLastKnowledPgScrapyUrl(self, subject, xd):
     '''修改或新增pg表中内容为最新'''
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.knowled)
     coll_pg = mongo.getCollection(COLL.pg_url)
     count_add = 0
     count_update = 0
     for kpg in self.generateLastKnowledPgScrapyUrlByCid(subject, xd, coll):
         doc = coll_pg.find_one({'kid': kpg['kid'], 'pg': kpg['pg']})
         if not doc:
             coll_pg.insert_one(kpg)
             count_add += 1
         elif doc['total'] != kpg['total']:
             coll_pg.update_one({
                 'kid': kpg['kid'],
                 'pg': kpg['pg']
             }, {
                 '$set': {
                     'total': kpg['total'],
                     'rows': kpg['rows']
                 },
                 '$unset': {'status'}
             })
             count_update += 1
     logger.info(u'新增爬取pg数量:%d,修改爬取的pg数量:%d', count_add, count_update)
示例#3
0
 def mainSelection(self,c_url,category_url,course):
     c_name = course[1]
     c_url = c_url % c_name
     category_url = category_url % c_name
     response = self.session.get(c_url)
     root_soup = BeautifulSoup(html_parser.unescape(response.content), "lxml")
     ul_soup = root_soup.find('ul',id='JYE_BOOK_TREE_HOLDER')
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.SELECTION)
     for ek_li_soup in ul_soup.find_all('li',attrs={'ek':True}):
         #教材ID,教材名称
         ek_id = ek_li_soup['ek']
         ek_name = ek_li_soup['nm']
         for bk_li_soup in ek_li_soup.find_all('li',attrs={'bk':True}):
             # 年级ID,年级名称
             bk_id = bk_li_soup['bk']
             bk_name = bk_li_soup['nm']
             data = {'a':bk_id,'q':'','f':0,'cb':'_setQ','r':random.random()}
             resp = self.session.post(category_url,data=data)
             pk_ul_soup = BeautifulSoup(html_parser.unescape(resp.content), "lxml").find('ul',id='JYE_POINT_TREE_HOLDER')
             try:
                 s_rows = self.pareSelection(pk_ul_soup,bk_id,bk_name,ek_id,ek_name,course,1)
                 coll.insert_many(s_rows)
                 logger.info(u'完成下载菁优章节,教材名称:%s,年级名称:%s,科目名称:%s-%s,科目主url:%s',ek_name,bk_name,c_name,course[-1],c_url)
             except  Exception as e:
                 logger.exception(u'分析下载菁优章节异常,教材名称:%s,年级名称:%s,科目名称:%s-%s,科目主url:%s',ek_name,bk_name,course[-1],c_name,c_url)
示例#4
0
def exportQuestionTypeToExecel():
    mongon = MongoDB()
    coll = mongon.getCollection(COLL.question_type['question_channel_type'])
    question_type_dic = {}
    for doc in coll.find():
        for cid in doc['cids']:
            cid_arr = question_type_dic[cid] if question_type_dic.has_key(
                cid) else []
            cid_arr.append({
                'id': doc['id'],
                'name': doc['name'],
                'cname': subjectName_dic[cid]
            })
            question_type_dic[cid] = cid_arr
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('题目类型', cell_overwrite_ok=True)
    sheet.write(0, 0, u'类型编码')
    sheet.write(0, 1, u'类型名称')
    sheet.write(0, 2, u'学科名称')
    sheet.write(0, 3, u'学科编码')
    row = 1
    for key, values in question_type_dic.items():
        for value in values:
            sheet.write(row, 0, value['id'])
            sheet.write(row, 1, value['name'])
            sheet.write(row, 2, value['cname'])
            sheet.write(row, 3, key)
            row += 1
    book.save('zujuan_question_type.xls')
示例#5
0
 def mainKnowled(self):
     '''分析所有学科的知识点'''
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.knowled)
     #coll.create_index([("id",pymongo.ASCENDING)],unique=True)
     for key, value in subjects.items():
         for xdKey, xdValue in xds.items():
             if xdValue['xd'] in value['xds']:
                 try:
                     coll.insert(self.parseKnowled(value, xdValue))
                 except Exception as e:
                     logger.exception(u'处理学科:%s,学段:%s;出现异常,异常信息:%s',
                                      value['name'], xdValue['name'],
                                      e.message)
示例#6
0
 def generateLastKnowledPgScrapyUrl(self):
     '''生成最后知识点分析爬取的url'''
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.knowled)
     coll_pg = mongo.getCollection(COLL.pg_url)
     #coll_pg.create_index([("kid", pymongo.ASCENDING),("pg", pymongo.ASCENDING)], unique=True)
     for key, value in subjects.items():
         for xd in value['xds']:
             try:
                 coll_pg.insert_many(
                     self.generateLastKnowledPgScrapyUrlByCid(
                         value, xd, coll))
             except Exception as e:
                 logger.exception(u'生成知识点url错误,学科:%s,学段:%d,错误信息:%s',
                                  value['name'], xd, e.message)
示例#7
0
 def generateQuestions(self, subjectCode):
     mongo = MongoDB()
     coll_question = mongo.getCollection(COLL.question)
     # coll_question.create_index([("question_id", pymongo.ASCENDING)], unique=True)
     cursor = mongo.getCollection(COLL.pg_url).find({
         'cid': subjectCode,
         'status': 1
     })
     count_add = 0
     count_update = 0
     count_repeat = 0
     for doc in cursor:
         kid = doc['kid']
         ktitle = doc['ktitle']
         for question in doc['pgdata']['data'][0]['questions']:
             q = coll_question.find_one(
                 {'question_id': question['question_id']})
             if q:
                 kids = q['kids']
                 flag = False
                 for kid_dic in kids:
                     if kid_dic['kid'] == kid:
                         flag = True
                         count_repeat += 1
                         break
                 if flag: continue
                 kids.append({'kid': kid, 'ktitle': ktitle})
                 coll_question.update_one(
                     {'question_id': question['question_id']},
                     {'$set': {
                         'kids': kids
                     }})
                 count_update += 1
             else:
                 data = {
                     'question_id': question['question_id'],
                     'cid': subjectCode,
                     'old_data': question,
                     'kids': [{
                         'kid': kid,
                         'ktitle': ktitle
                     }]
                 }
                 coll_question.insert_one(data)
                 count_add += 1
     logger.info(u'本次生成学科:%d,新增题目:%d,修改kids数量:%d,忽略掉的重复题目:%d', subjectCode,
                 count_add, count_update, count_repeat)
示例#8
0
 def parseParperPropAll(self, url=URL.paper_url):
     '''分析试卷的所有公共属性'''
     mongo = MongoDB()
     #创建唯一索引
     # for key, value in COLL.type.items():
     #     coll = mongo.getCollection(value)
     #     coll.create_index([(key+'_id',pymongo.ASCENDING)],unique=True)
     for key, value in subjects.items():
         for xd in value['xds']:
             self.parsePaperProp(value, xd, mongo, url)
示例#9
0
 def downloadPaper(self, subject, xd):
     '''下载试卷'''
     subjectCode = xd * 10 + subject['code']
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.paper)
     for doc in coll.find({
             'cid': subjectCode,
             'status': {
                 '$exists': False
             }
     }):
         try:
             response = self.session.get(doc['url'])
             #root_soup = BeautifulSoup(response.content, "lxml")
             #script_soup = root_soup.find('script',text=re.compile(u'var\s*MockDataTestPaper\s*=\s*\['))
             #datastr = re.findall(u'var\s*MockDataTestPaper\s*=\s*(\[{.+?}\])\s*;\s*',script_soup.get_text())[0]
             datastr = re.findall(
                 u'var\s*MockDataTestPaper\s*=\s*(\[{.+?}\])\s*;\s*',
                 response.content)[0]
             MockDataTestPaper = json.loads(datastr)
             coll.update_one({'paper_id': doc['paper_id']}, {
                 "$set": {
                     "paper_detail": MockDataTestPaper,
                     'status': 1
                 },
                 "$currentDate": {
                     "lastModified": True
                 }
             })
         except Exception as e:
             logger.exception(u'下载试卷失败,试卷Id:%s,试卷url:%s,学科名称:%s,学科编码:%d',
                              doc['paper_id'], doc['url'], subject['name'],
                              subjectCode)
             if re.findall(r'sorry!\s*系统出错了!', response.content):
                 coll.update_one({'paper_id': doc['paper_id']}, {
                     "$set": {
                         "error": u'sorry! 系统出错了!',
                         'status': -1
                     },
                     "$currentDate": {
                         "lastModified": True
                     }
                 })
示例#10
0
def exportKnowledToExecel():
    mongon = MongoDB()
    coll = mongon.getCollection(COLL.knowled)
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('知识点关系', cell_overwrite_ok=True)
    sheet.write(0, 0, u'学科名称')
    sheet.write(0, 1, u'学科编码')
    sheet.write(0, 2, u'一级知识点名称')
    sheet.write(0, 3, u'一级知识点编码')
    sheet.write(0, 4, u'二级知识点名称')
    sheet.write(0, 5, u'二级知识点编码')
    sheet.write(0, 6, u'三级知识点名称')
    sheet.write(0, 7, u'三级知识点编码')
    sheet.write(0, 8, u'四级知识点名称')
    sheet.write(0, 9, u'四级知识点编码')
    row = 1
    for doc in coll.find({
            'hasChild': False
    }).sort([('cid', pymongo.ASCENDING)]):
        sheet.write(row, 0, subjectName_dic[doc['cid']])
        sheet.write(row, 1, doc['cid'])
        writeKnowldeToSheet(doc, row, sheet, coll)
        row += 1
    book.save('zujuan_knowled.xls')
示例#11
0
 def localDataToRemote(self):
     '''数据库之前复制数据'''
     mongo_local = MongoDB()
     mongo_remote = MongoDB('192.168.26.159', 27017)
     cursor = mongo_local.getCollection(COLL.pg_url).find({'status': 1})
     coll_remote = mongo_remote.getCollection(COLL.pg_url)
     for doc in cursor:
         coll_remote.update_one({
             'kid': doc['kid'],
             'pg': doc['pg']
         }, {"$set": {
             'status': doc['status'],
             'pgdata': doc['pgdata']
         }})
示例#12
0
 def downloadPaperIds(self,
                      subject,
                      xd,
                      paper_pg_url=URL.paper_pg_url,
                      baseUrl=URL.rootUrl):
     subjectCode = xd * 10 + subject['code']
     mongo = MongoDB()
     coll_grade = mongo.getCollection(COLL.type['grade'])
     coll_province = mongo.getCollection(COLL.type['province'])
     cursor_grade = coll_grade.find({'cids': {'$all': [subjectCode]}})
     cursor_province = coll_province.find({'cids': {'$all': [subjectCode]}})
     #将省份迭代器转换为tuple
     provinces = tuple(cursor_province)
     #学科试卷类型字典
     papertype_dic = {}
     for doc_papertype in mongo.getCollection(COLL.type['papertype']).find(
         {'cids': {
             '$all': [subjectCode]
         }}):
         papertype_dic[doc_papertype['papertype_name']] = doc_papertype[
             'papertype_id']
     #试卷数据集合
     coll_paper = mongo.getCollection(COLL.paper)
     coll_paper.create_index([('paper_id', pymongo.ASCENDING)], unique=True)
     pg = 1
     for doc_grade in cursor_grade:
         for doc_province in provinces:
             url = paper_pg_url % (subject['chid'], xd, pg,
                                   doc_grade['grade_id'],
                                   doc_province['province_id'])
             try:
                 self.downloadPaperPgIds(url, subjectCode, papertype_dic,
                                         doc_grade, doc_province,
                                         coll_paper, baseUrl)
                 logger.info(u'完成学科名称%s,学科编码%d,年级:%s,地区:%s,所有试卷Id等简单信息爬取',
                             subject['name'], subjectCode,
                             doc_grade['grade_name'],
                             doc_province['province_name'])
             except Exception as e:
                 logger.exception(
                     u'下载试卷分页信息错误,学科名称%s,学科编码%d,爬取url:%s,年级:%s,地区:%s 异常信息:%s',
                     subject['name'], subjectCode, url,
                     doc_grade['grade_name'], doc_province['province_name'],
                     e.message)
示例#13
0
        total += 1
        if len(params) >= 1000:
            try:
                pg.batchExecute(sql, params)
                pg.commit()
                count += len(params)
                params = []
            except Exception as e:
                pg.rollback()
                print(e.message)
            print u'处理总数:%d,成功处理数量: %d' % (total, count)
    if params:
        try:
            pg.batchExecute(sql, params)
            pg.commit()
            count += len(params)
        except Exception as e:
            pg.rollback()
            print(e.message)
        print u'处理总数:%d,成功处理数量: %d' % (total, count)


if __name__ == '__main__':
    mongon = MongoDB()
    pg = PostgreSql()
    try:
        exportKnowledToPg(mongon, pg)
    finally:
        pg.close()
        mongon.close()
示例#14
0
 def parseQuestionAllType(self):
     '''分析题目所有类型'''
     mongo = MongoDB()
     for key, value in subjects.items():
         for xd in value['xds']:
             self.parseQuesiontAllTypeByCid(value, xd, mongo)
示例#15
0
 def updteTotalKnowledToSubject(self, subject, xd):
     '''更新指定学科学段的total字段'''
     mongo = MongoDB()
     coll = mongo.getCollection(COLL.knowled)
     self.updteTotalKnowledByCid(subject, xd, coll)
示例#16
0
    def paresToPg(self,cid):
        mg = MongoDB()
        pg = PostgreSql()
        try:
            coll = mg.getCollection(COLL.question)
            # 获取题型
            type_dic = {}
            for row in pg.getAll('SELECT code,name,zujuan_code,zujuan_name FROM t_ques_type_zujuan_relation where subject_code = %s',(cid,)):
                type_dic[str(row[2])] = {'code': row[0], 'name': row[1]}
            insert_params = []
            self.total = 0
            self.count = 0
            for doc in coll.find({'cid':cid,'status':{'$in':[0,1]}}):
                try:
                    self.total += 1
                    old_id = doc['question_id'] #题目原始id
                    qid = str(uuid.uuid1()) # 新的题目Id
                    difficulty = doc['new_data']['difficult_index']  # 难度
                    subject = cid #课程ID
                    provider = '04' # 来源
                    status = 0 #状态
                    # 最后一级知识点集合
                    points = []
                    for kid_dic in doc['kids']:
                        points.append({'code':kid_dic['kid'],'name':kid_dic['ktitle']})
                    points = json.dumps(points)
                    # 题目类型处理
                    question_channel_type = doc['new_data']['question_channel_type']
                    cate = type_dic[question_channel_type]['code']
                    cate_name = type_dic[question_channel_type]['name']

                    # 题干、答案
                    content = doc['new_data']['question_text']
                    answer = []
                    if doc['new_data'].has_key('list') and doc['new_data']['list']:
                        for childe_content in  doc['new_data']['list']:
                            content = '%s <br/> %s' %(content,childe_content['question_text'])
                            if not childe_content['answer']:
                                coll.update_one({'question_id': old_id},
                                                {'$set': {'status': None},
                                                 "$currentDate": {"lastModified": True}})
                                raise Exception('题目答案异常,id:%s' % old_id)
                            answer.append('<img align="top" src="%s" />' % childe_content['answer'])
                            if childe_content['options']:
                                content = '%s <br/> %s' % (content, json.dumps( childe_content['options'],ensure_ascii=False))
                    else:
                        if not doc['new_data']['answer']:
                            coll.update_one({'question_id': old_id},
                                            {'$set': {'status': None},
                                             "$currentDate": {"lastModified": True}})
                            raise Exception('题目答案异常,id:%s' % old_id)
                        answer.append('<img align="top" src="%s" />' % doc['new_data']['answer'])
                    answer = json.dumps(answer,ensure_ascii=False)

                    #题目选项处理
                    options = []
                    if question_channel_type in ['1','2']:
                        if isinstance(doc['new_data']['options'], unicode):
                            #选择题没有选项的状态,变为-1
                            coll.update_one({'question_id': old_id},
                                            {'$set': {'status': -1}, "$currentDate": {"lastModified": True}})
                        for key,value in doc['new_data']['options'].items():
                            options.insert(ord(key)-ord('A'),value)
                    options = json.dumps(options,ensure_ascii=False)
                    analyses = doc['new_data']['explanation'] if doc['status'] == 1 else None  #解析
                    if analyses:
                        analyses = '< img align="top" src="%s" />' % analyses
                    insert_params.append((qid,answer,analyses,cate,cate_name,content,options,points,subject,difficulty,status,provider,old_id))
                except Exception as e:
                    logger.exception(u'处理分析组卷题目失败,题目id-%s',old_id)
                if len(insert_params)>= 1000:
                    count = self.batchInsertExecute(pg,insert_params)
                    insert_params = []
            self.batchInsertExecute(pg,insert_params)
        finally:
            mg.close()
            pg.close()