def refresh_level(self): """ 刷新学级 :return: """ self.comboBox_level.clear() mutex.acquire() levels = self.db_connect.session.query( ItemStyle.level_name, ItemStyle.level_code).group_by( ItemStyle.level_name).order_by(ItemStyle.level_code.desc()) mutex.release() for item in levels: self.comboBox_level.addItem(item[0], item[1])
def add_chapter_to_db(self, fieldset_id, detail_page_url, args): item_bank_init = dict() item_bank_init['fieldset_id'] = fieldset_id item_bank_init['detail_page_url'] = detail_page_url item_bank_init['ques_url'] = args.get('url') item_bank_init['from_code'] = self.from_code item_bank_init['item_style_code'] = args.get('item_style_code') item_bank_init['library_id'] = args.get('library_id') item_bank_init['chaper_id'] = self.chapter_id item_bank_init['is_finish'] = 0 mutex.acquire() self.db_connect.add(ItemBankInit(**item_bank_init)) mutex.release()
def refresh_grade(self): """ 年级 :return: """ self.comboBox_grade.clear() level_code = self.comboBox_level.currentData() mutex.acquire() grade_query = self.db_connect.session.query( LevelGradeRef.grade_name, LevelGradeRef.grade_code).filter( LevelGradeRef.level_code == level_code) mutex.release() for item in grade_query: self.comboBox_grade.addItem(item[0], item[1])
def refresh_subject(self): """ 刷新学科 :return: """ self.comboBox_subject.clear() level_data = self.comboBox_level.currentData() mutex.acquire() subject_query = self.db_connect.session.query( LevelSubjectsRef.subject_name, LevelSubjectsRef.subject_code).filter( LevelSubjectsRef.level_code == level_data) mutex.release() for item in subject_query: _level = '' if int(level_data) == 1 else level_data self.comboBox_subject.addItem(item[0], item[1] + _level)
def refresh_from(self): """ 刷新来源 :return: """ self.comboBox_from.clear() mutex.acquire() level_code = self.comboBox_level.currentData() levels = self.db_connect.session.query( ItemFrom.from_name, ItemFrom.from_code).filter(ItemFrom.level_code == level_code) mutex.release() # 默认为全部 self.comboBox_from.addItem('全部', '') for item in levels: self.comboBox_from.addItem(item[0], item[1])
def refresh_teaching(self): """ 刷新教材 :return: """ self.comboBox_teaching.clear() grade = self.comboBox_grade.currentData() subject = self.comboBox_subject.currentData() if subject[-1].isdigit(): subject = subject[:-1] mutex.acquire() teaching_query = self.db_connect.session.query( LibraryEntry.style_name, LibraryEntry.id).filter(LibraryEntry.grade_code == grade, LibraryEntry.subject_code == subject) mutex.release() for item in teaching_query: self.comboBox_teaching.addItem(item[0], item[1])
def refresh_chapter(self): """ 章节 :return: """ self.comboBox_chapter.clear() self.treeWidget_chapter.clear() self.treeWidget_chapter.setColumnCount(1) library_id = self.comboBox_teaching.currentData() mutex.acquire() chapters = self.db_connect.session.query( LibraryChapter.name, LibraryChapter.id, LibraryChapter.parent_id, LibraryChapter.pk).filter(LibraryChapter.library_id == library_id) tree_dict = dict() for item in chapters: self.comboBox_chapter.addItem(item[0], item[1]) if '' == item[2]: tree_item = QTreeWidgetItem(self.treeWidget_chapter) tree_item.setText(0, item[0]) tree_item.setText(1, item[1]) tree_item.setText(2, item[3]) tree_dict[item[1]] = {'item': tree_item, 'parent_id': ''} else: tree_item = QTreeWidgetItem() tree_item.setText(0, item[0]) tree_item.setText(1, item[1]) tree_item.setText(2, item[3]) tree_dict[item[1]] = {'item': tree_item, 'parent_id': item[2]} mutex.release() for key, value in tree_dict.items(): parent_id = value.get('parent_id') if parent_id: if not tree_dict.get(parent_id): result = self.message_box_choice('章节获取错误', '请重新获取此章节') if result == QMessageBox.Ok: # 重新爬取章节 self.start_chapter() break tree_dict[parent_id]['item'].addChild(value.get('item')) # 设置默认选中第一个 item = self.treeWidget_chapter.topLevelItem(0) self.treeWidget_chapter.setCurrentItem(item)
def get_item_bank_init_url(self, chapter_id, subject_code): """ 获取题库url列表用来爬取数据 :return: """ re_dict = dict() query = self.db_session.query(LibraryChapter).filter( LibraryChapter.id == chapter_id) url_str = 'http://www.jyeoo.com/{subject}/ques/search?f=0&q={pk}&so={from_code}' last_data = None # 遍历章节 for item in query: if last_data: is_ok_count = self.db_session.query(ItemBankInit).filter( ItemBankInit.chaper_id == last_data.id).count() if is_ok_count > 1: last_data.is_finish = 1 mutex.acquire() self.db_session.commit() mutex.release() last_data = item temp_dict = dict() # 学科 temp_dict['subject'] = subject_code # 教材ID temp_dict['library_id'] = item.library_id # 章节ID temp_dict['chaper_id'] = item.id # 章节直连 temp_dict['pk'] = item.pk # 题型 temp_dict['item_style_code'] = '' # 题类 temp_dict['field_code'] = '' # 来源 temp_dict['from_code'] = self.from_code temp_dict['url'] = url_str.format(**temp_dict) re_dict[item.id] = temp_dict yield re_dict
def library_chapter(self): """ 章节爬取动作 :return: """ start_url = self.get_chapter_url() try: self.driver.get(start_url) WebDriverWait(self.driver, 30).until( ec.visibility_of_element_located( (By.XPATH, '//div[@class="tree-head"]/span[@id="spanEdition"]'))) except TimeoutException as e: self.sinOut.emit('超时!!! %s' % str(e)) self.driver.get_screenshot_as_file('./error.png') return teaching = self.driver.find_element_by_xpath( '//div[@class="tree-head"]/span[@id="spanEdition"]').text level_name = self.driver.find_element_by_xpath( '//div[@class="tree-head"]/span[@id="spanGrade"]').text teaching = teaching.replace(':', '').replace(':', '') self.sinOut.emit('进行爬取章节!') if self.teaching_name != teaching or self.level_name != level_name: self.message_box.emit('警告', "没有数据!") return et = etree.HTML(self.driver.page_source) library_id = self.teaching sub_obj = et.xpath('//ul[@id="JYE_POINT_TREE_HOLDER"]/li') chapters_list = list() total = len(sub_obj) current_count = 0 for item in sub_obj: lc_item = dict() lc_item['id'] = str(uuid.uuid1()) pk = item.attrib.get('pk') nm = item.attrib.get('nm') child = utils.recursive_get_li(lc_item['id'], library_id, item) lc_item['pk'] = pk lc_item['parent_id'] = '' lc_item['library_id'] = library_id lc_item['name'] = nm lc_item['child'] = child chapters_list.append(lc_item) current_count += 1 self.crawler_chapter_progress.emit(current_count, total) self.sinOut.emit('正在解析入库') if chapters_list: mutex.acquire() chapters = self.db_connect.session.query( LibraryChapter.name, LibraryChapter.id, LibraryChapter.pk).filter( LibraryChapter.library_id == library_id) new_list = utils.split_list(chapters_list) if chapters.count() > 0: # 如果章节存在数据则进行更新 relational_dict = dict() for item in chapters: # new_list = self.update_chapter_pk_id(item.id, item.pk, new_list) for item2 in new_list: if item2.get('pk') == item.pk: relational_dict[item2['id']] = item.id item2['id'] = item.id break for item3 in new_list: if item3.get('parent_id') and relational_dict.get( item3['parent_id']): item3['parent_id'] = relational_dict.get( item3['parent_id']) chapters.delete() self.db_connect.session.commit() mutex.release() # 插入新值 for item in new_list: mutex.acquire() if 'child' in item: del item['child'] self.db_connect.add(LibraryChapter(**item)) mutex.release() self.sinOut.emit('章节爬取完成,重新加载查看')
def item_bank_details(self): """ 详情页爬取方法 :return: """ current_count = 0 if not self.chapter_id: self.sinOut.emit('错误!章节获取失败,可能未选择章节!') else: start_urls = self.get_details_url() for item in start_urls: current_count += 1 bank_item = dict() self.sinOut.emit('正在获取详情页 %s' % item.get('detail_page_url')) self.driver.get(item.get('detail_page_url')) et = etree.HTML(self.driver.page_source) year_html = et.xpath('.//div[@class="pt1"]/a/text()') if year_html: year_area = utils.txt_wrap_by('(', ')', year_html[0]) if not year_area: year_area = utils.txt_wrap_by('(', ')', year_html[0]) if year_area: bank_item['year_code'] = year_area.split('•')[0] bank_item['year_area'] = year_area else: bank_item['year_area'] = '' bank_item['used_times'] = '' bank_item['exam_times'] = '' fieldset_xpath = '//div[@id="{fieldset_id}"]'.format( fieldset_id=item.get('fieldset_id')) detail_data = et.xpath(fieldset_xpath) # 考题 bank_item['context'] = str( detail_data[0].xpath('.//div[@class="pt1"]/text()')) bank_item['anwser'] = self.driver.page_source fieldtip_left = detail_data[0].xpath( './/div[@class="fieldtip-left"]') record_time = fieldtip_left[0].xpath('.//span[1]/text()') used_times = fieldtip_left[0].xpath('.//span[2]/text()') exam_times = fieldtip_left[0].xpath('.//span[3]/text()') difficult_code = fieldtip_left[0].xpath('.//span[4]/text()') if record_time: bank_item['record_time'] = record_time[0].replace( ":", ":").split(':')[1] if used_times: bank_item['used_times'] = used_times[0].replace( ":", ":").split(':')[1] if exam_times: bank_item['exam_times'] = exam_times[0].replace( ":", ":").split(':')[1] if difficult_code: bank_item['difficult_code'] = difficult_code[0].replace( ":", ":").split(':')[1] bank_item['from_code'] = self.from_code bank_item['url'] = item.get('detail_page_url') bank_item['chaper_id'] = item.get('chaper_id') bank_item['library_id'] = item.get('library_id') bank_item['item_style_code'] = item.get('item_style_code') point_list = self.get_pointcard(item.get('fieldset_id'), bank_item, et) bank_item['points'] = point_list # 入库 mutex.acquire() self.item_bank_deails_and_point_db(bank_item) mutex.release() # 更新爬虫次数进度 self.details_progress.emit(current_count, int(self.crawl_maximum)) return