def add_chapter_to_db(self, fieldset_id, detail_page_url, args): item_bank_init = dict() item_bank_init['fieldset_id'] = fieldset_id item_bank_init['detail_page_url'] = detail_page_url item_bank_init['ques_url'] = args.get('url') item_bank_init['from_code'] = self.from_code item_bank_init['item_style_code'] = args.get('item_style_code') item_bank_init['library_id'] = args.get('library_id') item_bank_init['chaper_id'] = self.chapter_id item_bank_init['is_finish'] = 0 mutex.acquire() self.db_connect.add(ItemBankInit(**item_bank_init)) mutex.release()
def get_item_bank_init_url(self, chapter_id, subject_code): """ 获取题库url列表用来爬取数据 :return: """ re_dict = dict() query = self.db_session.query(LibraryChapter).filter( LibraryChapter.id == chapter_id) url_str = 'http://www.jyeoo.com/{subject}/ques/search?f=0&q={pk}&so={from_code}' last_data = None # 遍历章节 for item in query: if last_data: is_ok_count = self.db_session.query(ItemBankInit).filter( ItemBankInit.chaper_id == last_data.id).count() if is_ok_count > 1: last_data.is_finish = 1 mutex.acquire() self.db_session.commit() mutex.release() last_data = item temp_dict = dict() # 学科 temp_dict['subject'] = subject_code # 教材ID temp_dict['library_id'] = item.library_id # 章节ID temp_dict['chaper_id'] = item.id # 章节直连 temp_dict['pk'] = item.pk # 题型 temp_dict['item_style_code'] = '' # 题类 temp_dict['field_code'] = '' # 来源 temp_dict['from_code'] = self.from_code temp_dict['url'] = url_str.format(**temp_dict) re_dict[item.id] = temp_dict yield re_dict
def library_chapter(self): """ 章节爬取动作 :return: """ start_url = self.get_chapter_url() try: self.driver.get(start_url) WebDriverWait(self.driver, 30).until( ec.visibility_of_element_located( (By.XPATH, '//div[@class="tree-head"]/span[@id="spanEdition"]'))) except TimeoutException as e: self.sinOut.emit('超时!!! %s' % str(e)) self.driver.get_screenshot_as_file('./error.png') return teaching = self.driver.find_element_by_xpath( '//div[@class="tree-head"]/span[@id="spanEdition"]').text level_name = self.driver.find_element_by_xpath( '//div[@class="tree-head"]/span[@id="spanGrade"]').text teaching = teaching.replace(':', '').replace(':', '') self.sinOut.emit('进行爬取章节!') if self.teaching_name != teaching or self.level_name != level_name: self.message_box.emit('警告', "没有数据!") return et = etree.HTML(self.driver.page_source) library_id = self.teaching sub_obj = et.xpath('//ul[@id="JYE_POINT_TREE_HOLDER"]/li') chapters_list = list() total = len(sub_obj) current_count = 0 for item in sub_obj: lc_item = dict() lc_item['id'] = str(uuid.uuid1()) pk = item.attrib.get('pk') nm = item.attrib.get('nm') child = utils.recursive_get_li(lc_item['id'], library_id, item) lc_item['pk'] = pk lc_item['parent_id'] = '' lc_item['library_id'] = library_id lc_item['name'] = nm lc_item['child'] = child chapters_list.append(lc_item) current_count += 1 self.crawler_chapter_progress.emit(current_count, total) self.sinOut.emit('正在解析入库') if chapters_list: mutex.acquire() chapters = self.db_connect.session.query( LibraryChapter.name, LibraryChapter.id, LibraryChapter.pk).filter( LibraryChapter.library_id == library_id) new_list = utils.split_list(chapters_list) if chapters.count() > 0: # 如果章节存在数据则进行更新 relational_dict = dict() for item in chapters: # new_list = self.update_chapter_pk_id(item.id, item.pk, new_list) for item2 in new_list: if item2.get('pk') == item.pk: relational_dict[item2['id']] = item.id item2['id'] = item.id break for item3 in new_list: if item3.get('parent_id') and relational_dict.get( item3['parent_id']): item3['parent_id'] = relational_dict.get( item3['parent_id']) chapters.delete() self.db_connect.session.commit() mutex.release() # 插入新值 for item in new_list: mutex.acquire() if 'child' in item: del item['child'] self.db_connect.add(LibraryChapter(**item)) mutex.release() self.sinOut.emit('章节爬取完成,重新加载查看')
def item_bank_details(self): """ 详情页爬取方法 :return: """ current_count = 0 if not self.chapter_id: self.sinOut.emit('错误!章节获取失败,可能未选择章节!') else: start_urls = self.get_details_url() for item in start_urls: current_count += 1 bank_item = dict() self.sinOut.emit('正在获取详情页 %s' % item.get('detail_page_url')) self.driver.get(item.get('detail_page_url')) et = etree.HTML(self.driver.page_source) year_html = et.xpath('.//div[@class="pt1"]/a/text()') if year_html: year_area = utils.txt_wrap_by('(', ')', year_html[0]) if not year_area: year_area = utils.txt_wrap_by('(', ')', year_html[0]) if year_area: bank_item['year_code'] = year_area.split('•')[0] bank_item['year_area'] = year_area else: bank_item['year_area'] = '' bank_item['used_times'] = '' bank_item['exam_times'] = '' fieldset_xpath = '//div[@id="{fieldset_id}"]'.format( fieldset_id=item.get('fieldset_id')) detail_data = et.xpath(fieldset_xpath) # 考题 bank_item['context'] = str( detail_data[0].xpath('.//div[@class="pt1"]/text()')) bank_item['anwser'] = self.driver.page_source fieldtip_left = detail_data[0].xpath( './/div[@class="fieldtip-left"]') record_time = fieldtip_left[0].xpath('.//span[1]/text()') used_times = fieldtip_left[0].xpath('.//span[2]/text()') exam_times = fieldtip_left[0].xpath('.//span[3]/text()') difficult_code = fieldtip_left[0].xpath('.//span[4]/text()') if record_time: bank_item['record_time'] = record_time[0].replace( ":", ":").split(':')[1] if used_times: bank_item['used_times'] = used_times[0].replace( ":", ":").split(':')[1] if exam_times: bank_item['exam_times'] = exam_times[0].replace( ":", ":").split(':')[1] if difficult_code: bank_item['difficult_code'] = difficult_code[0].replace( ":", ":").split(':')[1] bank_item['from_code'] = self.from_code bank_item['url'] = item.get('detail_page_url') bank_item['chaper_id'] = item.get('chaper_id') bank_item['library_id'] = item.get('library_id') bank_item['item_style_code'] = item.get('item_style_code') point_list = self.get_pointcard(item.get('fieldset_id'), bank_item, et) bank_item['points'] = point_list # 入库 mutex.acquire() self.item_bank_deails_and_point_db(bank_item) mutex.release() # 更新爬虫次数进度 self.details_progress.emit(current_count, int(self.crawl_maximum)) return