def _get_sections(self, url, list_rule, section_rule, book, decode="utf-8"): # 获取章节列表 res = requests_get(url=url, decode=decode) parse_html = html_to_etree(res) section_p = parse_html.xpath(list_rule) need_add_obj = [] order = 0 for i in section_p: # 获取目录 order += 1 a = i.xpath(section_rule) if a: o = a[0] href = o.xpath("./@href")[0] sec_name = o.text need_add_obj.append( NovelSection(novel=book, name=sec_name, url=href, order=order)) # 结束后再次判断need_add_obj if need_add_obj: NovelSection.objects.bulk_create(need_add_obj) # 更新book状态 book.section_complete = True book.save()
def grab_real_info(data, service=None): """抓取实时信息""" if service is None: service = "http://bm.eyuyao.com/bus/mobile/getGpsInfoCs.php?{data}" url = service.format(data=data) res = requests_get(url=url, j=True) return res
def grab_ajax_data(real_url): """根据real_url抓取ajax的url""" import re res = requests_get(url=real_url) # parse_html = html_to_etree(html_raw=res) # ajax = parse_html.xpath('/html/head/script[3]') m1 = re.findall("data:(.*)", res) m2 = re.findall(r"\"([a-zA-Z0-9=&]+)", "".join(m1)) if m2: return m2[0]
def list(self, request, *args, **kwargs): host = request.query_params.get("host") url = request.query_params.get("url") book_name = request.query_params.get("book_name") if not url: return ErrorHR("参数url缺失") if host: self.query_sql &= Q(host__contains=host) book = self.get_novel_entry(book_name=book_name) if not book: return ErrorHR("不存在该书") # 获取章节的抓取规则 rule = GraspRule.objects.filter(self.query_sql).first() list_rule = rule.list_rule section_rule_p = rule.section_rule_p section_rule = rule.section_rule decode = rule.decode res = requests_get(url=url, decode=decode) parse_html = html_to_etree(res) sections = [] # 获取章节列表 section_p = parse_html.xpath(list_rule) section_p_obj = None need_add_obj = [] order = 0 for i in section_p: # 判断是否为父级目录 if dict(i.attrib).get( "class") == section_rule_p and section_rule_p is not None: order = 0 # 判断need_add_obj 有就新增 if need_add_obj: NovelSection.objects.bulk_create(need_add_obj) need_add_obj.clear() _name = i.text section_p_obj = self.create_section(novel=book, name=_name) else: # 获取目录 order += 1 a = i.xpath(section_rule) if a: o = a[0] href = o.xpath("./@href")[0] sec_name = o.text need_add_obj.append( NovelSection(novel=book, name=sec_name, url=href, parent=section_p_obj, order=order)) # 结束后再次判断need_add_obj if need_add_obj: NovelSection.objects.bulk_create(need_add_obj) return SuccessHR("创建成功")
def _get_book(self, url, rule, decode="utf-8"): """获取书""" try: n = NovelEntry.objects.get(is_active=True, url=url) except NovelEntry.DoesNotExist: res = requests_get(url=url, decode=decode) parse_html = html_to_etree(res) book_name_p = parse_html.xpath(rule) book_name = "" if book_name_p: book_name = book_name_p[0].text # 创建书本 return NovelEntry.objects.create(name=book_name, url=url) else: return n
def grab_bus_real_url(raw): """抓取公交实况url""" host = 'http://bm.eyuyao.com/bus/mobile/' result = [] for i in raw: pk = i.get("id") grab_url = i.get("url") # 抓取实况url res = requests_get(url=host + grab_url) parse_html = html_to_etree(html_raw=res) real_url = parse_html.xpath('/html/body/header/div[2]/a/@href') print(real_url) if real_url: result.append({"id": pk, "real_url": real_url[0]}) return result
def grab_bus_real_info(pk, url): """抓取公交实况url""" result = [] # 抓取实况url res = requests_get(url=url) parse_html = html_to_etree(html_raw=res) station_list = parse_html.xpath('//*[@id="touchBox"]/li') for i in station_list: station_id = i.xpath("./@id") if station_id: result.append({ "id": pk, "station_id": station_id[0], "name": i.text }) return result
def grab_base_bus(): """抓取公交基础信息""" url = "http://bm.eyuyao.com/bus/mobile/lineList.php?k=pp&q=" list_rule = "/html/body/div/ul[@class='list borderNone mbNone']/li/a" res = requests_get(url=url) parse_html = html_to_etree(html_raw=res) bus_list = parse_html.xpath(list_rule) bus = [] # TODO 获取反向的车 - for i in bus_list: href_list = i.xpath('./@href') if href_list: href = href_list[0] name = i.text bus.append({"name": name, "href": href}) return bus
def parse_content(sections_url, content_rule, decode="utf-8"): """提取小说内容""" # TODO 多线程获取小说内容 # if isinstance(sections_url, str): # sections_url = [sections_url] # assert not isinstance(sections_url, list) # 发送请求获取页面数据 res = requests_get(url=sections_url, decode=decode) # 解析页面 parse_html = html_to_etree(res) # 获取规则下的标签 content_tab = parse_html.xpath(content_rule) # 提取主体内容 if content_tab: return "".join([i.tail if i.tail else "\n\n" for i in content_tab[0]]) # content = content_tab[0].xpath("string(.)") # log_common.out(msg=f"内容{content[:10]}") return None