def download_list_page_html(self, url, logger): """ 下载列表页的html文件,主要在这个地方需要做一件事:完成存储在同一级目录下的文件之间可以完成首页、上页、下页、末页的切换的功能 :param url: 列表页的url :return: None """ html = requests.get(url, headers=self.headers) num_str = url.split("&")[-1].split("=")[-1] html_text = html.text par = Parse() total_page = str( par.parse_main_page_get_total_pagenum(html.text, configs["test"])) # 这个部分是首页、上页、下页、末页的切换 pattern_fpage = "id=\"fpage\" href=\"(.+?)\"" pattern_upage = "id=\"upage\" href=\"(.+?)\"" pattern_npage = "id=\"npage\" href=\"(.+?)\"" pattern_epage = "id=\"epage\" href=\"(.+?)\"" fapge_str = re.search(pattern_fpage, html_text).group(1) upage_str = re.search(pattern_upage, html_text).group(1) npage_str = re.search(pattern_npage, html_text).group(1) epage_str = re.search(pattern_epage, html_text).group(1) if num_str == "1": html_text = html_text.replace(fapge_str, "#") html_text = html_text.replace(upage_str, "#") html_text = html_text.replace( npage_str, "./page" + str(int(num_str) + 1).zfill(4) + ".html") html_text = html_text.replace( epage_str, "./page" + total_page.zfill(4) + ".html") elif num_str == total_page: html_text = html_text.replace(fapge_str, "./page0001.html") html_text = html_text.replace( upage_str, "./page" + str(int(num_str) - 1).zfill(4) + ".html") html_text = html_text.replace(npage_str, "#") html_text = html_text.replace(epage_str, "#") else: html_text = html_text.replace(fapge_str, "./page0001.html") html_text = html_text.replace( upage_str, "./page" + str(int(num_str) - 1).zfill(4) + ".html") html_text = html_text.replace( npage_str, "./page" + str(int(num_str) + 1).zfill(4) + ".html") html_text = html_text.replace( epage_str, "./page" + total_page.zfill(4) + ".html") # 这个部分是品牌和code数据链接的切换 html_text = self.replace_brand_and_code_url(html_text) file_name = "page" + num_str.zfill(4) + ".html" html_store_dir = make_store_html_dir() self.write_file(html_store_dir, file_name, html_text, logger)
"--url", type=str, default= 'https://www.qcsanbao.cn/webqcba/DVMProducerServlet?method=getWhereList&p=1', help="要爬取的网站") args = parser.parse_args() url = args.url base_url = configs["basic_url"] r = get_redis_connect() dl = Download() par = Parse() # 制作列表页的url_list make_url_list( base_url, par.parse_main_page_get_total_pagenum( dl.download_first_page(url, logger), configs["test"])) threading_list = [] # 列表页的解析详情页的数据url,存放在redis中,并且下载列表页html threading_list.extend([ Thread(target=download_and_parse_page, args=("url_list", r, par.parse_main_page_get_detail_page_url, dl.download_first_page, dl.download_list_page_html, lock, logger)) for _ in range(configs["thread_num"]) ]) # 解析详情页的code和name数据url,存放在redis中,并且下载详情页html threading_list.extend([ Thread(target=download_and_parse_page, args=("detail_url_list", r, par.parse_detail_page_get_url,