def err_url_handl(self, url, store_id): """ 对异常和太特殊的url进行处理,如果后面需要扩展,可以从这里入手 :param url: 一些错误的地址以及新的电商网站 :param store_id: 商铺ID :return: None """ if len(url) > 0: if url.startswith("http"): response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) if response.status_code is not None: pass else: print("无效网址" + url) elif u'\u4e00' <= url <= u'\u9fff': pass else: url = "http://" + url response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) if response.status_code is not None: pass
def err_url_handl(self, url, store_dict): if len(url) > 0: if url.startswith("http"): response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) if response.status_code is not None: pass else: print("无效网址" + url) elif u'\u4e00' <= url <= u'\u9fff': pass else: url = "http://" + url response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) if response.status_code is not None: pass
def yupoo_url_handl(self, old_url, store_id): """ 对商铺的地址进行处理,构造定位地址,把相册请求地址定位在相册栏,并对被冻结的用户进行过滤 :param old_url: 商铺的相册地址 :param store_id: 商铺ID :return: None """ response = request_url( old_url, headers=self.headers, proxies_list=self.ip_pool, ) if response.status_code is not None: html = etree.HTML(response.text) try: albums_url = html.xpath( "//div[@class='showheader__menus']/a[2]/@href")[0] except: try: if len(re.findall("http|https", old_url)) == 2: res = re.split(r"https", old_url) for url in res: self.yupoo_url_handl("http" + url, store_id) elif not old_url.endswith("albums"): old_url = re.search(r"http.*/albums", old_url).group() self.yupoo_url_handl(old_url, store_id) except: print("账户被冻结:" + old_url) else: new_url = parse.urljoin(response.url, albums_url) self.yupoo_spider(new_url, store_id)
def get_ypimg_page(self, albums_href, store_id): response = request_url(albums_href, headers=self.headers, proxies_list=self.ip_pool) try: html = etree.HTML(response.text) except: print("请求失败:" + albums_href) pass else: albums_name = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()")[0] albums_name = re.sub("\"|'", "“", albums_name) # albums_count = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[2]/text()") # pic_src = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']//img/@src") other_msg = html.xpath("//div[@class='showalbumheader__gallerydec']/div[1]/text()") other_msg = re.sub("\"|'", "“", str(other_msg)) data_id = html.xpath( "//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/@data-id") img_href = list() for id in data_id: img_url = parse_url(albums_href, id, "uid=1") # &tab=min 请求缩略版图片 img_href.append(img_url) img_href = str(img_href) albums_info = (albums_name, store_id, albums_href, img_href, str(other_msg)) insert_albums = "insert into albums (albums_name,store_id,albums_href,img_url,other_msg) values %s on duplicate key update %s;" % ( str(albums_info), """albums_name="%s",store_id=%s,albums_href="%s",img_url="%s",other_msg="%s" """ % albums_info) insert_albums = re.sub(r"\\|\n", "", insert_albums) print(insert_albums) try: self.crs.execute(insert_albums) except: self.conn.ping() self.crs = self.conn.cursor() self.crs.execute(insert_albums) self.conn.commit()
def sale_date_spider(self, response): """ 获取发售日历url :param response 首页的源码 """ html = etree.HTML(response.text) a_list = html.xpath("//div[@class='left pure-u-1']/a/@href") saledate_url = parse.urljoin(self.start_url, str(a_list[1])) saledate_text = request_url(saledate_url, headers=self.headers, proxies_list=self.ip_pool) saledate_html = etree.HTML(saledate_text.text) saledate_url_list = list() # 获取最新球鞋发售信息的url for i in saledate_html.xpath("//div[@class='release_list']/a/@href"): saledate_list_url = parse.urljoin(self.start_url, i) saledate_detail_text = request_url(saledate_list_url, headers=self.headers, proxies_list=self.ip_pool) saledate_detail_html = etree.HTML(saledate_detail_text.text) # 获取相关资讯的url(最终目的) for i in saledate_detail_html.xpath("//div[@class='relat_news']/a/@href"): saledate_url_list.append(parse.urljoin(self.start_url, i)) self.detail_page_spider(saledate_url_list)
def yupoo_url_handl(self, old_url, store_id): response = request_url(old_url, headers=self.headers, proxies_list=self.ip_pool) if response.status_code is not None: html = etree.HTML(response.text) try: albums_url = html.xpath("//div[@class='showheader__menus']/a[2]/@href")[0] except: print("用户账号被冻结:" + old_url) else: new_url = parse.urljoin(response.url, albums_url) self.yupoo_spider(new_url, store_id) else: print("错误页面" + old_url)
def store_url_spider(self, kind_id, kind_url, temp_list=None): if temp_list is None: temp_list = list() response = request_url(kind_url, headers=self.index_headers, proxies_list=self.ip_pool) if response is None: print("页面请求失败:" + kind_url) html = etree.HTML(response.text) store_html = html.xpath("//div[@class='def_border hot_tj_index c_page']/div[@class='data']/ul/li/a[last()]/@href") for store_href in store_html: temp_list.append(parse.urljoin(self.start_url, store_href)) next_page = html.xpath("//div[@class='pagination'][1]/a[text()='下一页']/@href") if len(next_page) <= 0: return self.get_store_info(temp_list, kind_id) else: self.store_url_spider(kind_id, parse.urljoin(kind_url, next_page[0]), temp_list)
def start_page(self): response = request_url(self.start_url, headers=self.headers, proxies_list=self.ip_pool) t_list = list() t1 = threading.Thread(target=self.slider_spider, args=(response,)) t2 = threading.Thread(target=self.sale_date_spider, args=(response,)) t3 = threading.Thread(target=self.main_page_spider, args=(response,)) t4 = threading.Thread(target=self.get_load_api, args=(response,)) t5 = threading.Thread(target=self.get_msg_to_mysql) t_list.append(t1) t_list.append(t2) t_list.append(t3) t_list.append(t4) t_list.append(t5) for t in t_list: t.start()
def other_url_handl(self, url, store_id): other_headers = None response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) try: shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1) except: print(response.url) else: if response is not None: cookies, base_url = get_cookies.get_cookies(url, ip_pool) url = parse_url(base_url.lower(), self.server_url, "") for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } self.get_other_msg(other_headers, url, store_id, shop_id)
def nav_url_spider(self): response = request_url(self.start_url, headers=self.index_headers, proxies_list=self.ip_pool) if response is None: print("请求首页失败!") nav_html = etree.HTML(response.text) nav_href = nav_html.xpath("//div[@class='category']/div/h3/a/@href") nav_href = [parse.urljoin(self.start_url, href) for href in nav_href] nav_kind = nav_html.xpath("//div[@class='category']/div/h3/a/text()") for i in range(0, len(nav_kind)): kind_insert = """insert into store_kind (kind_name, kind_href) values ('%s', '%s') on duplicate key update kind_name = '%s';""" % ( nav_kind[i], nav_href[i], nav_kind[i]) try: self.crs.execute(kind_insert) except: self.conn.ping() self.crs = self.conn.cursor() self.crs.execute(kind_insert) self.conn.commit() self.crs.execute("select * from store_kind") return [(kind[0], kind[2]) for kind in self.crs.fetchall()]
def other_url_handl(self, url, store_id): """ 获取shop_id,构造json信息的地址,对请求头进行初步处理 :param url: 首页地址 :param store_id: 商店ID :return: None """ other_headers = None response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) # 对于请求错误的url进行重新请求,直到正确 if "b.oijgvrq.cn" in response.url: self.other_url_handl(url, store_id) # 从首页重定向的url中获取商品的id用于后面构造url try: shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1) except: print(response.url) else: if response is not None: cookies = get_cookies.get_cookies(url) url = parse_url(response.url.lower(), self.server_url, "") # 对请求头进行处理,获取请求首页时服务器设置的cookie值中的token字段构造请求头 for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } # 对于一些特殊的商店因为请求头中服务器没有设置token字段,所有手动构造 if "token" not in other_headers["cookie"]: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD;" } self.get_other_msg(other_headers, url, store_id, shop_id)
def yupoo_spider(self, yupoo_url, store_id): response = request_url(yupoo_url, headers=self.headers, proxies_list=self.ip_pool) try: html = etree.HTML(response.text) except: print("请求失败:" + yupoo_url) pass else: if len(re.findall(r"该用户主页暂时关闭", response.text)) > 0: print("该用户主页暂时关闭:" + yupoo_url) elif html.xpath("//a[@class='showheader__menuslink showheader__active']/text()")[0] == "相册": temp_albums_list = html.xpath("//div[@class='showindex__parent']/div/a/@href") for albums in temp_albums_list: self.get_ypimg_page(parse.urljoin(response.url, albums), store_id) if len(html.xpath("//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href")) == 0: print("一个网址爬取完毕!") else: next_page = html.xpath("//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href")[0] next_page = parse.urljoin(response.url, next_page) self.yupoo_spider(next_page, store_id) else: print("未知错误:" + yupoo_url)
def yupoo_spider(self, yupoo_url, store_id): """ 用递归函数对定位在相册栏的页面进行翻页循环获取,而对获取的每一个相册对象传递给 get_ypimg_page方法进行获取信息,递归结束的标志在‘后一页‘没有翻页连接 :param yupoo_url: :param store_id: :return: None """ response = request_url(yupoo_url, headers=self.headers, proxies_list=self.ip_pool) try: html = etree.HTML(response.text) except: print("请求失败:" + yupoo_url) else: if len(re.findall(r"该用户主页暂时关闭", response.text)) > 0: print("该用户主页暂时关闭:" + yupoo_url) elif html.xpath( "//a[@class='showheader__menuslink showheader__active']/text()" )[0] == "相册": temp_albums_list = html.xpath( "//div[@class='showindex__parent']/div/a/@href") for albums in temp_albums_list: self.get_ypimg_page(parse.urljoin(response.url, albums), store_id) if len( html.xpath( "//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href" )) == 0: pass else: next_page = html.xpath( "//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href" )[0] next_page = parse.urljoin(response.url, next_page) self.yupoo_spider(next_page, store_id) else: print("未知错误:" + yupoo_url)
def load_more_msg(self, path, news_list_pos, params): print(news_list_pos) load_more_url = path + str(news_list_pos) + params content = request_url(load_more_url, headers=self.headers, proxies_list=self.ip_pool).content.decode( "unicode-escape") try: # json异常预处理 json.loads(content, strict=False) except: content = dji(content) finally: json_text = json.loads(content, strict=False) if json_text["msg"]["count"] == 0: print("已获取到所有详情页的地址!") return else: load_more_url_list = list() for i in json_text["msg"]["list"]: load_more_url_list.append(parse.urljoin(self.start_url, i["url"])) self.detail_page_spider(load_more_url_list) news_list_pos = int(news_list_pos) + 30 self.load_more_msg(path, news_list_pos, params)
def get_store_info(self, store_href_list, kind_id): for store_href in store_href_list: # time.sleep(random.randint(2, 4)) response = request_url(store_href, headers=self.index_headers, proxies_list=ip_pool) if response is None: print("页面请求失败:" + store_href) else: try: html = etree.HTML(response.text) except: with open("./test1/" + store_href[-6:-11:-1], "w", encoding="utf-8") as f: f.write(response.text) dl_list = html.xpath("//div[@class='rows']/dl") temp_dict = {"kind_id": kind_id, "store_href": store_href} for dl in dl_list: a = dl.xpath("./dt/text()")[0].strip() b = dl.xpath("./dd")[0] if a == "商家信誉:": try: temp_dict["credit_count"] = re.findall(r'\d', str(b.xpath("./img/@src")[0]))[0] + "星" except: temp_dict["credit_count"] = "6星" elif a == "店名:": try: temp_dict["store_name"] = b.xpath("./text()")[0].strip() except: temp_dict["store_name"] = "" elif a == "网址(1):": try: temp_dict["albums_url1"] = b.xpath("./a/@href")[0].strip() except: temp_dict["albums_url1"] = "" elif a == "网址(2):": try: temp_dict["albums_url2"] = b.xpath("./a/@href")[0].strip() except: temp_dict["albums_url2"] = "" elif a == "搜福一下:": try: temp_dict["soufu_url"] = b.xpath("./a/@href")[0].strip() except: temp_dict["soufu_url"] = "" elif a == "QQ(1):": try: temp_dict["QQ1"] = b.xpath("./text()")[0].strip() except: temp_dict["QQ1"] = "" elif a == "QQ(2):": try: temp_dict["QQ2"] = b.xpath("./text()")[0].strip() except: temp_dict["QQ2"] = "" elif a == "微信:": try: temp_dict["wechart"] = b.xpath("./text()")[0].strip() except: temp_dict["wechart"] = "" elif a == "电话:": try: temp_dict["phone"] = b.xpath("./text()")[0].strip() except: temp_dict["phone"] = "" elif a == "地址:": try: temp_dict["address"] = b.xpath("./text()")[0].strip() except: temp_dict["address"] = "" elif a == "主营产品:": try: temp_dict["product"] = b.xpath("./text()")[0].strip() except: temp_dict["product"] = "" elif a == "扫一扫:": try: temp_dict["wechart_code"] = parse.urljoin(self.start_url, b.xpath("./img/@src")[0].strip()) except: temp_dict["wechart_code"] = "" else: print("未知的条栏:" + store_href) temp_str = "" for i in tuple(" %s = '%s', " % (k, v) for k, v in temp_dict.items()): temp_str += i store_insert = "insert into store %s values %s on duplicate key update %s;" % (re.sub(r"'", "", str(tuple(temp_dict))), str(tuple(temp_dict.values())), temp_str[0:-2]) self.q.put(store_insert)
def detail_page_spider(self, url_list): """ 获取详情页所有信息保存到列表然后添加到队列中 :param response 首页的源码 <class 'str'> title <class 'str'> push_time_str <class 'str'> read_count <class 'list'> img_list <class 'str'> text_content """ for url in url_list: msg_dict = {"text_link": url} # 获取详情页信息 response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) html = etree.HTML(response.text) # 获取文字主题 try: temp_title = re.sub("\\|\n", "", html.xpath("//div[@class='news_title']/h1/text()")[0]) except: print(response.url) pass else: temp_title = re.sub("\"|'", "“", temp_title) msg_dict["title"] = temp_title # 获取文字发表时间 msg_dict["push_time"] = html.xpath("//div[@class='body']/div[1]/text()")[0] # 请求阅读数api,获取阅读数量 count_url_list = re.findall("fetch\(\"(/ajax/news_count/\d+)\"\)\.then\(function\(r\)", response.text) count_url = parse.urljoin(self.start_url, count_url_list[0]) count = request_url(count_url, headers=self.headers, proxies_list=self.ip_pool) try: msg_dict["read_count"] = count.text except: print(url) print(count_url) # 获取全部图片链接 img = html.xpath("//div[@class='content']/img") if len(img) <= 0: img = html.xpath("//div[@class='content']/p/a/img") img_list = list() for i in img: temp_img = str(i.xpath("./@src")[0]) if temp_img.endswith(".png"): try: img_list.append(str(i.xpath("./@data-original")[0])) except: pass else: img_list.append(temp_img) else: img_list.append(temp_img) msg_dict["img_list"] = str(img_list) # 获取正文内容 temp_re_obj = re.compile(r"<div class=\"content\">(.*?)<!-- GA -->", re.S) text_list = temp_re_obj.findall(response.text) text_content = "" for i in text_list: text_temp = re.sub( r"<(.*?)>| |²|\u200b|¥| |“|”|·|&|—|▼|\r|\n|\t|\\", "", i) text_temp = re.sub("\"|'", "“", text_temp) text_content += text_temp msg_dict["text_content"] = text_content self.q.put(msg_dict)
def get_ypimg_page(self, albums_href, store_id, num=1, img_href=None): """ 获取每个相册的信息,并对数据进行清洗放入队列 :param albums_href: :param store_id: :return: None """ if img_href is None: img_href = list() try: albums_href1 = albums_href + "&tab=min" + "&page=" + str(num) response = request_url(albums_href1, headers=self.headers, proxies_list=self.ip_pool) except: response = request_url(albums_href + "&page=" + str(num), headers=self.headers, proxies_list=self.ip_pool) try: html = etree.HTML(response.text) except: print("请求失败:" + albums_href) else: albums_name = html.xpath( "//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()" )[0] albums_name = re.sub("\"|'", "“", albums_name) albums_name = re.sub(r"\n|\\|\r|\t|\r\n|\n\r", "", albums_name) other_msg = html.xpath( "//div[@class='showalbumheader__gallerydec']/div[1]/text()") other_msg = re.sub("\"|'", "“", str(other_msg)) other_msg = re.sub(r"\n|\\|\r|\t|\r\n|\n\r", "", other_msg) imgs = html.xpath( "//div[@class='showalbum__parent showalbum__min min']/div[@class='showalbum__children image__main']/div[@class='image__imagewrap']/img/@src" ) if len(imgs) <= 0: imgs = html.xpath( "//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/div[@class='image__imagewrap']/img/@src" ) if len(imgs) <= 0: imgs = html.xpath( "//div[@class='showalbum__parent showalbum__max max']/div[@class='showalbum__children image__main']/div[@class='image__imagewrap']/img/@src" ) if len(imgs) <= 0: img_href = str(img_href) albums_info = (albums_name, store_id, albums_href, img_href, str(other_msg)) insert_albums = "insert into albums (albums_name,store_id,albums_href,img_url,other_msg) values %s on duplicate key update %s;" % ( str(albums_info), """albums_name="%s",store_id=%s,albums_href="%s",img_url="%s",other_msg="%s" """ % albums_info) insert_albums = re.sub(r"\n|\\|\r|\t|\r\n|\n\r", "", insert_albums) self.q.put(insert_albums) else: for img in imgs: img = "http:" + img img_href.append(img) num += 1 self.get_ypimg_page(albums_href, store_id, num=num, img_href=img_href)
import json from retryl_request import request_url from lxml import etree from urllib import parse url = "http://www.flightclub.cn/news/a/sneaker/2019/1029/53700.html" start_url = "http://www.flightclub.cn/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" } with open("./ip_pool", "r") as f: content = f.read() ip_pool = json.loads(content) msg_dict = {"text_link": url} response = request_url(url, headers=headers, proxies_list=ip_pool) html = etree.HTML(response.text) # 获取文字主题 temp_title = re.sub("\\|\n", "", html.xpath("//div[@class='news_title']/h1/text()")[0]) temp_title = re.sub("\"|'", "“", temp_title) msg_dict["title"] = temp_title # 获取文字发表时间 msg_dict["push_time"] = html.xpath("//div[@class='body']/div[1]/text()")[0] # 请求阅读数api,获取阅读数量 count_url_list = re.findall( "fetch\(\"(/ajax/news_count/\d+)\"\)\.then\(function\(r\)", response.text) count_url = parse.urljoin(start_url, count_url_list[0]) count = request_url(count_url, headers=headers, proxies_list=ip_pool) try: msg_dict["read_count"] = count.text