示例#1
0
 def err_url_handl(self, url, store_id):
     """
     对异常和太特殊的url进行处理,如果后面需要扩展,可以从这里入手
     :param url: 一些错误的地址以及新的电商网站
     :param store_id: 商铺ID
     :return: None
     """
     if len(url) > 0:
         if url.startswith("http"):
             response = request_url(url,
                                    headers=self.headers,
                                    proxies_list=self.ip_pool)
             if response.status_code is not None:
                 pass
             else:
                 print("无效网址" + url)
         elif u'\u4e00' <= url <= u'\u9fff':
             pass
         else:
             url = "http://" + url
             response = request_url(url,
                                    headers=self.headers,
                                    proxies_list=self.ip_pool)
             if response.status_code is not None:
                 pass
示例#2
0
 def err_url_handl(self, url, store_dict):
     if len(url) > 0:
         if url.startswith("http"):
             response = request_url(url, headers=self.headers, proxies_list=self.ip_pool)
             if response.status_code is not None:
                 pass
             else:
                 print("无效网址" + url)
         elif u'\u4e00' <= url <= u'\u9fff':
             pass
         else:
             url = "http://" + url
             response = request_url(url, headers=self.headers, proxies_list=self.ip_pool)
             if response.status_code is not None:
                 pass
示例#3
0
 def yupoo_url_handl(self, old_url, store_id):
     """
     对商铺的地址进行处理,构造定位地址,把相册请求地址定位在相册栏,并对被冻结的用户进行过滤
     :param old_url: 商铺的相册地址
     :param store_id: 商铺ID
     :return: None
     """
     response = request_url(
         old_url,
         headers=self.headers,
         proxies_list=self.ip_pool,
     )
     if response.status_code is not None:
         html = etree.HTML(response.text)
         try:
             albums_url = html.xpath(
                 "//div[@class='showheader__menus']/a[2]/@href")[0]
         except:
             try:
                 if len(re.findall("http|https", old_url)) == 2:
                     res = re.split(r"https", old_url)
                     for url in res:
                         self.yupoo_url_handl("http" + url, store_id)
                 elif not old_url.endswith("albums"):
                     old_url = re.search(r"http.*/albums", old_url).group()
                     self.yupoo_url_handl(old_url, store_id)
             except:
                 print("账户被冻结:" + old_url)
         else:
             new_url = parse.urljoin(response.url, albums_url)
             self.yupoo_spider(new_url, store_id)
示例#4
0
 def get_ypimg_page(self, albums_href, store_id):
     response = request_url(albums_href, headers=self.headers, proxies_list=self.ip_pool)
     try:
         html = etree.HTML(response.text)
     except:
         print("请求失败:" + albums_href)
         pass
     else:
         albums_name = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()")[0]
         albums_name = re.sub("\"|'", "“", albums_name)
         # albums_count = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[2]/text()")
         # pic_src = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']//img/@src")
         other_msg = html.xpath("//div[@class='showalbumheader__gallerydec']/div[1]/text()")
         other_msg = re.sub("\"|'", "“", str(other_msg))
         data_id = html.xpath(
             "//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/@data-id")
         img_href = list()
         for id in data_id:
             img_url = parse_url(albums_href, id, "uid=1")  # &tab=min 请求缩略版图片
             img_href.append(img_url)
         img_href = str(img_href)
         albums_info = (albums_name, store_id, albums_href, img_href, str(other_msg))
         insert_albums = "insert into albums (albums_name,store_id,albums_href,img_url,other_msg) values %s on duplicate key update %s;" % (
         str(albums_info),
         """albums_name="%s",store_id=%s,albums_href="%s",img_url="%s",other_msg="%s" """ % albums_info)
         insert_albums = re.sub(r"\\|\n", "", insert_albums)
         print(insert_albums)
         try:
             self.crs.execute(insert_albums)
         except:
             self.conn.ping()
             self.crs = self.conn.cursor()
             self.crs.execute(insert_albums)
         self.conn.commit()
示例#5
0
 def sale_date_spider(self, response):
     """
     获取发售日历url
     :param response 首页的源码
     """
     html = etree.HTML(response.text)
     a_list = html.xpath("//div[@class='left pure-u-1']/a/@href")
     saledate_url = parse.urljoin(self.start_url, str(a_list[1]))
     saledate_text = request_url(saledate_url, headers=self.headers, proxies_list=self.ip_pool)
     saledate_html = etree.HTML(saledate_text.text)
     saledate_url_list = list()
     # 获取最新球鞋发售信息的url
     for i in saledate_html.xpath("//div[@class='release_list']/a/@href"):
         saledate_list_url = parse.urljoin(self.start_url, i)
         saledate_detail_text = request_url(saledate_list_url, headers=self.headers, proxies_list=self.ip_pool)
         saledate_detail_html = etree.HTML(saledate_detail_text.text)
         # 获取相关资讯的url(最终目的)
         for i in saledate_detail_html.xpath("//div[@class='relat_news']/a/@href"):
             saledate_url_list.append(parse.urljoin(self.start_url, i))
     self.detail_page_spider(saledate_url_list)
示例#6
0
 def yupoo_url_handl(self, old_url, store_id):
     response = request_url(old_url, headers=self.headers, proxies_list=self.ip_pool)
     if response.status_code is not None:
         html = etree.HTML(response.text)
         try:
             albums_url = html.xpath("//div[@class='showheader__menus']/a[2]/@href")[0]
         except:
             print("用户账号被冻结:" + old_url)
         else:
             new_url = parse.urljoin(response.url, albums_url)
             self.yupoo_spider(new_url, store_id)
     else:
         print("错误页面" + old_url)
示例#7
0
 def store_url_spider(self, kind_id, kind_url, temp_list=None):
     if temp_list is None:
         temp_list = list()
     response = request_url(kind_url, headers=self.index_headers, proxies_list=self.ip_pool)
     if response is None:
         print("页面请求失败:" + kind_url)
     html = etree.HTML(response.text)
     store_html = html.xpath("//div[@class='def_border hot_tj_index c_page']/div[@class='data']/ul/li/a[last()]/@href")
     for store_href in store_html:
         temp_list.append(parse.urljoin(self.start_url, store_href))
     next_page = html.xpath("//div[@class='pagination'][1]/a[text()='下一页']/@href")
     if len(next_page) <= 0:
         return self.get_store_info(temp_list, kind_id)
     else:
         self.store_url_spider(kind_id, parse.urljoin(kind_url, next_page[0]), temp_list)
示例#8
0
 def start_page(self):
     response = request_url(self.start_url, headers=self.headers, proxies_list=self.ip_pool)
     t_list = list()
     t1 = threading.Thread(target=self.slider_spider, args=(response,))
     t2 = threading.Thread(target=self.sale_date_spider, args=(response,))
     t3 = threading.Thread(target=self.main_page_spider, args=(response,))
     t4 = threading.Thread(target=self.get_load_api, args=(response,))
     t5 = threading.Thread(target=self.get_msg_to_mysql)
     t_list.append(t1)
     t_list.append(t2)
     t_list.append(t3)
     t_list.append(t4)
     t_list.append(t5)
     for t in t_list:
         t.start()
示例#9
0
 def other_url_handl(self, url, store_id):
     other_headers = None
     response = request_url(url, headers=self.headers, proxies_list=self.ip_pool)
     try:
         shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1)
     except:
         print(response.url)
     else:
         if response is not None:
             cookies, base_url = get_cookies.get_cookies(url, ip_pool)
             url = parse_url(base_url.lower(), self.server_url, "")
             for item in cookies:
                 other_headers = {
                     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                     "cookie": "%s=%s" % (item.name, item.value)
                 }
             self.get_other_msg(other_headers, url, store_id, shop_id)
示例#10
0
 def nav_url_spider(self):
     response = request_url(self.start_url, headers=self.index_headers, proxies_list=self.ip_pool)
     if response is None:
         print("请求首页失败!")
     nav_html = etree.HTML(response.text)
     nav_href = nav_html.xpath("//div[@class='category']/div/h3/a/@href")
     nav_href = [parse.urljoin(self.start_url, href) for href in nav_href]
     nav_kind = nav_html.xpath("//div[@class='category']/div/h3/a/text()")
     for i in range(0, len(nav_kind)):
         kind_insert = """insert into store_kind (kind_name, kind_href) values ('%s', '%s') on duplicate key update kind_name = '%s';""" % (
         nav_kind[i], nav_href[i], nav_kind[i])
         try:
             self.crs.execute(kind_insert)
         except:
             self.conn.ping()
             self.crs = self.conn.cursor()
             self.crs.execute(kind_insert)
         self.conn.commit()
     self.crs.execute("select * from store_kind")
     return [(kind[0], kind[2]) for kind in self.crs.fetchall()]
示例#11
0
 def other_url_handl(self, url, store_id):
     """
     获取shop_id,构造json信息的地址,对请求头进行初步处理
     :param url: 首页地址
     :param store_id: 商店ID
     :return: None
     """
     other_headers = None
     response = request_url(url,
                            headers=self.headers,
                            proxies_list=self.ip_pool)
     # 对于请求错误的url进行重新请求,直到正确
     if "b.oijgvrq.cn" in response.url:
         self.other_url_handl(url, store_id)
     # 从首页重定向的url中获取商品的id用于后面构造url
     try:
         shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1)
     except:
         print(response.url)
     else:
         if response is not None:
             cookies = get_cookies.get_cookies(url)
             url = parse_url(response.url.lower(), self.server_url, "")
             # 对请求头进行处理,获取请求首页时服务器设置的cookie值中的token字段构造请求头
             for item in cookies:
                 other_headers = {
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                     "cookie": "%s=%s" % (item.name, item.value)
                 }
             # 对于一些特殊的商店因为请求头中服务器没有设置token字段,所有手动构造
             if "token" not in other_headers["cookie"]:
                 other_headers = {
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                     "cookie":
                     "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD;"
                 }
             self.get_other_msg(other_headers, url, store_id, shop_id)
示例#12
0
 def yupoo_spider(self, yupoo_url, store_id):
     response = request_url(yupoo_url, headers=self.headers, proxies_list=self.ip_pool)
     try:
         html = etree.HTML(response.text)
     except:
         print("请求失败:" + yupoo_url)
         pass
     else:
         if len(re.findall(r"该用户主页暂时关闭", response.text)) > 0:
             print("该用户主页暂时关闭:" + yupoo_url)
         elif html.xpath("//a[@class='showheader__menuslink showheader__active']/text()")[0] == "相册":
             temp_albums_list = html.xpath("//div[@class='showindex__parent']/div/a/@href")
             for albums in temp_albums_list:
                 self.get_ypimg_page(parse.urljoin(response.url, albums), store_id)
             if len(html.xpath("//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href")) == 0:
                 print("一个网址爬取完毕!")
             else:
                 next_page = html.xpath("//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href")[0]
                 next_page = parse.urljoin(response.url, next_page)
                 self.yupoo_spider(next_page, store_id)
         else:
             print("未知错误:" + yupoo_url)
示例#13
0
 def yupoo_spider(self, yupoo_url, store_id):
     """
     用递归函数对定位在相册栏的页面进行翻页循环获取,而对获取的每一个相册对象传递给
     get_ypimg_page方法进行获取信息,递归结束的标志在‘后一页‘没有翻页连接
     :param yupoo_url:
     :param store_id:
     :return: None
     """
     response = request_url(yupoo_url,
                            headers=self.headers,
                            proxies_list=self.ip_pool)
     try:
         html = etree.HTML(response.text)
     except:
         print("请求失败:" + yupoo_url)
     else:
         if len(re.findall(r"该用户主页暂时关闭", response.text)) > 0:
             print("该用户主页暂时关闭:" + yupoo_url)
         elif html.xpath(
                 "//a[@class='showheader__menuslink showheader__active']/text()"
         )[0] == "相册":
             temp_albums_list = html.xpath(
                 "//div[@class='showindex__parent']/div/a/@href")
             for albums in temp_albums_list:
                 self.get_ypimg_page(parse.urljoin(response.url, albums),
                                     store_id)
             if len(
                     html.xpath(
                         "//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href"
                     )) == 0:
                 pass
             else:
                 next_page = html.xpath(
                     "//div[@class='none_select pagination__buttons']/a[@title='后一页']/@href"
                 )[0]
                 next_page = parse.urljoin(response.url, next_page)
                 self.yupoo_spider(next_page, store_id)
         else:
             print("未知错误:" + yupoo_url)
示例#14
0
 def load_more_msg(self, path, news_list_pos, params):
     print(news_list_pos)
     load_more_url = path + str(news_list_pos) + params
     content = request_url(load_more_url, headers=self.headers, proxies_list=self.ip_pool).content.decode(
         "unicode-escape")
     try:
         # json异常预处理
         json.loads(content, strict=False)
     except:
         content = dji(content)
     finally:
         json_text = json.loads(content, strict=False)
         if json_text["msg"]["count"] == 0:
             print("已获取到所有详情页的地址!")
             return
         else:
             load_more_url_list = list()
             for i in json_text["msg"]["list"]:
                 load_more_url_list.append(parse.urljoin(self.start_url, i["url"]))
             self.detail_page_spider(load_more_url_list)
         news_list_pos = int(news_list_pos) + 30
         self.load_more_msg(path, news_list_pos, params)
示例#15
0
 def get_store_info(self, store_href_list, kind_id):
     for store_href in store_href_list:
         # time.sleep(random.randint(2, 4))
         response = request_url(store_href, headers=self.index_headers, proxies_list=ip_pool)
         if response is None:
             print("页面请求失败:" + store_href)
         else:
             try:
                 html = etree.HTML(response.text)
             except:
                 with open("./test1/" + store_href[-6:-11:-1], "w", encoding="utf-8") as f:
                     f.write(response.text)
             dl_list = html.xpath("//div[@class='rows']/dl")
             temp_dict = {"kind_id": kind_id, "store_href": store_href}
             for dl in dl_list:
                 a = dl.xpath("./dt/text()")[0].strip()
                 b = dl.xpath("./dd")[0]
                 if a == "商家信誉:":
                     try:
                         temp_dict["credit_count"] = re.findall(r'\d', str(b.xpath("./img/@src")[0]))[0] + "星"
                     except:
                         temp_dict["credit_count"] = "6星"
                 elif a == "店名:":
                     try:
                         temp_dict["store_name"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["store_name"] = ""
                 elif a == "网址(1):":
                     try:
                         temp_dict["albums_url1"] = b.xpath("./a/@href")[0].strip()
                     except:
                         temp_dict["albums_url1"] = ""
                 elif a == "网址(2):":
                     try:
                         temp_dict["albums_url2"] = b.xpath("./a/@href")[0].strip()
                     except:
                         temp_dict["albums_url2"] = ""
                 elif a == "搜福一下:":
                     try:
                         temp_dict["soufu_url"] = b.xpath("./a/@href")[0].strip()
                     except:
                         temp_dict["soufu_url"] = ""
                 elif a == "QQ(1):":
                     try:
                         temp_dict["QQ1"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["QQ1"] = ""
                 elif a == "QQ(2):":
                     try:
                         temp_dict["QQ2"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["QQ2"] = ""
                 elif a == "微信:":
                     try:
                         temp_dict["wechart"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["wechart"] = ""
                 elif a == "电话:":
                     try:
                         temp_dict["phone"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["phone"] = ""
                 elif a == "地址:":
                     try:
                         temp_dict["address"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["address"] = ""
                 elif a == "主营产品:":
                     try:
                         temp_dict["product"] = b.xpath("./text()")[0].strip()
                     except:
                         temp_dict["product"] = ""
                 elif a == "扫一扫:":
                     try:
                         temp_dict["wechart_code"] = parse.urljoin(self.start_url, b.xpath("./img/@src")[0].strip())
                     except:
                         temp_dict["wechart_code"] = ""
                 else:
                     print("未知的条栏:" + store_href)
             temp_str = ""
             for i in tuple(" %s = '%s', " % (k, v) for k, v in temp_dict.items()):
                 temp_str += i
             store_insert = "insert into store %s values %s on duplicate key update %s;" % (re.sub(r"'", "", str(tuple(temp_dict))), str(tuple(temp_dict.values())), temp_str[0:-2])
             self.q.put(store_insert)
示例#16
0
 def detail_page_spider(self, url_list):
     """
     获取详情页所有信息保存到列表然后添加到队列中
     :param response 首页的源码
     <class 'str'>  title
     <class 'str'>  push_time_str
     <class 'str'>  read_count
     <class 'list'>  img_list
     <class 'str'>  text_content
     """
     for url in url_list:
         msg_dict = {"text_link": url}
         # 获取详情页信息
         response = request_url(url, headers=self.headers, proxies_list=self.ip_pool)
         html = etree.HTML(response.text)
         # 获取文字主题
         try:
             temp_title = re.sub("\\|\n", "", html.xpath("//div[@class='news_title']/h1/text()")[0])
         except:
             print(response.url)
             pass
         else:
             temp_title = re.sub("\"|'", "“", temp_title)
             msg_dict["title"] = temp_title
             # 获取文字发表时间
             msg_dict["push_time"] = html.xpath("//div[@class='body']/div[1]/text()")[0]
             # 请求阅读数api,获取阅读数量
             count_url_list = re.findall("fetch\(\"(/ajax/news_count/\d+)\"\)\.then\(function\(r\)", response.text)
             count_url = parse.urljoin(self.start_url, count_url_list[0])
             count = request_url(count_url, headers=self.headers, proxies_list=self.ip_pool)
             try:
                 msg_dict["read_count"] = count.text
             except:
                 print(url)
                 print(count_url)
             # 获取全部图片链接
             img = html.xpath("//div[@class='content']/img")
             if len(img) <= 0:
                 img = html.xpath("//div[@class='content']/p/a/img")
             img_list = list()
             for i in img:
                 temp_img = str(i.xpath("./@src")[0])
                 if temp_img.endswith(".png"):
                     try:
                         img_list.append(str(i.xpath("./@data-original")[0]))
                     except:
                         pass
                     else:
                         img_list.append(temp_img)
                 else:
                     img_list.append(temp_img)
             msg_dict["img_list"] = str(img_list)
             # 获取正文内容
             temp_re_obj = re.compile(r"<div class=\"content\">(.*?)<!-- GA -->", re.S)
             text_list = temp_re_obj.findall(response.text)
             text_content = ""
             for i in text_list:
                 text_temp = re.sub(
                     r"<(.*?)>| |&sup2;|\u200b|&yen;|&nbsp;|&ldquo;|&rdquo;|&middot;|&amp;|&mdash;|▼|\r|\n|\t|\\", "", i)
                 text_temp = re.sub("\"|'", "“", text_temp)
                 text_content += text_temp
             msg_dict["text_content"] = text_content
             self.q.put(msg_dict)
示例#17
0
 def get_ypimg_page(self, albums_href, store_id, num=1, img_href=None):
     """
     获取每个相册的信息,并对数据进行清洗放入队列
     :param albums_href:
     :param store_id:
     :return: None
     """
     if img_href is None:
         img_href = list()
     try:
         albums_href1 = albums_href + "&tab=min" + "&page=" + str(num)
         response = request_url(albums_href1,
                                headers=self.headers,
                                proxies_list=self.ip_pool)
     except:
         response = request_url(albums_href + "&page=" + str(num),
                                headers=self.headers,
                                proxies_list=self.ip_pool)
     try:
         html = etree.HTML(response.text)
     except:
         print("请求失败:" + albums_href)
     else:
         albums_name = html.xpath(
             "//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()"
         )[0]
         albums_name = re.sub("\"|'", "“", albums_name)
         albums_name = re.sub(r"\n|\\|\r|\t|\r\n|\n\r", "", albums_name)
         other_msg = html.xpath(
             "//div[@class='showalbumheader__gallerydec']/div[1]/text()")
         other_msg = re.sub("\"|'", "“", str(other_msg))
         other_msg = re.sub(r"\n|\\|\r|\t|\r\n|\n\r", "", other_msg)
         imgs = html.xpath(
             "//div[@class='showalbum__parent showalbum__min min']/div[@class='showalbum__children image__main']/div[@class='image__imagewrap']/img/@src"
         )
         if len(imgs) <= 0:
             imgs = html.xpath(
                 "//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/div[@class='image__imagewrap']/img/@src"
             )
             if len(imgs) <= 0:
                 imgs = html.xpath(
                     "//div[@class='showalbum__parent showalbum__max max']/div[@class='showalbum__children image__main']/div[@class='image__imagewrap']/img/@src"
                 )
         if len(imgs) <= 0:
             img_href = str(img_href)
             albums_info = (albums_name, store_id, albums_href, img_href,
                            str(other_msg))
             insert_albums = "insert into albums (albums_name,store_id,albums_href,img_url,other_msg) values %s on duplicate key update %s;" % (
                 str(albums_info),
                 """albums_name="%s",store_id=%s,albums_href="%s",img_url="%s",other_msg="%s" """
                 % albums_info)
             insert_albums = re.sub(r"\n|\\|\r|\t|\r\n|\n\r", "",
                                    insert_albums)
             self.q.put(insert_albums)
         else:
             for img in imgs:
                 img = "http:" + img
                 img_href.append(img)
             num += 1
             self.get_ypimg_page(albums_href,
                                 store_id,
                                 num=num,
                                 img_href=img_href)
示例#18
0
import json
from retryl_request import request_url
from lxml import etree
from urllib import parse

url = "http://www.flightclub.cn/news/a/sneaker/2019/1029/53700.html"
start_url = "http://www.flightclub.cn/"
headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
}
with open("./ip_pool", "r") as f:
    content = f.read()
ip_pool = json.loads(content)
msg_dict = {"text_link": url}
response = request_url(url, headers=headers, proxies_list=ip_pool)
html = etree.HTML(response.text)
# 获取文字主题
temp_title = re.sub("\\|\n", "",
                    html.xpath("//div[@class='news_title']/h1/text()")[0])
temp_title = re.sub("\"|'", "“", temp_title)
msg_dict["title"] = temp_title
# 获取文字发表时间
msg_dict["push_time"] = html.xpath("//div[@class='body']/div[1]/text()")[0]
# 请求阅读数api,获取阅读数量
count_url_list = re.findall(
    "fetch\(\"(/ajax/news_count/\d+)\"\)\.then\(function\(r\)", response.text)
count_url = parse.urljoin(start_url, count_url_list[0])
count = request_url(count_url, headers=headers, proxies_list=ip_pool)
try:
    msg_dict["read_count"] = count.text