Пример #1
0
 def start_req(self):
     """
     数据冰山发布的文章列表抓取
     :return:
     """
     task_queue.add_task(Aio_Req(self.first_req, self.parse_arti_lst))
     for i in range(1, 19):
         task_queue.add_task(
             Aio_Req(self.arti_req.format(offset=i * 10),
                     self.parse_arti_lst))
Пример #2
0
 def parse_items(self, response):
     """
     :param response:返回帖子列表页面数据
     :return: 构造评论数和点击数的数据请求
     """
     text = response["response"]
     page = response["page"]
     base_link = response["base_link"]
     log.log(base_link, "debug")
     car = response["car"]
     try:
         if text.strip():
             id_ = re.findall(r'data-topicid="(.*?)">', text, re.S)
             link = re.findall(r' <a href="(.*?)" >', text, re.S)
             title = re.findall(r'<h4>(.*?)</h4>', text, re.S)
             comm = re.findall(r'ass="comment">(.*?)帖</span>', text, re.S)
             time_ = re.findall(r'<time>(.*?)</time>', text, re.S)
             id_lst = []
             for i in range(len(id_)):
                 data = dict()
                 data["topic_id"] = id_[i]
                 id_lst.append(id_[i])
                 data["car"] = car
                 data["link"] = link[i]
                 data["title"] = title[i].strip()
                 data["comm"] = comm[i]
                 data["time"] = time_[i]
                 log.log(str(data), "info")
                 mon.insert("item_data", data)
             if id_lst:
                 print(id_lst)
                 ids = "%2C".join(id_lst)
                 log.log(ids, "debug")
                 time_ = int(time.time() * 1000)
                 comm_view_link = self.key_link.format(ids=ids, time_=time_)
                 log.log(comm_view_link, "debug")
                 task_queue.add_task(
                     Aio_Req(comm_view_link,
                             callback=self.parse_comment_view))
             task_queue.add_task(
                 Aio_Req(base_link.format(page + 1),
                         self.parse_items,
                         meta={
                             "page": page + 1,
                             "base_link": base_link,
                             "car": car
                         }))
     except Exception as e:
         log.log(str(e), "error")
         log.log(base_link + str(page), "error")
         log.log(text, "error")
Пример #3
0
 def parse_kb_detail(self, response):
     brand = response["brand"]
     series = response["series"]
     page = response["page"]
     base_link = response["base_link"]
     text = response["response"]
     try:
         if len(text) > 50:
             data = {}
             data["brand"] = brand
             data["series"] = series
             data["kb_raw"] = text
             log.log(str(data), "info")
             mon.insert("kb_detail", data)
             task_queue.add_task(
                 Aio_Req(base_link.format(page + 1),
                         self.parse_kb_detail,
                         meta={
                             "base_link": base_link,
                             "brand": brand,
                             "series": series,
                             "page": page + 1
                         }))
     except:
         log.log(brand + series + base_link + str(page), "error")
Пример #4
0
 def parse_comm(self, response):
     """
     :param response:文章的评论数据
     :return: 把数据持久化进数据库并把下一页的评论数据请求放入任务队列
     """
     offset = response["offset"]
     comm_id = response["comm_id"]
     try:
         text = json.loads(response["response"])
         data = {}
         data["comm_id"] = comm_id
         data["arti_data"] = text
         log.log("评论    " + str(data), "info")
         mon.insert("comm_lst", data)
         if not text["paging"]["is_end"]:
             log.log("下一个    ", "debug")
             next_task = Aio_Req(self.comm_req.format(comm_id=comm_id,
                                                      offset=20 *
                                                      (offset + 1)),
                                 self.parse_comm,
                                 meta={
                                     "comm_id": comm_id,
                                     "offset": offset
                                 })
             task_queue.add_task(next_task)
     except Exception as e:
         log.log(str(e), "error")
Пример #5
0
 def start_req(self):
     """
     初始化请求
     :return:
     """
     for i in tasks:
         base_link = i+"zufang/pg{}/"
         page = 1
         task_queue.add_task(Aio_Req(base_link.format(page),self.parse_item,meta={"page":page,"base_link":base_link}))
Пример #6
0
 def parse_item(self, response):
     """
     :param response:返回初始请求的响应HTML
     :return:构造下一页的请求和解析出的数据进行持久化保存
     """
     html = etree.HTML(response["response"])
     page = response["page"]
     base_link = response["base_link"]
     items = html.xpath('//ul[@class="sellListContent"]/li')
     next_item = html.xpath('//div[@comp-module="page"]/@page-data')
     if not next_item:
         print(response["response"])
     log.log("下一页item   " + str(next_item), "debug")
     if next_item:
         next_data = eval(next_item[0])
         if int(next_data["totalPage"]) >= page:
             log.log(("总页数   " + str(next_data["totalPage"]) + "   " +
                      "现在页数   " + str(page)), "debug")
             log.log(str(items), "debug")
             if items:
                 for j in items:
                     data = dict()
                     title = j.xpath('a/img/@alt')
                     data["city"] = j.xpath('//title/text()')
                     data["title"] = title
                     link = j.xpath('a/@href')
                     data["link"] = link
                     name = j.xpath(
                         'div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/text()'
                     )
                     data["name"] = name
                     sty_size = j.xpath(
                         'div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/text()'
                     )
                     data["size"] = sty_size
                     data["uni_price"] = j.xpath(
                         'div[@class="info clear"]/div[@class="priceInfo"]/div[2]/@data-price'
                     )
                     data["price"] = j.xpath(
                         'div[@class="info clear"]/div[@class="priceInfo"]/div[1]/span/text()'
                     )
                     log.log(str(data), "info")
                     mon.insert("ershou", data)
                 task_queue.old_task(base_link.format(page))
                 task_queue.add_task(
                     Aio_Req(base_link.format(page + 1),
                             callback=self.parse_item,
                             meta={
                                 "page": page,
                                 "base_link": base_link
                             }))
Пример #7
0
 def parse_item(self, response):
     html = etree.HTML(response["response"])
     page = response["page"]
     base_link = response["base_link"]
     items = html.xpath('//ul[@class="house-lst"]/li')
     next_item = html.xpath('//div[@comp-module="page"]/@page-data')
     log.log("下一页item   " + str(next_item), "debug")
     if next_item:
         next_data = eval(next_item[0])
         if int(next_data["totalPage"]) >= page:
             log.log(("总页数   " + str(next_data["totalPage"]) + "   " +
                      "现在页数   " + str(page)), "debug")
             log.log(str(items), "debug")
             if items:
                 for j in items:
                     data = dict()
                     title = j.xpath('div[@class="info-panel"]/h2/a/@title')
                     data["title"] = title
                     data["city"] = j.xpath('//title/text()')
                     link = j.xpath('div[@class="pic-panel"]/a/@href')
                     data["link"] = link
                     name = j.xpath(
                         'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/a/span/text()'
                     )
                     if not name:
                         name = j.xpath(
                             'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/span[@class="region"]/text()'
                         )
                     data["name"] = name
                     sty_size = j.xpath(
                         'div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/span[@class="meters"]/text()'
                     )
                     data["size"] = sty_size
                     data["price"] = j.xpath(
                         'div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span/text()'
                     )
                     data["address"] = j.xpath(
                         'div[@class="info-panel"]/div[@class="col-1"]/div[@class="other"]/div[@class="con"]/a/text()'
                     )
                     log.log(str(data), "info")
                     mon.insert("zufang", data)
                 task_queue.old_task(base_link.format(page))
                 task_queue.add_task(
                     Aio_Req(base_link.format(page + 1),
                             callback=self.parse_item,
                             meta={
                                 "page": page,
                                 "base_link": base_link
                             }))
Пример #8
0
 def start_req(self):
     """
     初始化每个车型的首页论坛页面
     :return:
     """
     for i in self.cars.keys():
         page = 1
         car = i
         base_link = self.cars[i]
         task_queue.add_task(
             Aio_Req(base_link.format(page),
                     self.parse_items,
                     meta={
                         "page": page,
                         "base_link": base_link,
                         "car": i
                     }))
Пример #9
0
 def start_req(self):
     for i in mon.data_find("kb"):
         if "口碑" in i["sele_info"]:
             brand = i["brand"]
             series = i["series"][0].strip()
             kb_link = i["sele_info"]["口碑"].replace(
                 '.shtml',
                 "/list-more.json?number=5&page={}&jsonpCallback=article&_=1528591590986&callback=article"
             )
             task_queue.add_task(
                 Aio_Req(kb_link.format(1),
                         self.parse_kb_detail,
                         meta={
                             "base_link": kb_link,
                             "brand": brand,
                             "series": series,
                             "page": 1
                         }))
Пример #10
0
 def parse_brand(self,response):
     """
     :param response: 返回品牌条目页的
     :return:构造车系的请求
     """
     html = etree.HTML(response["response"])
     items = html.xpath('//div[@class="img_list_bg"]/ul/li')
     for i in items:
         data = {}
         name = i.xpath('h4/a/text()')[0]
         link = urljoin(self.host,i.xpath('h4/a/@href')[0])
         brand_id = i.xpath('a/@class')[0]
         data["brand_name"] = name
         data["brand_link"] = link
         data["brand_id"] = brand_id
         log.log(str(data),"info")
         task_queue.add_task(Aio_Req(link,self.parse_series,meta={"brand":name}))
         mon.insert("brand_info",data)
Пример #11
0
 def parse_series(self,response):
     """
     :param response:请求到的车系页面
     :return:构造车型的口碑请求
     """
     brand = response["brand"]
     html = etree.HTML(response["response"])
     items = html.xpath('//ul[@class="pt_list"]/li')
     for i in items:
         data = {}
         series_name = i.xpath('a/div[@class="info"]/strong/text()')
         series_link = urljoin(self.host,i.xpath('a/@href')[0])
         data["series_name"] = series_name
         data["series_link"] = series_link
         data["brand"] = brand
         log.log(str(data),"info")
         kb_link = re.sub(r'(\?param=.*?)$','dianping.shtml',series_link)
         task_queue.add_task(Aio_Req(kb_link,self.parse_kb,meta={"brand":brand,"series":series_name}))
         mon.insert("series_info",data)
Пример #12
0
 def parse_arti_lst(self, response):
     """
     返回的Json数据解析
     :param response:返回的文章数据
     :return:构造对应文章的评论的Url,构造成Request放入任务队列
     """
     try:
         data = json.loads(response["response"])
         log.log(str(data), "info")
         for i in data["data"]:
             log.log(str(i), "info")
             comm_id = i["id"]
             offset = 0
             task_queue.add_task(
                 Aio_Req(self.comm_req.format(comm_id=comm_id,
                                              offset=offset),
                         self.parse_comm,
                         meta={
                             "comm_id": comm_id,
                             "offset": offset
                         }))
     except Exception as e:
         log.log(str(e), "error")
Пример #13
0
 def start_req(self):
     a = Aio_Req(self.start_link,self.parse_brand)
     loop = asyncio.get_event_loop()
     tasks = [a.aio_req()]
     loop.run_until_complete(asyncio.wait(tasks))
 def start_req(self):
     for i in range(17069):
         task_queue.add_task(
             Aio_Req(self.follow_base_link.format(i * 10),
                     self.parse_follow_lst,
                     meta={"offset": i}))