Exemplo n.º 1
0
    def parse(self, response):
        response = str(response).split(" ")[1].replace(">", "")
        bro = self.login(response)
        # print(response.text)
        num = 0
        for i in range(2):
            url = "https://s.taobao.com/search?q=java&s=" + str(num)
            num += 44
            bro.get(url)
            html = bro.page_source

            soup = BeautifulSoup(html, 'lxml')
            data_list = soup.find_all(class_='item J_MouserOnverReq')
            for data in data_list:
                data_soup = BeautifulSoup(str(data), 'lxml')
                # 图片链接
                img_url = "http:" + data_soup.find(
                    class_='J_ItemPic img')['data-src']
                # 图片价格
                price = data_soup.find('strong').string
                # 图片标题
                title = data_soup.find(class_='J_ItemPic img')['alt']
                # 详情页
                detail_url = "https:" + data_soup.find(
                    class_="pic-link J_ClickStat J_ItemPicA")["data-href"]

                bro.get(detail_url)
                time.sleep(1)
                html_second = bro.page_source
                soup = BeautifulSoup(html_second, 'lxml')

                try:
                    svolume = soup.find(
                        class_="tm-ind-item tm-ind-sellCount").text.replace(
                            "月销量", "")
                except:
                    svolume = 0

                try:
                    evaluate = soup.find(
                        class_="tm-ind-item tm-ind-reviewCount canClick tm-line3"
                    ).text.replace("累计评价", "")
                except:
                    evaluate = 0

                try:
                    integral = soup.find(
                        class_="tm-ind-item tm-ind-emPointCount").text.replace(
                            "送天猫积分", "")
                except:
                    integral = 0

                item = TaobaoItem(img_url=img_url,
                                  price=price,
                                  title=title,
                                  svolume=svolume,
                                  evaluate=evaluate,
                                  integral=integral,
                                  detail_url=detail_url)
                yield item
Exemplo n.º 2
0
 def loop_rate_parse(self, response):
     body = json.loads(re.findall('\((.*)\)', response.text)[0])
     items = TaobaoItem()
     for item in body['comments']:
         items['id'] = item['rateId']
         items['url'] = response.url
         items['platform'] = '淘宝'
         items['viewType'] = '问答'
         items['searchWord'] = response.meta['sw']
         items['crawlTime'] = self.get_localtime()
         items['publishTime'] = item['date']
         items['level'] = 1
         items['authorName'] = item['user']['nick']
         items['content'] = item['content']
         print(items)
Exemplo n.º 3
0
    def parse_item(self, response):
        #time.sleep(1)
        pattern = re.compile('g_page_config = ({.*?});', re.S)
        # 匹配出json型的数据,用正则。
        json_data = re.search(pattern, response.text).group(1)
        json_l = json.loads(json_data)
        page = re.findall(r'\"totalPage\"\:\d+', response.text)
        totalPage = eval(page[0].split(':')[1])
        #print('+++++++++++++++++++++++++++++++++++++',totalPage)
        #print(json_l)
        for datas in json_l.get('mods').get('itemlist').get('data').get(
                'auctions'):
            item = TaobaoItem()
            item['item_loc'] = datas.get('item_loc')
            item['pic_url'] = 'https:' + datas.get('pic_url')
            item['raw_title'] = datas.get('raw_title')
            item['shop_link'] = datas.get('shop_Link')
            item['view_price'] = datas.get('view_price')
            item['view_sales'] = datas.get('view_sales')

            file_names = [
                'item_loc', 'pic_url', 'raw_title', 'view_price', 'view_sales',
                'shop_link'
            ]
            for name in file_names:
                if item[name] == None:
                    item[name] = 'there is no item data'
            yield item
        for i in range(totalPage - 2):
            pager = json_l.get('mainInfo').get('modLinks').get('pager')
            print('爬取一页结束')
            # 调试信息
            # 这里获得了下一页的基本链接,还有加上时间戳
            times = self.get_time_stamp()

            other_data = {
                'data-key': 's',
                'data-value': str(response.meta['data_value']),
                '_ksTS': times[0]
            }
            next_url = 'https:' + pager + '&' + parse.urlencode(other_data)
            #print('******************',next_url)
            # 这里获得第二页的基本链接,要加上时间戳
            data_values = response.meta['data_value'] + 44
            yield Request(url=next_url,
                          meta={'data_value': data_values},
                          cookies=self.cookie,
                          callback=self.parse_item)
Exemplo n.º 4
0
 def parse(self, response):
     #time.sleep(1)
     pattern = re.compile('g_page_config = ({.*?});', re.S)
     # 匹配出json型的数据,用正则。
     json_data = re.search(pattern, response.text).group(1)
     json_l = json.loads(json_data)
     item = TaobaoItem()
     for datas in json_l.get('mods').get('itemlist').get('data').get(
             'auctions'):
         item['item_loc'] = datas.get('item_loc')
         item['pic_url'] = 'https:' + datas.get('pic_url')
         item['raw_title'] = datas.get('raw_title')
         item['shop_link'] = 'https:' + datas.get('shopLink')
         item['view_price'] = datas.get('view_price')
         item['view_sales'] = datas.get('view_sales')
         shop_link = datas.get('shopLink')
         #pages = datas.get('totalPage')
         file_names = [
             'item_loc', 'pic_url', 'raw_title', 'view_price', 'view_sales',
             'shop_link'
         ]
         for name in file_names:
             if item[name] == None:
                 item[name] = 'there is no item data'
         yield item
     #print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++   '+'https:',shop_link)
     next_partial_url = json_l.get('mainInfo').get('modLinks').get('pager')
     # print('next_partial_url:','https:'+next_partial_url)
     times = self.get_time_stamp()
     data_value = 44
     other_data = {
         'data-key': 's',
         'data-value': str(data_value),
         '_ksTS': times[0],
     }
     # print('other_date:',parse.urlencode(other_data))
     next_url = 'https:' + next_partial_url + '&' + parse.urlencode(
         other_data)
     # print('-------------------',next_url)
     # 这里获得第二页的基本链接,要加上时间戳
     data_values = data_value + 44
     yield Request(url=next_url,
                   meta={'data_value': data_values},
                   cookies=self.cookie,
                   callback=self.parse_item)
Exemplo n.º 5
0
 def parse(self, response):
     # 获取12种热门种类
     shop_type_name = re.findall(r'{"name":"(\S+?)"', response.text)
     # 创建对象
     item = TaobaoItem()
     llist = [n * 6 for n in range(12)]
     i = 0
     for n in range(len(shop_type_name)):
         if i in llist:
             item['商店种类'] = shop_type_name[n]
             url = 'https://shopsearch.taobao.com/search?data-key=s&data-value=0&ajax=true&_ksTS=1511058139704_756&app=shopsearch&spm=a230r.7195193.0.0.9l0Tsy&q=%s' % (
                 item['商店种类'])
             time.sleep(0.01)
             yield scrapy.Request(url,
                                  callback=self.parse_shop_type_detail,
                                  meta={'meta_1': copy.deepcopy(item)})
             time.sleep(1)
         i += 1
Exemplo n.º 6
0
    def next(self, response):
        item = TaobaoItem()
        url = response.url
        pattam_url = 'https://(.*?).com'
        subdomain = re.compile(pattam_url).findall(url)
        #print(subdomain)
        if subdomain[0] != 'item.taobao':
            #天猫!!!
            title = response.xpath("//div[@class='tb-detail-hd']/h1/text()").extract()[0]
            pattam_price = '"defaultItemPrice":"(.*?)"'
            price = re.compile(pattam_price).findall(response.body.decode('utf-8', 'ignore')) # 天猫
            pattam_id = 'id=(.*?)&'
            itemID = re.compile(pattam_id).findall(url)[0]
            # salesVolume = response.xpath('//div[@class="tb-sell-counter"]/a/strong/text()').extract()[0]
            itemInfo = response.xpath('//div[@class="attributes"]/div/ul/li/text()').extract()
        else:
            #淘宝!!!
            title = response.xpath("//h3[@class='tb-main-title']/@data-title").extract()[0]
            price = response.xpath("//em[@class = 'tb-rmb-num']/text()").extract()[0] # 淘宝
            pattam_id = 'id=(.*?)$'
            itemID = re.compile(pattam_id).findall(url)[0]
            # salesVolume = response.xpath('//div[@class="tb-sell-counter"]/a/@title').extract()
            itemInfo = response.xpath('//div[@class="attributes"]/ul/li/text()').extract()
        #评论数太麻烦了,我有直接抓评论的方法,不理这个了
        # # 构造具有评论数量信息的包的网址
        # comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(itemID)
        # # 这个获取网址源代码的代码永远也不会出现错误,因为这个URL的问题,就算URL是错误的,也可以获取到对应错误网址的源代码。
        # # 所以不需要使用 try 和 except urllib.URLError as e 来包装。
        # comment_data = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore')
        # pattam_comment = '"rateTotal":(.*?),"'
        # comment = re.compile(pattam_comment).findall(comment_data)
        

        item['title'] = title
        item['itemLink'] = response.url
        item['price'] = price
        item['itemID'] = itemID
        # item['salesVolume'] = salesVolume
        item['itemInfo'] = itemInfo
        yield item







#         # 名称
#     title = scrapy.Field()
# # 价格
#     price = scrapy.Field()
# # 月销量
#     salesVolume = scrapy.Field()
# # 评论数
#     comment = scrapy.Field()
# # 宝贝详情
#     itemInfo = scrapy.Field()
# # 链接
#     itemLink = scrapy.Field()
# # 宝贝ID
#     itemID = scrapy.Field()