Пример #1
0
    def parse_page(self, response):
        resp = response.text
        goods_json = re.search(r"g_page_config = (.+?\}\});", resp)
        if goods_json:
            goods_dict = json.loads(goods_json.group(1))
            goods_list = goods_dict["mods"]["itemlist"]["data"]["auctions"]
            for goods in goods_list:
                title = goods["title"]
                raw_title = goods["raw_title"]
                view_price = goods["view_price"]
                detail_url = goods["detail_url"]
                item_loc = goods["item_loc"]
                view_sales = goods["view_sales"]
                comment_count = goods["comment_count"]

                items = TaobaoItem(title=title,
                                   raw_title=raw_title,
                                   view_price=view_price,
                                   detail_url=detail_url,
                                   item_loc=item_loc,
                                   view_sales=view_sales,
                                   comment_count=comment_count)
                # print(dict(items))
                yield items
        else:
            print("没有找到")
Пример #2
0
 def parse(self, response):
     for sel in response.xpath('//div[@class="list-item"]'):
         try:
             # 有些页面不标准,没有mm姓名
             name = sel.xpath(
                 './/a[@class="lady-name"]/text()').extract()[0]
             print u'美眉姓名:', name
             self.mkdir('images/full/%s' % (name))
             item = TaobaoItem()
             item['mm_name'] = name
             href = sel.xpath('.//a[@class="lady-avatar"]/@href').extract()
             url = response.urljoin(href[0])
             yield scrapy.Request(url,
                                  meta={
                                      'driver': response.meta['driver'],
                                      'PhantomJS': response.meta['PhantomJS'],
                                      #'cookiejar': response.meta['cookiejar'],
                                      'item': item},
                                  callback=self.parse_mm_page)
             print u'去美眉图片页抓图:scheduling', url
         except:
             # 跳过这个mm,继续下一个
             continue
     # request next page
     self.pageindex += 1
     next_page = self.start_urls[0] + str(self.pageindex)
     yield scrapy.Request(next_page,
                          meta={
                              'driver': response.meta['driver'],
                              'PhantomJS': response.meta['PhantomJS'],
                              #'cookiejar': response.meta['cookiejar']
                          },
                          callback=self.parse)
Пример #3
0
    def parse(self, response):
        # print(response.body)


        print("1111111111111----------")
        time.sleep(5)
        
        i = response.meta.get("i")
        # url_i = response.meta.get("url")
        i +=1
        # print("2222222222222----------")
        if i > 100:
            return
        # try:
        # print("start:----------------------------")
        node_list = response.xpath("//div[@class='item J_MouserOnverReq  ']/div[@class='ctx-box J_MouseEneterLeave J_IconMoreNew']")
        print(node_list)
        for node in node_list:
            item = TaobaoItem()
            # print("--------------------------------")
            item['name'] = node.xpath("./div[@class='row row-2 title']/a[@class='J_ClickStat']/text()[2]").extract()[0].encode("utf-8")
            item['price'] = node.xpath("./div[@class='row row-1 g-clearfix']/div[@class='price g_price g_price-highlight']/strong/text()").extract()[0].encode("utf-8")
            item['payment_num'] = node.xpath("./div[@class='row row-1 g-clearfix']/div[@class='deal-cnt']/text()").extract()[0].encode("utf-8")
            item['shop_name'] = node.xpath("./div[@class='row row-3 g-clearfix']/div[@class='shop']/a/span[2]/text()").extract()[0].encode("utf-8")
            item['shop_address'] = node.xpath("./div[@class='row row-3 g-clearfix']/div[@class='location']/text()").extract()[0].encode("utf-8")
            yield item
        
        #点击下一页
        button = self.browser.find_elements(By.XPATH,'//a[@class="J_Ajax num icon-tag"]')[-1]
        button.click()
        time.sleep(random.random()*2)
        self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        html = self.browser.page_source
        yield scrapy.Request(url=response.url,callback=self.parse,meta={'html':html},dont_filter=True)
Пример #4
0
    def parse(self, response):
        html = response.text
        content = re.findall(r'g_page_config = (.*?) g_srp_loadCss', html,
                             re.S)[0].strip()[:-1]
        #格式化
        content = json.loads(content)

        item = TaobaoItem()

        #获取信息列表
        data_list = content['mods']['itemlist']['data']['auctions']

        #提取数据
        for data in data_list:
            try:
                item['title'] = data['raw_title']
                item['price'] = float(data['view_price'])
                pattern = re.compile(r'\d+')
                item['sales'] = int(pattern.findall(data['view_sales'])[0])
                item['is_tmall'] = '是' if data['shopcard']['isTmall'] else '否'
                item['shops_loc'] = data['item_loc']
                item['shops_name'] = data['nick']
                item['shops_id'] = data['user_id']
                item['goods_url'] = 'http' + data['detail_url']
                item['comment_count'] = int(data['comment_count'])
                item['goods_id'] = data['nid']

                yield item
            except Exception as e:
                pass
Пример #5
0
 def parse(self, response):
     # y = 0
     p = 'g_page_config = ({.*?});'
     g_page_config = response.selector.re(p)[0]
     g_page_config = json.loads(g_page_config)
     auctions = g_page_config['mods']['itemlist']['data']['auctions']
     # g_page_configs = response.selector.re(r'g_page_config = ({.*?});')
     #totalPage = response.selector.re(r'"totalPage":(.*?),')[0]
     #print(totalPage)
     for auction in auctions:
         y = y + 1
         item = TaobaoItem()
         item['title'] = auction['raw_title']
         item['price'] = auction['view_price']
         item['nick'] = auction['nick']
         item['sales'] = auction['view_sales']
         item['loc'] = auction['item_loc']
         item['detail_url'] = auction['detail_url']
         if item['detail_url'].startswith('//'):
             item['detail_url'] = 'https:' + item['detail_url']
             yield item
     #PageNow = self.a + 1
     if y < 44:
         num = self.a * 44 + y
         print('淘宝【' + self.keyword + '】: ' + '已采集' + str(self.a + 1) +
               '页' + ',总数据 :' + str(num) + '条')
         self.crawler.engine.close_spider(self, '已爬取所有信息!')
     else:
         num = self.a * 44 + y
         print('淘宝【' + self.keyword + '】: ' + '已采集' + str(self.a + 1) +
               '页' + ',总数据 :' + str(num) + '条')
         self.a = self.a + 1
         yield scrapy.Request(self.url % (self.keyword, self.a * 44),
                              callback=self.parse)
Пример #6
0
    def next(self, response):
        item = TaobaoItem()

        item["title"] = response.xpath(
            '//h3[@class="tb-main-title"]/@data-title').extract()[0].encode(
                'utf-8')
        item["link"] = response.url
        item["price"] = response.xpath(
            '//em[@class="tb-rmb-num"]/text()').extract()[0]
        item['shop'] = response.xpath(
            '//*[@id="J_ShopInfo"]//dl/dd/strong/a/text()').extract(
            )[0].encode('utf-8').strip()
        shop_url = 'http:' + response.xpath(
            '//*[@id="J_ShopInfo"]//dl/dd/strong/a/@href').extract()[0]
        item['shopLink'] = shop_url
        try:
            item['describeScore'] = response.xpath(
                '//div[@class="tb-shop-rate"]/dl[1]/dd/a/text()').extract(
                )[0].strip()
            item['serviceScore'] = response.xpath(
                '//div[@class="tb-shop-rate"]/dl[2]/dd/a/text()').extract(
                )[0].strip()
            item['logisticsScore'] = response.xpath(
                '//div[@class="tb-shop-rate"]/dl[3]/dd/a/text()').extract(
                )[0].strip()
        except Exception, e:
            item['describeScore'] = ""
            item['serviceScore'] = ""
            item['logisticsScore'] = ""
Пример #7
0
    def parse2(self, response):
        item=TaobaoItem()
        page=Selector(response)
        # print(response.text)
        item['title']=page.xpath('//head/title/text()').extract()[0][:-4]
        item['goods_url']=response.meta['goods_url']
        item['goods_class']=response.meta['goods_class']
        item['price']=page.xpath('//strong[@id="J_StrPrice"]/em[@class="tb-rmb-num"]/text()').extract()[0]
        item['sell_count']=response.meta['sell_count'][:-3]
        item['area']=response.meta['area']
        # item['trade']=page.xpath('//div[@class="tb-sell-counter"]/a/strong/text()').extract()
        seller= page.xpath('//div[@class="tb-shop-name"]/dl/dd/strong/a/@title').extract()
        if len(seller)==1:
            item['seller']=seller[0]
        else:
            seller=page.xpath('//span[@class="shop-name-title"]/@title').extract()
            if len(seller)==1:
                item['seller'] = seller[0]
            else:
                seller = page.xpath('//span[@class="shop-name-title"]/@title').extract()
                if len(seller) == 1:
                    item['seller'] = seller[0]
                else:
                    item['seller'] = '未知'

        yield item
Пример #8
0
    def parse(self, response):

        GoodsSpider.count += 1

        divs = response.xpath(
            "//*[@id='listsrp-itemlist']/div/div/div[1]/div")  #商品列表xpath
        if not divs:  #判断是否在这divs中,不在记录url
            self.log("list page error--%s" % response.url)

        for div in divs[1:59]:

            item = TaobaoItem()
            #商品价格
            item["price"] = div.xpath(
                "div[3]/div[1]/div[1]/strong")[0].extract()
            #商品链接url
            pre_goods_url = div.xpath("div[3]/div[2]/a/@href")[0].extract()

            #判断url中是否有https,没有就补上
            item[
                "goodsUrl"] = pre_goods_url if "https:" in pre_goods_url else (
                    "https:" + pre_goods_url)

            yield scrapy.Request(url=item["goodsUrl"],
                                 meta={'item': item},
                                 callback=self.parse_detail(),
                                 dont_filter=True)
Пример #9
0
    def next(self, response):

        item = TaobaoItem()
        item['title'] = response.meta['name']
        item['price'] = response.meta['price']
        item['address'] = response.meta['address']
        item['link'] = response.url
Пример #10
0
    def parse(self, response):

        URL = response.url

        html = response.text
        title = re.findall('"raw_title":"(.*?)"', html, re.S)
        pic_url = re.findall('"pic_url":"(.*?)"', html, re.S)
        view_price = re.findall('"view_price":"(.*?)"', html, re.S)
        view_fee = re.findall('"view_fee":"(.*?)"', html, re.S)
        item_loc = re.findall('"item_loc":"(.*?)"', html, re.S)
        view_sales = re.findall('"view_sales":"(.*?)"', html, re.S)
        nid = re.findall('"nid":"(.*?)"', html, re.S)
        nick = re.findall('"nick":"(.*?)"', html, re.S)
        nick.pop(-1)
        i = 0
        for x in view_price:
            item = TaobaoItem()
            item['title'] = title[i]
            print(title[i])
            item['pic_url'] = pic_url[i]
            item['view_price'] = view_price[i]
            item['view_fee'] = view_fee[i]
            item['item_loc'] = item_loc[i]
            item['view_sales'] = view_sales[i]
            item['nid'] = 'https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.13c42140RGlAs3&id=' + str(nid[i])
            item['nick'] = nick[i]
            i += 1
            print(item)
            yield item
Пример #11
0
    def allnvzhuang(self, response):
        item = TaobaoItem()
        x = response.xpath('//div[@class="count count5"]//em')
        item['city'] = x.xpath('text()').extract()
        item['url'] = response.url

        yield item
Пример #12
0
 def get_product(self, response):
     URL = response.url
     try:
         html = response.text
         title = re.findall('"raw_title":"(.*?)"', html, re.S)
         pic_url = re.findall('"pic_url":"(.*?)"', html, re.S)
         view_price = re.findall('"view_price":"(.*?)"', html, re.S)
         view_fee = re.findall('"view_fee":"(.*?)"', html, re.S)
         item_loc = re.findall('"item_loc":"(.*?)"', html, re.S)
         view_sales = re.findall('"view_sales":"(.*?)"', html, re.S)
         nid = re.findall('"nid":"(.*?)"', html, re.S)
         nick = re.findall('"nick":"(.*?)"', html, re.S)
         nick.pop(-1)
         i = 0
         for x in nick:
             item = TaobaoItem()
             item['title'] = title[i]
             item['pic_url'] = pic_url[i]
             item['view_price'] = view_price[i]
             item['view_fee'] = view_fee[i]
             item['item_loc'] = item_loc[i]
             item['view_sales'] = view_sales[i]
             item['nid'] = 'https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.13c42140RGlAs3&id=' + str(nid [i])
             item['nick'] = nick[i]
             i += 1
             yield item
     except:
         self.logger.debug(URL)
Пример #13
0
 def next(self,response):
     item=TaobaoItem()
     #pdb.set_trace()
     item['key'] = response.meta['key']
     item["title"]=response.xpath("//h3[@class='tb-main-title']/@data-title").extract()[0] #提取title
     intro = response.xpath('//ul[@class="attributes-list"]//li/text()').extract()
     intro = str(intro).replace('\\xa0','')
     intro = eval(intro)
     item['intro'] = intro
     item["link"]=response.url                                                         #提取当前页面的url
     patid='id=(.*?)$'                              
     thisid=re.compile(patid).findall(response.url)[0]     #用正则表达式通过url来提取商品的id
     clickurl = 'https://count.taobao.com/counter3?callback=jsonp86&keys=ICCP_1_' + str(thisid)
     clickdata = urllib.request.urlopen(clickurl).read().decode("utf-8","ignore")
     click = re.findall(':(\d+)',clickdata)
     item['click'] = click[0]
     commenturl="https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId="+str(thisid) #构造抓包获得的url(评论数)
     #print(commenturl)
     commentdata=urllib.request.urlopen(commenturl).read().decode("utf-8","ignore")  #进入构造出的url,并读取页面源码信息
     #print(commentdata)
     pat='"count":(.*?)}'
     item["comment"]=re.compile(pat).findall(commentdata)[0]    #用正则表达式来匹配提取出商品评价的数量
     referer = response.url
     headers = {'Referer':referer}
     seldurl = 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId='+str(thisid)+'&modules=soldQuantity,xmpPromotion&callback=onSibRequestSuccess'
     request = urllib.request.Request(seldurl,headers=headers)
     runtime = 1
     while True:
         if runtime > 5:
             break
         try:
             selddata = urllib.request.urlopen(request).read().decode("utf-8","ignore")
             selddata = selddata.replace('\r','').replace('\n','').replace('onSibRequestSuccess(','').replace(');','').replace('true','1').replace('false','0')
             seld_dict = eval(selddata)
             seld = seld_dict['data']['soldQuantity']['confirmGoodsCount']
             item['seld'] = seld
             break
         except:
             time.sleep(1)
             runtime += 1
     try:
         price = seld_dict['data']['promotion']['promoData']['def'][0]['price']
         item["price"] = price
     except:
         item["price"]=response.xpath("//em[@class='tb-rmb-num']/text()").extract()[0]       #提取原价
     item['extract_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     description = response.xpath('//p[@class="tb-subtitle"]/text()').extract()
     if description:
         item['description'] = description[0]
     else:
         description = response.xpath('//p[@class="newp"]/text()').extract()
         if description:
             item['description'] = description[0]
     #pdb.set_trace()
     #print(item["comment"])
     yield item
Пример #14
0
    def parse(self, response):
        item = TaobaoItem()

        # title_node = response.xpath("//div/text()").extract()

        # print('打印title信息。', title_node)

        file_name = 'data.html'
        with open(file_name, 'wb') as f:
            f.write(response.body)
Пример #15
0
    def parse_page(self, response):
        item = TaobaoItem()

        # extract
        item['sn'] = response.url.split("?id=")[1]
        item['images'] = self.extract_images(response)
        item['choices'], item['sizes'], item['colors'] = self.extract_choice(response)
        item['properties'] = self.extract_properties(response)
        # print item
        return item
Пример #16
0
    def parse_self(self, response):
        mtitle = response.xpath('//h1/text()').get()
        mcontent = response.xpath(
            "//div[@class='fed-arti-content fed-padding']/p/text()").getall()
        mcontent = ''.join(mcontent)
        item = TaobaoItem()
        item['mtitle'] = mtitle
        item['mcontent'] = mcontent

        yield item
Пример #17
0
    def next(self, response):
        #print(response.url)
        item = TaobaoItem()

        url = response.url
        pattam_url = 'https://(.*?).com'
        subdomain = re.compile(pattam_url).findall(url)
        #print(subdomain)

        item["link"] = response.url  #商品链接

        if subdomain[0] != 'item.taobao':  #判断域名
            # 天猫或天猫超市
            title = response.xpath(
                "//div[@class='tb-detail-hd']/h1/text()").extract()
            pattam_price = '"defaultItemPrice":"(.*?)"'  #正则表达式,商品价格
            price = re.compile(pattam_price).findall(
                response.body.decode('utf-8', 'ignore'))  #网页源代码中提取
            pattam_id = 'id=(.*?)&'
            this_id = re.compile(pattam_id).findall(url)[0]  #通过url中提取id
        else:
            # 淘宝
            title = response.xpath(
                "//h3[@class='tb-main-title']/@data-title").extract()
            price = response.xpath(
                "//em[@class='tb-rmb-num']/text()").extract()
            pattam_id = 'id=(.*?)$'
            this_id = re.compile(pattam_id).findall(url)[0]

        #print(this_id)

        item["title"] = title
        item["price"] = price

        # # 构造具有评论数量信息的包的网址
        # comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(this_id)
        #
        # context=ssl._create_unverified_context()
        #
        # comment_data = urllib.request.urlopen(comment_url,context=context).read().decode('utf-8', 'ignore')
        #
        # pattam_comment = '"rateTotal":(.*?),"'
        #
        # comment = re.compile(pattam_comment).findall(comment_data)
        #
        #
        # item["comment"]=comment

        #xpath获取失败
        #item["comment"]=response.xpath("//em[@class='J_ReviewsCount']/text()")  #商品评论数目
        #print(title)

        yield item

        pass
Пример #18
0
 def parse_item(self,response):
     self.log(response.url)
     #解析评论数据
     select=response.xpath("//div[@class='comments-item']")
    
     items=[]      
     for i in select:
         item=TaobaoItem()
         item['title']=(''.join(i.xpath("div/div[2]/div/text()").extract())).strip()
         items.append(item)
     return items
Пример #19
0
 def parse(self, response):
     item = TaobaoItem()
     try:
         urls = response.xpath('//div[@class="productImg-wrap"]/a/img').re(
             r'//.*.jpg')
         for i in urls:
             item['image_urls'].append('http:' + i)
         item['title'] = response.xpath(
             '//p[@class="productTitle"]/a/@title').extract()
     except:
         pass
     return item
Пример #20
0
 def parse_detail(self,response):
     content = response.xpath('//meta[@name="microscope-data"]/@content')[0].extract()
     item = TaobaoItem()
     Id = response.xpath('//ul/li[@class="tb-social-fav"]/a/@href')[0].extract()
     Id = Id.split('=')[-1]
     print('店铺ID:%s'%Id)
     item['shopId'] = content.split(';')[3].split('=')[-1]
     start_page = 1
     end_page = 10
     for page in range(start_page,end_page+1):
         user_evaluation_url ='https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId=1986869048&currentPageNum={}&pageSize=20&rateType=&orderType=sort_weight'.format(Id,page)
         print('获取评论第%d页'%page)
         yield scrapy.Request(user_evaluation_url, callback=self.parse_detail2,meta={'items': item})
         print('完成获取评论第%d页' % page)
Пример #21
0
    def parse(self, response):
         TmallSpider.count += 1
         divs = response.xpath('//div[@id="J_ItemList"]/div[@class="product  "]/div')  #<div class="product-iWrap">是区分他们的共同元素
         if not divs:
             self.log("List Page error __%s" %response.url)

         for div in divs:
            item = TaobaoItem()
            item["GOODS_PRICE"] = div.xpath('p[@class="productPrice"]/em/@title')[0].extract()
            item["GOODS_NAME"] = div.xpath('p[@class="productTitle"]/a/@title')[0].extract()
            goods_url = div.xpath('p[@class="productTitle"]/a/@href')[0].extract()
            item["GOODS_URL"] = goods_url  if "http:" in goods_url else ("http:"+ goods_url)
            yield scrapy.Request(url = item["GOODS_URL"],meta = {"item":item},callback = self.parse_detail,dont_filter=True)
            print(item["GOODS_NAME"])
Пример #22
0
 def parse_id(self, response):
     body = response.body.decode()
     pat = '"nid":"(.*?)"'
     allid = re.compile(pattern=pat).findall(body)
     for id in allid:
         url = 'https://item.taobao.com/item.htm?id=' + str(id)
         item = TaobaoItem()
         item['id'] = id
         yield Request(url,
                       callback=self.parse_good,
                       meta={
                           'item': item,
                       },
                       dont_filter=True)
 def next(self,response):
     item=TaobaoItem()
     item["title"]=response.xpath("//h3[@class='tb-main-title']/@data-title").extract()
     item["link"]=response.url
     item["price"]=response.xpath("//em[@class='tb-rmb-num']/text()").extract()
     patid='id=(.*?)$'
     thisid=re.compile(patid).findall(response.url)[0]
     commenturl="https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId="+str(thisid)
     #print(commenturl)
     commentdata=urllib.request.urlopen(commenturl).read().decode("utf-8","ignore")
     #print(commentdata)
     pat='"count":(.*?)}'
     item["comment"]=re.compile(pat).findall(commentdata)
     #print(item["comment"])
     yield item
Пример #24
0
    def parse_news(self, response):

        t = TaobaoItem()
        price = response.xpath(
            './/dd[@class="price-content big-price"]/span/text()'
        ).extract_first()
        t['price'] = price
        if round(float(price)) < round(float(self.wantprice)):
            emailSenderClient = emailSender()
            toSendEmailLst = ['*****@*****.**', '*****@*****.**']
            startTime = datetime.datetime.now()
            subject = "低价提醒"
            body = "细节:检测到有低于您设置的低价"
            emailSenderClient.sendEmail(toSendEmailLst, subject, body)  # 发送邮件
        yield t
Пример #25
0
 def parse_url(self,response):
     try:
         item = TaobaoItem()
         pat = re.compile(r'"spuId":"(\d{7})".*?"params":\[(.*?)\],"tag"', re.S)
         cen = re.findall(pat, response.text)
         pspuid = cen[0][0]
         parameter = cen[0][1].replace('"name":', '').replace('"value":', '').replace('"', '')
         item['pspuid'] = pspuid
         item['parameter'] = parameter#具体参数
         self.count+=1
         print(self.count)
         # print(item)
         yield item
     except IndexError:
         return ''
Пример #26
0
 def parse_id(self, response):
     id = response.meta['id'].strip()
     content = json.loads(response.text)
     ret = content.get('ret')
     if re.match(r'FAIL_SYS_USER_VALIDATE:', ret[0]):
         print('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
         yield scrapy.Request(self.id_url.format(id=id), self.parse_id, meta={'id': id})
     if 'item' in content['data'].keys():
         item = TaobaoItem()
         item['id'] = id
         item['content'] = response.text
         item['title'] = content['data']['item']['title']
         #print(content['data']['item']['title'])
         yield item
     else:
         print('下架===============================================')
Пример #27
0
 def parse(self, response):
     print('收到请求!!!!')
     products = response.xpath(
         '//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]'
     )
     for product in products:
         item = TaobaoItem()
         item['price'] = ''.join(
             product.xpath('.//div[contains(@class, "price")]//text()').
             extract()).strip()
         item['title'] = ''.join(
             product.xpath('.//div[contains(@class, "title")]//text()').
             extract()).strip()
         item['shop'] = ''.join(
             product.xpath('.//div[contains(@class, "shop")]//text()').
             extract()).strip()
         yield item
Пример #28
0
    def parse(self, response):
        # 使用scrapy内置的正则获取需要的内容
        p = 'g_page_config = ({.*?});'
        g_page_config = response.selector.re(p)[0]
        g_page_config = json.loads(g_page_config)
        auctions = g_page_config['mods']['itemlist']['data']['auctions']

        for auction in auctions:
            item = TaobaoItem()  # 实例化item
            item['price'] = auction['view_price']
            item['deals'] = auction['view_sales']
            item['title'] = auction['raw_title']
            item['shop'] = auction['nick']
            item['location'] = auction['item_loc']
            item['detail_url'] = auction['detail_url']

            yield item  # 将item传给生成器
Пример #29
0
    def parse_next(self, response):

        #item['title']=[]
        #item['price']=[]
        items = []
        select = response.xpath(
            "//ul[@class='gl-warp clearfix']/li[@class='gl-item']/div")
        for i in select:
            item = TaobaoItem()
            item["title"] = i.xpath(
                "div[@class='p-name p-name-type-2']/a/em/text()").extract()
            item["price"] = i.xpath(
                "div[@class='p-price']/strong/i/text()").extract()
            item['url'] = response.url
            items.append(item)

        return items
Пример #30
0
    def next(self, response):
        item = TaobaoItem()

        item["title"] = response.xpath(
            '//h3[@class="tb-main-title"]/@data-title').extract()[0].encode(
                'utf-8')
        item["link"] = response.url
        item["price"] = response.xpath(
            '//em[@class="tb-rmb-num"]/text()').extract()[0]
        item['shop'] = response.xpath(
            '//*[@id="J_ShopInfo"]//dl/dd/strong/a/text()').extract(
            )[0].encode('utf-8').strip()
        shop_url = 'http:' + response.xpath(
            '//*[@id="J_ShopInfo"]//dl/dd/strong/a/@href').extract()[0]
        item['shopLink'] = shop_url
        item['describeScore'] = response.xpath(
            '//div[@class="tb-shop-rate"]/dl[1]/dd/a/text()').extract(
            )[0].strip()
        item['serviceScore'] = response.xpath(
            '//div[@class="tb-shop-rate"]/dl[2]/dd/a/text()').extract(
            )[0].strip()
        item['logisticsScore'] = response.xpath(
            '//div[@class="tb-shop-rate"]/dl[3]/dd/a/text()').extract(
            )[0].strip()

        thisid = re.findall('id=(.*?)$', response.url)[0]
        commenturl = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId=880734502&currentPage=1".format(
            thisid)
        commentdata = urllib2.urlopen(commenturl).read().decode(
            "GBK", "ignore")
        #data = re.findall('"rateList":(.*?}]),',commentdata)[0]
        #try:
        #    t = json.loads(data)
        #    print t[0]['rateContent'].encode('utf-8')
        #except Exception, e:
        #    print "transfer error: %s" % e
        tempdata = re.findall('("commentTime":.*?),"days"', commentdata)
        if len(tempdata) == 0:
            tempdata = re.findall('("rateContent":.*?),"reply"', commentdata)
        item['commentdata'] = ""
        for data in tempdata:
            item['commentdata'] += data.encode('utf-8')

        print item['title']
        print item['link']