def parse_page(self, response): resp = response.text goods_json = re.search(r"g_page_config = (.+?\}\});", resp) if goods_json: goods_dict = json.loads(goods_json.group(1)) goods_list = goods_dict["mods"]["itemlist"]["data"]["auctions"] for goods in goods_list: title = goods["title"] raw_title = goods["raw_title"] view_price = goods["view_price"] detail_url = goods["detail_url"] item_loc = goods["item_loc"] view_sales = goods["view_sales"] comment_count = goods["comment_count"] items = TaobaoItem(title=title, raw_title=raw_title, view_price=view_price, detail_url=detail_url, item_loc=item_loc, view_sales=view_sales, comment_count=comment_count) # print(dict(items)) yield items else: print("没有找到")
def parse(self, response): for sel in response.xpath('//div[@class="list-item"]'): try: # 有些页面不标准,没有mm姓名 name = sel.xpath( './/a[@class="lady-name"]/text()').extract()[0] print u'美眉姓名:', name self.mkdir('images/full/%s' % (name)) item = TaobaoItem() item['mm_name'] = name href = sel.xpath('.//a[@class="lady-avatar"]/@href').extract() url = response.urljoin(href[0]) yield scrapy.Request(url, meta={ 'driver': response.meta['driver'], 'PhantomJS': response.meta['PhantomJS'], #'cookiejar': response.meta['cookiejar'], 'item': item}, callback=self.parse_mm_page) print u'去美眉图片页抓图:scheduling', url except: # 跳过这个mm,继续下一个 continue # request next page self.pageindex += 1 next_page = self.start_urls[0] + str(self.pageindex) yield scrapy.Request(next_page, meta={ 'driver': response.meta['driver'], 'PhantomJS': response.meta['PhantomJS'], #'cookiejar': response.meta['cookiejar'] }, callback=self.parse)
def parse(self, response): # print(response.body) print("1111111111111----------") time.sleep(5) i = response.meta.get("i") # url_i = response.meta.get("url") i +=1 # print("2222222222222----------") if i > 100: return # try: # print("start:----------------------------") node_list = response.xpath("//div[@class='item J_MouserOnverReq ']/div[@class='ctx-box J_MouseEneterLeave J_IconMoreNew']") print(node_list) for node in node_list: item = TaobaoItem() # print("--------------------------------") item['name'] = node.xpath("./div[@class='row row-2 title']/a[@class='J_ClickStat']/text()[2]").extract()[0].encode("utf-8") item['price'] = node.xpath("./div[@class='row row-1 g-clearfix']/div[@class='price g_price g_price-highlight']/strong/text()").extract()[0].encode("utf-8") item['payment_num'] = node.xpath("./div[@class='row row-1 g-clearfix']/div[@class='deal-cnt']/text()").extract()[0].encode("utf-8") item['shop_name'] = node.xpath("./div[@class='row row-3 g-clearfix']/div[@class='shop']/a/span[2]/text()").extract()[0].encode("utf-8") item['shop_address'] = node.xpath("./div[@class='row row-3 g-clearfix']/div[@class='location']/text()").extract()[0].encode("utf-8") yield item #点击下一页 button = self.browser.find_elements(By.XPATH,'//a[@class="J_Ajax num icon-tag"]')[-1] button.click() time.sleep(random.random()*2) self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") html = self.browser.page_source yield scrapy.Request(url=response.url,callback=self.parse,meta={'html':html},dont_filter=True)
def parse(self, response): html = response.text content = re.findall(r'g_page_config = (.*?) g_srp_loadCss', html, re.S)[0].strip()[:-1] #格式化 content = json.loads(content) item = TaobaoItem() #获取信息列表 data_list = content['mods']['itemlist']['data']['auctions'] #提取数据 for data in data_list: try: item['title'] = data['raw_title'] item['price'] = float(data['view_price']) pattern = re.compile(r'\d+') item['sales'] = int(pattern.findall(data['view_sales'])[0]) item['is_tmall'] = '是' if data['shopcard']['isTmall'] else '否' item['shops_loc'] = data['item_loc'] item['shops_name'] = data['nick'] item['shops_id'] = data['user_id'] item['goods_url'] = 'http' + data['detail_url'] item['comment_count'] = int(data['comment_count']) item['goods_id'] = data['nid'] yield item except Exception as e: pass
def parse(self, response): # y = 0 p = 'g_page_config = ({.*?});' g_page_config = response.selector.re(p)[0] g_page_config = json.loads(g_page_config) auctions = g_page_config['mods']['itemlist']['data']['auctions'] # g_page_configs = response.selector.re(r'g_page_config = ({.*?});') #totalPage = response.selector.re(r'"totalPage":(.*?),')[0] #print(totalPage) for auction in auctions: y = y + 1 item = TaobaoItem() item['title'] = auction['raw_title'] item['price'] = auction['view_price'] item['nick'] = auction['nick'] item['sales'] = auction['view_sales'] item['loc'] = auction['item_loc'] item['detail_url'] = auction['detail_url'] if item['detail_url'].startswith('//'): item['detail_url'] = 'https:' + item['detail_url'] yield item #PageNow = self.a + 1 if y < 44: num = self.a * 44 + y print('淘宝【' + self.keyword + '】: ' + '已采集' + str(self.a + 1) + '页' + ',总数据 :' + str(num) + '条') self.crawler.engine.close_spider(self, '已爬取所有信息!') else: num = self.a * 44 + y print('淘宝【' + self.keyword + '】: ' + '已采集' + str(self.a + 1) + '页' + ',总数据 :' + str(num) + '条') self.a = self.a + 1 yield scrapy.Request(self.url % (self.keyword, self.a * 44), callback=self.parse)
def next(self, response): item = TaobaoItem() item["title"] = response.xpath( '//h3[@class="tb-main-title"]/@data-title').extract()[0].encode( 'utf-8') item["link"] = response.url item["price"] = response.xpath( '//em[@class="tb-rmb-num"]/text()').extract()[0] item['shop'] = response.xpath( '//*[@id="J_ShopInfo"]//dl/dd/strong/a/text()').extract( )[0].encode('utf-8').strip() shop_url = 'http:' + response.xpath( '//*[@id="J_ShopInfo"]//dl/dd/strong/a/@href').extract()[0] item['shopLink'] = shop_url try: item['describeScore'] = response.xpath( '//div[@class="tb-shop-rate"]/dl[1]/dd/a/text()').extract( )[0].strip() item['serviceScore'] = response.xpath( '//div[@class="tb-shop-rate"]/dl[2]/dd/a/text()').extract( )[0].strip() item['logisticsScore'] = response.xpath( '//div[@class="tb-shop-rate"]/dl[3]/dd/a/text()').extract( )[0].strip() except Exception, e: item['describeScore'] = "" item['serviceScore'] = "" item['logisticsScore'] = ""
def parse2(self, response): item=TaobaoItem() page=Selector(response) # print(response.text) item['title']=page.xpath('//head/title/text()').extract()[0][:-4] item['goods_url']=response.meta['goods_url'] item['goods_class']=response.meta['goods_class'] item['price']=page.xpath('//strong[@id="J_StrPrice"]/em[@class="tb-rmb-num"]/text()').extract()[0] item['sell_count']=response.meta['sell_count'][:-3] item['area']=response.meta['area'] # item['trade']=page.xpath('//div[@class="tb-sell-counter"]/a/strong/text()').extract() seller= page.xpath('//div[@class="tb-shop-name"]/dl/dd/strong/a/@title').extract() if len(seller)==1: item['seller']=seller[0] else: seller=page.xpath('//span[@class="shop-name-title"]/@title').extract() if len(seller)==1: item['seller'] = seller[0] else: seller = page.xpath('//span[@class="shop-name-title"]/@title').extract() if len(seller) == 1: item['seller'] = seller[0] else: item['seller'] = '未知' yield item
def parse(self, response): GoodsSpider.count += 1 divs = response.xpath( "//*[@id='listsrp-itemlist']/div/div/div[1]/div") #商品列表xpath if not divs: #判断是否在这divs中,不在记录url self.log("list page error--%s" % response.url) for div in divs[1:59]: item = TaobaoItem() #商品价格 item["price"] = div.xpath( "div[3]/div[1]/div[1]/strong")[0].extract() #商品链接url pre_goods_url = div.xpath("div[3]/div[2]/a/@href")[0].extract() #判断url中是否有https,没有就补上 item[ "goodsUrl"] = pre_goods_url if "https:" in pre_goods_url else ( "https:" + pre_goods_url) yield scrapy.Request(url=item["goodsUrl"], meta={'item': item}, callback=self.parse_detail(), dont_filter=True)
def next(self, response): item = TaobaoItem() item['title'] = response.meta['name'] item['price'] = response.meta['price'] item['address'] = response.meta['address'] item['link'] = response.url
def parse(self, response): URL = response.url html = response.text title = re.findall('"raw_title":"(.*?)"', html, re.S) pic_url = re.findall('"pic_url":"(.*?)"', html, re.S) view_price = re.findall('"view_price":"(.*?)"', html, re.S) view_fee = re.findall('"view_fee":"(.*?)"', html, re.S) item_loc = re.findall('"item_loc":"(.*?)"', html, re.S) view_sales = re.findall('"view_sales":"(.*?)"', html, re.S) nid = re.findall('"nid":"(.*?)"', html, re.S) nick = re.findall('"nick":"(.*?)"', html, re.S) nick.pop(-1) i = 0 for x in view_price: item = TaobaoItem() item['title'] = title[i] print(title[i]) item['pic_url'] = pic_url[i] item['view_price'] = view_price[i] item['view_fee'] = view_fee[i] item['item_loc'] = item_loc[i] item['view_sales'] = view_sales[i] item['nid'] = 'https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.13c42140RGlAs3&id=' + str(nid[i]) item['nick'] = nick[i] i += 1 print(item) yield item
def allnvzhuang(self, response): item = TaobaoItem() x = response.xpath('//div[@class="count count5"]//em') item['city'] = x.xpath('text()').extract() item['url'] = response.url yield item
def get_product(self, response): URL = response.url try: html = response.text title = re.findall('"raw_title":"(.*?)"', html, re.S) pic_url = re.findall('"pic_url":"(.*?)"', html, re.S) view_price = re.findall('"view_price":"(.*?)"', html, re.S) view_fee = re.findall('"view_fee":"(.*?)"', html, re.S) item_loc = re.findall('"item_loc":"(.*?)"', html, re.S) view_sales = re.findall('"view_sales":"(.*?)"', html, re.S) nid = re.findall('"nid":"(.*?)"', html, re.S) nick = re.findall('"nick":"(.*?)"', html, re.S) nick.pop(-1) i = 0 for x in nick: item = TaobaoItem() item['title'] = title[i] item['pic_url'] = pic_url[i] item['view_price'] = view_price[i] item['view_fee'] = view_fee[i] item['item_loc'] = item_loc[i] item['view_sales'] = view_sales[i] item['nid'] = 'https://item.taobao.com/item.htm?spm=a219r.lm874.14.1.13c42140RGlAs3&id=' + str(nid [i]) item['nick'] = nick[i] i += 1 yield item except: self.logger.debug(URL)
def next(self,response): item=TaobaoItem() #pdb.set_trace() item['key'] = response.meta['key'] item["title"]=response.xpath("//h3[@class='tb-main-title']/@data-title").extract()[0] #提取title intro = response.xpath('//ul[@class="attributes-list"]//li/text()').extract() intro = str(intro).replace('\\xa0','') intro = eval(intro) item['intro'] = intro item["link"]=response.url #提取当前页面的url patid='id=(.*?)$' thisid=re.compile(patid).findall(response.url)[0] #用正则表达式通过url来提取商品的id clickurl = 'https://count.taobao.com/counter3?callback=jsonp86&keys=ICCP_1_' + str(thisid) clickdata = urllib.request.urlopen(clickurl).read().decode("utf-8","ignore") click = re.findall(':(\d+)',clickdata) item['click'] = click[0] commenturl="https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId="+str(thisid) #构造抓包获得的url(评论数) #print(commenturl) commentdata=urllib.request.urlopen(commenturl).read().decode("utf-8","ignore") #进入构造出的url,并读取页面源码信息 #print(commentdata) pat='"count":(.*?)}' item["comment"]=re.compile(pat).findall(commentdata)[0] #用正则表达式来匹配提取出商品评价的数量 referer = response.url headers = {'Referer':referer} seldurl = 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId='+str(thisid)+'&modules=soldQuantity,xmpPromotion&callback=onSibRequestSuccess' request = urllib.request.Request(seldurl,headers=headers) runtime = 1 while True: if runtime > 5: break try: selddata = urllib.request.urlopen(request).read().decode("utf-8","ignore") selddata = selddata.replace('\r','').replace('\n','').replace('onSibRequestSuccess(','').replace(');','').replace('true','1').replace('false','0') seld_dict = eval(selddata) seld = seld_dict['data']['soldQuantity']['confirmGoodsCount'] item['seld'] = seld break except: time.sleep(1) runtime += 1 try: price = seld_dict['data']['promotion']['promoData']['def'][0]['price'] item["price"] = price except: item["price"]=response.xpath("//em[@class='tb-rmb-num']/text()").extract()[0] #提取原价 item['extract_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) description = response.xpath('//p[@class="tb-subtitle"]/text()').extract() if description: item['description'] = description[0] else: description = response.xpath('//p[@class="newp"]/text()').extract() if description: item['description'] = description[0] #pdb.set_trace() #print(item["comment"]) yield item
def parse(self, response): item = TaobaoItem() # title_node = response.xpath("//div/text()").extract() # print('打印title信息。', title_node) file_name = 'data.html' with open(file_name, 'wb') as f: f.write(response.body)
def parse_page(self, response): item = TaobaoItem() # extract item['sn'] = response.url.split("?id=")[1] item['images'] = self.extract_images(response) item['choices'], item['sizes'], item['colors'] = self.extract_choice(response) item['properties'] = self.extract_properties(response) # print item return item
def parse_self(self, response): mtitle = response.xpath('//h1/text()').get() mcontent = response.xpath( "//div[@class='fed-arti-content fed-padding']/p/text()").getall() mcontent = ''.join(mcontent) item = TaobaoItem() item['mtitle'] = mtitle item['mcontent'] = mcontent yield item
def next(self, response): #print(response.url) item = TaobaoItem() url = response.url pattam_url = 'https://(.*?).com' subdomain = re.compile(pattam_url).findall(url) #print(subdomain) item["link"] = response.url #商品链接 if subdomain[0] != 'item.taobao': #判断域名 # 天猫或天猫超市 title = response.xpath( "//div[@class='tb-detail-hd']/h1/text()").extract() pattam_price = '"defaultItemPrice":"(.*?)"' #正则表达式,商品价格 price = re.compile(pattam_price).findall( response.body.decode('utf-8', 'ignore')) #网页源代码中提取 pattam_id = 'id=(.*?)&' this_id = re.compile(pattam_id).findall(url)[0] #通过url中提取id else: # 淘宝 title = response.xpath( "//h3[@class='tb-main-title']/@data-title").extract() price = response.xpath( "//em[@class='tb-rmb-num']/text()").extract() pattam_id = 'id=(.*?)$' this_id = re.compile(pattam_id).findall(url)[0] #print(this_id) item["title"] = title item["price"] = price # # 构造具有评论数量信息的包的网址 # comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(this_id) # # context=ssl._create_unverified_context() # # comment_data = urllib.request.urlopen(comment_url,context=context).read().decode('utf-8', 'ignore') # # pattam_comment = '"rateTotal":(.*?),"' # # comment = re.compile(pattam_comment).findall(comment_data) # # # item["comment"]=comment #xpath获取失败 #item["comment"]=response.xpath("//em[@class='J_ReviewsCount']/text()") #商品评论数目 #print(title) yield item pass
def parse_item(self,response): self.log(response.url) #解析评论数据 select=response.xpath("//div[@class='comments-item']") items=[] for i in select: item=TaobaoItem() item['title']=(''.join(i.xpath("div/div[2]/div/text()").extract())).strip() items.append(item) return items
def parse(self, response): item = TaobaoItem() try: urls = response.xpath('//div[@class="productImg-wrap"]/a/img').re( r'//.*.jpg') for i in urls: item['image_urls'].append('http:' + i) item['title'] = response.xpath( '//p[@class="productTitle"]/a/@title').extract() except: pass return item
def parse_detail(self,response): content = response.xpath('//meta[@name="microscope-data"]/@content')[0].extract() item = TaobaoItem() Id = response.xpath('//ul/li[@class="tb-social-fav"]/a/@href')[0].extract() Id = Id.split('=')[-1] print('店铺ID:%s'%Id) item['shopId'] = content.split(';')[3].split('=')[-1] start_page = 1 end_page = 10 for page in range(start_page,end_page+1): user_evaluation_url ='https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId=1986869048¤tPageNum={}&pageSize=20&rateType=&orderType=sort_weight'.format(Id,page) print('获取评论第%d页'%page) yield scrapy.Request(user_evaluation_url, callback=self.parse_detail2,meta={'items': item}) print('完成获取评论第%d页' % page)
def parse(self, response): TmallSpider.count += 1 divs = response.xpath('//div[@id="J_ItemList"]/div[@class="product "]/div') #<div class="product-iWrap">是区分他们的共同元素 if not divs: self.log("List Page error __%s" %response.url) for div in divs: item = TaobaoItem() item["GOODS_PRICE"] = div.xpath('p[@class="productPrice"]/em/@title')[0].extract() item["GOODS_NAME"] = div.xpath('p[@class="productTitle"]/a/@title')[0].extract() goods_url = div.xpath('p[@class="productTitle"]/a/@href')[0].extract() item["GOODS_URL"] = goods_url if "http:" in goods_url else ("http:"+ goods_url) yield scrapy.Request(url = item["GOODS_URL"],meta = {"item":item},callback = self.parse_detail,dont_filter=True) print(item["GOODS_NAME"])
def parse_id(self, response): body = response.body.decode() pat = '"nid":"(.*?)"' allid = re.compile(pattern=pat).findall(body) for id in allid: url = 'https://item.taobao.com/item.htm?id=' + str(id) item = TaobaoItem() item['id'] = id yield Request(url, callback=self.parse_good, meta={ 'item': item, }, dont_filter=True)
def next(self,response): item=TaobaoItem() item["title"]=response.xpath("//h3[@class='tb-main-title']/@data-title").extract() item["link"]=response.url item["price"]=response.xpath("//em[@class='tb-rmb-num']/text()").extract() patid='id=(.*?)$' thisid=re.compile(patid).findall(response.url)[0] commenturl="https://rate.taobao.com/detailCount.do?callback=jsonp100&itemId="+str(thisid) #print(commenturl) commentdata=urllib.request.urlopen(commenturl).read().decode("utf-8","ignore") #print(commentdata) pat='"count":(.*?)}' item["comment"]=re.compile(pat).findall(commentdata) #print(item["comment"]) yield item
def parse_news(self, response): t = TaobaoItem() price = response.xpath( './/dd[@class="price-content big-price"]/span/text()' ).extract_first() t['price'] = price if round(float(price)) < round(float(self.wantprice)): emailSenderClient = emailSender() toSendEmailLst = ['*****@*****.**', '*****@*****.**'] startTime = datetime.datetime.now() subject = "低价提醒" body = "细节:检测到有低于您设置的低价" emailSenderClient.sendEmail(toSendEmailLst, subject, body) # 发送邮件 yield t
def parse_url(self,response): try: item = TaobaoItem() pat = re.compile(r'"spuId":"(\d{7})".*?"params":\[(.*?)\],"tag"', re.S) cen = re.findall(pat, response.text) pspuid = cen[0][0] parameter = cen[0][1].replace('"name":', '').replace('"value":', '').replace('"', '') item['pspuid'] = pspuid item['parameter'] = parameter#具体参数 self.count+=1 print(self.count) # print(item) yield item except IndexError: return ''
def parse_id(self, response): id = response.meta['id'].strip() content = json.loads(response.text) ret = content.get('ret') if re.match(r'FAIL_SYS_USER_VALIDATE:', ret[0]): print('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa') yield scrapy.Request(self.id_url.format(id=id), self.parse_id, meta={'id': id}) if 'item' in content['data'].keys(): item = TaobaoItem() item['id'] = id item['content'] = response.text item['title'] = content['data']['item']['title'] #print(content['data']['item']['title']) yield item else: print('下架===============================================')
def parse(self, response): print('收到请求!!!!') products = response.xpath( '//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]' ) for product in products: item = TaobaoItem() item['price'] = ''.join( product.xpath('.//div[contains(@class, "price")]//text()'). extract()).strip() item['title'] = ''.join( product.xpath('.//div[contains(@class, "title")]//text()'). extract()).strip() item['shop'] = ''.join( product.xpath('.//div[contains(@class, "shop")]//text()'). extract()).strip() yield item
def parse(self, response): # 使用scrapy内置的正则获取需要的内容 p = 'g_page_config = ({.*?});' g_page_config = response.selector.re(p)[0] g_page_config = json.loads(g_page_config) auctions = g_page_config['mods']['itemlist']['data']['auctions'] for auction in auctions: item = TaobaoItem() # 实例化item item['price'] = auction['view_price'] item['deals'] = auction['view_sales'] item['title'] = auction['raw_title'] item['shop'] = auction['nick'] item['location'] = auction['item_loc'] item['detail_url'] = auction['detail_url'] yield item # 将item传给生成器
def parse_next(self, response): #item['title']=[] #item['price']=[] items = [] select = response.xpath( "//ul[@class='gl-warp clearfix']/li[@class='gl-item']/div") for i in select: item = TaobaoItem() item["title"] = i.xpath( "div[@class='p-name p-name-type-2']/a/em/text()").extract() item["price"] = i.xpath( "div[@class='p-price']/strong/i/text()").extract() item['url'] = response.url items.append(item) return items
def next(self, response): item = TaobaoItem() item["title"] = response.xpath( '//h3[@class="tb-main-title"]/@data-title').extract()[0].encode( 'utf-8') item["link"] = response.url item["price"] = response.xpath( '//em[@class="tb-rmb-num"]/text()').extract()[0] item['shop'] = response.xpath( '//*[@id="J_ShopInfo"]//dl/dd/strong/a/text()').extract( )[0].encode('utf-8').strip() shop_url = 'http:' + response.xpath( '//*[@id="J_ShopInfo"]//dl/dd/strong/a/@href').extract()[0] item['shopLink'] = shop_url item['describeScore'] = response.xpath( '//div[@class="tb-shop-rate"]/dl[1]/dd/a/text()').extract( )[0].strip() item['serviceScore'] = response.xpath( '//div[@class="tb-shop-rate"]/dl[2]/dd/a/text()').extract( )[0].strip() item['logisticsScore'] = response.xpath( '//div[@class="tb-shop-rate"]/dl[3]/dd/a/text()').extract( )[0].strip() thisid = re.findall('id=(.*?)$', response.url)[0] commenturl = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId=880734502¤tPage=1".format( thisid) commentdata = urllib2.urlopen(commenturl).read().decode( "GBK", "ignore") #data = re.findall('"rateList":(.*?}]),',commentdata)[0] #try: # t = json.loads(data) # print t[0]['rateContent'].encode('utf-8') #except Exception, e: # print "transfer error: %s" % e tempdata = re.findall('("commentTime":.*?),"days"', commentdata) if len(tempdata) == 0: tempdata = re.findall('("rateContent":.*?),"reply"', commentdata) item['commentdata'] = "" for data in tempdata: item['commentdata'] += data.encode('utf-8') print item['title'] print item['link']