def parse2(self, response): # open_in_browser(response) meta = response.meta item = DoubanspiderItem() comments = response.xpath('//div[@class="comment"]') if comments and len(comments) != 0: for comment in comments: item["name"] = meta["keyword"] item["content"] = comment.xpath( './p/text()').extract()[0].strip() # item["time"] = comment.xpath('./descendant::span[@class="comment-time "][1]/text()').extract()[0].strip() yield item next_page = response.xpath( '//div[@id="paginator"]/a[last()]/@href').extract() if len(next_page) != 0: next_page = next_page[0] yield scrapy.Request(meta['prefix'] + next_page, callback=self.parse2, meta={ "keyword": meta['keyword'], 'prefix': meta['prefix'] })
def parse(self, response): if self.start_urls is None or self.max_page is None or self.search_str_list is None or not self.search_str_list: print("参数错误!!!") return None for each in response.xpath('//*/table[@class="olt"]/tr'): url = each.xpath('td[1]/a/@href').extract_first() title = each.xpath('td[1]/a/@title').extract_first() time = each.xpath('td[4][@class="time"]/text()').extract_first() if url is None or title is None or time is None or not any( str in title for str in self.search_str_list): continue item = DoubanspiderItem() item['url'] = url item['title'] = title item['time'] = time self.count = self.count + 1 yield item nexturl = response.xpath( '//*/span[@class="next"]/a/@href').extract_first() cur_page = response.xpath( '//*/span[@class="thispage"]/text()').extract_first() if nexturl and cur_page and int(cur_page) < int(self.max_page): yield scrapy.Request(url=nexturl, callback=self.parse, meta={}) else: return None
def parse(self, resonpse): item = DoubanspiderItem() movies = response.xpath("//div[@class=\'info\']") for each in movies: title = each.xpath( 'div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() info = each.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] #以;作为分割,将content列表里所有的元素合并成一个新的字符串 item['content'] = ';'.join(content) item['score'] = score[0] item['info'] = info[0] #提交item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
def parse(self, response): item = DoubanspiderItem() selector = Selector(response) movies = selector.xpath('//div[@class="info"]') for eachmovie in movies: title = eachmovie.xpath( '//div[@class="hd"]/a/span/text()').extract() fulltitle = '' for each in title: fulltitle += each moviesinfo = eachmovie.xpath( '//div[@class="bd"]/p/text()').extract() # star = eachmovie.xpath('//div[@class="bd"]/div[@class="star"]/span/em/text()').extract()[0] quote = eachmovie.xpath( '//div[@class="bd"]/p[@class="quote"]/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = fulltitle item['movieInfo'] = ';'.join(moviesinfo) # item['star'] = star item['quote'] = quote yield item nextlinks = selector.xpath( '//span[@class="next"]/link/@href').extract() if nextlinks: nextlinks = nextlinks[0] print nextlinks yield Request(self.url + nextlinks, callback=self.parse)
def parse(self, response): for each in response.xpath( '//div[@id="content"]//div[@class="article"]//ol//div[@class="item"]' ): item = DoubanspiderItem() item['title'] = each.xpath( './/div[@class="info"]//div[@class="hd"]//span[1]/text()' ).extract()[0] item['posterLink'] = each.xpath( './/div[@class="pic"]//img/@src').extract()[0] # item['bd'] = each.xpath('.//div[@class="info"]//div[@class="bd"]/p/text()').extract()[0] # item['star'] = each.xpath('.//div[@class="info"]//div[@class="bd"]//div[@class="star"]//span[@class="rating_num"]/text()').extract()[0] # quote = each.xpath('.//div[@class="info"]//div[@class="bd"]//p[@class="quote"]//span/text()').extract() # if len(quote) != 0: # item['quote'] = quote[0] yield item if self.offset < 25: self.offset += 25 yield scrapy.Request(self.urls + str(self.offset), callback=self.parse)
def parse(self, response): result = json.loads(response.text) for data in result["subjects"]: item = DoubanspiderItem() item['id'] = data['id'] item['title'] = data['title'] item['url'] = data['cover'] item['rate'] = data['rate'] yield item
def parse(self, response): # print response.body item = DoubanspiderItem() url = "https://www.douban.com/accounts/login" data = { 'redir': 'https://www.douban.com', 'form_email': '', 'form_password': '' } header = { 'Host': 'accounts.douban.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:49.0) Gecko/20100101 Firefox/49.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://www.douban.com/accounts/login', 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': '126' } data['form_email'] = "15711057804" data['form_password'] = "******" s = requests.Session() text = s.get(url).text if '请输入上图中的单词' in text: # 如果有验证码 page = etree.HTML(text) img = page.xpath('//img[@id="captcha_image"]/@src') # 取得验证码图片 id = page.xpath( '//div[@class="captcha_block"]/input[@type="hidden"]/@value' ) # 取得登录必需的验证码值 pic = requests.get(img[0]) with open( '/home/hadoop/文档/HiData/青软实训/python/PythonProject/Spider/doubanSpider/checkMa.jpg', 'wb') as f: for chunk in pic.iter_content(1024): if chunk: f.write(chunk) captcha = raw_input('请输入验证码:') print captcha data['captcha-solution'] = captcha data['captcha-id'] = id[0] p = s.post(url, headers=header, data=data) if '的帐号' in p.text: print('登录成功') print "s.cookies: ", s.cookies next_url = "https://www.douban.com/mine/" cont = s.get(next_url).text print "cont: ", cont item = self.parse2(cont, s, item) return item # yield Request(url=next_url, meta={'item': item}, cookies=cookie, headers=user_agent, callback=self.parse2) else: print('登录失败')
def parse_Douban(self, response): # return print(response.url) for i in range(0, 25): # 可以获取父类的xpath(25个)遍历当前xpath节点也可以直接取子节点 item = DoubanspiderItem() item["title"] = response.xpath( "//div[@class='item']//a/span[1]/text()").extract()[i] item["bd"] = response.xpath( "//div[@class='info']/div[@class='bd']/p[1]/text()").extract( )[i] item["star"] = response.xpath( "//span[@class='rating_num']/text()").extract()[i] item["quote"] = response.xpath("//p[@class='quote']").extract()[i] yield item
def parse(self, response): for i in range(0, 25): item = DoubanspiderItem() item["title"] = response.xpath( "//div[@class='item']//a/span[1]/text()").extract()[i] item["bd"] = response.xpath( "//div[@class='info']/div[@class='bd']/p[1]/text()").extract( )[i] item["star"] = response.xpath( "//span[@class='rating_num']/text()").extract()[i] item["quote"] = response.xpath("//p[@class='quote']").extract()[i] yield item self.page += 10 url = self.base_url + str(self.page) yield scrapy.Request(url, callback=self.parse)
def parse3(self, response): meta = response.meta item = DoubanspiderItem() contents = response.xpath('//div[@class="short-content"]') if contents and len(contents) != 0: for content in contents: item['name'] = meta['keyword'] item['content'] = content.xpath( './text()').extract()[0].strip() yield item next_page = response.xpath('//link[@rel="next"]/@href').extract() if len(next_page) != 0: next_page = next_page[0] yield scrapy.Request(url=meta['prefix'] + next_page, callback=self.parse3, meta=meta)
def parse(self, response): item = DoubanspiderItem() movies = response.xpath("//div[@class=\'info\']") for movie in movies: name = movie.xpath('div[@class="hd"]/a/span/text()').extract() message = movie.xpath('div[@class="bd"]/p/text()').extract() star = movie.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() number = movie.xpath( 'div[@class="bd"]/div[@class="star"]/span/text()').extract() quote = movie.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = ''.join(name) item['info'] = quote item['score'] = star[0] item['content'] = ';'.join(message).replace(' ', '').replace('\n', '') item['number'] = number[1].split('人')[0] # 提交item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
def parse(self, response): selectors = response.xpath("//*[@id='content']/div/div[1]/ol/li") const_item = DoubanspiderItem() for selector in selectors: name = selector.xpath(".//span[@class='title']/text()").extract() name = name[0] xiangqing = selector.xpath( ".//div[@class='bd']/p/text()[2]").extract() xiangqing = xiangqing[0].strip() index2 = xiangqing.find('/', xiangqing.find('/', 2) + 1) juqing = xiangqing[index2 + 1:] year = re.findall('(.*?)/', xiangqing)[0].strip() city = re.findall('(.*?)/', xiangqing)[1].strip() zhiyuan = selector.xpath( ".//div[@class='bd']/p/text()[1]").extract() zhiyuan = zhiyuan[0].strip() index1 = zhiyuan.find(':', 1) index2 = zhiyuan.find('主', 1) index3 = zhiyuan.find(':', index1 + 1) daoyan = zhiyuan[index1 + 1:index2] zhuyan = zhiyuan[index3 + 1:] pinglun = selector.xpath( './/div[@class="star"]/span[4]/text()').extract() link = selector.xpath(".//div[@class ='hd']/a/@href").extract() pingfen = selector.xpath( './/div[@class="star"]/span[2]/text()').extract() items = { '电影名': name, '电影链接': link, '上映时间': year, '上映城市': city, '剧情': juqing, '导演': daoyan, '主演': zhuyan, '豆瓣评分': pingfen } const_item['name'] = name const_item['link'] = link yield const_item
def parse(self, response): item = DoubanspiderItem() movieInfo = response.xpath('//div[@class="info"]') for movie in movieInfo: item['title'] = movie.xpath( './div[@class="hd"]/a/span[@class="title"]/text()')[0].extract( ) item['score'] = movie.xpath( './div[@class="bd"]/div[@class="star"]/span/text()' )[0].extract() content = movie.xpath('./div[@class="bd"]/p/text()') item['content'] = content[0].extract().strip( ) if len(content) > 0 else 'NULL' info = movie.xpath( './div[@class="bd"]/p[@class="quote"]/span/text()') item['info'] = info[0].extract() if len(info) > 0 else 'NULL' yield item if self.offset <= 225: self.offset += 25 url = self.url + str(self.offset) + self.end yield scrapy.Request(url, callback=self.parse)
def parse(self, response): print(response.url) movies = response.xpath('//div[@class="info"]') for movie in movies: movie_name = movie.xpath( './div[@class="hd"]/a/span[1]/text()').extract()[0] movie_info = movie.xpath('./div[@class="bd"]/p[1]/text()').extract( )[0].strip() + ' / ' + movie.xpath( './div[@class="bd"]/p[1]/text()').extract()[1].strip() movie_rating = movie.xpath( './div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] movie_quote = movie.xpath( './div[@class="bd"]/p[@class="quote"]/span/text()').extract() if movie_quote: movie_quote = movie_quote[0] else: movie_quote = '' item = DoubanspiderItem() item['movie_name'] = movie_name item['movie_info'] = movie_info item['movie_rating'] = movie_rating item['movie_quote'] = movie_quote yield item page = int(re.search(r'top250\?(\D*)(\d+)', response.url).group(2)) s = re.search(r'top250\?(\D*)(\d+)', response.url).group(1) if s == '': s = 'start=' if page <= 225: url = re.sub(r'top250\?(\D*)(\d+)', 'top250?' + s + str(page + 25), response.url) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): item = DoubanspiderItem() item[ 'posterLink'] = 'https://img3.doubanio.com/view/photo/m/public/p480747492.webp' yield item