def item_parse(self, response): # 解析影片介绍及播放页面 detail = response.meta['detail'] soup = BeautifulSoup(detail, "lxml") values = soup.find('a') cover_image = { "img" : values['src'], "width" : 0, "height" : 0 } video_src_list = response.xpath('//*[@id="playlist1"]/ul/li').extract() item = VipfreeItem() # 站内唯一标识 item['item_id'] = response.url.split('/')[-1].split('.')[-2] # 影片标题 item['title'] = values['title'] # 图片信息 item['cover_images'].append(cover_image) # 播放页链接 item['content_url'] = response.url # 介绍 item['description'] = response.xpath('//*[@id="list3"]/div/div/text()').extract()[-1] # 播放地址数 item['video_src_cnt'] = len(video_src_list) # 图片数量 item['cover_img_cnt'] = 1 logger.info(item['title']) logger.info(item['description']) yield item
def parse(self, response): logger.info('开始爬取数据.') ip_list = response.xpath('//*[@id="ip_list"]') trs = ip_list[0].xpath('tr') items = [] for ip in trs[1:]: pre_item = CollectipsIpItem() pre_item['IP'] = ip.xpath('td[3]/text()')[0].extract() pre_item['PORT'] = ip.xpath('td[4]/text()')[0].extract() pre_item['POSITION'] = ip.xpath( 'string(td[5])')[0].extract().strip() pre_item['TYPE'] = ip.xpath('td[7]/text()')[0].extract() pre_item['SPEED'] = ip.xpath('td[8]/div[@class="bar"]/@title').re( '\d{0,2}\.\d{0,}')[0] pre_item['LAST_CHECK_TIME'] = ip.xpath( 'td[10]/text()')[0].extract() items.append(pre_item) return items
def parse(self, response): """ 解析电影列表页 """ logger.info(u'解析电影列表页:%s' % response.url) # 找出所有影片链接 detail_link_list = response.xpath( '//div[@class="index-area clearfix"]/ul/li/a/@href').extract( ) # 获取当前页所有详情链接 detail_list = response.xpath( '//div[@class="index-area clearfix"]/ul/li/a').extract( ) # 获取当前页所有详情信息 for detail_link, detail in zip(detail_link_list, detail_list): yield scrapy.Request(url=self.base_domain + detail_link, headers=headers, meta={'detail': detail}, callback=self.item_parse) # 解析单页页 break # 超出下一页按钮 next_page_list = response.xpath( '//div[@class="page mb clearfix"]/a').extract() # 获取当前页所有详情链接 for button_item in next_page_list: soup = BeautifulSoup(button_item, "lxml") link_list = soup.findAll('a') for tag_a in link_list: if u'>' == tag_a.text: link = self.base_domain + tag_a['href'] logger.info(u"下一页链接: %s" % link) yield scrapy.Request(url=link, callback=self.parse)
def __init__(self): super(XiaomaSpider, self).__init__() # 初始化 start_urls, 生成电影的地址 movie_root = u'http://efx6.cn/movie.php?m=http://www.360kan.com/dianying/list.php?cat=all%26pageno={pageno}' for i in range(MAX_PAGE_INDEX): url = movie_root.format(pageno = str(i+1)) logger.info(u'初始化网页链接 %s' % url) self.start_urls.append(url)
def item_parse(self, response): # 解析影片介绍及播放页面 detail = response.meta['detail'] # 图片, 主演, 片名 soup_meta = BeautifulSoup(detail, "lxml") movie_tag = soup_meta.find('a') title = movie_tag['title'] # 片名 # href = movie_tag['href'] # 链接 直接调用response.url img_url = soup_meta.find('img')['src'] # 封面图片地址 year = soup_meta.find('span', {'class' : 'hint'}).text # 年份 zhuyan = soup_meta.find('p', {'class' : 'star'}).text # 主演 play_item = response.xpath('//p[@class="vspy"]/a').extract() video_src_list = len(play_item) # 播放源数量 tyyp = response.xpath('/html/body/div[1]/section/div[1]/div/div[5]/div/h3[1]/span').extract()[0] tyyp = BeautifulSoup(tyyp, "lxml").text.replace(' ', '') # 影片分类 desc = response.xpath('//p[@class="item-desc js-close-wrap"]').extract()[0] desc = BeautifulSoup(desc, "lxml").text.replace('\n', '') # 简介 cover_image = { "img" : img_url, "width" : 0, "heigh" : 0 } # 数视频源数量 # video_src_list = response.xpath('//p[@class="vspy"]').extract() # logger.info(video_src_list) item = QuanminItem() # 站内唯一标识 item['item_id'] = response.url.split('/')[-1].split('.')[-2] # 影片标题 item['title'] = title # 图片信息 item['cover_images'].append(cover_image) # 播放页链接 item['content_url'] = response.url # 介绍 item['description'] = desc # response.xpath('//*[@id="list3"]/div/div/text()').extract()[-1] # 播放地址数/片源数量 item['video_src_cnt'] = video_src_list # 图片数量 item['cover_img_cnt'] = len(item['cover_images']) # 主演 item['actor_list'] = zhuyan.replace('/', ',').replace(' ', ',') # 上映年份 item['show_year'] = year # 影片类型 item['item_catagory'] = tyyp logger.info(u"成功解析: %s url: %s" % (item['title'], item['content_url'])) yield item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem('Missing{0}!'.format(data)) if valid: self.collection.insert(dict(item)) # log.msg('question added to mongodb database!', # level=log.DEBUG,spider=spider) logger.info(u'存入数据成功') return item
def parse(self, response): logger.info(u'解析电影列表页面: %s' % response.url) # 找出所有影片链接 detail_link_list = response.xpath( '//div[@class="s-tab-main"]/ul/li/a/@href').extract() # 获取当前页所有详情链接 detail_list = response.xpath( '//div[@class="s-tab-main"]/ul/li/a').extract() # 获取当前页所有详情信息 for detail_link, detail in zip(detail_link_list, detail_list): url = self.base_domain + detail_link[1:] logger.info(u'影片播放页 传参 %s' % url) yield scrapy.Request(url = url, headers=headers, meta={'detail':detail}, callback=self.item_parse) # 解析播放页
def item_parse(self, response): logger.info(u'解析电影详情页面: %s' % response.url) detail = response.meta['detail'] # 图片, 主演, 片名 soup_meta = BeautifulSoup(detail, "lxml") movie_tag = soup_meta.find('a') title = movie_tag['title'] # 片名 img_url = soup_meta.find('img')['src'] # 封面图片地址 score = soup_meta.find('span', {'class' : 's2'}).text # 评分 zhuyan = soup_meta.find('p', {'class' : 'star'}).text # 主演 desc = response.xpath('//p[@class="item-desc js-close-wrap"]').extract()[0] desc = BeautifulSoup(desc, "lxml").text.replace('\n', '') # 简介 cover_image = { "img" : img_url, "width" : 0, "heigh" : 0 } item = XiaomaItem() # 站内唯一标识 item['item_id'] = response.url.split('/')[-1].split('.')[-2] # 影片标题 item['title'] = title # 图片信息 item['cover_images'].append(cover_image) # 播放页链接 item['content_url'] = response.url # 介绍 item['description'] = desc # 播放地址数/片源数量 item['video_src_cnt'] = 1 # 图片数量 item['cover_img_cnt'] = len(item['cover_images']) # 主演 item['actor_list'] = zhuyan.replace('/', ',').replace(' ', ',') # 评分 item['score'] = score logger.info(u"成功解析: %s url: %s" % (item['title'], item['content_url'])) yield item
def parse(self, response): #先,解析起始路径 start_urls logger.info(u'解析页面地址url:%s' % response.url) # 找出所有影片链接 detail_link_list = response.xpath( '//html/body/section/div[3]/div/div[1]/div/div/ul[2]/li/a/@href').extract() # 获取当前页所有详情链接 detail_list = response.xpath( '//html/body/section/div[3]/div/div[1]/div/div/ul[2]/li/a').extract() # 获取当前页所有详情信息 for detail_link, detail in zip(detail_link_list, detail_list): yield scrapy.Request(url = detail_link, meta={'detail':detail}, headers=headers, callback=self.item_parse) # 解析单页页 if self.page_count > MAX_PAGE_INDEX : return else: self.page_count += 1 # if detail_link_list or len(detail_link_list) == 0: # # 下一页按钮一直会在, 但是可能只有广告, 这里针对电影数做个判断 # return # 寻找下一页按钮 next_page_list = response.xpath('//div[@class="paging"]').extract() for button_item in next_page_list: soup = BeautifulSoup(button_item, "lxml") link_list = soup.findAll('a') for tag_a in link_list: if u'下一页' in tag_a.text : link = self.base_domain + tag_a['href'] logger.info(u"下一页链接: %s" % link) yield scrapy.Request(url = link, headers=headers, callback=self.parse)
def sub_parse(self, response): # 分页执行 # 找出所有影片链接 detail_link_list = response.xpath('//div[@class="item"]/ul/div/a/@href').extract() # 获取当前页所有详情链接 detail_list = response.xpath('//div[@class="item"]/ul/div/a').extract() # 获取当前页所有详情信息 for detail_link, detail in zip(detail_link_list, detail_list): yield scrapy.Request(url = self.base_domain + detail_link[1:], meta={'detail':detail}, headers=headers, callback=self.item_parse) # 解析单页页面 # 寻找下一页按钮 next_page_list = response.xpath('/html/body/div[2]/div/div[3]/div[3]/ul/li/a').extract() for button_item in next_page_list: if u'下一页' in button_item : logger.info(u'------------正在翻页------------') soup = BeautifulSoup(button_item, "lxml") link = soup.find('a')['href'] link = self.base_domain + '/' + link logger.info(link) yield scrapy.Request(url = link, headers=headers, callback=self.sub_parse)
def item_parse(self, response): # 解析影片介绍页面 link_list = response.xpath("(.//div[@class='videourl clearfix'])[1]/ul/li/a/@href").extract() title_list = response.xpath("(.//div[@class='videourl clearfix'])[1]/ul/li/a/@title").extract() # 正片 title = response.xpath(".//dt[@class='name']/text()").extract()[0] protagonist = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() mtype = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() director = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() description = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() show_year = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() region = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() lang = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() cover_url = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() # play_url = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract() # protagonist = response.xpath().extract() logger.info(protagonist) logger.info("titile %s, link %s " % (str(len(title_list)), str(len(link_list)))) for i in range(len(title_list)): logger.info("title index %d" % (i,)) logger.info("title value %s" % (title_list[i])) item = Liliyy123Item() item['title'] = title item['protagonist'] = protagonist item['type'] = mtype item['director'] = director item['description'] = description item['show_year'] = show_year item['region'] = region item['lang'] = lang item['cover_url'] = cover_url item['play_url'] = self.base_domain + link_list[i] # logger.info(item) yield item
def item_parse(self, response): """ 解析影片播放详情页 """ logger.info(u'解析影片播放详情页: %s' % response.url) # 解析影片介绍页面 logger.info(u'解析影片介绍页面: %s' % response.url) detail = response.meta['detail'] # 图片, 片名 soup_meta = BeautifulSoup(detail, "lxml") title = soup_meta.find('p', {'class': 'name'}).text #片名 img_url = soup_meta.find('img')['data-original'] # 封面图片地址 actors = soup_meta.findAll('p', {'class': 'actor'}) actor_list = actors[0].text # 演员表 item_catagory = actors[1].text # 影片类型 show_year = actors[2].text.split('/')[0] # 上映年份 region = actors[2].text.split('/')[1] # 所属地区 # -------------------------------------------------- # director = response.xpath( '//div[@class="ct-c"]/dl/dd[1]/a/text()').extract() # 导演 if len(director) == 0: director = '' else: director = director[0] lang = response.xpath( '//div[@class="ct-c"]/dl/dd[4]/text()').extract() # 影片语言 if len(lang) == 0: lang = '' else: lang = lang[0] play_item = response.xpath( '//div[@class="playfrom tab8 clearfix"]/ul/li').extract() video_src_list = len(play_item) # 播放源数量 desc = response.xpath('////div[@name="ee"]').extract()[0] desc = BeautifulSoup(desc, "lxml").text.replace('\n', '') # 简介 cover_image = {"img": img_url, "width": 0, "heigh": 0} item = ZxkkItem() # 站内唯一标识 item['item_id'] = response.url.split('?')[-1].split('.')[-2] # 影片标题 item['title'] = title # 图片信息 item['cover_images'].append(cover_image) # 播放页链接 item['content_url'] = response.url # 介绍 item['description'] = desc # 播放地址数/片源数量 item['video_src_cnt'] = video_src_list # 图片数量 item['cover_img_cnt'] = len(item['cover_images']) # 导员 item['director'] = director # 演员表 item['actor_list'] = actor_list # 上映年份 item['show_year'] = show_year # 影片类型 item['item_catagory'] = item_catagory # 影片类型 item['sub_channel'] = item_catagory # 上映年份 item['region'] = region logger.info(u"成功解析: %s url: %s" % (item['title'], item['content_url'])) yield item
# -*- coding: utf-8 -*- import os import sys base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, base_dir) from scrapy.cmdline import execute from movie_spider.common import logger from rest_spider import images_loader # 莉莉影视 # execute(['scrapy', 'crawl', 'lili']) # 唯爱痞电影网 # execute(['scrapy', 'crawl', 'vipfree']) # 全民影院 # execute(['scrapy', 'crawl', 'quanmin']) # 小马影院 # execute(['scrapy', 'crawl', 'xiaoma']) # 在线看看 execute(['scrapy', 'crawl', 'zxkk']) # 下载图片, 更新相对链接 images_loader.master_main() logger.info(u'全部任务执行完毕')
def parse(self, response): # 只执行一次,解析起始路径 start_urls logger.info(u'开始解析') for url in self.start_urls: yield scrapy.Request(url, headers=headers, callback=self.sub_parse)
def parse(self, response): logger.info(str(response.body)) pass