def parse_66ys_detail(self, response): # 详情介绍页面 mvClass = response.meta['mvclass'] mvname = response.xpath( "//div[@class='contentinfo']/h1/text()").extract_first() mvdesc = response.xpath('//td[@id="dede_content"]/p/text()').extract() if len("".join(mvdesc).strip()) == 0: return # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照 mvPoster = response.xpath("//div[@id='text']//img/@src").extract() # 更新时间 mv_time = response.xpath( "//table[@width='91%']//tr[2]/td/text()").extract_first() if len(mv_time): time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", mv_time).group(0) else: time = "2018-07-5" mvdtilte = "磁力下载" mgnetUrl = response.xpath( '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/@href' ).extract() mgnetName = response.xpath( '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/text()' ).extract() ed2k = response.xpath( '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/@href' ).extract() ed2kName = response.xpath( '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/text()' ).extract() # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合 downUrlList = [] downTitleList = [] # 如果磁力地址不为空 if len(mgnetUrl): downUrlList.extend(mgnetUrl) downTitleList.extend(mgnetName) else: if len(ed2k) == 0: return if len(ed2k): downUrlList.extend(ed2k) downTitleList.extend(ed2kName) Item = MovieItem() Item['movClass'] = mvClass[0] Item['downLoadName'] = mvname Item['downdtitle'] = ','.join(downTitleList) Item['downimgurl'] = str(",".join(mvPoster)) url = ','.join(downUrlList) Item['downLoadUrl'] = url Item['mvdesc'] = "".join(mvdesc).strip() Item['mv_update_time'] = time print('---------------save', downUrlList) yield Item
def parse_mor(self, response): # 详情介绍页面 mvname = response.xpath("//div[@class='title_all']//h1/font/text()").extract() mvdesc = response.xpath("//div[@class='co_content8']//p/text()").extract() if len(mvdesc): print("") else: mvdesc = response.xpath("//div[@class='co_content8']//div[@id='Zoom']//text()").extract() # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照 if len("".join(mvdesc).strip())==0: mvdesc = response.xpath("//div[@class='co_content8']//div[@id='Zoom']//text()").extract() if len("".join(mvdesc).strip())==0: return # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照 mvPoster = response.xpath("//div[@class='co_content8']//p/img/@src").extract() # 磁力下载链接,如果是个集合,列出 mv_magnetUrl = response.xpath("//div[@class='co_content8']//p/a[contains(@href,'magnet')]/@href").extract() # ftp下载 mv_ftp_name = response.xpath("//div[@class='co_content8']//table//a/text()").extract() # 分集的下载地址 mv_ftp = response.xpath("//div[@class='co_content8']//table//a/@href").extract() # 更新时间 mv_time = response.xpath("//div[@class='co_content8']//ul/text()").extract_first().strip() time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", mv_time).group(0) mvdtilte = "磁力下载" # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合 downUrlList = [] # 如果磁力地址不为空 if len(mv_magnetUrl): downUrlList.extend(mv_magnetUrl) else: if len(mv_ftp): print('') else: return # 如果下载地址不为空 if len(downUrlList): downUrlList.extend(mv_ftp) Item = MovieItem() Item['movClass'] = '热门推荐' Item['downLoadName'] = mvname Item['downdtitle'] = str(mvdtilte) Item['downimgurl'] = str("".join(mvPoster)) url = ','.join(downUrlList) Item['downLoadUrl'] = url Item['mvdesc'] = "".join(mvdesc).strip() Item['mv_update_time'] = time yield Item
def parse_detail(self,response): # 分类名称,也就是专题名称 mvClass = response.meta['mvclass'] # 影片名称 mvname = response.xpath("//div[@class='box']/h1/text()").extract_first() mvdesc =response.xpath("//div[@id='endText']//p/text()").extract() if len("".join(mvdesc).strip())==0: return # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照 mvPoster = response.xpath("//div[@id='endText']/p/img/@src").extract() # 更新时间,由于是专题,此字段没有意义 mv_time = "2018-10-28" mgnetUrl = response.xpath("//div[@class='box']//a[contains(@href,'magnet')]/@href").extract() mgnetName = response.xpath("//div[@class='box']//a[contains(@href,'magnet')]/text()").extract() ed2k = response.xpath("//div[@class='box']//a[contains(@href,'ed2k')]/@href").extract() ed2kName = response.xpath("//div[@class='box']//a[contains(@href,'ed2k')]/text()").extract() # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合 downUrlList = [] downTitleList = [] # 如果磁力地址不为空 if len(mgnetUrl): downUrlList.extend(mgnetUrl) downTitleList.extend(mgnetName) else: if len(ed2k) == 0: return if len(ed2k): downUrlList.extend(ed2k) downTitleList.extend(ed2kName) Item = MovieItem() Item['movClass'] = mvClass Item['downLoadName'] = mvname Item['downdtitle'] = ','.join(downTitleList) Item['downimgurl'] = str(",".join(mvPoster)) url = ','.join(downUrlList) Item['downLoadUrl'] = url Item['mvdesc'] = "".join(mvdesc).strip() Item['mv_update_time'] = mv_time print('---------------save', str(downUrlList)) yield Item
def parse_detail(self, response): mvClass = response.meta['mvClass'] # 详情介绍页面 mvname = response.xpath( "//div[@class='mainleft']//h1/text()").extract_first() mvdesc = response.xpath("//div[@id='post_content']/p/text()").extract() if len("".join(mvdesc).strip()) == 0: return # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照 mvPoster = response.xpath( "//div[@id='post_content']/p/img/@src").extract() mvTime = response.xpath( "//span[@class='info_date info_ico']/text()").extract_first() mvdtilte = "磁力下载" mgnetUrl = response.xpath( "//div[@id='post_content']//a[contains(@href,'magnet')]/@href" ).extract() mgnetName = response.xpath( "//div[@id='post_content']//a[contains(@href,'magnet')]/text()" ).extract() ed2k = response.xpath( "//div[@id='post_content']//a[contains(@href,'ed2k')]/@href" ).extract() ed2k_name = response.xpath( "//div[@id='post_content']//a[contains(@href,'ed2k')]/text()" ).extract() # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合 downUrlList = [] downTitleList = [] # 如果磁力地址不为空 if len(mgnetUrl): downUrlList.extend(mgnetUrl) downTitleList.extend(mgnetName) else: if len(ed2k) == 0: return if len(ed2k): downUrlList.extend(ed2k) downTitleList.extend(ed2k_name) #在线播放地址 mvPlayUrl = response.xpath( "//div[@class='widget box row'][2]/a[@class='lBtn']/@href" ).extract() mvPlayName = response.xpath( "//div[@class='widget box row'][2]/a[@class='lBtn']/text()" ).extract() Item = MovieItem() Item['movClass'] = mvClass Item['downLoadName'] = mvname Item['downdtitle'] = ','.join(downTitleList) Item['downimgurl'] = str(",".join(mvPoster)) url = ','.join(downUrlList) Item['downLoadUrl'] = url Item['mvdesc'] = "".join(mvdesc).strip() Item['mv_update_time'] = mvTime Item['playUrl'] = ','.join(mvPlayUrl) Item['playName'] = ','.join(mvPlayName) if len(mvPlayUrl) == 0: print("---------------", "无在线播放地址") pass yield else: print('---------------save', ','.join(mvPlayUrl), '-------', ','.join(mvPlayName)) yield Item
def parse_detail(self, response): # 详情介绍页面 mvClass = response.meta['mvclass'] # 记录剧集的唯一的key,根据这个判断是不是同一部电影,因为美剧涉及更新,更新时,标题也会改变,所以爬虫不能以标题识别,需要这个识别 mvUrl = response.meta['mvUrl'] mvId = hashlib.md5(mvUrl.encode(encoding='UTF-8')).hexdigest() print('---------------------mvid-------------', mvId) mvname = response.xpath('//div[@class="title"]/a/text()').extract() mvdesc = response.xpath('//td[@id="dede_content"]/p/text()').extract() if len("".join(mvdesc).strip()) == 0: return # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照 mvPoster = response.xpath( '//*[@id="dede_content"]/p/img/@src').extract() # 更新时间 mv_time = response.xpath( "//table[@width='91%']//tr[2]/td/text()").extract_first() if len(mv_time): time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", mv_time).group(0) else: time = "2018-07-5" mvdtilte = "磁力下载" mgnetUrl = response.xpath( '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/@href' ).extract() mgnetName = response.xpath( '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/text()' ).extract() ed2k = response.xpath( '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/@href' ).extract() ed2kName = response.xpath( '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/text()' ).extract() # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合 downUrlList = [] downTitleList = [] # 如果磁力地址不为空 if len(mgnetUrl): downUrlList.extend(mgnetUrl) downTitleList.extend(mgnetName) else: if len(ed2k) == 0: return if len(ed2k): downUrlList.extend(ed2k) downTitleList.extend(ed2kName) Item = MovieItem() Item['movClass'] = mvClass[0] Item['downLoadName'] = mvname Item['downdtitle'] = ','.join(downTitleList) Item['downimgurl'] = str(",".join(mvPoster)) url = ','.join(downUrlList) Item['downLoadUrl'] = url Item['mvdesc'] = "".join(mvdesc).strip() Item['mv_update_time'] = time Item['mv_md5_id'] = mvId print('---------------save', downUrlList) yield Item