Exemplo n.º 1
0
    def parse_66ys_detail(self, response):

        # 详情介绍页面
        mvClass = response.meta['mvclass']
        mvname = response.xpath(
            "//div[@class='contentinfo']/h1/text()").extract_first()
        mvdesc = response.xpath('//td[@id="dede_content"]/p/text()').extract()
        if len("".join(mvdesc).strip()) == 0:
            return
        # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照
        mvPoster = response.xpath("//div[@id='text']//img/@src").extract()
        # 更新时间
        mv_time = response.xpath(
            "//table[@width='91%']//tr[2]/td/text()").extract_first()

        if len(mv_time):
            time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", mv_time).group(0)
        else:
            time = "2018-07-5"
        mvdtilte = "磁力下载"

        mgnetUrl = response.xpath(
            '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/@href'
        ).extract()
        mgnetName = response.xpath(
            '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/text()'
        ).extract()

        ed2k = response.xpath(
            '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/@href'
        ).extract()
        ed2kName = response.xpath(
            '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/text()'
        ).extract()

        # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合
        downUrlList = []
        downTitleList = []
        # 如果磁力地址不为空
        if len(mgnetUrl):
            downUrlList.extend(mgnetUrl)
            downTitleList.extend(mgnetName)
        else:
            if len(ed2k) == 0:
                return
        if len(ed2k):
            downUrlList.extend(ed2k)
            downTitleList.extend(ed2kName)

        Item = MovieItem()
        Item['movClass'] = mvClass[0]
        Item['downLoadName'] = mvname
        Item['downdtitle'] = ','.join(downTitleList)
        Item['downimgurl'] = str(",".join(mvPoster))
        url = ','.join(downUrlList)
        Item['downLoadUrl'] = url
        Item['mvdesc'] = "".join(mvdesc).strip()
        Item['mv_update_time'] = time
        print('---------------save', downUrlList)
        yield Item
Exemplo n.º 2
0
    def parse_mor(self, response):
        # 详情介绍页面
        mvname = response.xpath("//div[@class='title_all']//h1/font/text()").extract()
        mvdesc = response.xpath("//div[@class='co_content8']//p/text()").extract()

        if len(mvdesc):
            print("")
        else:
            mvdesc = response.xpath("//div[@class='co_content8']//div[@id='Zoom']//text()").extract()
        # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照
        if len("".join(mvdesc).strip())==0:
            mvdesc = response.xpath("//div[@class='co_content8']//div[@id='Zoom']//text()").extract()

        if len("".join(mvdesc).strip())==0:
            return

        # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照
        mvPoster = response.xpath("//div[@class='co_content8']//p/img/@src").extract()
        # 磁力下载链接,如果是个集合,列出
        mv_magnetUrl = response.xpath("//div[@class='co_content8']//p/a[contains(@href,'magnet')]/@href").extract()
        # ftp下载
        mv_ftp_name = response.xpath("//div[@class='co_content8']//table//a/text()").extract()
        # 分集的下载地址
        mv_ftp = response.xpath("//div[@class='co_content8']//table//a/@href").extract()
        # 更新时间
        mv_time = response.xpath("//div[@class='co_content8']//ul/text()").extract_first().strip()
        time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", mv_time).group(0)

        mvdtilte = "磁力下载"
        # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合
        downUrlList = []
        # 如果磁力地址不为空
        if len(mv_magnetUrl):
            downUrlList.extend(mv_magnetUrl)
        else:
            if len(mv_ftp):
                print('')
            else:
                return
        # 如果下载地址不为空
        if len(downUrlList):
            downUrlList.extend(mv_ftp)
        Item = MovieItem()
        Item['movClass'] = '热门推荐'
        Item['downLoadName'] = mvname
        Item['downdtitle'] = str(mvdtilte)
        Item['downimgurl'] = str("".join(mvPoster))
        url = ','.join(downUrlList)
        Item['downLoadUrl'] = url
        Item['mvdesc'] = "".join(mvdesc).strip()
        Item['mv_update_time'] = time
        yield Item
Exemplo n.º 3
0
    def parse_detail(self,response):

        # 分类名称,也就是专题名称
        mvClass = response.meta['mvclass']
        # 影片名称
        mvname = response.xpath("//div[@class='box']/h1/text()").extract_first()
        mvdesc =response.xpath("//div[@id='endText']//p/text()").extract()

        if len("".join(mvdesc).strip())==0:
            return
        # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照
        mvPoster = response.xpath("//div[@id='endText']/p/img/@src").extract()
        # 更新时间,由于是专题,此字段没有意义
        mv_time = "2018-10-28"

        mgnetUrl = response.xpath("//div[@class='box']//a[contains(@href,'magnet')]/@href").extract()

        mgnetName = response.xpath("//div[@class='box']//a[contains(@href,'magnet')]/text()").extract()

        ed2k = response.xpath("//div[@class='box']//a[contains(@href,'ed2k')]/@href").extract()
        ed2kName = response.xpath("//div[@class='box']//a[contains(@href,'ed2k')]/text()").extract()
        # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合
        downUrlList = []
        downTitleList = []

        # 如果磁力地址不为空
        if len(mgnetUrl):
            downUrlList.extend(mgnetUrl)
            downTitleList.extend(mgnetName)
        else:
            if len(ed2k) == 0:
                return
        if len(ed2k):
            downUrlList.extend(ed2k)
            downTitleList.extend(ed2kName)
        Item = MovieItem()
        Item['movClass'] = mvClass
        Item['downLoadName'] = mvname
        Item['downdtitle'] = ','.join(downTitleList)
        Item['downimgurl'] = str(",".join(mvPoster))
        url = ','.join(downUrlList)
        Item['downLoadUrl'] = url
        Item['mvdesc'] = "".join(mvdesc).strip()
        Item['mv_update_time'] = mv_time
        print('---------------save', str(downUrlList))
        yield Item
Exemplo n.º 4
0
    def parse_detail(self, response):

        mvClass = response.meta['mvClass']
        # 详情介绍页面
        mvname = response.xpath(
            "//div[@class='mainleft']//h1/text()").extract_first()
        mvdesc = response.xpath("//div[@id='post_content']/p/text()").extract()
        if len("".join(mvdesc).strip()) == 0:
            return
        # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照
        mvPoster = response.xpath(
            "//div[@id='post_content']/p/img/@src").extract()

        mvTime = response.xpath(
            "//span[@class='info_date info_ico']/text()").extract_first()
        mvdtilte = "磁力下载"

        mgnetUrl = response.xpath(
            "//div[@id='post_content']//a[contains(@href,'magnet')]/@href"
        ).extract()
        mgnetName = response.xpath(
            "//div[@id='post_content']//a[contains(@href,'magnet')]/text()"
        ).extract()
        ed2k = response.xpath(
            "//div[@id='post_content']//a[contains(@href,'ed2k')]/@href"
        ).extract()
        ed2k_name = response.xpath(
            "//div[@id='post_content']//a[contains(@href,'ed2k')]/text()"
        ).extract()
        # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合
        downUrlList = []
        downTitleList = []
        # 如果磁力地址不为空
        if len(mgnetUrl):
            downUrlList.extend(mgnetUrl)
            downTitleList.extend(mgnetName)
        else:
            if len(ed2k) == 0:
                return
        if len(ed2k):
            downUrlList.extend(ed2k)
            downTitleList.extend(ed2k_name)
        #在线播放地址
        mvPlayUrl = response.xpath(
            "//div[@class='widget box row'][2]/a[@class='lBtn']/@href"
        ).extract()
        mvPlayName = response.xpath(
            "//div[@class='widget box row'][2]/a[@class='lBtn']/text()"
        ).extract()
        Item = MovieItem()
        Item['movClass'] = mvClass
        Item['downLoadName'] = mvname
        Item['downdtitle'] = ','.join(downTitleList)
        Item['downimgurl'] = str(",".join(mvPoster))
        url = ','.join(downUrlList)
        Item['downLoadUrl'] = url
        Item['mvdesc'] = "".join(mvdesc).strip()
        Item['mv_update_time'] = mvTime
        Item['playUrl'] = ','.join(mvPlayUrl)
        Item['playName'] = ','.join(mvPlayName)
        if len(mvPlayUrl) == 0:
            print("---------------", "无在线播放地址")
            pass
            yield
        else:
            print('---------------save', ','.join(mvPlayUrl), '-------',
                  ','.join(mvPlayName))
            yield Item
Exemplo n.º 5
0
    def parse_detail(self, response):
        # 详情介绍页面
        mvClass = response.meta['mvclass']
        # 记录剧集的唯一的key,根据这个判断是不是同一部电影,因为美剧涉及更新,更新时,标题也会改变,所以爬虫不能以标题识别,需要这个识别
        mvUrl = response.meta['mvUrl']
        mvId = hashlib.md5(mvUrl.encode(encoding='UTF-8')).hexdigest()
        print('---------------------mvid-------------', mvId)
        mvname = response.xpath('//div[@class="title"]/a/text()').extract()
        mvdesc = response.xpath('//td[@id="dede_content"]/p/text()').extract()
        if len("".join(mvdesc).strip()) == 0:
            return
        # 海报是个集合,包含2-3个图,一般第一个是大海报,后面的是剧照
        mvPoster = response.xpath(
            '//*[@id="dede_content"]/p/img/@src').extract()
        # 更新时间
        mv_time = response.xpath(
            "//table[@width='91%']//tr[2]/td/text()").extract_first()

        if len(mv_time):
            time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", mv_time).group(0)
        else:
            time = "2018-07-5"
        mvdtilte = "磁力下载"

        mgnetUrl = response.xpath(
            '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/@href'
        ).extract()
        mgnetName = response.xpath(
            '//*[@id="dede_content"]/table//a[contains(@href,"magnet")]/text()'
        ).extract()

        ed2k = response.xpath(
            '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/@href'
        ).extract()
        ed2kName = response.xpath(
            '//td[@id="dede_content"]//a[contains(@href,"ed2k")]/text()'
        ).extract()

        # 下载地址集合,第一个元素是磁力链,后面的是ftp,针对剧集类,磁力可能为空,ftp的是个集合
        downUrlList = []
        downTitleList = []
        # 如果磁力地址不为空
        if len(mgnetUrl):
            downUrlList.extend(mgnetUrl)
            downTitleList.extend(mgnetName)
        else:
            if len(ed2k) == 0:
                return
        if len(ed2k):
            downUrlList.extend(ed2k)
            downTitleList.extend(ed2kName)

        Item = MovieItem()
        Item['movClass'] = mvClass[0]
        Item['downLoadName'] = mvname
        Item['downdtitle'] = ','.join(downTitleList)
        Item['downimgurl'] = str(",".join(mvPoster))
        url = ','.join(downUrlList)
        Item['downLoadUrl'] = url
        Item['mvdesc'] = "".join(mvdesc).strip()
        Item['mv_update_time'] = time
        Item['mv_md5_id'] = mvId
        print('---------------save', downUrlList)
        yield Item