Пример #1
0
 def video_page(self, response: HtmlResponse):
     video_title = response.css('h1.title').css('span::text').get()
     video_channel = response.css('div.video-actions-container').css(
         'div.usernameWrap.clearfix').css('a::text').get()
     js = response.css('div.video-wrapper').css('#player').css(
         'script').get()
     data_video_id = response.css('div.video-wrapper').css(
         '#player::attr(data-video-id)').get()
     prepare_js = js.split('<script type="text/javascript">')[1].split(
         'loadScriptUniqueId')[0]
     exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id)
     js_result = js2py.eval_js(exec_js)  # type: js2py.base.JsObjectWrapper
     quality_items = js_result.to_list()  # type: list
     quality = quality_items[-1]['text'].split('p')[0]
     if int(quality) >= 720:
         video_url = quality_items[-1]['url']
         self.logger.info('parse [%s] success, url: %s', video_title,
                          video_url)
         if self.settings.get('ENABLE_SQL'):
             result = self.data_base.select_all_by_title_my_follow(
                 video_title)
             if len(result) != 0:
                 for line in result:
                     self.logger.error('has duplicate record: %s', line)
             else:
                 self.data_base.save_my_follow(video_title, video_channel,
                                               video_url, response.url)
         yield PornhubItem(file_urls=video_url,
                           file_name=video_title,
                           file_channel=video_channel)
Пример #2
0
 def content(self, response):
     print(response)
     item = PornhubItem()
     info = re.search('var flashvars(.*)=(.*?);\n', Selector(response).extract()).group()
     result = json.loads(re.findall('(\{.*?\});', info)[0])
     mediaDefinitions = result.get('mediaDefinitions')
     count = len(mediaDefinitions)
     for i in range(0, count):
         videoUrl = mediaDefinitions[i]['videoUrl']
         if videoUrl != '':
             item['file_urls'] = [videoUrl]
     item['name'] = result.get('video_title')
     yield item
Пример #3
0
    def parse_ph_info(self, response):
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        phItem = PornhubItem()
        _ph_info = re.findall('var flashvars =(.*?),\n', response.body)

        _ph_info_json = json.loads(_ph_info[0])
        duration = _ph_info_json.get('video_duration')
        phItem['video_duration'] = duration
        title = _ph_info_json.get('video_title')
        phItem['video_title'] = title
        image_url = _ph_info_json.get('image_url')
        phItem['image_url'] = image_url
        link_url = _ph_info_json.get('link_url')
        phItem['link_url'] = link_url
        quality_480p = _ph_info_json.get('quality_480p')
        phItem['quality_480p'] = quality_480p
        yield phItem
Пример #4
0
 def parse(self, response):
     LL = response.css("#videoCategory .wrap")
     for l in LL:
         item = PornhubItem()
         item["imageUrl"] = l.css("img::attr(data-thumb_url)").extract()[0]
         item["linkUrl"] = ("https://www.pornhub.com" +
                            l.css(".title a::attr(href)").extract()[0])
         item["name"] = l.css(".title a::text").extract()[0]
         item["playNum"] = l.css(
             ".videoDetailsBlock var::text").extract()[0]
         item["recommendation"] = l.css(
             ".videoDetailsBlock .value::text").extract()[0]
         item["time"] = l.css(".duration::text").extract()[0]
         yield item
         # https://www.pornhub.com
     nextP1 = response.css(".page_next a::attr(href)").extract()[0]
     nextP = "https://www.pornhub.com" + nextP1
     if self.count < 20:  # 判断是否存在下一页
         self.count += 1
         yield scrapy.Request(nextP, callback=self.parse)
Пример #5
0
 def parse_ph_info(self, response):
     phItem = PornhubItem()
     selector = Selector(response)
     # logging.info(selector)
     _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract())
     logging.debug('PH信息的JSON:')
     logging.debug(_ph_info)
     _ph_info_json = json.loads(_ph_info[0])
     duration = _ph_info_json.get('video_duration')
     phItem['video_duration'] = duration
     title = _ph_info_json.get('video_title')
     phItem['video_title'] = title
     image_url = _ph_info_json.get('image_url')
     phItem['image_url'] = image_url
     link_url = _ph_info_json.get('link_url')
     phItem['link_url'] = link_url
     quality_480p = _ph_info_json.get('quality_480p')
     phItem['quality_480p'] = quality_480p
     logging.info('duration:' + duration + ' title:' + title +
                  ' image_url:' + image_url + ' link_url:' + link_url)
     yield phItem
Пример #6
0
 def video_page(self, response: HtmlResponse):
     # some video has "Watch Full Video" button
     full_video_button = response.css("#trailerFullLengthDownload")
     video_title = response.css('h1.title').css('span::text').get()
     video_channel = response.css('div.video-actions-container').css(
         'div.usernameWrap.clearfix').css('a::text').get()
     if full_video_button:
         button_title = full_video_button.css('::attr(data-title)').get()
         if button_title != 'Buy Full Video':
             full_url = full_video_button.css('::attr(href)').get()
             self.logger.info('%s detected full video, original name: %s',
                              video_channel, video_title)
             yield scrapy.Request(full_url,
                                  callback=self.video_page,
                                  priority=100)
         else:
             self.logger.info('%s detected buy video, drop', video_channel)
     else:
         self.logger.info('get model: %s, title: %s', video_channel,
                          video_title)
         player_id_element = response.css('#player')
         js = player_id_element.css('script').get()
         data_video_id = player_id_element.css(
             '::attr(data-video-id)').get()
         prepare_js = js.split('<script type="text/javascript">')[1].split(
             'loadScriptUniqueId')[0]
         exec_js = '{0}\nqualityItems_{1};'.format(prepare_js,
                                                   data_video_id)
         js_result = js2py.eval_js(
             exec_js)  # type: js2py.base.JsObjectWrapper
         quality_items = js_result.to_list()  # type: list
         quality = quality_items[-1]['text']
         if quality != '240p' or quality != '"480p"':
             video_url = quality_items[-1]['url']
             yield PornhubItem(file_urls=video_url,
                               file_name=video_title,
                               file_channel=video_channel,
                               parent_url=response.url)
Пример #7
0
    def video_content(self, response):
        tag, duration = response.meta.get('item')

        item = PornhubItem()

        link_url = response.url

        try:
            title = response.xpath('//span[@class="inlineFree"]/text()').get()
        except:
            title = None

        try:
            count = response.xpath('//span[@class="count"]/text()').get()
        except:
            count = None

        try:
            video_tags = ','.join(
                response.xpath(
                    '//div[@class="categoriesWrapper"]/a//text()').getall())
        except:
            video_tags = None

        try:
            percent = response.xpath('//span[@class="percent"]/text()').get()
        except:
            percent = None

        try:
            img_url = response.xpath(
                '//meta[@property="og:image"]/@content').get()
        except:
            img_url = None

        # 得到视频截图,其中S{?} 代表有多少个视频截图,得到值,将img切割,推导式拼接
        # 有一定情况出现问题
        try:
            video_screenshot_img = re.findall(
                '"urlPattern":"(.*?)","thumbHeig', response.text)[0]
            num = int(re.findall('S{(\d+)}', video_screenshot_img)[0])
            start_video_img = video_screenshot_img.split('S{')[0]
            video_screenshot_imgs = [
                start_video_img + 'S{}.jpg'.format(i) for i in range(1, num)
            ]
        except:
            video_screenshot_imgs = None

        # 有一定情况出现问题
        # 网络小水管,1080p是必须登陆才能在页面上看到的,如果需要请携带cookies重写中间件
        # 如果需要下载视频,请携带请求头请求,不然返回403
        # if '"quality":"1080"' in response.text:
        #     video_url = re.findall('"quality":"1080","videoUrl":"(.*?)"},',response.text,re.S|re.I)[0]
        try:
            if '"quality":"720"' in response.text:
                video_url = re.findall('"quality":"720","videoUrl":"(.*?)"},',
                                       response.text, re.S | re.I)[0]
            elif '"quality":"480"' in response.text:
                video_url = re.findall('"quality":"480","videoUrl":"(.*?)"},',
                                       response.text, re.S | re.I)[0]
            elif '"quality":"240"' in response.text:
                video_url = re.findall('"quality":"240","videoUrl":"(.*?)"},',
                                       response.text, re.S | re.I)[0]
        except:
            # video_url = None
            with open('erorr_request.txt', 'a') as f:
                f.write(title + ',' + link_url)
                f.write('\n')

        item['tag'] = tag
        item['duration'] = duration
        item['title'] = title
        item['link_url'] = link_url
        item['count'] = count
        item['video_tags'] = video_tags
        item['percent'] = percent
        item['img_url'] = img_url
        item['video_screenshot_imgs'] = video_screenshot_imgs
        item['video_url'] = video_url

        yield item