Пример #1
0
    def parse(self, response):
        try:
            isotimeformat = '%Y-%m-%d'
            item = response.meta['item']
            json_data = json.loads(response.text)
            video_info = json_data['data']

            for video in video_info[2:]:
                video = json.loads(video['content'])
                item['id'] = video['group_id']
                url = video['display_url']
                item['download_url'] = video['display_url']
                item['like_cnt'] = video['video_like_count']
                item['cmt_cnt'] = video['comment_count']
                item['sha_cnt'] = video['share_count']
                item['view_cnt'] = video['video_detail_info'][
                    'video_watch_count']
                item['thumbnails'] = video['large_image_list'][0]['url']
                item['title'] = video['title']
                item['video_height'] = json.loads(
                    video['video_play_info']
                )['video_list']['video_1']['vheight']
                item['video_width'] = json.loads(
                    video['video_play_info']
                )['video_list']['video_1']['vwidth']
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '西瓜视频'
                item['category'] = item['category']
                rep = re.search(r'http://toutiao.com/group/(.*)/',
                                url).group(1)
                item['url'] = 'https://www.ixigua.com/i' + rep + '/'

                md = hashlib.md5()  # 构造一个md5
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()

                if item['view_cnt'] >= item['view_cnt_compare'] or item[
                        'cmt_cnt'] >= item['cmt_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        self.broser.get(item['download_url'])
                        exists = self.is_visible('//video')
                        if exists is True:
                            url = self.broser.find_element_by_xpath(
                                '//video').get_attribute("src")
                            print(url)
            self.broser.quit()

        except Exception as f:
            Print.error(f)
            print('错误所在的行号:', f.__traceback__.tb_lineno)
            # 判断是否出现解析失败
            pass
Пример #2
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']

        # 构建票圈post请求获取作品信息1
        url = 'https://longvideoapi.qingqu.top/longvideoapi/video/distribute/category/videoList'
        res = requests.post(
            url,
            headers=pq_headers,
            data=item['data'],
            timeout=30,
        )
        try:
            videos = json.loads(res.text)['data']
            for video in videos:
                item['url'] = re.match(r'https://.*.m3u8?',
                                       video['videoPath']).group()
                item['download_url'] = ''
                item['like_cnt'] = 0
                item['cmt_cnt'] = 0
                item['sha_cnt'] = video['shareCount']
                item['view_cnt'] = video['playCount']
                item['thumbnails'] = video['coverImg']['coverImgPath']
                try:
                    item['title'] = video['title']
                except:
                    item['title'] = video['shareTitle']

                item['id'] = video['id']
                item['video_height'] = video['height']
                item['video_width'] = video['width']
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '票圈长视频'
                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果

                print(item)
                # 筛选视频是否合格
                # if item['view_cnt'] >= item['view_cnt_compare'] or item['sha_cnt'] >= item['cmt_cnt_compare']:
                #     is_ture = Iduoliao.redis_check(item['osskey'])
                #     if is_ture is True:
                #         # 开始去水印上传
                Iduoliao.upload(item['url'], item['thumbnails'],
                                item['osskey'], '票圈长视频', item['title'],
                                item['old_type'])
                #         pass
        except Exception as f:
            Print.error(f)
            pass
Пример #3
0
    def redis_check(md5_name):
        try:
            redis_db = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True)
            is_presence = redis_db.zrank('spider', md5_name)
            if is_presence is None:
                mapping = {
                    md5_name: 10
                }
                redis_db.zadd('spider', mapping)
                Print.info('添加 {} 到redis当中'.format(md5_name))
                return True

            else:
                return False

        except Exception as f:
            Print.error(f)
Пример #4
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']
        url = 'https://api.xiaoniangao.cn/trends/get_recommend_trends'
        try:
            res = requests.post(url,
                                headers=xng_zf_headers,
                                proxies=self.proxies,
                                data=item['data'],
                                timeout=30)
            json_data = json.loads(res.text)

            video_datas = json_data['data']['list']
            for video in video_datas:
                item['url'] = video['v_url']
                item['download_url'] = video['v_url']
                item['like_cnt'] = video['favor']['total']
                item['cmt_cnt'] = 0
                item['sha_cnt'] = 0
                item['view_cnt'] = video['views']
                item['thumbnails'] = video['url']
                item['title'] = video['title']
                item['id'] = video['album_id']
                item['video_height'] = video['vw']
                item['video_width'] = video['w']
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '小年糕祝福'

                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果
                # 筛选条件
                if item['view_cnt'] >= item['view_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        # 开始去水印上传
                        Iduoliao.upload(item['url'], item['thumbnails'],
                                        item['osskey'], '小年糕祝福', item['title'],
                                        item['old_type'])

        except Exception as f:
            Print.error('小年糕祝福爬虫错误:{}'.format(f))
            pass
Пример #5
0
    def tangdou(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']
        json_data = json.loads(response.text)
        video_info = json_data['datas']['list']
        try:
            for video in video_info:
                item['url'] = 'http://aqiniu.tangdou.com/' + video[
                    'videourl'] + '-20.mp4'
                item['download_url'] = 'http://aqiniu.tangdou.com/' + video[
                    'videourl'] + '-20.mp4'
                item['like_cnt'] = 0
                item['cmt_cnt'] = 0
                item['sha_cnt'] = 0
                item['view_cnt'] = video['hits_total']
                item['thumbnails'] = 'https://aimg.tangdou.com' + video['pic']
                item['title'] = video['title']
                item['id'] = video['vid']
                item['video_height'] = 0
                item['video_width'] = 0
                item['spider_time'] = time.strftime(
                    isotimeformat, time.localtime(time.time()))
                item['from'] = '糖豆'
                item['category'] = item['category']
                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果
                # 筛选条件
                if item['view_cnt'] >= item['view_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        # 开始去水印上传
                        Iduoliao.upload(item['url'], item['thumbnails'],
                                        item['osskey'], '糖豆', item['title'],
                                        item['old_type'])

        except Exception as f:
            Print.error('糖豆爬虫错误:{}'.format(f))
            pass
Пример #6
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']

        url = 'https://sv.baidu.com/haokan/api?tn=1008350o&ctn=1008350o&imei=02B4B04B-2F2E-49DB-AF2D-AFFC79A3B0D2&cuid=3E8B5CD30DC5CF707754338AB6C6B1B408204C669OMPAQEKPQC&os=ios&osbranch=i0&ua=750_1334_326&ut=iPhone8%2C1_12.2&net_type=1&apiv=4.10.3.10&appv=1&version=4.10.3.10&life=1551235144&clife=1551235144&sids=2518_4-2540_1-2583_1-2627_2-2604_2-2635_1-2659_4-2665_2-2673_1-2685_1-2686_2-2691_2-2694_2-2697_2-2704_1-2717_3-2731_2-2732_4-2739_1-2743_2-2745_2-2498_1-2750_1-2753_1-2761_2-2772_1-2776_1-2782_2-2787_1-2796_1-2803_2&idfa=AB9793B9-CEE3-4EB2-9994-6DB2632BF4E6&hid=E0D63A86979B6633AB05F6AE72350416&log=vhk&location=&cmd=feed'

        res = requests.post(url, headers=hk_headers, proxies=self.proxies, data=item['data'])
        json_data = json.loads(res.text)
        video_info = json_data['feed']['data']['list']
        try:
            for video in video_info:
                item['url'] = ''
                item['download_url'] = video['content']['video_src']
                item['like_cnt'] = video['content']['praiseNum']
                item['cmt_cnt'] = video['content']['comment_cnt']
                item['sha_cnt'] = 0
                item['view_cnt'] = video['content']['playcnt']
                item['thumbnails'] = video['content']['thumbnails']
                item['title'] = video['content']['title']
                item['id'] = video['content']['vid']
                item['video_height'] = video['content']['height']
                item['video_width'] = video['content']['width']
                item['spider_time'] = time.strftime(isotimeformat, time.localtime(time.time()))
                item['from'] = '好看视频'
                item['category'] = item['category']
                # 构造一个md5
                md = hashlib.md5()
                md.update(str(item['download_url']).encode())
                item['osskey'] = md.hexdigest()  # 加密结果

                # 筛选视频是否合格1
                if item['view_cnt'] >= item['view_cnt_compare'] or item['sha_cnt'] >= item['cmt_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        # 开始去水印上传
                        Iduoliao.upload(item['download_url'], item['thumbnails'], item['osskey'], '好看视频', item['title'],
                                        item['old_type'])

        except Exception as f:
            Print.error(f)
Пример #7
0
    def upload(url, img_url, filename, videofrom, title, old_type):
        if videofrom == "西瓜视频":
            # 传入视频下载地址,返回新的文件名字
            new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=True)
            # 获取视频的帧宽,帧高, 用于去水印定位
            size_filename, width, height = IduoliaoTool.get_video_size(url)
            # 下载视频的封面地址
            img_filename = IduoliaoTool.img_download(img_url, filename)

            # 当三种东西准备就绪,调用去水印工具
            if new_filename and size_filename and img_filename:
                # 去水印,判断是否成功返回真的视频文件用于oss上传
                dewatermark_name = IduoliaoTool.dewatermark(width, height, 20, 200, 55, 204, new_filename, title, old_type, videofrom)
                if dewatermark_name:
                    # oss上传视频
                    # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True)
                    # oss上传视频封面
                    # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False)
                    pass
                # 上传完毕,删除文件
                if os.path.exists(img_filename):
                    os.remove(img_filename)

                if os.path.exists(size_filename):
                    os.remove(size_filename)

        if videofrom == "票圈长视频":
            # 获取ffmpeg导出视频名字
            synthesis_filename = re.match(r'https://rescdn.yishihui.com/longvideo/(.*)/(.*)/(.*)/(.*)', url).group(4)
            ffmpeg_filename = re.match(r'(.*)\.m3u8', synthesis_filename).group(1) + '.mp4'

            isotimeformat = '%Y-%m-%d'
            day = time.strftime(isotimeformat, time.localtime(time.time()))

            filename2 = './{}/{}/{}'.format(videofrom, old_type, day)
            if not os.path.exists(filename2):
                os.makedirs(filename2)

            filename = './{}/{}/{}/{}'.format(videofrom, old_type, day, title) + '.mp4'

            # 下载视频
            os.system('ffmpeg -i {} {}'.format(url, filename))

        if videofrom == "UC浏览器":
            IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=False)

        if videofrom == "糖豆":
            # 获取视频的帧宽,帧高, 用于去水印定位
            size_filename, width, height = IduoliaoTool.get_video_size(url)
            if int(width) > int(height):
                # 传入视频下载地址,返回新的文件名字
                new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=True)

                # 下载视频的封面地址
                img_filename = IduoliaoTool.img_download(img_url, filename)

                # 当三种东西准备就绪,调用去水印工具
                if new_filename and size_filename and img_filename:
                    # 去水印,判断是否成功返回真的视频文件用于oss上传
                    dewatermark_name = IduoliaoTool.dewatermark(width, height, 10, 100, 50, 110, new_filename, title,
                                                                old_type, videofrom)
                    if dewatermark_name:
                        # oss上传视频
                        # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True)
                        # oss上传视频封面
                        # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False)
                        pass

                    # 上传完毕,删除文件
                    if os.path.exists(img_filename):
                        os.remove(img_filename)

                    if os.path.exists(size_filename):
                        os.remove(size_filename)

        if videofrom == "开眼视频":
            isotimeformat = '%Y-%m-%d'
            day = time.strftime(isotimeformat, time.localtime(time.time()))

            filename2 = 'Z:\\爬虫储存\\爬虫储存1.0\\{}\\{}\\{}'.format(videofrom, old_type, day)
            if not os.path.exists(filename2):
                os.makedirs(filename2)

            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.3.2.1000 Chrome/30.0.1599.101 Safari/537.36"}
            with closing(requests.get(url, stream=True, headers=headers)) as r:
                chunk_size = 1024
                # content_size = int(r.headers['content-length'])
                filename = 'Z:\\爬虫储存\\爬虫储存1.0\\{}\\{}\\{}\\{}'.format(videofrom, old_type, day, title) + '.mp4'

                with open(filename, "wb") as f:
                    n = 1
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        # loaded = n * 1024.0 / content_size
                        f.write(chunk)
                        n += 1
                    Print.info('下载视频: {}'.format(filename))

        if videofrom == "小年糕":
            # 获取视频的帧宽,帧高, 用于去水印定位
            size_filename, width, height = IduoliaoTool.get_video_size(url)
            # if int(width) > int(height):
                # 传入视频下载地址,返回新的文件名字
            new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom,
                                                       ifdewatermark=True)

            # 下载视频的封面地址
            img_filename = IduoliaoTool.img_download(img_url, filename)

            # 当三种东西准备就绪,调用去水印工具
            if new_filename and size_filename and img_filename:
                # 去水印,判断是否成功返回真的视频文件用于oss上传
                dewatermark_name = IduoliaoTool.dewatermark(width, height, int(height)-70, 100, 50, 120, new_filename, title,
                                                            old_type, videofrom)
                if dewatermark_name:
                    # oss上传视频
                    # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True)
                    # oss上传视频封面
                    # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False)
                    pass

            # 上传完毕,删除文件
            if os.path.exists(img_filename):
                os.remove(img_filename)

            if os.path.exists(size_filename):
                    os.remove(size_filename)

        if videofrom == "小年糕祝福":
            IduoliaoTool.video_download(filename, url, title, old_type, videofrom, ifdewatermark=False)

        if videofrom == "好看视频":
            # 传入视频下载地址,返回新的文件名字
            new_filename = IduoliaoTool.video_download(filename, url, title, old_type, videofrom,
                                                       ifdewatermark=True)
            # 获取视频的帧宽,帧高, 用于去水印定位
            size_filename, width, height = IduoliaoTool.get_video_size(url)
            # 下载视频的封面地址
            img_filename = IduoliaoTool.img_download(img_url, filename)

            # 当三种东西准备就绪,调用去水印工具
            if new_filename and size_filename and img_filename:
                # 去水印,判断是否成功返回真的视频文件用于oss上传
                dewatermark_name = IduoliaoTool.dewatermark(width, height, 10, 150, 50, 160, new_filename,
                                                            title, old_type, videofrom)
                if dewatermark_name:
                    # oss上传视频
                    # IduoliaoTool.oss_upload(dewatermark_name, dewatermark_name, UPLOADPATH, de_suffix=True)
                    # oss上传视频封面
                    # IduoliaoTool.oss_upload(img_filename, img_filename, UPLOADPATH2, de_suffix=False)
                    pass
                # 上传完毕,删除文件
                if os.path.exists(img_filename):
                    os.remove(img_filename)

                if os.path.exists(size_filename):
                    os.remove(size_filename)
Пример #8
0
    def parse(self, response):
        isotimeformat = '%Y-%m-%d'
        item = response.meta['item']

        try:
            # UC浏览器
            json_data = json.loads(response.text)
            ids = json_data['data']['items']
            ids = [id for id in ids if len(id['id']) == 20]

            video_datas = [
                {
                    # 视频id
                    'id':
                    json_data['data']['articles'][id['id']]['id'],
                    # 视频地址
                    'url':
                    json_data['data']['articles'][id['id']]['url'],
                    # 视频标题
                    'title':
                    json_data['data']['articles'][id['id']]['title'],
                    # 视频分类
                    'category':
                    json_data['data']['articles'][id['id']]['category'][0],
                    # 原始分类
                    'old_type':
                    json_data['data']['articles'][id['id']]['category'][0],
                    # 视频封面地址
                    'thumbnails':
                    json_data['data']['articles'][
                        id['id']]['videos'][0]['poster']['url'],
                    # 视频宽
                    'video_width':
                    json_data['data']['articles'][
                        id['id']]['videos'][0]['video_width'],
                    # 视频高
                    'video_height':
                    json_data['data']['articles'][
                        id['id']]['videos'][0]['video_height'],
                    # 播放量
                    'view_cnt':
                    json_data['data']['articles'][id['id']]['videos'][0]
                    ['view_cnt'],
                    # 评论数
                    'cmt_cnt':
                    json_data['data']['articles'][id['id']]['cmt_cnt'],
                    'from':
                    'UC浏览器',
                    'spider_time':
                    time.strftime(isotimeformat, time.localtime(time.time())),
                } for id in ids if json_data['data']['articles'][id['id']]
                ['videos'][0]['view_cnt']
            ]

            item['video_datas'] = video_datas
            self.engine = create_engine(
                "mysql+pymysql://root:[email protected]/UC?charset=utf8")

            # 创建会话
            self.session = sessionmaker(self.engine)
            self.mySession = self.session()

            for gzh_cids in item['video_datas']:
                work = {}
                work['url'] = gzh_cids['url']
                work['thumbnails'] = gzh_cids['thumbnails']
                work['title'] = gzh_cids['title']
                work['work_id'] = int(gzh_cids['id'])
                work['video_height'] = gzh_cids['video_height']
                work['video_width'] = gzh_cids['video_width']
                md = hashlib.md5()  # 构造一个md5
                md.update(str(work['thumbnails']).encode())
                url_md5 = md.hexdigest()  # 加密结果
                work['url_md5'] = url_md5
                # if work['video_width'] >= 1000:
                result = self.mySession.query(Work).filter_by(
                    url_md5=work['url_md5']).first()
                if result is None:
                    print('添加视频:{}'.format(work['title']))
                    work = Work(url=work['url'],
                                thumbnails=work['thumbnails'],
                                title=work['title'],
                                url_md5=work['url_md5'],
                                video_height=work['video_height'],
                                video_width=work['video_width'],
                                status=0)

                    self.mySession.add(work)
                    self.mySession.commit()

                else:
                    pprint('视频已存在')

                self.mySession.query(Url).filter(Url.id == item['id']).update(
                    {"status": "1"})
                self.mySession.commit()

                self.mySession.query(Url).filter(Url.id < 1000000).update(
                    {"status": "1"})
                self.mySession.commit()

        except Exception as f:
            Print.error('UC浏览器爬虫错误:{}'.format(f))

            pass
Пример #9
0
    def parse(self, response):
        try:
            isotimeformat = '%Y-%m-%d'
            item = response.meta['item']
            json_data = json.loads(response.text)
            video_info = json_data['data']

            for video in video_info[2:]:
                video = json.loads(video['content'])
                item['id'] = video['group_id']
                url = video['display_url']
                item['download_url'] = video['display_url']
                item['like_cnt'] = video['video_like_count']
                item['cmt_cnt'] = video['comment_count']
                item['sha_cnt'] = video['share_count']
                item['view_cnt'] = video['video_detail_info']['video_watch_count']
                item['thumbnails'] = video['large_image_list'][0]['url']
                item['title'] = video['title']
                item['video_height'] = json.loads(video['video_play_info'])['video_list']['video_1']['vheight']
                item['video_width'] = json.loads(video['video_play_info'])['video_list']['video_1']['vwidth']
                item['spider_time'] = time.strftime(isotimeformat, time.localtime(time.time()))
                item['from'] = '西瓜视频'
                item['category'] = item['category']
                rep = re.search(r'http://toutiao.com/group/(.*)/', url).group(1)
                item['url'] = 'https://www.ixigua.com/i' + rep + '/'

                md = hashlib.md5()  # 构造一个md5
                md.update(str(item['url']).encode())
                item['osskey'] = md.hexdigest()

                if item['view_cnt'] >= item['view_cnt_compare'] or item['cmt_cnt'] >= item['cmt_cnt_compare']:
                    is_ture = Iduoliao.redis_check(item['osskey'])
                    if is_ture is True:
                        try:
                            # 输入要解析的地址
                            self.url_box.send_keys(item['url'])
                            # 点击解析
                            click_button = self.broser.find_element_by_css_selector('[class="nya-btn"]')
                            click_button.click()

                            # 判断是否出现解析失败
                            exists = self.is_visible('//*[@id="__layout"]/div/div[1]/div/div[2]/div[2]/button')
                            if exists is True:
                                click_button = self.broser.find_element_by_css_selector('[class="vue-dialog-button"]')
                                click_button.click()
                                self.url_box.clear()

                            # 判断是否获取成功
                            exists = self.is_visible('//*[@id="__layout"]/div/main/div[3]/fieldset[2]/legend/span')
                            if exists is True:
                                url = self.broser.find_element_by_xpath(
                                    '//*[@id="__layout"]/div/main/div[3]/fieldset[2]/div/p/a').get_attribute('href')

                                # 开始去水印上传
                                Iduoliao.upload(url, item['thumbnails'], item['osskey'], '西瓜视频', item['title'], item['old_type'])
                            self.url_box.clear()
                        except Exception as f:
                            print(f)
            self.broser.quit()

        except Exception as f:
            Print.error(f)
            print('错误所在的行号:', f.__traceback__.tb_lineno)
            # 判断是否出现解析失败
            exists = self.is_visible('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[2]')
            if exists is True:
                click_button = self.broser.find_element_by_css_selector('[class="vue-dialog-button"]')
                click_button.click()
                self.url_box.clear()
            pass