Python makeUTF8 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.encodings

메소드/함수: makeUTF8

hotexamples.com에서의 예제들: 14

Python makeUTF8 - 14개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.encodings.makeUTF8에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: Sina_mobile.py 프로젝트: wangbar0133/PatchyVideo

class Sina_mobile(Crawler):
    NAME = 'sina_mobile'
    HEADERS = makeUTF8({
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
    })
    HEADERS_NO_UTF8 = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
    }

    def normalize_url(self, link):
        return link

    def unique_id(self, link):
        for item in link.split('/'):
            if item.isdigit():
                return item

    def expand_url(self, num):
        return "https://m.weibo.cn/detail/" + str(num)

    def run(self, content, link):  #content = page.text
        soup = BeautifulSoup(content, "lxml")
        data = str(soup.select("body  script")[0]).split(
            'var $render_data = [')[1].split('][0]')[0]
        status = json.loads(data)["status"]
        html = json.loads(data)["status"]["text"]
        soup = BeautifulSoup(html, "lxml")
        a_list = soup.findAll('a')
        text = html_to_plain_text(html)
        HYPERLINK = []
        for a in a_list:
            if 'm.weibo.cn/search?' in a.get('href'):
                HYPERLINK.append(a.get('href'))
        for url in HYPERLINK:
            text = text.replace('HYPERLINK', url, 1)
        return makeResponseSuccess({
            "unique_id":
            "weibo:%s" % status["id"],
            'uploadDate':
            status["created_at"],  #Tue Feb 18 02:48:31 +0800 2020
            'users':
            status["user"]["screen_name"],
            'thumbnailURL':
            status["page_info"]["page_pic"]["url"],
            'title':
            status["page_info"]["title"],
            #'stream_url_hd': status["page_info"]["media_info"]["stream_url_hd"],
            'desc':
            text  #超链接
        })

    async def run_async(self, content, link):
        return self.run(self=self, content=content, link=link)

예제 #2

파일 보기

class SinaMobile(Crawler):
	NAME = 'weibo-mobile'
	SHORT_PATTERN = r''
	PATTERN = r'^(https:\/\/|http:\/\/)?m\.weibo\.(com|cn)\/detail\/(\d+)'
	HEADERS = makeUTF8({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', })
	HEADERS_NO_UTF8 = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', }

	def normalize_url( self, link ) :
		ret = re.search(self.PATTERN, link)
		vid = ret.group(3)
		return f"https://m.weibo.cn/detail/{vid}"

	def unique_id(self, link):
		ret = re.search(self.PATTERN, link)
		vid = ret.group(3)
		return f'weibo-mobile:{vid}'

	def expand_url(self, num):
		return "https://m.weibo.cn/detail/" + str(num)

	def run(self, content, xpath, link, update_video_detail):#content = page.text
		soup = BeautifulSoup(content, "lxml")
		data = str(soup.select("body  script")[0]).split('var $render_data = [')[1].split('][0]')[0]
		status = json.loads(data)["status"]
		html = json.loads(data)["status"]["text"]
		soup = BeautifulSoup(html, "lxml")
		a_list = soup.findAll('a')
		text = html_to_plain_text(html)
		HYPERLINK = []
		for a in a_list:
			if 'm.weibo.cn/search?' in a.get('href'):
				HYPERLINK.append(a.get('href'))
		for url in HYPERLINK:
			text = text.replace('HYPERLINK', url, 1)
		return makeResponseSuccess({
			"unique_id": self.unique_id(self=self, link=link),
			'uploadDate': parse(status["created_at"]).astimezone(timezone.utc),#Tue Feb 18 02:48:31 +0800 2020
			'thumbnailURL': status["page_info"]["page_pic"]["url"],
			'title': status["page_info"]["title"],
			'site': 'weibo-mobile',
			'desc': text,#超链接
			'utags': []
			})

	async def unique_id_async( self, link ) :
		return self.unique_id(self = self, link = link)

	async def run_async(self, content, xpath, link, update_video_detail):
		return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)

예제 #3

파일 보기

class Zcool(Crawler):
    NAME = 'zcool'
    PATTERN = r'^https:\/\/www\.zcool\.com\.cn\/work\/[0-9a-zA-Z=]*\.html'
    SHORT_PATTERN = r''
    HEADERS = makeUTF8({
        'Referer':
        'https://www.zcool.com.cn/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.zcool.com.cn/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    DESC_REGEX_OBJ = re.compile(r"share_description\s*=\s*\'(.*)\'\s*;",
                                re.MULTILINE)
    COVER_REGEX_OBJ = re.compile(
        r'share_description_split,\s*title:\s*\".*\",\s*pic:\s*\"(.*)\"',
        re.MULTILINE)
    UID_REGEX_OBJ = re.compile(
        r"^https:\/\/www\.zcool\.com\.cn\/work\/([0-9a-zA-Z=]*)\.html",
        re.MULTILINE)

    def normalize_url(self, link):
        return link

    def expand_url(self, short):
        return short

    def unique_id(self, link):
        return 'zcool:%s' % self.UID_REGEX_OBJ.search(link).group(1)

    def run(self, content, xpath, link, update_video_detail):
        if not 'J_prismPlayer0' in content:
            return makeResponseFailed('NOT_ZCOOL_VIDEO')
        zcool_id = self.UID_REGEX_OBJ.search(link).group(1)
        title = xpath.xpath('//span[@class="fw-bold"]/text()')[0]

        desc = self.DESC_REGEX_OBJ.search(content).group(1)
        desc = desc.replace('<br>', '\n')

        upload_time = xpath.xpath('//p[@class="title-time"]/@title')[0].split(
            '：')[-1]
        upload_time = parse(upload_time) - timedelta(hours=8)

        cover = self.COVER_REGEX_OBJ.search(content).group(1)
        cover = cover.split('|')[0].strip().split('@')[0]

        return makeResponseSuccess({
            'thumbnailURL': cover,
            'title': title,
            'desc': desc,
            'site': 'zcool',
            'uploadDate': upload_time,
            "unique_id": "zcool:%s" % zcool_id,
            "utags": []
        })

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        return self.run(self=self,
                        content=content,
                        xpath=xpath,
                        link=link,
                        update_video_detail=update_video_detail)

예제 #4

파일 보기

class Bilibili( Crawler ) :
	NAME = 'bilibili'
	PATTERN = r'^(https:\/\/|http:\/\/)?((www|m)\.)?(bilibili\.com\/video\/[aA][vV][\d]+|b23\.tv\/[aA][vV][\d]+)'
	SHORT_PATTERN = r'^[aA][Vv][\d]+$'
	HEADERS = makeUTF8( { 'Referer' : 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linuâ€¦) Gecko/20100101 Firefox/65.0"' } )
	HEADERS_NO_UTF8 = { 'Referer' : 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linuâ€¦) Gecko/20100101 Firefox/65.0"' }

	def get_cookie(self) :
		return {
			'SESSDATA' : Config.BILICOOKIE_SESSDATA,
			'bili_jct' : Config.BILICOOKIE_bili_jct
		}

	def normalize_url( self, link ) :
		link = link.lower()
		return "https://www.bilibili.com/video/" + link[link.rfind("av"):]

	def expand_url( self, short ) :
		return "https://www.bilibili.com/video/" + short.lower()

	def unique_id( self, link ) :
		link = link.lower()
		return 'bilibili:%s' % link[link.rfind("av"):]
	
	def run( self, content, xpath, link, update_video_detail ) :
		raise NotImplementedError()

	async def unique_id_async( self, link ) :
		return self.unique_id(self = self, link = link)
		
	async def run_async(self, content, xpath, link, update_video_detail) :
		link = link.lower()
		vidid = link[link.rfind("av"):]
		if False :
			# use biliplus, try to get metadata from deleted video
			api_url = f"https://www.biliplus.com/api/view?id={vidid[2:]}"
			async with aiohttp.ClientSession() as session:
				async with session.get(api_url) as resp:
					if resp.status == 200 :
						apirespond = await resp.text()
			respond_json = loads(apirespond)
			if 'code' in respond_json and respond_json['code'] == -404 :
				raise Exception('Video not found in biliplus, it is gone forever ðŸ˜')
			thumbnailURL = respond_json['pic']
			title = respond_json['title']
			desc = respond_json['description']
			uploadDate = parse(respond_json['created_at']) - timedelta(hours = 8) # convert from Beijing time to UTC
			utags = respond_json['tag']
			return makeResponseSuccess({
				'thumbnailURL': thumbnailURL,
				'title' : title,
				'desc' : desc,
				'site': 'bilibili',
				'uploadDate' : uploadDate,
				"unique_id": "bilibili:%s" % vidid,
				"utags": utags
			})
		try :
			thumbnailURL = xpath.xpath( '//meta[@itemprop="thumbnailUrl"]/@content' )[0]
			title = xpath.xpath( '//h1[@class="video-title"]/@title' )[0]
			desc = getInnerText(xpath.xpath( '//div[@class="info open"]/node()' ))
			uploadDate = parse(xpath.xpath( '//meta[@itemprop="uploadDate"]/@content' )[0]) - timedelta(hours = 8) # convert from Beijing time to UTC
			utags = xpath.xpath( '//meta[@itemprop="keywords"]/@content' )[0]
			utags = list(filter(None, utags.split(',')[1: -4]))
		except :
			return makeResponseSuccess({
				'thumbnailURL': '',
				'title' : 'å·²å¤±æ•ˆè§†é¢‘',
				'desc' : 'å·²å¤±æ•ˆè§†é¢‘',
				'site': 'bilibili',
				'uploadDate' : datetime.now(),
				"unique_id": "bilibili:%s" % vidid,
				"utags": [],
				"placeholder": True
			})
		return makeResponseSuccess({
			'thumbnailURL': thumbnailURL,
			'title' : title,
			'desc' : desc,
			'site': 'bilibili',
			'uploadDate' : uploadDate,
			"unique_id": "bilibili:%s" % vidid,
			"utags": utags
		})

예제 #5

파일 보기

class Youtube(Crawler):
    NAME = 'youtube'
    PATTERN = r'^((https:\/\/)?(www\.|m\.)?youtube\.com\/watch\?v=[-\w]+|(https:\/\/)?youtu\.be\/(watch\?v=[-\w]+|[-\w]+))'
    SHORT_PATTERN = r''
    HEADERS = makeUTF8({
        'Referer':
        'https://www.youtube.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.youtube.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    API_KEYs = os.getenv('GOOGLE_API_KEYs', "").split(',')

    def normalize_url(self, link):
        if 'youtube.com' in link:
            vidid = link[link.rfind('=') + 1:]
        elif 'youtu.be' in link:
            if 'watch?v=' in link:
                vidid = link[link.rfind('=') + 1:]
            else:
                vidid = link[link.rfind('/') + 1:]
        return "https://www.youtube.com/watch?v=" + vidid

    def expand_url(self, short):
        return short

    def unique_id(self, link):
        if 'youtube.com' in link:
            vidid = link[link.rfind('=') + 1:]
        elif 'youtu.be' in link:
            if 'watch?v=' in link:
                vidid = link[link.rfind('=') + 1:]
            else:
                vidid = link[link.rfind('/') + 1:]
        return "youtube:%s" % vidid

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    def run(self, content, xpath, link):
        if 'youtube.com' in link:
            vidid = link[link.rfind('=') + 1:]
        elif 'youtu.be' in link:
            if 'watch?v=' in link:
                vidid = link[link.rfind('=') + 1:]
            else:
                vidid = link[link.rfind('/') + 1:]

        for key in Config.YOUTUBE_API_KEYS.split(","):
            api_url = "https://www.googleapis.com/youtube/v3/videos?id=" + vidid + "&key=" + key + "&part=snippet,contentDetails,statistics,status"
            apirespond = requests.get(api_url)  # 得到api响应
            if apirespond.status_code == 200:
                break
            else:
                log_ne(op='youtube_run',
                       level='WARN',
                       obj={
                           'msg': 'FETCH_FAILED',
                           'key': key,
                           'resp': apirespond.content,
                           'url': api_url
                       })

        player_response = apirespond.json()
        player_response = player_response['items'][0]
        player_response = player_response['snippet']
        publishedAt_time = player_response['publishedAt']
        uploadDate = parse(publishedAt_time).astimezone(
            timezone.utc)  #上传时间 格式：2019-04-27 04:58:45+00:00

        title = player_response['title']  #标题
        desc = player_response['description']  #描述
        thumbnailsurl0 = player_response['thumbnails']
        thumbnailsurl1 = thumbnailsurl0['medium']
        thumbnailURL = thumbnailsurl1['url']  #缩略图url size：320 180
        utags = player_response['tags'] if 'tags' in player_response else []

        return makeResponseSuccess({
            'thumbnailURL': thumbnailURL,
            'title': title,
            'desc': desc,
            'site': 'youtube',
            'uploadDate': uploadDate,
            "unique_id": "youtube:%s" % vidid,
            "utags": utags
        })

    async def run_async(self, content, xpath, link, update_video_detail):
        if 'youtube.com' in link:
            vidid = link[link.rfind('=') + 1:]
        elif 'youtu.be' in link:
            if 'watch?v=' in link:
                vidid = link[link.rfind('=') + 1:]
            else:
                vidid = link[link.rfind('/') + 1:]

        keys = Config.YOUTUBE_API_KEYS.split(",")
        while keys:
            key = random.choice(keys)
            api_url = "https://www.googleapis.com/youtube/v3/videos?id=" + vidid + "&key=" + key + "&part=snippet,contentDetails,statistics,status"
            async with aiohttp.ClientSession() as session:
                async with session.get(api_url,
                                       headers=self.HEADERS_NO_UTF8) as resp:
                    apirespond = await resp.text()
                    if resp.status == 200:
                        break
                    else:
                        log_ne(op='youtube_run_async',
                               level='WARN',
                               obj={
                                   'msg': 'FETCH_FAILED',
                                   'key': key,
                                   'resp': apirespond,
                                   'url': api_url
                               })
            keys.remove(key)

        player_response = loads(apirespond)
        player_response = player_response['items'][0]
        player_response = player_response['snippet']
        publishedAt_time = player_response['publishedAt']
        uploadDate = parse(publishedAt_time).astimezone(timezone.utc)

        title = player_response['title']
        desc = player_response['description']
        thumbnailsurl0 = player_response['thumbnails']
        thumbnailsurl1 = thumbnailsurl0['medium']
        thumbnailURL = thumbnailsurl1['url']
        utags = player_response['tags'] if 'tags' in player_response else []

        return makeResponseSuccess({
            'thumbnailURL': thumbnailURL,
            'title': title,
            'desc': desc,
            'site': 'youtube',
            'uploadDate': uploadDate,
            "unique_id": "youtube:%s" % vidid,
            "utags": utags
        })

예제 #6

파일 보기

파일: Acfun.py 프로젝트: wangbar0133/PatchyVideo

class Acfun(Crawler):
    NAME = 'acfun'
    PATTERN = r'^(https:\/\/|http:\/\/)?(www\.)?acfun\.cn\/v\/[aA][cC][\d]+'
    SHORT_PATTERN = r'^[aA][cC][\d]+$'
    HEADERS = makeUTF8({
        'Referer':
        'https://www.acfun.cn/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.acfun.cn/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    THUMBNAIL_URL = re.compile(r'"coverUrl":"([\d\w:\/\-\.]+)\?')
    #THUMBNAIL_URL = re.compile(r'https:\/\/imgs\.aixifan\.com\/[\w\-]{8,}')
    THUMBNAIL_URL_2 = re.compile(
        r'https:\/\/cdn\.aixifan\.com\/dotnet\/[\/\w]+\.(jpg|png|jpeg)')
    EXTRACT_NUM = re.compile(r'^[\d]+')

    def normalize_url(self, link):
        link = link.lower()
        return "https://www.acfun.cn/v/" + link[link.rfind("ac"):]

    def expand_url(self, short):
        return "https://www.acfun.cn/v/" + short.lower()

    def unique_id(self, link):
        link = link.lower()
        return 'acfun:%s' % link[link.rfind("ac"):]

    def run(self, content, xpath, link, update_video_detail):
        link = link.lower()
        vidid = link[link.rfind("ac"):]
        thumbnailURL = self.THUMBNAIL_URL.search(content)
        if thumbnailURL:
            thumbnailURL = thumbnailURL.group(1)
            #thumbnailURL = thumbnailURL[0]
        else:
            thumbnailURL = self.THUMBNAIL_URL_2.search(content)
            if thumbnailURL:
                thumbnailURL = thumbnailURL[0]
            else:
                thumbnailURL = ''
        title = xpath.xpath('//h1[@class="title"]/text()')[0]
        desc = try_get_xpath(xpath, [
            '//div[@class="description-container"]/text()',
            '//div[@class="J_description"]/text()',
            '//div[@class="sp1 J_description"]/text()'
        ])[0]
        desc = re.sub(r'<br\s*?\/?>', '\n', desc)
        uploadDate = xpath.xpath('//div[@class="publish-time"]/text()')[0]
        utags = xpath.xpath('//meta[@name="keywords"]/@content')[0]
        utags = list(filter(None, utags.split(',')[1:-4]))
        try:
            uploadDate = parse(uploadDate) - timedelta(hours=8)
        except:
            hrs_prior = self.EXTRACT_NUM.match(uploadDate)
            if hrs_prior:
                hrs_prior = int(hrs_prior.group(0))
            else:
                hrs_prior = 0
            uploadDate = datetime.utcnow() - timedelta(hours=hrs_prior)
        return makeResponseSuccess({
            'thumbnailURL': thumbnailURL,
            'title': title,
            'desc': desc,
            'site': 'acfun',
            'uploadDate': uploadDate,
            "unique_id": "acfun:%s" % vidid,
            "utags": utags
        })

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        return self.run(self=self,
                        content=content,
                        xpath=xpath,
                        link=link,
                        update_video_detail=update_video_detail)

예제 #7

파일 보기

class BilibiliAudio(Crawler):
    NAME = 'bilibili_audio'
    PATTERN = r'^(http(s)?:\/\/)?(www\.)?bilibili\.com\/audio\/au(\d+)'
    SHORT_PATTERN = r''
    HEADERS = makeUTF8({
        'Referer':
        'https://www.bilibili.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.bilibili.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }

    def normalize_url(self, link):
        url_result = re.search(self.PATTERN, link)
        if url_result:
            return f'https://www.bilibili.com/audio/au{url_result.group(4)}'
        else:
            return f'https://www.bilibili.com/audio/au{0}'

    def unique_id(self, link):
        url_result = re.search(self.PATTERN, link)
        if url_result:
            return f'bilibili_audio:{url_result.group(4)}'
        else:
            return f''

    def run(self, content, xpath, link, update_video_detail):
        raise NotImplementedError()

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        url_result = re.search(self.PATTERN, link)
        if url_result:
            auid = url_result.group(4)
        else:
            raise NotImplementedError()
        api_url = f'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={auid}'
        async with aiohttp.ClientSession() as session:
            async with session.get(api_url) as resp:
                api_resp = await resp.json()
        if api_resp['code'] == 0:
            api_resp = api_resp['data']
            thumbnailURL = api_resp['cover']
            title = api_resp['title']
            desc = api_resp['intro']
            uploadDate = datetime.fromtimestamp(
                api_resp['passtime']).astimezone(timezone.utc)
            uid = f'bilibili_audio:{auid}'
            utags = []
            user_space_urls = [f'https://space.bilibili.com/{api_resp["uid"]}']
            return makeResponseSuccess({
                'thumbnailURL': thumbnailURL,
                'title': title,
                'desc': desc,
                'site': 'bilibili_audio',
                'uploadDate': uploadDate,
                "unique_id": uid,
                "utags": utags,
                "user_space_urls": user_space_urls,
                'extra': {
                    'vip_info': api_resp['vipInfo']
                }
            })
        else:
            raise UserError(f'Bilibili API resp code = {api_resp["code"]}')

예제 #8

파일 보기

class Twitter(Crawler):
    NAME = 'twitter'
    PATTERN = r'^(https:\/\/)?(www\.|mobile\.)?twitter\.com\/[\w]+\/status\/[\d]+'
    SHORT_PATTERN = r''
    HEADERS = makeUTF8({
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  # noqa
        'Accept-Charset':
        'UTF-8,*;q=0.5',
        'Accept-Encoding':
        'gzip,deflate,sdch',
        'Accept-Language':
        'en-US,en;q=0.8',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',  # noqa
    })
    HEADERS_NO_UTF8 = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  # noqa
        'Accept-Charset':
        'UTF-8,*;q=0.5',
        'Accept-Encoding':
        'gzip,deflate,sdch',
        'Accept-Language':
        'en-US,en;q=0.8',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',  # noqa
    }

    def normalize_url(self, link):
        if re.match(r'https?://mobile', link):  # normalize mobile URL
            link = 'https://' + match1(link, r'//mobile\.(.+)')
        item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link)
        return "https://twitter.com/i/status/" + item_id

    def unique_id(self, link):
        if re.match(r'https?://mobile', link):  # normalize mobile URL
            link = 'https://' + match1(link, r'//mobile\.(.+)')
        item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link)
        return "twitter:%s" % item_id

    def run(self, content, xpath, link):
        if re.match(r'https?://mobile', link):  # normalize mobile URL
            link = 'https://' + match1(link, r'//mobile\.(.+)')
        screen_name = r1(r'twitter\.com/([^/]+)', link) or r1(r'data-screen-name="([^"]*)"', content) or \
         r1(r'<meta name="twitter:title" content="([^"]*)"', content)
        item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link) or r1(r'data-item-id="([^"]*)"', content) or \
         r1(r'<meta name="twitter:site:id" content="([^"]*)"', content)

        authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'

        ga_url = 'https://api.twitter.com/1.1/guest/activate.json'
        ga_content = post_content(ga_url,
                                  headers={'authorization': authorization})
        guest_token = json.loads(ga_content)['guest_token']

        api_url = 'https://api.twitter.com/1.1/statuses/show.json?id=%s' % item_id
        api_content = get_content(api_url,
                                  headers={
                                      'authorization': authorization,
                                      'x-guest-token': guest_token
                                  })

        info = json.loads(api_content)
        if 'extended_entities' not in info:
            return makeResponseFailed('Not a twitter video')
        desc = info['text']
        cover = info['extended_entities']['media'][0]['media_url']
        user_name = info['user']['name']
        screen_name = info['user']['screen_name']
        uploadDate = parse(info['created_at']).astimezone(timezone.utc)

        return makeResponseSuccess({
            'thumbnailURL': cover,
            'title': '%s @%s' % (user_name, screen_name),
            'desc': desc,
            'site': 'twitter',
            'uploadDate': uploadDate,
            "unique_id": "twitter:%s" % item_id,
            "url_overwrite":
            f'https://twitter.com/{screen_name}/status/{item_id}',
            "utags": []
        })

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        if re.match(r'https?://mobile', link):  # normalize mobile URL
            link = 'https://' + match1(link, r'//mobile\.(.+)')
        screen_name = r1(r'twitter\.com/([^/]+)', link) or r1(r'data-screen-name="([^"]*)"', content) or \
         r1(r'<meta name="twitter:title" content="([^"]*)"', content)
        item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link) or r1(r'data-item-id="([^"]*)"', content) or \
         r1(r'<meta name="twitter:site:id" content="([^"]*)"', content)

        authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'

        ga_url = 'https://api.twitter.com/1.1/guest/activate.json'
        async with aiohttp.ClientSession() as session:
            async with session.post(ga_url,
                                    headers={'authorization':
                                             authorization}) as resp:
                ga_content = await resp.text()
            guest_token = json.loads(ga_content)['guest_token']
            api_url = 'https://api.twitter.com/1.1/statuses/show.json?id=%s' % item_id
            async with session.get(api_url,
                                   headers={
                                       'authorization': authorization,
                                       'x-guest-token': guest_token
                                   }) as resp:
                api_content = await resp.text()

        info = json.loads(api_content)
        if 'extended_entities' not in info:
            return makeResponseFailed('Not a twitter video')
        desc = info['text']
        cover = info['extended_entities']['media'][0]['media_url']
        user_name = info['user']['name']
        screen_name = info['user']['screen_name']
        uploadDate = parse(info['created_at']).astimezone(timezone.utc)

        return makeResponseSuccess({
            'thumbnailURL': cover,
            'title': f'{user_name} @{screen_name}',
            'desc': desc,
            'site': 'twitter',
            'uploadDate': uploadDate,
            "unique_id": "twitter:%s" % item_id,
            "url_overwrite":
            f'https://twitter.com/{screen_name}/status/{item_id}',
            "utags": []
        })

예제 #9

파일 보기

class Nicovideo(Crawler):
    NAME = 'nicovideo'
    PATTERN = r'^(https:\/\/|http:\/\/)?(www\.|sp\.)?(nicovideo\.jp\/watch\/(s|n)m[\d]+|nico\.ms\/(s|n)m[\d]+)'
    SHORT_PATTERN = r'^(s|n)m[\d]+$'
    HEADERS = makeUTF8({
        'Referer':
        'https://www.nicovideo.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.nicovideo.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    THUMBNAIL_PATTERN = r'\"(https:\\\\/\\\\/img\.cdn\.nimg\.jp\\\\/s\\\\/nicovideo\\\\/thumbnails\\\\/\d+\\\\/\d+\.\w+\\\\/\w+\?key=\w+)\"'
    USER_ID_MATCHER = r"www\.nicovideo\.jp\\\/user\\\/([\d]+)"

    #def get_cookie(self) :
    #	return {
    #		'user_session': 'user_session_69318161_02257179b85d2430deb42ca8763071423671fbf8f531ddcf43185de2e376f686',
    #		'user_session_secure': 'NjkzMTgxNjE6ZXIwOW4yM29YdXUueHFCY2d0Qk5mZHlvOVNROGpjTjV1emRaWFRHZDJqMQ',
    #	}

    def normalize_url(self, link):
        link = link.lower()
        return "https://www.nicovideo.jp/watch/" + link[link.rfind("m") - 1:]

    def expand_url(self, short):
        return "https://www.nicovideo.jp/watch/" + short

    def unique_id(self, link):
        link = link.lower()
        return "nicovideo:%s" % link[link.rfind("m") - 1:]

    def run(self, content, xpath, link, update_video_detail):
        link = link.lower()
        vidid = link[link.rfind("m") - 1:]
        thumbnailURL = try_get_xpath(xpath, [
            '//meta[@itemprop="thumbnailUrl"]/@content',
            '//meta[@name="thumbnail"]/@content'
        ])
        if thumbnailURL:
            thumbnailURL = thumbnailURL[0]
        else:
            url_result = re.search(self.THUMBNAIL_PATTERN, content)
            if url_result:
                thumbnailURL = url_result.group(1).replace('\\\\/', '/')
                import sys
                print(thumbnailURL, file=sys.stderr)
            else:
                thumbnailURL = ''
        title = try_get_xpath(xpath, [
            '//meta[@itemprop="name"]/@content',
            '//meta[@property="og:title"]/@content'
        ])[0]
        jsons = try_get_xpath(xpath,
                              ['//script[@type="application/ld+json"]/text()'])
        desc = None
        for json_str in jsons:
            json_obj = json.loads(json_str)
            if '@type' in json_obj and json_obj['@type'] == 'VideoObject':
                desc = json_obj['description']
                break
        if desc is None:
            desc = try_get_xpath(
                xpath,
                [('//p[@itemprop="description"]',
                  lambda ret: [tostring(ret[0], encoding='UTF-8').decode()]),
                 '//meta[@itemprop="description"]/@content',
                 '//meta[@name="description"]/@content'])[0]
        uploadDate = try_get_xpath(xpath, [
            '//meta[@property="video:release_date"]/@content',
            '//meta[@name="video:release_date"]/@content'
        ])[0]
        desc = re.sub(r'<br\s*?\/?>', '\n', desc)
        soup = BeautifulSoup(desc, features="lxml")
        desc_textonly = ''.join(soup.findAll(text=True))
        uploadDate = parse(uploadDate).astimezone(timezone.utc)
        utags = try_get_xpath(xpath, [
            '//meta[@property="og:video:tag"]/@content',
            '//meta[@itemprop="og:video:tag"]/@content',
            '//meta[@name="og:video:tag"]/@content'
        ])
        user_id = ''
        user_id_match_result = re.search(self.USER_ID_MATCHER, content)
        if user_id_match_result:
            user_id = user_id_match_result.group(1)
        if utags:
            utags = [str(ut) for ut in utags]
        else:
            utags = []
        return makeResponseSuccess({
            'thumbnailURL':
            thumbnailURL,
            'title':
            title,
            'desc':
            desc_textonly,
            'site':
            'nicovideo',
            'uploadDate':
            uploadDate,
            "unique_id":
            "nicovideo:%s" % vidid,
            "user_space_urls":
            [f"https://www.nicovideo.jp/user/{user_id}"] if user_id else [],
            "utags":
            utags
        })

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        return self.run(self=self,
                        content=content,
                        xpath=xpath,
                        link=link,
                        update_video_detail=update_video_detail)

예제 #10

파일 보기

class Bilibili(Crawler):
    NAME = 'bilibili'
    PATTERN = r'^(https:\/\/|http:\/\/)?((www|m)\.)?(bilibili\.com\/video\/([aA][vV][\d]+|BV[a-zA-Z0-9]+)|b23\.tv\/([aA][vV][\d]+|BV[a-zA-Z0-9]+))'
    SHORT_PATTERN = r'^([aA][Vv][\d]+|BV[a-zA-Z0-9]+)$'
    VID_MATCH_REGEX = r"([aA][Vv][\d]+|BV[a-zA-Z0-9]+)"
    HEADERS = makeUTF8({
        'Referer':
        'https://www.bilibili.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.bilibili.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    BV2AV = _bv2av()

    def get_cookie(self):
        return {
            'SESSDATA': Config.BILICOOKIE_SESSDATA,
            'bili_jct': Config.BILICOOKIE_bili_jct
        }

    def extract_link(self, link):
        ret = re.search(self.VID_MATCH_REGEX, link)
        vid = ret.group(1)
        if vid[:2].lower() == 'av':
            vid = vid.lower()
        if vid[:2].upper() == 'BV':
            vid = 'BV' + vid[2:]
            vid = 'av' + str(self.BV2AV.dec(vid))
        return vid

    def normalize_url(self, link):
        return "https://www.bilibili.com/video/" + self.extract_link(self=self,
                                                                     link=link)

    def expand_url(self, short):
        return "https://www.bilibili.com/video/" + short

    def unique_id(self, link):
        return 'bilibili:%s' % self.extract_link(self=self, link=link)

    def run(self, content, xpath, link, update_video_detail):
        raise NotImplementedError()

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        vidid = self.extract_link(self=self, link=link)
        try:
            thumbnailURL = xpath.xpath(
                '//meta[@itemprop="thumbnailUrl"]/@content')[0]
            title = xpath.xpath('//h1[@class="video-title"]/@title')[0]
            desc = getInnerText(
                xpath.xpath('//div[@class="info open"]/node()'))
            uploadDate = parse(
                xpath.xpath('//meta[@itemprop="uploadDate"]/@content')
                [0]) - timedelta(hours=8)  # convert from Beijing time to UTC
            utags = xpath.xpath('//meta[@itemprop="keywords"]/@content')[0]
            utags = list(filter(None, utags.split(',')[1:-4]))
        except:
            return makeResponseSuccess({
                'thumbnailURL': '',
                'title': '已失效视频',
                'desc': '已失效视频',
                'site': 'bilibili',
                'uploadDate': datetime.now(),
                "unique_id": "bilibili:%s" % vidid,
                "utags": [],
                "placeholder": True
            })
        return makeResponseSuccess({
            'thumbnailURL': thumbnailURL,
            'title': title,
            'desc': desc,
            'site': 'bilibili',
            'uploadDate': uploadDate,
            "unique_id": "bilibili:%s" % vidid,
            "utags": utags
        })

예제 #11

파일 보기

class Xigua(Crawler):
    NAME = 'Xigua'
    PATTERN = r'^https?\:\/\/(www\.)?ixigua.com/(\d+)'
    SHORT_PATTERN = r''
    HEADERS = makeUTF8({
        'Referer':
        'https://www.ixigua.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.ixigua.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    META_MATCH_OBJ = r"<script\s+data-react-helmet=\"true\"\s+type=\"application/ld\+json\">\s*(.*?)\s*</script>"
    USER_ID_MATCH_OBJ = r"\"user_id\":(\d+)"

    def get_cookie(self):
        return {
            'ttwid':
            '1%7Cfjd63xV6vk-PylvXbSpJ6X3A6TA9GDxriyUbQWjDsBs%7C1614899329%7C131ef1e44612efea0743459a6fe967e70a2ca5ece23fda9c9b3983354d3d00fe'
        }

    def normalize_url(self, link):
        ret = re.search(self.PATTERN, link)
        vid = ret.group(2)
        return f'https://www.ixigua.com/{vid}'

    def expand_url(self, short):
        return short

    def unique_id(self, link):
        ret = re.search(self.PATTERN, link)
        vid = ret.group(2)
        return f'xigua:{vid}'

    def run(self, content, xpath, link, update_video_detail):
        metadata = re.search(self.META_MATCH_OBJ, content)
        user_id = re.search(self.USER_ID_MATCH_OBJ, content)
        if user_id:
            user_id = user_id.group(1)
        if metadata:
            metadata = loads(metadata.group(1))
            title = metadata['name'].removesuffix(' - 西瓜视频')
            desc = metadata['description']
            cover = metadata['thumbnailUrl'][
                0] if 'thumbnailUrl' in metadata else metadata['image'][0]
            upload_time = parse(metadata['datePublished']).astimezone(
                timezone.utc)
        else:
            raise Exception('Cannot find metadata or user_id object')

        return makeResponseSuccess({
            'thumbnailURL':
            cover,
            'title':
            title,
            'desc':
            desc,
            'site':
            'xigua',
            'uploadDate':
            upload_time,
            "unique_id":
            self.unique_id(self=self, link=link),
            "user_space_urls":
            [f"https://www.ixigua.com/home/{user_id}"] if user_id else [],
            "utags": []
        })

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        return self.run(self=self,
                        content=content,
                        xpath=xpath,
                        link=link,
                        update_video_detail=update_video_detail)

예제 #12

파일 보기

파일: Nicovideo.py 프로젝트: wangbar0133/PatchyVideo

class Nicovideo(Crawler):
    NAME = 'nicovideo'
    PATTERN = r'^(https:\/\/|http:\/\/)?(www\.)?(nicovideo\.jp\/watch\/(s|n)m[\d]+|nico\.ms\/(s|n)m[\d]+)'
    SHORT_PATTERN = r'^(s|n)m[\d]+$'
    HEADERS = makeUTF8({
        'Referer':
        'https://www.nicovideo.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.nicovideo.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }

    def normalize_url(self, link):
        link = link.lower()
        return "https://www.nicovideo.jp/watch/" + link[link.rfind("m") - 1:]

    def expand_url(self, short):
        return "https://www.nicovideo.jp/watch/" + short

    def unique_id(self, link):
        link = link.lower()
        return "nicovideo:%s" % link[link.rfind("m") - 1:]

    def run(self, content, xpath, link, update_video_detail):
        link = link.lower()
        vidid = link[link.rfind("m") - 1:]
        thumbnailURL = try_get_xpath(xpath, [
            '//meta[@itemprop="thumbnailUrl"]/@content',
            '//meta[@name="thumbnail"]/@content'
        ])[0]
        title = try_get_xpath(xpath, [
            '//meta[@itemprop="name"]/@content',
            '//meta[@property="og:title"]/@content'
        ])[0]
        jsons = try_get_xpath(xpath,
                              ['//script[@type="application/ld+json"]/text()'])
        desc = None
        for json_str in jsons:
            json_obj = json.loads(json_str)
            if '@type' in json_obj and json_obj['@type'] == 'VideoObject':
                desc = json_obj['description']
                break
        if desc is None:
            desc = try_get_xpath(
                xpath,
                [('//p[@itemprop="description"]',
                  lambda ret: [tostring(ret[0], encoding='UTF-8').decode()]),
                 '//meta[@itemprop="description"]/@content',
                 '//meta[@name="description"]/@content'])[0]
        uploadDate = try_get_xpath(xpath, [
            '//meta[@property="video:release_date"]/@content',
            '//meta[@name="video:release_date"]/@content'
        ])[0]
        desc = re.sub(r'<br\s*?\/?>', '\n', desc)
        soup = BeautifulSoup(desc, features="lxml")
        desc_textonly = ''.join(soup.findAll(text=True))
        uploadDate = parse(uploadDate).astimezone(timezone.utc)
        utags = try_get_xpath(xpath, [
            '//meta[@property="og:video:tag"]/@content',
            '//meta[@itemprop="og:video:tag"]/@content',
            '//meta[@name="og:video:tag"]/@content'
        ])
        utags = [str(ut) for ut in utags]
        return makeResponseSuccess({
            'thumbnailURL': thumbnailURL,
            'title': title,
            'desc': desc_textonly,
            'site': 'nicovideo',
            'uploadDate': uploadDate,
            "unique_id": "nicovideo:%s" % vidid,
            "utags": utags
        })

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        return self.run(self=self,
                        content=content,
                        xpath=xpath,
                        link=link,
                        update_video_detail=update_video_detail)

예제 #13

파일 보기

class Bilibili(Crawler):
    NAME = 'bilibili'
    PATTERN = r'^((https:\/\/|http:\/\/)?((www|m)\.)?(bilibili\.com\/video\/([aA][vV][\d]+|BV[a-zA-Z0-9]+)).*|https:\/\/b23\.tv\/\w+)'
    SHORT_PATTERN = r'^([aA][Vv][\d]+|[Bb][Vv][a-zA-Z0-9]+)$'
    VID_MATCH_REGEX = r"([aA][Vv][\d]+|[Bb][Vv][a-zA-Z0-9]+)"
    AID_MATCH_REGEX = r"__INITIAL_STATE__\s*=\s*{\"aid\"\:(\d+),"
    HEADERS = makeUTF8({
        'Referer':
        'https://www.bilibili.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    })
    HEADERS_NO_UTF8 = {
        'Referer':
        'https://www.bilibili.com/',
        'User-Agent':
        '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"'
    }
    BV2AV = _bv2av()

    def get_cookie(self):
        return {
            'SESSDATA': Config.BILICOOKIE_SESSDATA,
            'bili_jct': Config.BILICOOKIE_bili_jct
        }

    # TODO: can not handle if p number exceeds actual p number of the given video
    def extract_link(self, link):
        ret = re.search(self.VID_MATCH_REGEX, link)
        if ret is None and 'b23.tv' in link:
            return None, None, True
        parsed_link = urlparse(link)
        qs_dict = parse_qs(parsed_link.query)
        p_num = 1
        try:
            p_num = int(qs_dict['p'][0])
        except:
            pass
        vid = ret.group(1)
        if vid[:2].lower() == 'av':
            vid = vid.lower()
        if vid[:2].upper() == 'BV':
            vid = 'BV' + vid[2:]
            vid = 'av' + str(self.BV2AV.dec(vid))
        return vid, p_num, False

    def normalize_url(self, link):
        vidid, p_num, b23vid = self.extract_link(self=self, link=link)
        if b23vid:
            return link
        else:
            return f"https://www.bilibili.com/video/{vidid}?p={p_num}"

    def expand_url(self, short):
        if short[:2].lower() == 'av':
            short = short.lower()
        if short[:2].upper() == 'BV':
            short = 'BV' + short[2:]
            short = 'av' + str(self.BV2AV.dec(short))
        return f"https://www.bilibili.com/video/{short}?p=1"

    def unique_id(self, link):
        vidid, p_num, b23vid = self.extract_link(self=self, link=link)
        if b23vid:
            return ''
        else:
            return 'bilibili:%s-%d' % (vidid, p_num)

    def run(self, content, xpath, link, update_video_detail):
        raise NotImplementedError()

    async def unique_id_async(self, link):
        return self.unique_id(self=self, link=link)

    async def run_async(self, content, xpath, link, update_video_detail):
        uid = ''
        new_url = ''
        try:
            aid, p_num, b23vid = self.extract_link(self=self, link=link)
            if b23vid:
                aid_match = re.search(self.AID_MATCH_REGEX, content)
                aid = 'av' + aid_match.group(1)
                new_url = f"https://www.bilibili.com/video/{aid}?p=1"
                p_num = 1
                uid = 'bilibili:%s-1' % aid
            else:
                new_url = link
                uid = self.unique_id(self=self, link=link)
            aid = aid[2:]  # remove 'av'

            api_url = f'http://api.bilibili.com/x/web-interface/view?aid={aid}'
            async with aiohttp.ClientSession() as session:
                async with session.get(api_url) as resp:
                    api_content = await resp.json()
            code = api_content['code']
            if code != 0 or 'data' not in api_content:
                raise Exception(f'api request failed, message:\n{api_content}')
            data = api_content['data']
            thumbnailURL = data['pic']
            title = data['title']
            desc = data['desc']
            uploadDate = datetime.fromtimestamp(data['pubdate']).astimezone(
                timezone.utc)

            api_url = f'http://api.bilibili.com/x/tag/archive/tags?aid={aid}'
            async with aiohttp.ClientSession() as session:
                async with session.get(api_url) as resp:
                    api_content = await resp.json()
            code = api_content['code']
            if code != 0 or 'data' not in api_content:
                utags = []
            else:
                utags = [item['tag_name'] for item in api_content['data']]

            if 'staff' in data:
                user_space_urls = [
                    'https://space.bilibili.com/%d' % x['mid']
                    for x in data['staff']
                ]
            elif 'owner' in data:
                user_space_urls = [
                    'https://space.bilibili.com/%d' % data['owner']['mid']
                ]

            cid = 0
            async with aiohttp.ClientSession() as session:
                async with session.get(
                        f'https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp'
                ) as resp:
                    api_content = await resp.text()
                    if resp.status == 200:
                        api_obj = loads(api_content)
                        num_parts = len(api_obj['data'])
                        if p_num < 1 or p_num > num_parts:
                            raise UserError(
                                f'P number out of range, should be in [1, {num_parts}]'
                            )
                        part_name = api_obj['data'][p_num - 1]['part']
                        cid = api_obj['data'][p_num - 1]['cid']
                    else:
                        raise Exception(
                            f'api request failed, message:\n{api_content}')
        except UserError as ex:
            raise ex
        except:
            return makeResponseSuccess({
                'thumbnailURL': '',
                'title': '已失效视频',
                'desc': '已失效视频',
                'site': 'bilibili',
                'uploadDate': datetime.now(),
                "unique_id": uid,
                "utags": [],
                "url_overwrite": new_url,
                "placeholder": True
            })
        return makeResponseSuccess({
            'thumbnailURL': thumbnailURL,
            'title': title,
            'desc': desc,
            'site': 'bilibili',
            'uploadDate': uploadDate,
            "unique_id": uid,
            "utags": utags,
            "url_overwrite": new_url,
            "user_space_urls": user_space_urls,
            'extra': {
                'part_name': part_name,
                'cid': cid
            }
        })

예제 #14

파일 보기

class SinaPC(Crawler):
	NAME = 'weibo-pc'
	SHORT_PATTERN = r''
	PATTERN = r'^(https:\/\/|http:\/\/)?weibo\.(com|cn)\/tv\/v\/(\w+)\?fid=(\d+:\d+)'
	Cookie = 'SINAGLOBAL=4002460776686.9824.1585321155178; UOR=,,m.weibo.cn; YF-V5-G0=125128c5d7f9f51f96971f11468b5a3f; _s_tentry=-; Apache=8703795817895.288.1585556164345; ULV=1585556164370:2:2:1:8703795817895.288.1585556164345:1585321155211; YF-Page-G0=091b90e49b7b3ab2860004fba404a078|1585563210|1585563210; WBStorage=42212210b087ca50|undefined; login_sid_t=52bd5c499b65543341c46965f3d3267b; cross_origin_proto=SSL; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wb_view_log=2560*14401; WBtopGlobal_register_version=3d5b6de7399dfbdb; crossidccode=CODE-yf-1JiRvU-226WPk-QGmVB9KHtatfm2Ec7ed84; ALF=1617099576; SSOLoginState=1585563577; SUB=_2A25zhbfpDeRhGeBI7lIV9i_IzTuIHXVQ8q4hrDV8PUNbmtANLXDukW9NRpACXBpVshRIjli1oSoWs_HnV-7brere; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhMKKdEqihzDJl7MPTpyF_b5JpX5KzhUgL.FoqcSK5XSo2XSoM2dJLoI7LpUcf.eh.RShqt; SUHB=0WGMeWn5GWqB9T; wvr=6'
	HEADERS = makeUTF8({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
						'cookie': Cookie})
	HEADERS_NO_UTF8 = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0',
					   'cookie': Cookie}

	def get_cookie(self) :
		return {
			'SINAGLOBAL' : '4002460776686.9824.1585321155178',
			'UOR' : ',,m.weibo.cn',
			'YF-V5-G0' : '125128c5d7f9f51f96971f11468b5a3f',
			'_s_tentry' : '-',
			'Apache' : '8703795817895.288.1585556164345',
			'ULV' : '1585556164370:2:2:1:8703795817895.288.1585556164345:1585321155211',
			'YF-Page-G0' : '091b90e49b7b3ab2860004fba404a078|1585563210|1585563210',
			'WBStorage' : '42212210b087ca50|undefined',
			'login_sid_t' : '52bd5c499b65543341c46965f3d3267b',
			'cross_origin_proto' : 'SSL',
			'Ugrow-G0' : '7e0e6b57abe2c2f76f677abd9a9ed65d',
			'wb_view_log' : '2560*14401',
			'WBtopGlobal_register_version' : '3d5b6de7399dfbdb',
			'crossidccode' : 'CODE-yf-1JiRvU-226WPk-QGmVB9KHtatfm2Ec7ed84',
			'ALF' : '1617099576',
			'SSOLoginState' : '1585563577',
			'SUB' : '_2A25zhbfpDeRhGeBI7lIV9i_IzTuIHXVQ8q4hrDV8PUNbmtANLXDukW9NRpACXBpVshRIjli1oSoWs_HnV-7brere',
			'SUBP' : '0033WrSXqPxfM725Ws9jqgMF55529P9D9WhMKKdEqihzDJl7MPTpyF_b5JpX5KzhUgL.FoqcSK5XSo2XSoM2dJLoI7LpUcf.eh.RShqt',
			'SUHB' : '0WGMeWn5GWqB9T',
			'wvr' : '6'
		}

	def normalize_url( self, link ) :
		ret = re.search(self.PATTERN, link)
		vid = ret.group(4)
		rnd = ret.group(3)
		return f"https://weibo.com/tv/v/{rnd}?fid={vid}"

	def unique_id(self, link):
		ret = re.search(self.PATTERN, link)
		vid = ret.group(4)
		return f'weibo-pc:{vid}'

	def run(self, content, xpath, link, update_video_detail):
		soup = BeautifulSoup(content, "lxml")
		description = soup.find('div', class_='info_txt W_f14')
		description = description.get_text()
		#user = soup.find('span', class_='W_f14 L_autocut bot_name W_fl')
		#user_name = user.get_text()
		add_time = soup.find('div', class_='broad_time W_f12')
		add_time = add_time.get_text()
		vidid = self.unique_id(self, link)
		return makeResponseSuccess({
			'thumbnailURL': '',
			'title': description,
			'desc': description,
			'site': 'weibo-pc',
			'uploadDate': parse(add_time).astimezone(timezone.utc),
			"unique_id": vidid,
			'utags': []
		})

	async def unique_id_async( self, link ) :
		return self.unique_id(self = self, link = link)

	async def run_async(self, content, xpath, link, update_video_detail):
		return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)