Пример #1
0
def eastday_video_download(url):
    html = get_content(url, )
    title = match1(html, r'var\s*redirect_topic\s*=\s*[\'|"](.*?)[\'|"];')
    if title is None:
        title = match1(
            html,
            r'<meta\s*name=[\'|"]description[\'|"]\s*content=[\'|"](.,*?)[\'|"]/>'
        )
    source = match1(html, r'var\s*d_source\s*=\s*[\'|"](.*?)[\'|"];')
    if source is None:
        source = "crawl"
    thumbnail_url = match1(html,
                           r'var\s*global_share_img\s*=\s*[\'|"](.*?)[\'|"];')
    video_url = match1(html, r'var\s*mp4\s*=\s*[\'|"](.*?)[\'|"];')
    if not re.search(r"http|https", video_url):
        video_url = "http:{}".format(video_url)
    if not re.search(r"http|https", thumbnail_url):
        thumbnail_url = "http:{}".format(thumbnail_url)

    data = {
        "type": 'video',
        "title": title,
        "source": source,
        "thumbnail_urls": [thumbnail_url],
        "image_urls": None,
        "video_url": [video_url],
        "ext": None,
        "size": None,
    }

    return data
Пример #2
0
def baomihua_download_by_id(_id, title, source, img_url, type):
    html = get_content(
        'http://play.baomihua.com/getvideourl.aspx?flvid={}&devicetype='
        'phone_app'.format(_id))
    host = match1(html, r'host=([^&]*)')
    _type = match1(html, r'videofiletype=([^&]*)')
    vid = match1(html, r'&stream_name=([^&]*)')
    dir_str = match1(html, r'&dir=([^&]*)').strip()
    video_url = 'http://{}/{}/{}.{}'.format(host, dir_str, vid, _type)
    logging.debug("url is {}".format(video_url))
    if title is None:
        title = match1(html, r'&title=([^&]*)')
        title = urllib.parse.unquote(title)
    if source is None:
        return None
    if img_url is None:
        img_url = match1(html, r'&video_img=([^&]*)')

    ext = _type
    size = int(match1(html, r'&videofilesize=([^&]*)'))
    size = float("{:.2f}".format(int(size) / 1024 / 1024))

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [img_url],
        "image_urls": None,
        "video_url": [video_url],
        "ext": ext,
        "size": size,
    }

    return data
Пример #3
0
def miaopai_download(url):
    mobile_page = get_content(url, headers=fake_headers_mobile)
    try:
        title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
    except:
        title = re.search(r'([\'"])status_title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
    title = title.replace('\n', '_')
    source = re.search(r'([\'"])screen_name\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
    stream_url = re.search(r'([\'"])stream_url\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
    thumbnail_urls = re.search(
        r'[\'"]page_pic[\'"]:[\s\W\S\w]*[\'"]url[\'"]:\s*[\'"](.*?)[\'"],[\s\W\S\w]*},',
        mobile_page
    ).group(1)

    ext = 'mp4'
    type = news_type(url)

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [thumbnail_urls],
        "image_urls": None,
        "video_url": [stream_url],
        "ext": ext,
        "size": None,
    }

    return data
Пример #4
0
def ku6_download(url):
    html = get_content(url)
    type = news_type(url)
    title = match1(
        html,
        r"\$\(['\"]#video-title['\"]\)\.text\(['\"]([\s\S\w\W]+?)['\"]\);")
    if title is None:
        title = match1(html,
                       r"document\.title\s*=\s*['\"]([\s\S\w\W]+?)['\"];")
    title = title.strip()
    source = match1(
        html, r"\$\(['\"]#video-author['\"]\)\.text\(['\"](.*?)['\"]\);")
    img_url = match1(
        html,
        r'[\'|"]poster[\'|"]:\s*[\'|"](.*?)[\'|"],\s*[\'|"]controls[\'|"]:')
    video_url = match1(
        html,
        r'this\.src\(\{type:\s*[\'|"]video/mp4[\'|"], src: [\'|"](.*?)[\'|"]}\);'
    )
    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [img_url],
        "image_urls": None,
        "video_url": [video_url],
        "ext": None,
        "size": None,
    }

    return data
Пример #5
0
def ifeng_download(url,
                   title=None,
                   output_dir=output_dir,
                   merge=True,
                   info_only=False,
                   **kwargs):
    # old pattern /uuid.shtml
    # now it could be #uuid
    id = match1(
        url, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})')
    if id:
        return ifeng_download_by_id(id,
                                    None,
                                    output_dir=output_dir,
                                    merge=merge,
                                    info_only=info_only)

    html = get_content(url)
    uuid_pattern = r'"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"'
    id = match1(
        html,
        r'var vid="([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"'
    )
    if id is None:
        video_pattern = r'"vid"\s*:\s*' + uuid_pattern
        id = match1(html, video_pattern)
    assert id, "can't find video app"
    return ifeng_download_by_id(id,
                                title=title,
                                output_dir=output_dir,
                                merge=merge,
                                info_only=info_only)
Пример #6
0
def acfun_download(url):
    response = get_content(url)
    if re.search(r'data-title="(.*?)"', response, re.S).group(1):
        title = re.search(r'data-title="(.*?)"', response, re.S).group(1)
    elif re.search(r'<title>(.*?)\s-\sAcFun弹幕视频网.*</title>', response,
                   re.S).group(1):
        title = re.search(r'<title>(.*?)\s-\sAcFun弹幕视频网.*</title>', response,
                          re.S).group(1)
    else:
        title = re.search(r'data-proof="(.*?)"', response, re.S).group(1)

    thumbnail_url = re.search(r'"coverImage":"(.*?)"', response).group(1)

    if re.search(r'data-uname="(.*?)"', response, re.S).group(1):
        source = re.search(r'data-uname="(.*?)"', response, re.S).group(1)
    elif re.search(r'"username":"******"', response, re.S).group(1):
        source = re.search(r'"username":"******"', response, re.S).group(1)
    else:
        source = re.search(r'data-name="(.*?)"', response, re.S).group(1)

    video_url = None
    type = news_type(url)

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [thumbnail_url],
        "image_urls": None,
        "video_url": video_url,
        "ext": None,
        "size": None,
    }

    return data
Пример #7
0
def baomihua_download(url):
    html = get_content(url)
    type = news_type(url)
    title = match1(html, r"var\s*temptitle\s*=\s*'(.*?)';")
    source = match1(html, r"var\s*appName\s*=\s*\"(.*?)\";")
    img_url = match1(html, r"var\s*pic\s*=\s*\"(.*?)\";")
    _id = match1(html, r'flvid\s*=\s*(\d+)')
    if type == "video":
        return baomihua_download_by_id(
            _id,
            title,
            source,
            img_url,
            type,
        )
Пример #8
0
def wangyi_news_download(url):
    html = get_content(url, charset="GBK")
    doc = pq(html)
    # 标题
    title = doc('div.post_content_main h1').text()
    assert title, "获取文章标题失败"
    # 来源
    source = doc(
        'div.post_content_main div.post_time_source a#ne_article_source').text(
        )
    assert source, "获取文章来源失败"
    # 预处理正文内容
    # content = doc('div.post_content_main div.post_body').html()
    content = doc('div.post_content_main div.post_body div.post_text').html()
    back = re.compile(
        r"<div\s*class=['|\"]ep-source\s*cDGray['|\"]>[\s\S\w\W]*?</div>")
    content = back.sub('', content, re.S)
    content = cleaner(str(content))
    assert content, "获取文章内容失败"
    # 获取文章内图片
    image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
    if not image_urls:
        image_urls = re.findall(r'data-original=[\'|"](.*?)[\'|"]', content,
                                re.S)
    # 获取不到返回空列表
    assert image_urls, "获取文章图片失败"
    image_urls_final = []
    for url in image_urls:
        regex = re.compile(r'http:|https:')
        if regex.match(url):
            image_urls_final.append(url)
        else:
            image_url = 'http:' + url
            image_urls_final.append(image_url)
    # 缩略图
    thumbnail_urls = [image_urls_final[0]]

    data = {
        "type": 'news',
        "title": title,
        "source": source,
        "content": content,
        "thumbnail_urls": thumbnail_urls,
        "image_urls": image_urls_final,
    }

    return data
Пример #9
0
def zaker_news_download(url):
    html = get_content(url, )
    doc = pq(html)
    # 标题
    title = doc('div#article div.article_header h1').text()
    assert title, "获取文章标题失败"
    # 来源
    source = doc(
        'div#article div.article_header div.article_tips a span.auther').text(
        )
    assert source, "获取文章来源失败"
    # 预处理正文内容
    content = doc('div#article div.article_content div#content').html()
    content = cleaner(str(content))
    assert content, "获取文章内容失败"
    # 获取文章内图片
    image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
    if not image_urls:
        image_urls = re.findall(r'data-original=[\'|"](.*?)[\'|"]', content,
                                re.S)
    # 获取不到返回空列表
    assert image_urls, "文章中缺少图片"
    image_urls_final = []
    for url in image_urls:
        regex = re.compile(r'http:|https:')
        if regex.match(url):
            image_urls_final.append(url)
        else:
            image_url = 'http:' + url
            image_urls_final.append(image_url)
    # 缩略图
    thumbnail_urls = [image_urls_final[0]]

    data = {
        "type": 'news',
        "title": title,
        "source": source,
        "content": content,
        "thumbnail_urls": thumbnail_urls,
        "image_urls": image_urls_final,
    }

    return data
Пример #10
0
def ifeng_download_by_id(id,
                         title=None,
                         output_dir=output_dir,
                         merge=True,
                         info_only=False):
    assert match1(
        id,
        r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'), id
    url = 'http://vxml.ifengimg.com/video_info_new/{}/{}/{}.xml'.format(
        id[-2], id[-2:], id)
    xml = get_content(url)
    # 标题
    title_real = match1(xml, r'Name="([^"]+)"')
    title_real = unescape(title_real)
    # 来源
    source = match1(xml, r'ColumnName="([^"]+)"')
    source = unescape(source)
    # 缩略图
    thumbnail_urls = match1(xml, 'SmallPosterUrl="([^"]+)"')
    # 视频下载链接
    video_url = match1(xml, r'VideoPlayUrl="([^"]+)"')
    video_url = video_url.replace('http://wideo.ifeng.com/',
                                  'http://ips.ifeng.com/wideo.ifeng.com/')
    type, ext, size = url_info(video_url)
    # print_info(site_info, title, ext, size)
    data = {
        "title": title_real,
        "source": source,
        "thumbnail_urls": thumbnail_urls,
        "video_url": video_url,
    }
    if not info_only:
        download_urls([video_url],
                      title,
                      ext,
                      size,
                      output_dir,
                      merge=merge,
                      headers=headers)

    return data
Пример #11
0
def btime_news_download(url):
    html = get_content(url, )
    doc = pq(html)
    # 标题
    title = doc('div.article-container div.article h1#title').text()
    # 来源
    source = doc('div.content-info span.col.cite').text()
    # 预处理正文内容
    content = doc('div.content-text div#content-pure').children()
    content = cleaner(str(content))
    assert content, "获取文章内容失败"
    # 获取文章内图片
    image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
    # 获取不到返回空列表
    assert image_urls, "文章中缺少图片"
    image_urls_final = []
    for url in image_urls:
        regex = re.compile(r'http:|https:')
        if regex.match(url):
            image_urls_final.append(url)
        else:
            image_url = 'http:' + url
            image_urls_final.append(image_url)
    # 缩略图
    thumbnail_urls = [image_urls_final[0]]

    data = {
        "type": 'news',
        "title": title,
        "source": source,
        "content": content,
        "thumbnail_urls": thumbnail_urls,
        "image_urls": image_urls_final,
    }

    return data
Пример #12
0
def bilibili_download(url):
    response = get_content(url)
    html = etree.HTML(response)
    if html.xpath('//title/text()')[0]:
        title = html.xpath('//title/text()')[0]

    elif html.xpath('//meta[@itemprop="name"]/@content')[0]:
        title = html.xpath('//meta[@itemprop="name"]/@content')[0]
    else:
        title = html.xpath('//meta[@property="og:title"]/@content')[0]

    title = match1(title, r'(.*?)_哔哩哔哩')

    if html.xpath('//meta[@itemprop="thumbnailUrl"]/@content'):
        thumbnail_url = html.xpath('//meta[@itemprop="thumbnailUrl"]/@content')
    elif html.xpath('//meta[@itemprop="image"]/@content'):
        thumbnail_url = html.xpath('//meta[@itemprop="image"]/@content')
    else:
        thumbnail_url = html.xpath('//meta[@property="og:image"]/@content')

    source = html.xpath('//meta[@itemprop="author"]/@content')[0]
    video_url = None
    type = news_type(url)

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": thumbnail_url,
        "image_urls": None,
        "video_url": video_url,
        "ext": None,
        "size": None,
    }

    return data
Пример #13
0
def lieqi_news_download(url):
    i = 1
    content_list = []
    title = None
    source = None
    thumbnail_urls = None
    while True:
        if i == 1:
            detail_url = url
        else:
            detail_url = url.replace(".html", '-{}.html'.format(i))
        try:
            html = get_content(detail_url, )
        except Exception:
            raise Exception("获取文章内容超时")
        if re.search(r"很抱歉!您访问页面被外星人劫持了", html):
            break
        doc = pq(html)
        if i == 1:
            # 标题
            title = doc('title').text()
            if not title:
                title = doc("div.contentLtopCnt.clearfix h1.title").text()
            # 来源
            source = doc('div.contentLtopCnt.clearfix div.sourceShare div.source').children()
            # 缩略图
            try:
                thumbnail_urls = re.search(
                    r'var\s*detail_poster_src\s*=\s*[\'|"](.*?)[\'|"]',
                    html
                ).group(1)
                if not re.match(r"http[s]?:", thumbnail_urls):
                    thumbnail_urls = "http:" + thumbnail_urls
                thumbnail_urls = [thumbnail_urls]

            except AttributeError:
                pass
            try:
                source = re.search(r"</span>\s*<span>(.*?)</span>", str(source)).group(1)
            except AttributeError:
                raise AttributeError("获取来源失败")
            # 预处理正文内容
            div = doc('div.contentLtopCnt.clearfix div.contentTextCnt').html()
            content_list.append(str(div))
            i += 1
        else:
            # 预处理正文内容
            div = doc('div.contentLtopCnt.clearfix div.contentTextCnt').html()
            content_list.append(str(div))
            i += 1
        # 阈值
        if i >= 30:
            break


    try:
        content = ''.join(content_list)
        content = cleaner(content)
        logging.debug('清洗完成')
    except:
        raise AssertionError("获取文章内容失败")

    # 获取文章内图片
    image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
    # 获取不到返回空列表
    assert image_urls, "文章中缺少图片"
    image_urls_final = []
    for url in image_urls:
        regex = re.compile(r'http:|https:')
        if regex.match(url):
            image_urls_final.append(url)
        else:
            image_url = 'http:' + url
            image_urls_final.append(image_url)
    # 缩略图
    if not thumbnail_urls:
        thumbnail_urls = [image_urls_final[0]]

    if (title and source):
        data = {
            "type": 'news',
            "title": title,
            "source": source,
            "content": content,
            "thumbnail_urls": thumbnail_urls,
            "image_urls": image_urls_final,
        }
    else:
        raise Exception("获取标题和来源失败")

    return data
Пример #14
0
def eastday_news_download(url):
    i = 1
    content_list = []
    title = None
    source = None
    while True:
        if i == 1:
            detail_url = url
        else:
            detail_url = url.replace(".html", '-{}.html'.format(i))
        try:
            html = get_content(detail_url, )
        except Exception:
            raise Exception("获取文章内容超时")
        if re.search(r'<div class="detail_room">', html):
            logging.debug('东方号内容,发布失败')
            raise Exception('东方号内容,发布失败')
        if re.search(r"404&nbsp;&nbsp;很抱歉!您访问页面被外星人劫持了", html):
            break
        doc = pq(html)
        if i == 1:
            # 标题
            title = doc(
                'div.detail_left_cnt div.J-title_detail.title_detail h1 span'
            ).text()
            # 来源
            source = doc(
                'div.detail_left_cnt div.J-title_detail.title_detail div.share_cnt_p.clearfix div.fl'
            ).children()
            try:
                source = re.search(r"</i>\s*<i>(.*?)</i>",
                                   str(source)).group(1)
            except AttributeError:
                source = re.search(r"</i>\s*<a.*>(.*?)</a>", str(source),
                                   re.S).group(1)
            # 预处理正文内容
            div = doc('div#J-contain_detail_cnt').html()
            content_list.append(str(div))
            i += 1
        else:
            # 预处理正文内容
            div = doc('div#J-contain_detail_cnt').html()
            content_list.append(str(div))
            i += 1
        # 阈值
        if i >= 30:
            break

    try:
        content = ''.join(content_list)
        content = cleaner(content)
        logging.debug('清洗完成')
    except:
        raise AssertionError("获取文章内容失败")

    # 获取文章内图片
    image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
    # 获取不到返回空列表
    assert image_urls, "文章中缺少图片"
    image_urls_final = []
    for url in image_urls:
        regex = re.compile(r'http:|https:')
        if regex.match(url):
            image_urls_final.append(url)
        else:
            image_url = 'http:' + url
            image_urls_final.append(image_url)
    # 缩略图
    thumbnail_urls = [image_urls_final[0]]
    if (title and source):
        data = {
            "type": 'news',
            "title": title,
            "source": source,
            "content": content,
            "thumbnail_urls": thumbnail_urls,
            "image_urls": image_urls_final,
        }
    else:
        raise Exception("获取标题和来源失败")

    return data
Пример #15
0
def qq_video_download(url):
    type = "video"
    video_url = None

    if re.search(r"new\.qq\.com/omv/video/", url):
        vid = re.search(r"^http[s]?://new\.qq\.com/omv/video/(.*?)$",
                        url).group(1)
        detail_url = "{}{}".format(
            "https://pacaio.match.qq.com/vlike/detail?vid=",
            vid,
        )
        response = get_content(detail_url)
        info = json.loads(response)
        title = info.get("data").get("title")
        source = info.get("data").get("source")
        if (source is None) or (source == ''):
            source = "腾讯视频"
        if info.get("data").get("imgs").get("228X128"):
            thumbnail_url = info.get("data").get("imgs").get("228X128")
        elif info.get("data").get("imgs").get("496X280"):
            thumbnail_url = info.get("data").get("imgs").get("496X280")
        else:
            thumbnail_url = info.get("data").get("img")

    elif re.search(r"v\.qq\.com/x/page/", url) or re.search(
            r"v\.qq\.com/x/cover", url):
        response = get_content(url)
        title = re.search(r"<title>(.*?)</title>", response).group(1)
        if (title is None) or (title == ""):
            title = re.search(
                r'<meta\s*itemprop=[\'|"]name[\'|"]\s*name=[\'|"]title[\'|"]\s*content=[\'|"](.*?)[\'|"]>',
                response).group(1)
            if (title is None) or (title == ""):
                title = re.search(
                    r'<meta\s*name=[\'|"]twitter:title[\'|"]\s*property=[\'|"]og:title[\'|"]'
                    r'\s*content=[\'|"](.*?)[\'|"]\s*/>', response).group(1)
        title = re.sub(r"_.*$", '', title)
        try:
            source = re.search(
                r'<span\s*class=[\'|"]user_name[\'|"]>(.*?)</span>',
                response).group(1)
        except AttributeError:
            source = re.search(
                r'<strong\s*class=[\'|"]player_title[\'|"]>(.*?)</strong>',
                response).group(1)

        if (source is None) or (source == ''):
            source = "腾讯视频"

        thumbnail_url = re.search(
            r'<meta\s*itemprop=[\'|"]image[\'|"]\s*content=[\'|"](.*?)[\'|"]>',
            response).group(1)
        if thumbnail_url is None:
            thumbnail_url = re.search(
                r'<meta\s*itemprop=[\'|"]thumbnailUrl[\'|"]\s*content=[\'|"](.*?)[\'|"]>',
                response).group(1)
        if not re.search(r"^http[s]?:(.*)?$", thumbnail_url).group(1):
            thumbnail_url = re.search(
                r'[\'|"]pic_640_360[\'|"]:[\'|"](.*?)[\'|"],',
                response).group(1)

    elif re.search(r"sports\.qq\.com", url):
        return {"message": "腾讯独家,暂不支持"}

    else:
        title = None
        source = None
        thumbnail_url = None
        video_url = None

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [thumbnail_url],
        "image_urls": None,
        "video_url": video_url,
        "ext": None,
        "size": None,
    }

    return data
Пример #16
0
def zhonghua_news_download(url):
    i = 1
    content_list = []
    title = None
    source = None
    thumbnail_urls = None
    while True:
        if i == 1:
            detail_url = url
        else:
            detail_url = url.replace(".html", '_{}.html'.format(i))
        try:
            html = get_content(detail_url, )
        except Exception:
            raise Exception("获取文章内容超时")
        doc = pq(html)
        if i == 1:
            # 标题
            title = doc("div.pleft.mt10 div.article-header h1.title").text()
            # 来源
            source = doc(
                'div.pleft.mt10 div.article-header div.info div.left small#article-source'
            ).text()
            # 预处理正文内容
            div = doc('div.pleft.mt10 div.viewbox div#main-content').html()
            content_list.append(str(div))
            i += 1
        else:
            # 预处理正文内容
            div = doc('div.pleft.mt10 div.viewbox div#main-content').html()
            content_list.append(str(div))
            i += 1
        if not re.search(r"下一页</a>", html):
            break
        if i >= 30:
            break

    try:
        content = ''.join(content_list)
        content = cleaner(content)
        logging.debug('清洗完成')
    except:
        raise AssertionError("获取文章内容失败")

    # 获取文章内图片
    image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
    # 获取不到返回空列表
    assert image_urls, "文章中缺少图片"
    image_urls_final = []
    for url in image_urls:
        regex = re.compile(r'http:|https:')
        if regex.match(url):
            image_urls_final.append(url)
        else:
            image_url = 'http://kan.china.com' + url
            image_urls_final.append(image_url)
    # 缩略图
    if not thumbnail_urls:
        thumbnail_urls = [image_urls_final[0]]

    if (title and source):
        data = {
            "type": 'news',
            "title": title,
            "source": source,
            "content": content,
            "thumbnail_urls": thumbnail_urls,
            "image_urls": image_urls_final,
        }
    else:
        raise Exception("获取标题和来源失败")

    return data
Пример #17
0
def sohu_news_download(url, ):
    html = get_content(url, )
    doc = pq(html)
    if "www.sohu.com/a/" in url:
        # 标题
        title = doc('div.text div.text-title h1').text()
        if not title:
            title = doc(
                'div.content.area div.article-box.l h3.article-title').text()
        if re.match(r"原创", title):
            title = title.replace("原创", '')
        # 来源
        source = doc('div.column.left div.user-info h4 a').text()
        if not source:
            source = doc(
                'div.right-author-info.clearfix div.l.clearfix a.name.l').text(
                )
        # 预处理正文内容
        content = doc('div.text article.article').html()
        if not content:
            content = doc('article.article-text').html()
        backsohu = re.compile(r"<span\s*class=['|\"]backword['|\"]>.*?</span>")
        editor_name = re.compile(
            r"<p\s*data-role=['|\"]editor-name['|\"]>.*</p>")
        content = backsohu.sub('', content)
        content = editor_name.sub('', content)
        if re.search(r"(搜狐.*?独家出品 未经许可严禁转载)", content):
            content = re.sub(r'(搜狐.*?独家出品 未经许可严禁转载)', '', content)
        content = cleaner(str(content))
        assert content, "获取文章内容失败"
        # 获取文章内图片
        image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
        # 获取不到返回空列表
        assert image_urls, "文章中缺少图片"
        image_urls_final = []
        for url in image_urls:
            regex = re.compile(r'http:|https:')
            if regex.match(url):
                image_urls_final.append(url)
            else:
                image_url = 'http:' + url
                image_urls_final.append(image_url)
        # 缩略图
        thumbnail_urls = [image_urls_final[0]]
    elif "sh.focus.cn/zixun/" in url:
        # 标题
        title = doc('div.main-content h1').text()
        if re.match(r"原创", title):
            title = title.replace("原创", '')
        # 来源
        source = doc(
            'div.main-content div.s-pic-info div.info-source span a').text()
        # 预处理正文内容
        content = doc('div.main-content div.info-content').html()
        backsohu = re.compile(r"<span\s*class=['|\"]backword['|\"]>.*?</span>")
        editor_name = re.compile(
            r"<p\s*data-role=['|\"]editor-name['|\"]>.*</p>")
        content = backsohu.sub('', content)
        content = editor_name.sub('', content)
        if re.search(r"(搜狐.*?独家出品 未经许可严禁转载)", content):
            content = re.sub(r'(搜狐.*?独家出品 未经许可严禁转载)', '', content)
        content = cleaner(str(content))
        assert content, "获取文章内容失败"
        # 获取文章内图片
        image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S)
        # 获取不到返回空列表
        assert image_urls, "文章中缺少图片"
        image_urls_final = []
        for url in image_urls:
            regex = re.compile(r'http:|https:')
            if regex.match(url):
                image_urls_final.append(url)
            else:
                image_url = 'http:' + url
                image_urls_final.append(image_url)
        # 缩略图
        thumbnail_urls = [image_urls_final[0]]
    else:
        raise AssertionError("urls检测爬虫异常")

    data = {
        "type": 'news',
        "title": title,
        "source": source,
        "content": content,
        "thumbnail_urls": thumbnail_urls,
        "image_urls": image_urls_final,
    }

    return data
Пример #18
0
def sohu_video_download(url):
    if re.match(r'http[s]?://share\.vrs\.sohu\.com', url):
        vid = match1(url, 'id=(\d+)')
        source = None
    else:
        html = get_content(url, charset="GBK")
        vid = match1(html, r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?;')
        if re.search(r"var\s*wm_username='******';", html):
            source = re.search(r"var\s*wm_username='******';", html).group(1)
        else:
            source = None
    assert vid, "视频vid获取失败,请检查url"

    if re.match(r'http[s]?://tv\.sohu\.com/', url):
        info = json.loads(
            get_content(
                'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.format(vid)))
        if info.get("data") and (info.get("data") is not None):
            for qtyp in [
                    'oriVid', 'superVid', 'highVid', 'norVid', 'relativeId'
            ]:
                if 'data' in info:
                    hqvid = info['data'][qtyp]
                else:
                    hqvid = info[qtyp]
                if hqvid != 0 and hqvid != vid:
                    info = json.loads(
                        get_content(
                            'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.
                            format(hqvid)))
                    if 'allot' not in info:
                        continue
                    break
            host = info['allot']
            tvid = info['tvid']
            urls = []
            if not source:
                if "wm_data" in info:
                    if 'wm_username' in info["wm_data"]:
                        source = info["wm_data"]["wm_username"]
                    else:
                        source = "crawl"
                else:
                    source = "crawl"
            data = info['data']
            title = data['tvName']
            thumbnail_url = data["coverImg"]
            size = sum(data['clipsBytes'])
            assert len(data['clipsURL']) == len(data['clipsBytes']) == len(
                data['su'])
            for fileName, key in zip(data['su'], data['ck']):
                urls.append(real_url(fileName, key, data['ch']))

        else:
            info = json.loads(
                get_content(
                    'http://my.tv.sohu.com/play/videonew.do?vid={}&referer='
                    'http://my.tv.sohu.com'.format(vid)))
            host = info['allot']
            tvid = info['tvid']
            urls = []
            if not source:
                if "wm_data" in info:
                    if 'wm_username' in info["wm_data"]:
                        source = info["wm_data"]["wm_username"]
                    else:
                        source = "crawl"
                else:
                    source = "crawl"
            data = info['data']
            title = data['tvName']
            thumbnail_url = data["coverImg"]
            size = sum(map(int, data['clipsBytes']))
            assert len(data['clipsURL']) == len(data['clipsBytes']) == len(
                data['su'])
            for fileName, key in zip(data['su'], data['ck']):
                urls.append(real_url(fileName, key, data['ch']))

        data = {
            "type": 'video',
            "title": title,
            "source": source,
            "thumbnail_urls": [thumbnail_url],
            "image_urls": None,
            "video_url": urls,
            "ext": None,
            "size": size,
        }

        return data
    else:
        return None