Exemplo n.º 1
0
def parse_list_haowai(document, url=None):
    """ 号外新闻需要单独解析, 列表页有下载操作 """
    FIELDS = {
        "url", "title", "publish_time", "publish_site", "author", "abstract",
        "keywords", "comment_id"
    }

    class Fields(object):
        def __init__(self):
            for name in FIELDS:
                self.__dict__[name] = ""
                self.html = ""

        def to_dict(self):
            return dict(self.__dict__)

    def wrap_content(title, content):
        html = """
        <!DOCTYPE html>
        <html lang="en">
        <head><meta charset="UTF-8"><title>{}</title></head>
        <body><div id="content">{}</div></body></html>
        """
        html = html.format(title.encode("utf-8"), content.encode("utf-8"))
        return html

    import json
    url_format = "http://api.myhaowai.com/api/article/get_article_by_aid?aid={}&readFrom=app"
    data = json.loads(document)
    if not data or data["result"].get("code") == "1":
        return list()
    feeds = data.get("contentList", list())
    result = list()
    for feed in feeds:
        aid = feed.get("aid")
        if not aid:
            continue
        detail_url = url_format.format(aid)
        try:
            doc = http.download_json(url=detail_url)
            doc = doc["article_info"]
            content = http.download_json(doc["content_url"])
            content = content.get("content")
            title = doc["title"]
            if not (content and title):
                continue
            fields = Fields()
            fields.title = title
            fields.url = doc["content_url"]
            fields.publish_site = doc.get("nickname", "")
            fields.publish_time = clean_date_time(doc.get("pubtime", ""))
            fields.html = wrap_content(title=title, content=content)
        except Exception:
            raise
        else:
            result.append(fields.to_dict())
    return result
Exemplo n.º 2
0
def video_zaker_parser(url):
    document = http.download_json(url=url)
    data = document["data"].get("articles", [])
    videos = list()
    for item in data:
        url = item["full_url"]
        req = http.Request(url=url)
        try:
            response = http.download(req)
            doc = response.json()
        except Exception:
            continue
        detail = doc.get("data")
        if not detail:
            continue
        src = detail["video_info"]["url"]
        if src.endswith("m3u8"):
            src = src.replace("m3u8", "mp4")
        label = detail["video_info"]["video_label"].split(":")[::-1]
        duration = 0
        for n, i in enumerate(label):
            duration += pow(60, n) * int(i)
        video = VideoFields()
        video.title = item["title"]
        video.publish_ori_name = item["auther_name"]
        video.publish_ori_url = item["weburl"]
        video.publish_ori_icon = detail["article_group"]["logo"]["url"]
        video.thumbnail = detail["video_info"]["pic_url"]
        video.duration = duration
        video.src = src
        videos.append(video)
    return videos
Exemplo n.º 3
0
 def get_wap_detail(id):
     meta = {}
     detail_wap = "http://joke.4399pk.com/wap/video-content-id-%s.html" % vid
     content = http.download_json(detail_wap)
     soup = BeautifulSoup(content, "lxml")
     meta["name"] = get_tag_attribute(soup, publish_name_config, "text")
     meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src")
     return meta
Exemplo n.º 4
0
 def get_metas(ids):
     url = "http://dg.xxhh.com/getcnums/?__jsonp__=fn&ids={ids}".format(
         ids=",".join(ids))
     document = http.download_json(url=url, skip=(3, -1))
     metas = dict()
     for i, meta in enumerate(document.get("d", [])):
         metas[ids[i]] = (int(meta[0]), int(meta[1]), int(meta[2])
                          )  # comment, like, dislike
     return metas
Exemplo n.º 5
0
 def get_video_src(vid):
     # 获取视频地址
     main_parse_url = "http://www.acfun.tv/video/getVideo.aspx?id=%s" % vid
     info = http.download_json(url=main_parse_url)
     sourceType = info['sourceType']
     if sourceType != 'zhuzhan':
         return []
     encode = info['encode']
     pass
     return vid
Exemplo n.º 6
0
def joke_netease_parser(url):
    document = http.download_json(url=url)
    data = document[u"段子"]
    jokes = list()
    for g in data:
        if g.get("imgsum", 0) == 0:
            joke = JokeFields()
            joke.title = g["title"]
            joke.publish_ori_name = g["source"]
            joke.text = g["digest"]
            joke.n_comment = int(g["replyCount"])
            joke.n_like = int(g["upTimes"])
            joke.n_dislike = int(g["downTimes"])
            # _comment_need = g["docid"]  # 评论需要该字段
            jokes.append(joke)
    return jokes
Exemplo n.º 7
0
def joke_neihan_parser(url):
    document = http.download_json(url=url)
    groups = document["data"]["data"]
    jokes = list()
    for g in groups:
        g = g["group"]
        joke = JokeFields()
        joke.publish_ori_name = g["user"]["name"]
        joke.publish_ori_icon = g["user"]["avatar_url"]
        joke.publish_time = format_datetime_string(g["create_time"])
        joke.text = g["text"]
        joke.n_comment = int(g["comment_count"])
        joke.n_like = int(g["digg_count"])
        joke.n_dislike = int(g["bury_count"])
        # _comment_need = g["code"]  # 评论需要该字段
        jokes.append(joke)
    return jokes
Exemplo n.º 8
0
def video_kuaishou_parser(url):
    documents = http.download_json(url=url)
    data = documents.get("feeds", [])
    videos = list()
    for item in data:
        urls = item.get("main_mv_urls")
        thumbs = item.get("cover_thumbnail_urls")
        avatars = item.get("headurls")
        if not (urls and thumbs and avatars):
            continue
        video = VideoFields()
        video.title = item["caption"]
        video.publish_time = format_datetime_string(item["timestamp"])
        video.publish_ori_name = item["user_name"]
        video.publish_ori_url = avatars[0]["url"]
        video.src = urls[0]["url"]
        video.thumbnail = thumbs[0]["url"]
        video.duration = int(item["ext_params"].get("video", 0) / 1000.0)
        videos.append(video)
    return videos
Exemplo n.º 9
0
def video_meipai_parser(url):
    documents = http.download_json(url=url)
    data = [doc["media"] for doc in documents if doc["type"] == "media"]
    videos = list()
    for item in data:
        video = VideoFields()
        video.title = item["caption"]
        video.publish_time = format_datetime_string(item["created_at"])
        video.publish_ori_url = item["url"]
        video.publish_ori_name = item["user"]["screen_name"]
        video.publish_ori_icon = item["user"]["avatar"]
        video.src = item["video"]
        video.thumbnail = item["cover_pic"]
        video.duration = int(item.get("time", 0))
        video.n_like = int(item.get("likes_count", 0))
        video.n_comment = int(item.get("comments_count", 0))
        video.n_repost = int(item.get("reposts_count", 0))
        video.tags = g_tags(video.title)
        videos.append(video)
    return videos
Exemplo n.º 10
0
def video_duowan_parser(url):
    detail_info_template = "http://video.duowan.com/jsapi/playPageVideoInfo/?vids={vid}"
    detail_url_config = {
        "params": {
            "selector": "a.uiVideo__ori"
        },
        "method": "select"
    }
    video_src_re = re.compile('<video src="(.*?)" id="video"')
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.uiVideo__item")
    videos = list()
    for tag in tags:
        video = VideoFields()
        detail_url = get_tag_attribute(tag, detail_url_config, "href")
        vid = detail_url.split("/")[-1].strip(".html")
        m_detail_url = detail_url.replace(".com/", ".cn/")
        detail_json_url = detail_info_template.format(vid=vid)
        jsond_data = http.download_json(url=detail_json_url)
        video_info = jsond_data[vid]
        video.title = video_info["video_title"]
        video.n_comment = int(video_info["video_raw_comment_num"])
        video.n_read = video_info["video_raw_play_num"]
        video.n_like = int(video_info["video_raw_support"])
        video.tags = ";".join(video_info["video_tags"])
        video.publish_ori_name = video_info["user_nickname"]
        video.publish_ori_icon = video_info["user_avatar"]
        video.publish_time = format_datetime_string(
            video_info["video_upload_time"])
        video.publish_ori_url = video_info["video_url"]
        video.thumbnail = video_info["video_big_cover"]
        video.duration = int(video_info["video_raw_duration"])
        m_detail_content = http.download_html(url=m_detail_url)
        video.src = video_src_re.findall(m_detail_content)[0]
        videos.append(video)
        sleep(0.2)
    return videos
Exemplo n.º 11
0
def joke_qiushi_parser(url):
    headers = {
        "User-Agent":
        "qiushibalke_10.8.1_WIFI_auto_19",
        "Source":
        "android_10.8.1",
        "Model":
        "Xiaomi/hydrogen/hydrogen:6.0.1/MMB29M/V7.5.6.0.MBCCNDE:user/release-keys",
        "Uuid":
        "IMEI_8728c26518fa3ae795a7f787073d375f",
        "Deviceidinfo":
        '{"DEVICEID": "862535037295724","SIMNO": "89860112817005617959","IMSI": "460012225499106","ANDROID_ID": "27dafccd6e32bfb2","SDK_INT": 23,"SERIAL"a882d7f9","MAC": "02:00:00:00:00:00","RANDOM": ""}'
    }
    req = http.Request(url=url, headers=headers)
    document = http.download_json(request=req)
    data = document["items"]
    jokes = list()
    for g in data:
        if not g.get("user"):
            continue
        joke = JokeFields()
        joke.publish_ori_name = g["user"]["login"]
        avatar = g["user"].get("thumb")
        if not avatar:
            continue
        if avatar.startswith("//"):
            avatar = "http:" + avatar
        joke.publish_ori_icon = avatar
        joke.publish_time = format_datetime_string(g["created_at"])
        joke.text = g["content"]
        joke.n_comment = int(g.get("comments_count", 0))
        if g.get("votes"):
            joke.n_like = int(g["votes"]["up"])
            joke.n_dislike = int(g["votes"]["down"])
        jokes.append(joke)
    return jokes
Exemplo n.º 12
0
def video_acfun_parser(url):
    # http://www.acfun.cn/list/getlist?channelId=134&sort=0&pageSize=20&pageNo=1

    def get_video_src(vid):
        # 获取视频地址
        main_parse_url = "http://www.acfun.tv/video/getVideo.aspx?id=%s" % vid
        info = http.download_json(url=main_parse_url)
        sourceType = info['sourceType']
        if sourceType != 'zhuzhan':
            return []
        encode = info['encode']
        pass
        return vid

    json_data = http.download_json(url=url)
    item_list = json_data["data"]["data"]
    videos = list()
    for item in item_list:
        video = VideoFields()
        video.title = item["title"]
        video.n_comment = int(item["commentCount"])
        video.n_read = int(item["viewCountFormat"])
        video.n_like = None
        video.tags = None
        video.publish_ori_name = item["username"]
        video.publish_ori_icon = item["userAvatar"]
        video.publish_time = format_datetime_string(
            item["contributeTimeFormat"])
        video.publish_ori_url = urljoin(url, item["link"])
        video.thumbnail = item["coverImage"]
        video.duration = int(item["duration"])
        vid = item["videoId"]
        video.src = get_video_src(vid)
        videos.append(video)
        sleep(0.2)
    return videos
Exemplo n.º 13
0
 def get_num_comment(id):
     n_comment_url = "http://joke.4399pk.com/wap/funnycourse-num-id-%s" % id
     content = http.download_json(url=n_comment_url)
     n_comment = content["msg"]["vcomment"]
     return int(n_comment)