示例#1
0
def joke_khdx_parser(url):
    text_config = {"params": {"selector": "dd.content"}, "method": "select"}
    user_config = {"params": {"selector": "p.user > a"}, "method": "select"}
    user_icon_config = {"params": {"selector": "img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "a.ding > div > i"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "a.cai > div > i"
        },
        "method": "select"
    }
    pb_time_config = {"params": {"selector": "span.fr"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="dl.main-list")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.publish_ori_icon = urljoin(url, joke.publish_ori_icon)
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#2
0
def general_list(list_page_info):
    def _request_doc_from_config_channel(config):
        doc = dict()
        doc["channel"] = config["channel"]
        doc["config"] = str(config["_id"])
        doc["form"] = config["form"]
        doc["site"] = config["site"]
        doc["time"] = utc_datetime_now()
        doc["fields"] = dict()
        doc["unique"] = ""
        doc["procedure"] = -1
        return doc

    content = http.download_html(url=list_page_info["url"])
    result = FeedParser(document=content,
                        crawler=list_page_info["crawler"],
                        url=list_page_info["url"])
    ids = list()
    for item in result:
        middle = _request_doc_from_config_channel(list_page_info)
        fields = ListFields()
        fields.url = item["url"]
        fields.title = item.get("title", "")
        fields.publish_time = format_datetime_string(
            item.get("publish_time", ""))
        fields.publish_ori_name = item.get("publish_site") or item.get(
            "author", "")
        fields.abstract = item.get("abstract", "")
        fields.tags = item.get("keywords", "")
        fields.html = item.get("html", "")
        if item.get("thumb"):
            fields.thumbs.append(item["thumb"])
        middle["list_fields"] = fields.to_dict()
        middle["pages"] = [{"url": item["url"], "html": ""}]
        middle["unique"] = item["url"]  # 以 url 作为唯一性约束,避免重复抓取 TODO: 归一化url
        middle["procedure"] = PROCEDURE_LIST_TASK
        try:
            r = db.v1_request.insert_one(middle)  # fixme: 插入失败
        except DuplicateKeyError:
            print "DuplicateKeyError"
        except Exception as e:
            print e
        else:
            print "MONGO Insert Success"
            ids.append(str(r.inserted_id))

    next_key = "v1:spider:task:download:id"
    if not ids:
        return
    if isinstance(ids, list):
        redis.sadd(next_key, *ids)
        print "REDIS Add Success"
    elif id:
        redis.sadd(next_key, ids)
        print "REDIS Add Success"
    else:
        print "REDIS Add Faild"
示例#3
0
def video_gifcool_parser(url):
    # http://www.gifcool.com/xsp/
    def get_like_dislike(id):
        url = "http://www.gifcool.com/plus/digg_ajax_index.php?id=%s" % id
        content = http.download_html(url=url)
        n_like = int(num_like_config.findall(content)[0])
        n_dislike = int(num_dislike_config.findall(content)[0])
        return n_like, n_dislike

    detail_url_config = {
        "params": {
            "selector": "div.title  a"
        },
        "method": "select"
    }
    title_config = {"params": {"selector": "div.title  a"}, "method": "select"}
    publish_time_config = {
        "params": {
            "selector": "span.g9.ml50"
        },
        "method": "select"
    }
    src_config = {"params": {"selector": "video"}, "method": "select"}
    cover_config = {"params": {"selector": "video"}, "method": "select"}
    num_like_config = re.compile('<i class="up"></i>(\d+)<s>')
    num_dislike_config = re.compile('<i class="down"></i>(\d+)<s>')
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.main > ul > li")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.publish_ori_url = get_tag_attribute(tag, detail_url_config,
                                                  "href")
        video.publish_ori_url = urljoin(url, video.publish_ori_url)
        video.title = get_tag_attribute(tag, title_config, "text")
        video.publish_ori_name = "姐夫酷"
        video.publish_ori_icon = None
        video.publish_time = get_tag_attribute(soup, publish_time_config,
                                               "text")
        video.publish_time = format_datetime_string(video.publish_time)
        video.src = get_tag_attribute(tag, src_config, "src")
        video.thumbnail = get_tag_attribute(tag, cover_config, "poster")
        video.thumbnail = urljoin(url, video.thumbnail)
        vid = video.publish_ori_url.split("/")[-1].strip(".html")
        n_like, n_dislike = get_like_dislike(vid)
        video.n_like = n_like
        video.n_dislike = n_dislike
        videos.append(video)
        sleep(0.2)
    return videos
示例#4
0
def joke_biedoul_parser(url):
    title_config = {
        "params": {
            "selector": "div.dz-list-con > a > p"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.dz-list-con > p"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "div.dz-username > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "div.user-portrait > img.avatar"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "a.zanUp"}, "method": "select"}
    dislike_config = {"params": {"selector": "a.zanDown"}, "method": "select"}
    pb_time_config = {
        "params": {
            "selector": "div.dz-username > span"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.lcommon.dz-bg > div")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#5
0
def run_detail_task(_id):
    """ 解析详情页(非耗时任务) 
    
    :param _id: COL_REQUESTS 表 _id
    :type _id: str
    :return: COL_REQUESTS 表 _id
    :rtype: str
    """
    query = {"_id": ObjectId(_id)}
    collection = db[COL_REQUESTS]
    request = collection.find_one(query)
    pages = request["pages"]
    if request["form"] == FORM_NEWS:
        news = NewsFields()
    elif request["form"] == FORM_ATLAS:
        news = AtlasFields()
    else:
        raise NotSupportError("run detail task not support %s" %
                              request["form"])
    url, html = pages[0]["url"], pages[0]["html"]
    result = DetailParser(url=url, document=html)
    if not result["support"]:
        logging.error("Detail parse error(domain not support): %s" % _id)
        update = {"$set": {"procedure": PROCEDURE_DETAIL_NOT_SUPPORT_DOMAIN}}
    elif result["missing"]:
        logging.warning("Detail parse warn(miss some fields): %s" % _id)
        update = {"$set": {"procedure": PROCEDURE_DETAIL_MISS_FIELD}}
    else:
        news.title = result["title"]
        news.publish_time = format_datetime_string(result["date"])
        news.publish_ori_name = result["source"] or result["author"]
        if result["summary"]:
            news.abstract = result["summary"]
        if result["tags"]:
            news.tags = result["tags"]
        news.content = result["content"]
        news.publish_ori_url = url
        for page in request["pages"][1:]:
            result = DetailParser(url=page["url"], document=page["html"])
            news.content.extend(result["content"])
        update = {
            "$set": {
                "procedure": PROCEDURE_DETAIL_TASK,
                "fields": news.to_dict()
            }
        }
    collection.update_one(query, update=update)
    return _id if update["$set"]["procedure"] == PROCEDURE_DETAIL_TASK else None
示例#6
0
def joke_neihan_parser(url):
    document = http.download_json(url=url)
    groups = document["data"]["data"]
    jokes = list()
    for g in groups:
        g = g["group"]
        joke = JokeFields()
        joke.publish_ori_name = g["user"]["name"]
        joke.publish_ori_icon = g["user"]["avatar_url"]
        joke.publish_time = format_datetime_string(g["create_time"])
        joke.text = g["text"]
        joke.n_comment = int(g["comment_count"])
        joke.n_like = int(g["digg_count"])
        joke.n_dislike = int(g["bury_count"])
        # _comment_need = g["code"]  # 评论需要该字段
        jokes.append(joke)
    return jokes
示例#7
0
def video_yingtu_parser(url):
    # https://app.yingtu.co/v1/interaction/topic/video/list  [post]
    # {"data":{"topicId":"861232236534439936","pageId":0},"userId":"1501646183777","source":"h5"}:
    def download_this(url):
        import requests
        from urlparse import urlparse
        from urlparse import parse_qs
        a = urlparse(url)
        query_field = parse_qs(a.query)
        tid = query_field["topicId"][0]
        uid = query_field["userId"][0]
        params = '{"data":{"topicId":"%s","pageId":0},"userId":"%s","source":"h5"}'
        params = params % (tid, uid)
        headers = {"Content-Type": "application/x-www-form-urlencoded"}
        url_base = "https://app.yingtu.co/v1/interaction/topic/video/list"
        resp = requests.post(url=url_base, data=params, headers=headers)
        return resp.json()

    def format_duration(d_text):
        duration = map(lambda x: int(x), d_text.split(":"))
        duration = filter(lambda y: y != 0, duration)
        length = len(duration)
        result = 0
        for i in range(length, 0, -1):
            result += duration[length - i] * pow(60, i - 1)
        return int(result)

    json_data = download_this(url)
    item_list = json_data["data"].get("videoList", [])
    videos = list()
    for item in item_list:
        video = VideoFields()
        video.title = item["videoName"]
        video.publish_ori_name = item["creatorName"]
        video.publish_ori_url = item["videoPlayUrl"]
        video.thumbnail = item["videoCoverUrl"]
        video.duration = item["videoDuration"]
        video.duration = format_duration(video.duration)
        video.src = item["videoPlayUrl"]
        video.publish_time = format_datetime_string(item['createTime'])
        video.n_read = int(item["videoPlayCount"])
        video.n_repost = int(item["videoShareCount"])
        video.n_like = int(item["videoFavorCount"])
        videos.append(video)
    return videos
示例#8
0
def joke_fun48_parser(url):
    def get_full_content(ori_url):
        text_config = {
            "params": {
                "selector": "article.article"
            },
            "method": "select"
        }
        document = http.download_html(url=ori_url)
        soup = BeautifulSoup(document, "lxml")
        text = get_tag_attribute(soup, text_config, "text")
        return text

    title_config = {
        "params": {
            "selector": "div.texttitle > a"
        },
        "method": "select"
    }
    ori_url_config = {
        "params": {
            "selector": "div.texttitle > a"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div.card-info"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#isonormal > div")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_url = get_tag_attribute(tag, ori_url_config, "href")
        joke.text = get_full_content(joke.publish_ori_url)
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.text = joke.text.strip("[...]")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#9
0
def video_ifeng_parser(url):
    # http://v.ifeng.com/vlist/channel/85/showData/first_more.js
    body = http.download_html(url=url)[10:-2]
    detail_url_config = {"params": {"selector": "a"}, "method": "select"}
    video_info_re = re.compile(r"var videoinfo =(.*?);", re.S)
    video_src_re = re.compile(r'"gqSrc":"(.*?)"')
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="ul > li")
    videos = list()

    def get_detail_content(detail_url):
        detail_html = http.download_html(url=detail_url)
        video_info = video_info_re.findall(detail_html)[0]
        video_info = video_info.replace("'", '"')
        video_json = json.loads(video_info)
        return video_json

    def get_video_src(id):
        video_info_url = "http://tv.ifeng.com/h6/{}_/video.json".format(id)
        v_content = http.download_html(url=video_info_url)
        result = video_src_re.findall(v_content)
        print result
        if result:
            return result[0]
        else:
            return None

    for tag in tags:
        video = VideoFields()
        video.publish_ori_url = get_tag_attribute(tag, detail_url_config,
                                                  "href")
        detail_info = get_detail_content(video.publish_ori_url)
        video.title = detail_info["name"]
        video.publish_time = detail_info["createdate"]
        video.publish_time = format_datetime_string(video.publish_time)
        video.tags = ";".join(detail_info["keywords"].split())
        video.publish_ori_name = "凤凰视频"
        video.publish_ori_icon = None
        video.thumbnail = detail_info["videoLargePoster"]
        video.duration = int(detail_info["duration"])
        id = detail_info["id"]
        video.src = get_video_src(id)
        videos.append(video)
        sleep(0.2)
    return videos
示例#10
0
def joke_duanzidao_parser(url):
    text_config = {"params": {"selector": "div.article"}, "method": "select"}
    user_config = {
        "params": {
            "selector": "table.author td > ul > li > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "td.avatar img"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "em.good-btn > span"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "em.bad-btn > span"
        },
        "method": "select"
    }
    pb_time_config = {"params": {"selector": "table"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#main > div.panel")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#11
0
def video_meipai_parser(url):
    documents = http.download_json(url=url)
    data = [doc["media"] for doc in documents if doc["type"] == "media"]
    videos = list()
    for item in data:
        video = VideoFields()
        video.title = item["caption"]
        video.publish_time = format_datetime_string(item["created_at"])
        video.publish_ori_url = item["url"]
        video.publish_ori_name = item["user"]["screen_name"]
        video.publish_ori_icon = item["user"]["avatar"]
        video.src = item["video"]
        video.thumbnail = item["cover_pic"]
        video.duration = int(item.get("time", 0))
        video.n_like = int(item.get("likes_count", 0))
        video.n_comment = int(item.get("comments_count", 0))
        video.n_repost = int(item.get("reposts_count", 0))
        video.tags = g_tags(video.title)
        videos.append(video)
    return videos
示例#12
0
def video_kuaishou_parser(url):
    documents = http.download_json(url=url)
    data = documents.get("feeds", [])
    videos = list()
    for item in data:
        urls = item.get("main_mv_urls")
        thumbs = item.get("cover_thumbnail_urls")
        avatars = item.get("headurls")
        if not (urls and thumbs and avatars):
            continue
        video = VideoFields()
        video.title = item["caption"]
        video.publish_time = format_datetime_string(item["timestamp"])
        video.publish_ori_name = item["user_name"]
        video.publish_ori_url = avatars[0]["url"]
        video.src = urls[0]["url"]
        video.thumbnail = thumbs[0]["url"]
        video.duration = int(item["ext_params"].get("video", 0) / 1000.0)
        videos.append(video)
    return videos
示例#13
0
def joke_caoegg_parser(url):
    text_config = {
        "params": {
            "selector": "div.c > a > span"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "div#dateright span.voteyes > font"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "div#dateright span.voteno > font"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div#dateright"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#wrap_info > div.infobox")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.text = joke.text.strip("What a f*****g day!")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#14
0
def joke_nbsw_parser(url):
    text_config = {"params": {"selector": "div.ecae > p"}, "method": "select"}
    user_config = {"params": {"selector": "a.local-link"}, "method": "select"}
    user_icon_config = {
        "params": {
            "selector": "img.avatar"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "div.count-box"}, "method": "select"}
    comment_config = {
        "params": {
            "selector": "span.wppviews"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "span.meta > abbr"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="ul#postlist > li")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.text = joke.text.strip("[...]")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#15
0
def video_duowan_parser(url):
    detail_info_template = "http://video.duowan.com/jsapi/playPageVideoInfo/?vids={vid}"
    detail_url_config = {
        "params": {
            "selector": "a.uiVideo__ori"
        },
        "method": "select"
    }
    video_src_re = re.compile('<video src="(.*?)" id="video"')
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.uiVideo__item")
    videos = list()
    for tag in tags:
        video = VideoFields()
        detail_url = get_tag_attribute(tag, detail_url_config, "href")
        vid = detail_url.split("/")[-1].strip(".html")
        m_detail_url = detail_url.replace(".com/", ".cn/")
        detail_json_url = detail_info_template.format(vid=vid)
        jsond_data = http.download_json(url=detail_json_url)
        video_info = jsond_data[vid]
        video.title = video_info["video_title"]
        video.n_comment = int(video_info["video_raw_comment_num"])
        video.n_read = video_info["video_raw_play_num"]
        video.n_like = int(video_info["video_raw_support"])
        video.tags = ";".join(video_info["video_tags"])
        video.publish_ori_name = video_info["user_nickname"]
        video.publish_ori_icon = video_info["user_avatar"]
        video.publish_time = format_datetime_string(
            video_info["video_upload_time"])
        video.publish_ori_url = video_info["video_url"]
        video.thumbnail = video_info["video_big_cover"]
        video.duration = int(video_info["video_raw_duration"])
        m_detail_content = http.download_html(url=m_detail_url)
        video.src = video_src_re.findall(m_detail_content)[0]
        videos.append(video)
        sleep(0.2)
    return videos
示例#16
0
def joke_helegehe_parser(url):
    text_config = {"params": {"selector": "a.contentHerf"}, "method": "select"}
    user_config = {"params": {"selector": "h2"}, "method": "select"}
    user_icon_config = {"params": {"selector": "img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "a.output-leftSupport"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "a.output-leftOpposition"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div.publishedIn"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="article.post")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
示例#17
0
def joke_qiushi_parser(url):
    headers = {
        "User-Agent":
        "qiushibalke_10.8.1_WIFI_auto_19",
        "Source":
        "android_10.8.1",
        "Model":
        "Xiaomi/hydrogen/hydrogen:6.0.1/MMB29M/V7.5.6.0.MBCCNDE:user/release-keys",
        "Uuid":
        "IMEI_8728c26518fa3ae795a7f787073d375f",
        "Deviceidinfo":
        '{"DEVICEID": "862535037295724","SIMNO": "89860112817005617959","IMSI": "460012225499106","ANDROID_ID": "27dafccd6e32bfb2","SDK_INT": 23,"SERIAL"a882d7f9","MAC": "02:00:00:00:00:00","RANDOM": ""}'
    }
    req = http.Request(url=url, headers=headers)
    document = http.download_json(request=req)
    data = document["items"]
    jokes = list()
    for g in data:
        if not g.get("user"):
            continue
        joke = JokeFields()
        joke.publish_ori_name = g["user"]["login"]
        avatar = g["user"].get("thumb")
        if not avatar:
            continue
        if avatar.startswith("//"):
            avatar = "http:" + avatar
        joke.publish_ori_icon = avatar
        joke.publish_time = format_datetime_string(g["created_at"])
        joke.text = g["content"]
        joke.n_comment = int(g.get("comments_count", 0))
        if g.get("votes"):
            joke.n_like = int(g["votes"]["up"])
            joke.n_dislike = int(g["votes"]["down"])
        jokes.append(joke)
    return jokes
示例#18
0
def video_acfun_parser(url):
    # http://www.acfun.cn/list/getlist?channelId=134&sort=0&pageSize=20&pageNo=1

    def get_video_src(vid):
        # 获取视频地址
        main_parse_url = "http://www.acfun.tv/video/getVideo.aspx?id=%s" % vid
        info = http.download_json(url=main_parse_url)
        sourceType = info['sourceType']
        if sourceType != 'zhuzhan':
            return []
        encode = info['encode']
        pass
        return vid

    json_data = http.download_json(url=url)
    item_list = json_data["data"]["data"]
    videos = list()
    for item in item_list:
        video = VideoFields()
        video.title = item["title"]
        video.n_comment = int(item["commentCount"])
        video.n_read = int(item["viewCountFormat"])
        video.n_like = None
        video.tags = None
        video.publish_ori_name = item["username"]
        video.publish_ori_icon = item["userAvatar"]
        video.publish_time = format_datetime_string(
            item["contributeTimeFormat"])
        video.publish_ori_url = urljoin(url, item["link"])
        video.thumbnail = item["coverImage"]
        video.duration = int(item["duration"])
        vid = item["videoId"]
        video.src = get_video_src(vid)
        videos.append(video)
        sleep(0.2)
    return videos
示例#19
0
def video_autohome_parser(url):
    body = http.download_html(url=url)
    autohome_vid_re = re.compile(r'vid=(.*?)&|vid: \"(.*?)\"')
    video_info_url_template = "http://p-vp.autohome.com.cn/api/gmi?mid={mid}&useragent=Android"
    title_config = {
        "params": {
            "selector": "div.video-item-tit > a"
        },
        "method": "select"
    }
    detail_config = {
        "params": {
            "selector": "div.video-item-tit > a"
        },
        "method": "select"
    }
    publish_time_config = {
        "params": {
            "selector": "div:nth-of-type(3) span:nth-of-type(3)"
        },
        "method": "select"
    }
    publish_name_config = {
        "params": {
            "selector": "a#author_nickName"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "img#author_headimageurl"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "span.videocom"
        },
        "method": "select"
    }
    read_config = {
        "params": {
            "selector": "span.count-eye"
        },
        "method": "select"
    }
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.video-item")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.title = get_tag_attribute(tag, title_config, "text")
        video.publish_time = get_tag_attribute(tag, publish_time_config,
                                               "text")
        video.publish_time = format_datetime_string(video.publish_time)
        video.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        video.n_read = get_tag_attribute_int(tag, read_config, "text")
        detail_url = urljoin(url, get_tag_attribute(tag, detail_config,
                                                    "href"))
        try:
            req = http.Request(url=detail_url)
            response = http.download(req)
            _, content = http.response_url_content(response)
            vid_one, vid_two = autohome_vid_re.findall(content)[0]
            vid = vid_one if vid_one else vid_two
            soup = BeautifulSoup(content, "lxml")
            ts = soup.select("div.card-label > a") or soup.select(
                "a.video-label")
            video.tags = ";".join(
                [extract_tag_attribute(t, "text") for t in ts])
            kinenames = ";".join([
                extract_tag_attribute(t, "text")
                for t in soup.select("a.kindname")
            ])
            if kinenames:
                video.tags += ";" + kinenames
            video.publish_ori_name = get_tag_attribute(soup,
                                                       publish_name_config,
                                                       "text")
            video.publish_ori_icon = get_tag_attribute(soup,
                                                       publish_icon_config,
                                                       "src")
            if video.publish_ori_icon:
                _u = urljoin(url, video.publish_ori_icon)
                video.publish_ori_icon = remove_url_query_params(_u)
        except Exception:
            continue
        info_url = video_info_url_template.format(mid=vid)
        try:
            req = http.Request(url=info_url)
            response = http.download(req)
            content = response.body[5:-1]
            info = json.loads(content)
        except Exception as e:
            try:
                content = response.body
                info = json.loads(content)
            except:
                continue
        if int(info["status"]) == 0:
            continue
        video.src = remove_url_query_params(info["copies"][-1]["playurl"])
        video.publish_ori_url = detail_url
        video.thumbnail = info["img"]
        video.duration = int(info["duration"])
        videos.append(video)
        sleep(0.2)
    return videos
示例#20
0
def joke_budejie_parser(url):
    text_config = {
        "params": {
            "selector": "div.j-r-list-c-desc > a"
        },
        "method": "select"
    }
    user_config = {"params": {"selector": "img.u-logo"}, "method": "select"}
    user_icon_config = {
        "params": {
            "selector": "img.u-logo"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-up"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-down"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "li.j-comment"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "span.u-time"
        },
        "method": "select"
    }
    repost_config = {
        "params": {
            "selector": "div.j-r-list-tool-ct-share-c"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.j-r-list > ul > li")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "alt")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config,
                                                  "data-original")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        joke.n_repost = get_tag_attribute_int(tag, repost_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        jokes.append(joke)
    return jokes
示例#21
0
def run_list_task(_id, debug=False):
    """  列表页下载解析任务(耗时任务) 
    
    :param _id: thirdparty spider_config 表 _id
    :type _id: str
    :return: 新插入的 COL_REQUESTS 表的 _id 列表 
    :rtype: list of str
    """
    config = db[COL_CONFIGS].find_one({"_id": ObjectId(_id)})
    channel = db[COL_CHANNELS].find_one({"_id": ObjectId(config["channel"])})
    if channel["site"] == "585b6f3f3deaeb61dd2e288b":  # 百度参数需添加 ts 字段
        config["request"]["params"]["ts"] = [int(time.time())]
    elif channel["site"] == "5862342c3deaeb61dd2e2890":  # 号外参数需要添加 lastTime 字段
        config["request"]["params"]["lastTime"] = datetime.now().strftime(
            "%Y%m%d%H%M%S")
    elif channel[
            "site"] == "5875f46e3deaeb61dd2e2898":  # umei.cc 需要添加时间戳,保证页面更新
        config["request"]["params"]["_"] = [int(time.time())]
    elif channel["site"] == "57a4092eda083a0e80a709c1" and config["channel"] \
            in ["594b9a07921e6d1615df7afb", "594b99b6921e6d1615df7af9",
                "594b9985921e6d1615df7af7", "594b9951921e6d1615df7af5",
                "594b98fd921e6d1615df7af3"]:  # 新浪热点新闻需要添加 top_time 字段
        config["request"]["params"]["top_time"] = datetime.now().strftime(
            "%Y%m%d")
    elif channel["site"] == "579ee39fda083a625d1f4ad5" and config[
            "crawler"] == "toutiaoapp":
        ms = tmsnow()
        s = ms / 1000
        config["request"]["params"]["_rticket"] = ms
        config["request"]["params"]["last_refresh_sub_entrance_interval"] = s
        config["request"]["params"]["min_behot_time"] = s - 7200
    req = request_from_config_request(config["request"])
    response = http.download(req)
    url, content = http.response_url_content(response)
    if channel["site"] == "5862342c3deaeb61dd2e2890":  # 号外列表页有下载
        result = parse_list_haowai(document=content, url=url)
    else:
        result = FeedParser(document=content,
                            crawler=config["crawler"],
                            url=url)
    if len(result) == 0:  # Todo: 列表页解析失败
        logging.error("List parse error channel: %s config: %s" %
                      (config["channel"], _id))
        return None
    if debug:
        logging.info("List length: %s config: %s" % (len(result), _id))
        return result
    ids = list()
    for item in result:
        middle = _request_doc_from_config_channel(config, channel)
        fields = ListFields()
        fields.url = item["url"]
        fields.title = item.get("title", "")
        fields.publish_time = format_datetime_string(
            item.get("publish_time", ""))
        fields.publish_ori_name = item.get("publish_site") or item.get(
            "author", "")
        fields.abstract = item.get("abstract", "")
        fields.tags = item.get("keywords", "")
        fields.html = item.get("html", "")
        if item.get("thumb"):
            fields.thumbs.append(item["thumb"])
        comment_id = item.get("comment_id", "")
        if comment_id:  # 为网易和天天快报生成评论抓取链接
            fields.comment = get_comment_url(channel["site"], comment_id)
        middle["list_fields"] = fields.to_dict()
        middle["pages"] = [{"url": item["url"], "html": ""}]
        middle["unique"] = item["url"]  # 以 url 作为唯一性约束,避免重复抓取 TODO: 归一化url
        middle["procedure"] = PROCEDURE_LIST_TASK
        try:
            r = db[COL_REQUESTS].insert_one(middle)  # fixme: 插入失败
        except DuplicateKeyError:
            pass
        except Exception as e:
            logging.error(e.message, exc_info=True)
        else:
            ids.append(str(r.inserted_id))
    return ids
示例#22
0
def video_pearvideo_parser(url):
    def format_duration(d_text):
        duration = map(lambda x: int(x), d_text.split(":"))
        duration = filter(lambda y: y != 0, duration)
        length = len(duration)
        result = 0
        for i in range(length, 0, -1):
            result += duration[length - i] * pow(60, i - 1)
        return int(result)

    def get_detail_info(url):
        meta = {}
        content = http.download_html(url=url)
        soup = BeautifulSoup(content, "lxml")
        meta["src"] = src_re.findall(content)[0]
        meta["name"] = get_tag_attribute(soup, publish_name_config, "alt")
        meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src")
        meta["time"] = get_tag_attribute(soup, publish_time_config, "text")
        meta["thumbnail"] = get_tag_attribute(soup, cover_config, "src")
        return meta

    detail_url_config = {
        "params": {
            "selector": "a.vervideo-lilink"
        },
        "method": "select"
    }
    title_config = {
        "params": {
            "selector": "div.vervideo-title"
        },
        "method": "select"
    }
    duration_config = {
        "params": {
            "selector": "div.duration"
        },
        "method": "select"
    }
    num_like_config = {"params": {"selector": "span.fav"}, "method": "select"}
    publish_name_config = {
        "params": {
            "selector": "div.thiscat img"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "div.thiscat img"
        },
        "method": "select"
    }
    cover_config = {
        "params": {
            "selector": "div#poster img"
        },
        "method": "select"
    }
    publish_time_config = {
        "params": {
            "selector": "div.details-content div.date"
        },
        "method": "select"
    }
    src_re = re.compile('dUrl="(.*?)"')
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="li.categoryem ")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.publish_ori_url = get_tag_attribute(tag, detail_url_config,
                                                  "href")
        video.publish_ori_url = urljoin(url, video.publish_ori_url)
        video.title = get_tag_attribute(tag, title_config, "text")
        video.duration = get_tag_attribute(tag, duration_config, "text")
        video.duration = format_duration(video.duration)
        video.n_like = get_tag_attribute_int(tag, num_like_config, "text")
        meta = get_detail_info(video.publish_ori_url)
        video.publish_ori_name = meta["name"]
        video.publish_ori_icon = meta["icon"]
        video.publish_time = meta["time"]
        video.publish_time = format_datetime_string(video.publish_time)
        video.thumbnail = meta["thumbnail"]
        video.src = meta["src"]
        videos.append(video)
        sleep(0.2)
    return videos