Exemplo n.º 1
0
def video_zaker_parser(url):
    document = http.download_json(url=url)
    data = document["data"].get("articles", [])
    videos = list()
    for item in data:
        url = item["full_url"]
        req = http.Request(url=url)
        try:
            response = http.download(req)
            doc = response.json()
        except Exception:
            continue
        detail = doc.get("data")
        if not detail:
            continue
        src = detail["video_info"]["url"]
        if src.endswith("m3u8"):
            src = src.replace("m3u8", "mp4")
        label = detail["video_info"]["video_label"].split(":")[::-1]
        duration = 0
        for n, i in enumerate(label):
            duration += pow(60, n) * int(i)
        video = VideoFields()
        video.title = item["title"]
        video.publish_ori_name = item["auther_name"]
        video.publish_ori_url = item["weburl"]
        video.publish_ori_icon = detail["article_group"]["logo"]["url"]
        video.thumbnail = detail["video_info"]["pic_url"]
        video.duration = duration
        video.src = src
        videos.append(video)
    return videos
Exemplo n.º 2
0
 def _download(url):
     req = Request.from_random_browser(url=url)
     try:
         response = http.download(req)
         url, content = response_url_content(response)
     except Exception:
         url, content = http.stable_download_content(url)
     return url, content
Exemplo n.º 3
0
 def post(self, *args, **kwargs):
     url = self.get_body_argument("url")
     refer = self.get_body_argument("refer", None)
     md5 = self.get_body_argument("md5", None)
     if md5 and len(md5) == 32:
         document = {"url": url, "referer": refer, "online": True, "md5": md5}
         try:
             db[COL_ADVERTISEMENT].insert_one(document)
         except DuplicateKeyError:
             pass
         self.write({"code": 200})
     else:
         req = http.Request(url=url, headers={"referer": refer})
         r = http.download(req)
         images = process_image_response(r, TestObjectUploader)
         if not images:
             images = dict()
         self.write(images)
Exemplo n.º 4
0
def video_autohome_parser(url):
    body = http.download_html(url=url)
    autohome_vid_re = re.compile(r'vid=(.*?)&|vid: \"(.*?)\"')
    video_info_url_template = "http://p-vp.autohome.com.cn/api/gmi?mid={mid}&useragent=Android"
    title_config = {
        "params": {
            "selector": "div.video-item-tit > a"
        },
        "method": "select"
    }
    detail_config = {
        "params": {
            "selector": "div.video-item-tit > a"
        },
        "method": "select"
    }
    publish_time_config = {
        "params": {
            "selector": "div:nth-of-type(3) span:nth-of-type(3)"
        },
        "method": "select"
    }
    publish_name_config = {
        "params": {
            "selector": "a#author_nickName"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "img#author_headimageurl"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "span.videocom"
        },
        "method": "select"
    }
    read_config = {
        "params": {
            "selector": "span.count-eye"
        },
        "method": "select"
    }
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.video-item")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.title = get_tag_attribute(tag, title_config, "text")
        video.publish_time = get_tag_attribute(tag, publish_time_config,
                                               "text")
        video.publish_time = format_datetime_string(video.publish_time)
        video.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        video.n_read = get_tag_attribute_int(tag, read_config, "text")
        detail_url = urljoin(url, get_tag_attribute(tag, detail_config,
                                                    "href"))
        try:
            req = http.Request(url=detail_url)
            response = http.download(req)
            _, content = http.response_url_content(response)
            vid_one, vid_two = autohome_vid_re.findall(content)[0]
            vid = vid_one if vid_one else vid_two
            soup = BeautifulSoup(content, "lxml")
            ts = soup.select("div.card-label > a") or soup.select(
                "a.video-label")
            video.tags = ";".join(
                [extract_tag_attribute(t, "text") for t in ts])
            kinenames = ";".join([
                extract_tag_attribute(t, "text")
                for t in soup.select("a.kindname")
            ])
            if kinenames:
                video.tags += ";" + kinenames
            video.publish_ori_name = get_tag_attribute(soup,
                                                       publish_name_config,
                                                       "text")
            video.publish_ori_icon = get_tag_attribute(soup,
                                                       publish_icon_config,
                                                       "src")
            if video.publish_ori_icon:
                _u = urljoin(url, video.publish_ori_icon)
                video.publish_ori_icon = remove_url_query_params(_u)
        except Exception:
            continue
        info_url = video_info_url_template.format(mid=vid)
        try:
            req = http.Request(url=info_url)
            response = http.download(req)
            content = response.body[5:-1]
            info = json.loads(content)
        except Exception as e:
            try:
                content = response.body
                info = json.loads(content)
            except:
                continue
        if int(info["status"]) == 0:
            continue
        video.src = remove_url_query_params(info["copies"][-1]["playurl"])
        video.publish_ori_url = detail_url
        video.thumbnail = info["img"]
        video.duration = int(info["duration"])
        videos.append(video)
        sleep(0.2)
    return videos
Exemplo n.º 5
0
def video_thepaper_parser(url):
    body = http.download_html(url=url)
    thepaper_video_url_re = re.compile(r'source src="(.*?)" type="video/mp4"')
    detail_config = {"params": {"selector": "a"}, "method": "select"}
    title_config = {
        "params": {
            "selector": "div.video_title"
        },
        "method": "select"
    }
    user_name_config = {
        "params": {
            "selector": "div.t_source > a"
        },
        "method": "select"
    }
    thumbnail_config = {
        "params": {
            "selector": "div.video_list_pic > img"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "div.video_txt_r_icon img"
        },
        "method": "select"
    }
    duration_config = {
        "params": {
            "selector": "div.video_list_pic > span.p_time"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "div.t_source > span.reply"
        },
        "method": "select"
    }
    description_config = {"params": {"selector": "p"}, "method": "select"}
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector=".video_news")
    videos = list()
    for tag in tags:
        url = urljoin("http://www.thepaper.cn/",
                      get_tag_attribute(tag, detail_config, "href"))
        try:
            req = http.Request(url=url)
            response = http.download(req)
            _, content = http.response_url_content(response)
            video_url = unquote_plus(thepaper_video_url_re.findall(content)[0])
        except Exception:
            continue
        video = VideoFields()
        video.title = get_tag_attribute(tag, title_config, "text")
        video.src = video_url
        video.publish_ori_url = url
        video.publish_ori_name = get_tag_attribute(tag, user_name_config,
                                                   "text")
        video.publish_ori_name = video.publish_ori_name.replace(
            u"@所有人", u"澎湃视频")
        video.thumbnail = get_tag_attribute(tag, thumbnail_config, "src")
        video.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        video.description = get_tag_attribute(tag, description_config, "text")
        string = get_tag_attribute(tag, duration_config, "text")
        if string:
            try:
                m, s = string.split(":")
                second = int(m) * 60 + int(s)
            except Exception:
                pass
            else:
                video.duration = second
        detail = BeautifulSoup(content, "lxml")
        video.publish_ori_icon = get_tag_attribute(detail, user_icon_config,
                                                   "src")
        videos.append(video)
    return videos
Exemplo n.º 6
0
def run_list_task(_id, debug=False):
    """  列表页下载解析任务(耗时任务) 
    
    :param _id: thirdparty spider_config 表 _id
    :type _id: str
    :return: 新插入的 COL_REQUESTS 表的 _id 列表 
    :rtype: list of str
    """
    config = db[COL_CONFIGS].find_one({"_id": ObjectId(_id)})
    channel = db[COL_CHANNELS].find_one({"_id": ObjectId(config["channel"])})
    if channel["site"] == "585b6f3f3deaeb61dd2e288b":  # 百度参数需添加 ts 字段
        config["request"]["params"]["ts"] = [int(time.time())]
    elif channel["site"] == "5862342c3deaeb61dd2e2890":  # 号外参数需要添加 lastTime 字段
        config["request"]["params"]["lastTime"] = datetime.now().strftime(
            "%Y%m%d%H%M%S")
    elif channel[
            "site"] == "5875f46e3deaeb61dd2e2898":  # umei.cc 需要添加时间戳,保证页面更新
        config["request"]["params"]["_"] = [int(time.time())]
    elif channel["site"] == "57a4092eda083a0e80a709c1" and config["channel"] \
            in ["594b9a07921e6d1615df7afb", "594b99b6921e6d1615df7af9",
                "594b9985921e6d1615df7af7", "594b9951921e6d1615df7af5",
                "594b98fd921e6d1615df7af3"]:  # 新浪热点新闻需要添加 top_time 字段
        config["request"]["params"]["top_time"] = datetime.now().strftime(
            "%Y%m%d")
    elif channel["site"] == "579ee39fda083a625d1f4ad5" and config[
            "crawler"] == "toutiaoapp":
        ms = tmsnow()
        s = ms / 1000
        config["request"]["params"]["_rticket"] = ms
        config["request"]["params"]["last_refresh_sub_entrance_interval"] = s
        config["request"]["params"]["min_behot_time"] = s - 7200
    req = request_from_config_request(config["request"])
    response = http.download(req)
    url, content = http.response_url_content(response)
    if channel["site"] == "5862342c3deaeb61dd2e2890":  # 号外列表页有下载
        result = parse_list_haowai(document=content, url=url)
    else:
        result = FeedParser(document=content,
                            crawler=config["crawler"],
                            url=url)
    if len(result) == 0:  # Todo: 列表页解析失败
        logging.error("List parse error channel: %s config: %s" %
                      (config["channel"], _id))
        return None
    if debug:
        logging.info("List length: %s config: %s" % (len(result), _id))
        return result
    ids = list()
    for item in result:
        middle = _request_doc_from_config_channel(config, channel)
        fields = ListFields()
        fields.url = item["url"]
        fields.title = item.get("title", "")
        fields.publish_time = format_datetime_string(
            item.get("publish_time", ""))
        fields.publish_ori_name = item.get("publish_site") or item.get(
            "author", "")
        fields.abstract = item.get("abstract", "")
        fields.tags = item.get("keywords", "")
        fields.html = item.get("html", "")
        if item.get("thumb"):
            fields.thumbs.append(item["thumb"])
        comment_id = item.get("comment_id", "")
        if comment_id:  # 为网易和天天快报生成评论抓取链接
            fields.comment = get_comment_url(channel["site"], comment_id)
        middle["list_fields"] = fields.to_dict()
        middle["pages"] = [{"url": item["url"], "html": ""}]
        middle["unique"] = item["url"]  # 以 url 作为唯一性约束,避免重复抓取 TODO: 归一化url
        middle["procedure"] = PROCEDURE_LIST_TASK
        try:
            r = db[COL_REQUESTS].insert_one(middle)  # fixme: 插入失败
        except DuplicateKeyError:
            pass
        except Exception as e:
            logging.error(e.message, exc_info=True)
        else:
            ids.append(str(r.inserted_id))
    return ids