def joke_khdx_parser(url): text_config = {"params": {"selector": "dd.content"}, "method": "select"} user_config = {"params": {"selector": "p.user > a"}, "method": "select"} user_icon_config = {"params": {"selector": "img"}, "method": "select"} like_config = { "params": { "selector": "a.ding > div > i" }, "method": "select" } dislike_config = { "params": { "selector": "a.cai > div > i" }, "method": "select" } pb_time_config = {"params": {"selector": "span.fr"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="dl.main-list") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.publish_ori_icon = urljoin(url, joke.publish_ori_icon) joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def general_list(list_page_info): def _request_doc_from_config_channel(config): doc = dict() doc["channel"] = config["channel"] doc["config"] = str(config["_id"]) doc["form"] = config["form"] doc["site"] = config["site"] doc["time"] = utc_datetime_now() doc["fields"] = dict() doc["unique"] = "" doc["procedure"] = -1 return doc content = http.download_html(url=list_page_info["url"]) result = FeedParser(document=content, crawler=list_page_info["crawler"], url=list_page_info["url"]) ids = list() for item in result: middle = _request_doc_from_config_channel(list_page_info) fields = ListFields() fields.url = item["url"] fields.title = item.get("title", "") fields.publish_time = format_datetime_string( item.get("publish_time", "")) fields.publish_ori_name = item.get("publish_site") or item.get( "author", "") fields.abstract = item.get("abstract", "") fields.tags = item.get("keywords", "") fields.html = item.get("html", "") if item.get("thumb"): fields.thumbs.append(item["thumb"]) middle["list_fields"] = fields.to_dict() middle["pages"] = [{"url": item["url"], "html": ""}] middle["unique"] = item["url"] # 以 url 作为唯一性约束,避免重复抓取 TODO: 归一化url middle["procedure"] = PROCEDURE_LIST_TASK try: r = db.v1_request.insert_one(middle) # fixme: 插入失败 except DuplicateKeyError: print "DuplicateKeyError" except Exception as e: print e else: print "MONGO Insert Success" ids.append(str(r.inserted_id)) next_key = "v1:spider:task:download:id" if not ids: return if isinstance(ids, list): redis.sadd(next_key, *ids) print "REDIS Add Success" elif id: redis.sadd(next_key, ids) print "REDIS Add Success" else: print "REDIS Add Faild"
def video_gifcool_parser(url): # http://www.gifcool.com/xsp/ def get_like_dislike(id): url = "http://www.gifcool.com/plus/digg_ajax_index.php?id=%s" % id content = http.download_html(url=url) n_like = int(num_like_config.findall(content)[0]) n_dislike = int(num_dislike_config.findall(content)[0]) return n_like, n_dislike detail_url_config = { "params": { "selector": "div.title a" }, "method": "select" } title_config = {"params": {"selector": "div.title a"}, "method": "select"} publish_time_config = { "params": { "selector": "span.g9.ml50" }, "method": "select" } src_config = {"params": {"selector": "video"}, "method": "select"} cover_config = {"params": {"selector": "video"}, "method": "select"} num_like_config = re.compile('<i class="up"></i>(\d+)<s>') num_dislike_config = re.compile('<i class="down"></i>(\d+)<s>') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.main > ul > li") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.publish_ori_name = "姐夫酷" video.publish_ori_icon = None video.publish_time = get_tag_attribute(soup, publish_time_config, "text") video.publish_time = format_datetime_string(video.publish_time) video.src = get_tag_attribute(tag, src_config, "src") video.thumbnail = get_tag_attribute(tag, cover_config, "poster") video.thumbnail = urljoin(url, video.thumbnail) vid = video.publish_ori_url.split("/")[-1].strip(".html") n_like, n_dislike = get_like_dislike(vid) video.n_like = n_like video.n_dislike = n_dislike videos.append(video) sleep(0.2) return videos
def joke_biedoul_parser(url): title_config = { "params": { "selector": "div.dz-list-con > a > p" }, "method": "select" } text_config = { "params": { "selector": "div.dz-list-con > p" }, "method": "select" } user_config = { "params": { "selector": "div.dz-username > a" }, "method": "select" } user_icon_config = { "params": { "selector": "div.user-portrait > img.avatar" }, "method": "select" } like_config = {"params": {"selector": "a.zanUp"}, "method": "select"} dislike_config = {"params": {"selector": "a.zanDown"}, "method": "select"} pb_time_config = { "params": { "selector": "div.dz-username > span" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.lcommon.dz-bg > div") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def run_detail_task(_id): """ 解析详情页(非耗时任务) :param _id: COL_REQUESTS 表 _id :type _id: str :return: COL_REQUESTS 表 _id :rtype: str """ query = {"_id": ObjectId(_id)} collection = db[COL_REQUESTS] request = collection.find_one(query) pages = request["pages"] if request["form"] == FORM_NEWS: news = NewsFields() elif request["form"] == FORM_ATLAS: news = AtlasFields() else: raise NotSupportError("run detail task not support %s" % request["form"]) url, html = pages[0]["url"], pages[0]["html"] result = DetailParser(url=url, document=html) if not result["support"]: logging.error("Detail parse error(domain not support): %s" % _id) update = {"$set": {"procedure": PROCEDURE_DETAIL_NOT_SUPPORT_DOMAIN}} elif result["missing"]: logging.warning("Detail parse warn(miss some fields): %s" % _id) update = {"$set": {"procedure": PROCEDURE_DETAIL_MISS_FIELD}} else: news.title = result["title"] news.publish_time = format_datetime_string(result["date"]) news.publish_ori_name = result["source"] or result["author"] if result["summary"]: news.abstract = result["summary"] if result["tags"]: news.tags = result["tags"] news.content = result["content"] news.publish_ori_url = url for page in request["pages"][1:]: result = DetailParser(url=page["url"], document=page["html"]) news.content.extend(result["content"]) update = { "$set": { "procedure": PROCEDURE_DETAIL_TASK, "fields": news.to_dict() } } collection.update_one(query, update=update) return _id if update["$set"]["procedure"] == PROCEDURE_DETAIL_TASK else None
def joke_neihan_parser(url): document = http.download_json(url=url) groups = document["data"]["data"] jokes = list() for g in groups: g = g["group"] joke = JokeFields() joke.publish_ori_name = g["user"]["name"] joke.publish_ori_icon = g["user"]["avatar_url"] joke.publish_time = format_datetime_string(g["create_time"]) joke.text = g["text"] joke.n_comment = int(g["comment_count"]) joke.n_like = int(g["digg_count"]) joke.n_dislike = int(g["bury_count"]) # _comment_need = g["code"] # 评论需要该字段 jokes.append(joke) return jokes
def video_yingtu_parser(url): # https://app.yingtu.co/v1/interaction/topic/video/list [post] # {"data":{"topicId":"861232236534439936","pageId":0},"userId":"1501646183777","source":"h5"}: def download_this(url): import requests from urlparse import urlparse from urlparse import parse_qs a = urlparse(url) query_field = parse_qs(a.query) tid = query_field["topicId"][0] uid = query_field["userId"][0] params = '{"data":{"topicId":"%s","pageId":0},"userId":"%s","source":"h5"}' params = params % (tid, uid) headers = {"Content-Type": "application/x-www-form-urlencoded"} url_base = "https://app.yingtu.co/v1/interaction/topic/video/list" resp = requests.post(url=url_base, data=params, headers=headers) return resp.json() def format_duration(d_text): duration = map(lambda x: int(x), d_text.split(":")) duration = filter(lambda y: y != 0, duration) length = len(duration) result = 0 for i in range(length, 0, -1): result += duration[length - i] * pow(60, i - 1) return int(result) json_data = download_this(url) item_list = json_data["data"].get("videoList", []) videos = list() for item in item_list: video = VideoFields() video.title = item["videoName"] video.publish_ori_name = item["creatorName"] video.publish_ori_url = item["videoPlayUrl"] video.thumbnail = item["videoCoverUrl"] video.duration = item["videoDuration"] video.duration = format_duration(video.duration) video.src = item["videoPlayUrl"] video.publish_time = format_datetime_string(item['createTime']) video.n_read = int(item["videoPlayCount"]) video.n_repost = int(item["videoShareCount"]) video.n_like = int(item["videoFavorCount"]) videos.append(video) return videos
def joke_fun48_parser(url): def get_full_content(ori_url): text_config = { "params": { "selector": "article.article" }, "method": "select" } document = http.download_html(url=ori_url) soup = BeautifulSoup(document, "lxml") text = get_tag_attribute(soup, text_config, "text") return text title_config = { "params": { "selector": "div.texttitle > a" }, "method": "select" } ori_url_config = { "params": { "selector": "div.texttitle > a" }, "method": "select" } pb_time_config = { "params": { "selector": "div.card-info" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#isonormal > div") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_url = get_tag_attribute(tag, ori_url_config, "href") joke.text = get_full_content(joke.publish_ori_url) joke.title = get_tag_attribute(tag, title_config, "text") joke.text = joke.text.strip("[...]") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def video_ifeng_parser(url): # http://v.ifeng.com/vlist/channel/85/showData/first_more.js body = http.download_html(url=url)[10:-2] detail_url_config = {"params": {"selector": "a"}, "method": "select"} video_info_re = re.compile(r"var videoinfo =(.*?);", re.S) video_src_re = re.compile(r'"gqSrc":"(.*?)"') soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="ul > li") videos = list() def get_detail_content(detail_url): detail_html = http.download_html(url=detail_url) video_info = video_info_re.findall(detail_html)[0] video_info = video_info.replace("'", '"') video_json = json.loads(video_info) return video_json def get_video_src(id): video_info_url = "http://tv.ifeng.com/h6/{}_/video.json".format(id) v_content = http.download_html(url=video_info_url) result = video_src_re.findall(v_content) print result if result: return result[0] else: return None for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") detail_info = get_detail_content(video.publish_ori_url) video.title = detail_info["name"] video.publish_time = detail_info["createdate"] video.publish_time = format_datetime_string(video.publish_time) video.tags = ";".join(detail_info["keywords"].split()) video.publish_ori_name = "凤凰视频" video.publish_ori_icon = None video.thumbnail = detail_info["videoLargePoster"] video.duration = int(detail_info["duration"]) id = detail_info["id"] video.src = get_video_src(id) videos.append(video) sleep(0.2) return videos
def joke_duanzidao_parser(url): text_config = {"params": {"selector": "div.article"}, "method": "select"} user_config = { "params": { "selector": "table.author td > ul > li > a" }, "method": "select" } user_icon_config = { "params": { "selector": "td.avatar img" }, "method": "select" } like_config = { "params": { "selector": "em.good-btn > span" }, "method": "select" } dislike_config = { "params": { "selector": "em.bad-btn > span" }, "method": "select" } pb_time_config = {"params": {"selector": "table"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#main > div.panel") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def video_meipai_parser(url): documents = http.download_json(url=url) data = [doc["media"] for doc in documents if doc["type"] == "media"] videos = list() for item in data: video = VideoFields() video.title = item["caption"] video.publish_time = format_datetime_string(item["created_at"]) video.publish_ori_url = item["url"] video.publish_ori_name = item["user"]["screen_name"] video.publish_ori_icon = item["user"]["avatar"] video.src = item["video"] video.thumbnail = item["cover_pic"] video.duration = int(item.get("time", 0)) video.n_like = int(item.get("likes_count", 0)) video.n_comment = int(item.get("comments_count", 0)) video.n_repost = int(item.get("reposts_count", 0)) video.tags = g_tags(video.title) videos.append(video) return videos
def video_kuaishou_parser(url): documents = http.download_json(url=url) data = documents.get("feeds", []) videos = list() for item in data: urls = item.get("main_mv_urls") thumbs = item.get("cover_thumbnail_urls") avatars = item.get("headurls") if not (urls and thumbs and avatars): continue video = VideoFields() video.title = item["caption"] video.publish_time = format_datetime_string(item["timestamp"]) video.publish_ori_name = item["user_name"] video.publish_ori_url = avatars[0]["url"] video.src = urls[0]["url"] video.thumbnail = thumbs[0]["url"] video.duration = int(item["ext_params"].get("video", 0) / 1000.0) videos.append(video) return videos
def joke_caoegg_parser(url): text_config = { "params": { "selector": "div.c > a > span" }, "method": "select" } like_config = { "params": { "selector": "div#dateright span.voteyes > font" }, "method": "select" } dislike_config = { "params": { "selector": "div#dateright span.voteno > font" }, "method": "select" } pb_time_config = { "params": { "selector": "div#dateright" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#wrap_info > div.infobox") jokes = list() for tag in tags: joke = JokeFields() joke.text = get_tag_attribute(tag, text_config, "text") joke.text = joke.text.strip("What a f*****g day!") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_nbsw_parser(url): text_config = {"params": {"selector": "div.ecae > p"}, "method": "select"} user_config = {"params": {"selector": "a.local-link"}, "method": "select"} user_icon_config = { "params": { "selector": "img.avatar" }, "method": "select" } like_config = {"params": {"selector": "div.count-box"}, "method": "select"} comment_config = { "params": { "selector": "span.wppviews" }, "method": "select" } pb_time_config = { "params": { "selector": "span.meta > abbr" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="ul#postlist > li") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.text = joke.text.strip("[...]") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def video_duowan_parser(url): detail_info_template = "http://video.duowan.com/jsapi/playPageVideoInfo/?vids={vid}" detail_url_config = { "params": { "selector": "a.uiVideo__ori" }, "method": "select" } video_src_re = re.compile('<video src="(.*?)" id="video"') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.uiVideo__item") videos = list() for tag in tags: video = VideoFields() detail_url = get_tag_attribute(tag, detail_url_config, "href") vid = detail_url.split("/")[-1].strip(".html") m_detail_url = detail_url.replace(".com/", ".cn/") detail_json_url = detail_info_template.format(vid=vid) jsond_data = http.download_json(url=detail_json_url) video_info = jsond_data[vid] video.title = video_info["video_title"] video.n_comment = int(video_info["video_raw_comment_num"]) video.n_read = video_info["video_raw_play_num"] video.n_like = int(video_info["video_raw_support"]) video.tags = ";".join(video_info["video_tags"]) video.publish_ori_name = video_info["user_nickname"] video.publish_ori_icon = video_info["user_avatar"] video.publish_time = format_datetime_string( video_info["video_upload_time"]) video.publish_ori_url = video_info["video_url"] video.thumbnail = video_info["video_big_cover"] video.duration = int(video_info["video_raw_duration"]) m_detail_content = http.download_html(url=m_detail_url) video.src = video_src_re.findall(m_detail_content)[0] videos.append(video) sleep(0.2) return videos
def joke_helegehe_parser(url): text_config = {"params": {"selector": "a.contentHerf"}, "method": "select"} user_config = {"params": {"selector": "h2"}, "method": "select"} user_icon_config = {"params": {"selector": "img"}, "method": "select"} like_config = { "params": { "selector": "a.output-leftSupport" }, "method": "select" } dislike_config = { "params": { "selector": "a.output-leftOpposition" }, "method": "select" } pb_time_config = { "params": { "selector": "div.publishedIn" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="article.post") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_qiushi_parser(url): headers = { "User-Agent": "qiushibalke_10.8.1_WIFI_auto_19", "Source": "android_10.8.1", "Model": "Xiaomi/hydrogen/hydrogen:6.0.1/MMB29M/V7.5.6.0.MBCCNDE:user/release-keys", "Uuid": "IMEI_8728c26518fa3ae795a7f787073d375f", "Deviceidinfo": '{"DEVICEID": "862535037295724","SIMNO": "89860112817005617959","IMSI": "460012225499106","ANDROID_ID": "27dafccd6e32bfb2","SDK_INT": 23,"SERIAL"a882d7f9","MAC": "02:00:00:00:00:00","RANDOM": ""}' } req = http.Request(url=url, headers=headers) document = http.download_json(request=req) data = document["items"] jokes = list() for g in data: if not g.get("user"): continue joke = JokeFields() joke.publish_ori_name = g["user"]["login"] avatar = g["user"].get("thumb") if not avatar: continue if avatar.startswith("//"): avatar = "http:" + avatar joke.publish_ori_icon = avatar joke.publish_time = format_datetime_string(g["created_at"]) joke.text = g["content"] joke.n_comment = int(g.get("comments_count", 0)) if g.get("votes"): joke.n_like = int(g["votes"]["up"]) joke.n_dislike = int(g["votes"]["down"]) jokes.append(joke) return jokes
def video_acfun_parser(url): # http://www.acfun.cn/list/getlist?channelId=134&sort=0&pageSize=20&pageNo=1 def get_video_src(vid): # 获取视频地址 main_parse_url = "http://www.acfun.tv/video/getVideo.aspx?id=%s" % vid info = http.download_json(url=main_parse_url) sourceType = info['sourceType'] if sourceType != 'zhuzhan': return [] encode = info['encode'] pass return vid json_data = http.download_json(url=url) item_list = json_data["data"]["data"] videos = list() for item in item_list: video = VideoFields() video.title = item["title"] video.n_comment = int(item["commentCount"]) video.n_read = int(item["viewCountFormat"]) video.n_like = None video.tags = None video.publish_ori_name = item["username"] video.publish_ori_icon = item["userAvatar"] video.publish_time = format_datetime_string( item["contributeTimeFormat"]) video.publish_ori_url = urljoin(url, item["link"]) video.thumbnail = item["coverImage"] video.duration = int(item["duration"]) vid = item["videoId"] video.src = get_video_src(vid) videos.append(video) sleep(0.2) return videos
def video_autohome_parser(url): body = http.download_html(url=url) autohome_vid_re = re.compile(r'vid=(.*?)&|vid: \"(.*?)\"') video_info_url_template = "http://p-vp.autohome.com.cn/api/gmi?mid={mid}&useragent=Android" title_config = { "params": { "selector": "div.video-item-tit > a" }, "method": "select" } detail_config = { "params": { "selector": "div.video-item-tit > a" }, "method": "select" } publish_time_config = { "params": { "selector": "div:nth-of-type(3) span:nth-of-type(3)" }, "method": "select" } publish_name_config = { "params": { "selector": "a#author_nickName" }, "method": "select" } publish_icon_config = { "params": { "selector": "img#author_headimageurl" }, "method": "select" } comment_config = { "params": { "selector": "span.videocom" }, "method": "select" } read_config = { "params": { "selector": "span.count-eye" }, "method": "select" } soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.video-item") videos = list() for tag in tags: video = VideoFields() video.title = get_tag_attribute(tag, title_config, "text") video.publish_time = get_tag_attribute(tag, publish_time_config, "text") video.publish_time = format_datetime_string(video.publish_time) video.n_comment = get_tag_attribute_int(tag, comment_config, "text") video.n_read = get_tag_attribute_int(tag, read_config, "text") detail_url = urljoin(url, get_tag_attribute(tag, detail_config, "href")) try: req = http.Request(url=detail_url) response = http.download(req) _, content = http.response_url_content(response) vid_one, vid_two = autohome_vid_re.findall(content)[0] vid = vid_one if vid_one else vid_two soup = BeautifulSoup(content, "lxml") ts = soup.select("div.card-label > a") or soup.select( "a.video-label") video.tags = ";".join( [extract_tag_attribute(t, "text") for t in ts]) kinenames = ";".join([ extract_tag_attribute(t, "text") for t in soup.select("a.kindname") ]) if kinenames: video.tags += ";" + kinenames video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") if video.publish_ori_icon: _u = urljoin(url, video.publish_ori_icon) video.publish_ori_icon = remove_url_query_params(_u) except Exception: continue info_url = video_info_url_template.format(mid=vid) try: req = http.Request(url=info_url) response = http.download(req) content = response.body[5:-1] info = json.loads(content) except Exception as e: try: content = response.body info = json.loads(content) except: continue if int(info["status"]) == 0: continue video.src = remove_url_query_params(info["copies"][-1]["playurl"]) video.publish_ori_url = detail_url video.thumbnail = info["img"] video.duration = int(info["duration"]) videos.append(video) sleep(0.2) return videos
def joke_budejie_parser(url): text_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } user_config = {"params": {"selector": "img.u-logo"}, "method": "select"} user_icon_config = { "params": { "selector": "img.u-logo" }, "method": "select" } like_config = { "params": { "selector": "li.j-r-list-tool-l-up" }, "method": "select" } dislike_config = { "params": { "selector": "li.j-r-list-tool-l-down" }, "method": "select" } comment_config = { "params": { "selector": "li.j-comment" }, "method": "select" } pb_time_config = { "params": { "selector": "span.u-time" }, "method": "select" } repost_config = { "params": { "selector": "div.j-r-list-tool-ct-share-c" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.j-r-list > ul > li") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "alt") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "data-original") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) joke.n_repost = get_tag_attribute_int(tag, repost_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") jokes.append(joke) return jokes
def run_list_task(_id, debug=False): """ 列表页下载解析任务(耗时任务) :param _id: thirdparty spider_config 表 _id :type _id: str :return: 新插入的 COL_REQUESTS 表的 _id 列表 :rtype: list of str """ config = db[COL_CONFIGS].find_one({"_id": ObjectId(_id)}) channel = db[COL_CHANNELS].find_one({"_id": ObjectId(config["channel"])}) if channel["site"] == "585b6f3f3deaeb61dd2e288b": # 百度参数需添加 ts 字段 config["request"]["params"]["ts"] = [int(time.time())] elif channel["site"] == "5862342c3deaeb61dd2e2890": # 号外参数需要添加 lastTime 字段 config["request"]["params"]["lastTime"] = datetime.now().strftime( "%Y%m%d%H%M%S") elif channel[ "site"] == "5875f46e3deaeb61dd2e2898": # umei.cc 需要添加时间戳,保证页面更新 config["request"]["params"]["_"] = [int(time.time())] elif channel["site"] == "57a4092eda083a0e80a709c1" and config["channel"] \ in ["594b9a07921e6d1615df7afb", "594b99b6921e6d1615df7af9", "594b9985921e6d1615df7af7", "594b9951921e6d1615df7af5", "594b98fd921e6d1615df7af3"]: # 新浪热点新闻需要添加 top_time 字段 config["request"]["params"]["top_time"] = datetime.now().strftime( "%Y%m%d") elif channel["site"] == "579ee39fda083a625d1f4ad5" and config[ "crawler"] == "toutiaoapp": ms = tmsnow() s = ms / 1000 config["request"]["params"]["_rticket"] = ms config["request"]["params"]["last_refresh_sub_entrance_interval"] = s config["request"]["params"]["min_behot_time"] = s - 7200 req = request_from_config_request(config["request"]) response = http.download(req) url, content = http.response_url_content(response) if channel["site"] == "5862342c3deaeb61dd2e2890": # 号外列表页有下载 result = parse_list_haowai(document=content, url=url) else: result = FeedParser(document=content, crawler=config["crawler"], url=url) if len(result) == 0: # Todo: 列表页解析失败 logging.error("List parse error channel: %s config: %s" % (config["channel"], _id)) return None if debug: logging.info("List length: %s config: %s" % (len(result), _id)) return result ids = list() for item in result: middle = _request_doc_from_config_channel(config, channel) fields = ListFields() fields.url = item["url"] fields.title = item.get("title", "") fields.publish_time = format_datetime_string( item.get("publish_time", "")) fields.publish_ori_name = item.get("publish_site") or item.get( "author", "") fields.abstract = item.get("abstract", "") fields.tags = item.get("keywords", "") fields.html = item.get("html", "") if item.get("thumb"): fields.thumbs.append(item["thumb"]) comment_id = item.get("comment_id", "") if comment_id: # 为网易和天天快报生成评论抓取链接 fields.comment = get_comment_url(channel["site"], comment_id) middle["list_fields"] = fields.to_dict() middle["pages"] = [{"url": item["url"], "html": ""}] middle["unique"] = item["url"] # 以 url 作为唯一性约束,避免重复抓取 TODO: 归一化url middle["procedure"] = PROCEDURE_LIST_TASK try: r = db[COL_REQUESTS].insert_one(middle) # fixme: 插入失败 except DuplicateKeyError: pass except Exception as e: logging.error(e.message, exc_info=True) else: ids.append(str(r.inserted_id)) return ids
def video_pearvideo_parser(url): def format_duration(d_text): duration = map(lambda x: int(x), d_text.split(":")) duration = filter(lambda y: y != 0, duration) length = len(duration) result = 0 for i in range(length, 0, -1): result += duration[length - i] * pow(60, i - 1) return int(result) def get_detail_info(url): meta = {} content = http.download_html(url=url) soup = BeautifulSoup(content, "lxml") meta["src"] = src_re.findall(content)[0] meta["name"] = get_tag_attribute(soup, publish_name_config, "alt") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") meta["time"] = get_tag_attribute(soup, publish_time_config, "text") meta["thumbnail"] = get_tag_attribute(soup, cover_config, "src") return meta detail_url_config = { "params": { "selector": "a.vervideo-lilink" }, "method": "select" } title_config = { "params": { "selector": "div.vervideo-title" }, "method": "select" } duration_config = { "params": { "selector": "div.duration" }, "method": "select" } num_like_config = {"params": {"selector": "span.fav"}, "method": "select"} publish_name_config = { "params": { "selector": "div.thiscat img" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.thiscat img" }, "method": "select" } cover_config = { "params": { "selector": "div#poster img" }, "method": "select" } publish_time_config = { "params": { "selector": "div.details-content div.date" }, "method": "select" } src_re = re.compile('dUrl="(.*?)"') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="li.categoryem ") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.duration = get_tag_attribute(tag, duration_config, "text") video.duration = format_duration(video.duration) video.n_like = get_tag_attribute_int(tag, num_like_config, "text") meta = get_detail_info(video.publish_ori_url) video.publish_ori_name = meta["name"] video.publish_ori_icon = meta["icon"] video.publish_time = meta["time"] video.publish_time = format_datetime_string(video.publish_time) video.thumbnail = meta["thumbnail"] video.src = meta["src"] videos.append(video) sleep(0.2) return videos