def joke_360wa_parser(url): title_config = { "params": { "selector": "div.p_left > p.title1 > a" }, "method": "select" } text_config = { "params": { "selector": "div.p_left > p:nth-of-type(2)" }, "method": "select" } like_config = {"params": {"selector": "p.p_ding span"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#recent > div.p1") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") jokes.append(joke) return jokes
def joke_khdx_parser(url): text_config = {"params": {"selector": "dd.content"}, "method": "select"} user_config = {"params": {"selector": "p.user > a"}, "method": "select"} user_icon_config = {"params": {"selector": "img"}, "method": "select"} like_config = { "params": { "selector": "a.ding > div > i" }, "method": "select" } dislike_config = { "params": { "selector": "a.cai > div > i" }, "method": "select" } pb_time_config = {"params": {"selector": "span.fr"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="dl.main-list") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.publish_ori_icon = urljoin(url, joke.publish_ori_icon) joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def get_video_src(id): video_info_url = "http://tv.ifeng.com/h6/{}_/video.json".format(id) v_content = http.download_html(url=video_info_url) result = video_src_re.findall(v_content) print result if result: return result[0] else: return None
def general_list(list_page_info): def _request_doc_from_config_channel(config): doc = dict() doc["channel"] = config["channel"] doc["config"] = str(config["_id"]) doc["form"] = config["form"] doc["site"] = config["site"] doc["time"] = utc_datetime_now() doc["fields"] = dict() doc["unique"] = "" doc["procedure"] = -1 return doc content = http.download_html(url=list_page_info["url"]) result = FeedParser(document=content, crawler=list_page_info["crawler"], url=list_page_info["url"]) ids = list() for item in result: middle = _request_doc_from_config_channel(list_page_info) fields = ListFields() fields.url = item["url"] fields.title = item.get("title", "") fields.publish_time = format_datetime_string( item.get("publish_time", "")) fields.publish_ori_name = item.get("publish_site") or item.get( "author", "") fields.abstract = item.get("abstract", "") fields.tags = item.get("keywords", "") fields.html = item.get("html", "") if item.get("thumb"): fields.thumbs.append(item["thumb"]) middle["list_fields"] = fields.to_dict() middle["pages"] = [{"url": item["url"], "html": ""}] middle["unique"] = item["url"] # 以 url 作为唯一性约束,避免重复抓取 TODO: 归一化url middle["procedure"] = PROCEDURE_LIST_TASK try: r = db.v1_request.insert_one(middle) # fixme: 插入失败 except DuplicateKeyError: print "DuplicateKeyError" except Exception as e: print e else: print "MONGO Insert Success" ids.append(str(r.inserted_id)) next_key = "v1:spider:task:download:id" if not ids: return if isinstance(ids, list): redis.sadd(next_key, *ids) print "REDIS Add Success" elif id: redis.sadd(next_key, ids) print "REDIS Add Success" else: print "REDIS Add Faild"
def get_detail_info(url): meta = {} content = http.download_html(url=url) soup = BeautifulSoup(content, "lxml") meta["src"] = src_re.findall(content)[0] meta["name"] = get_tag_attribute(soup, publish_name_config, "alt") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") meta["time"] = get_tag_attribute(soup, publish_time_config, "text") meta["thumbnail"] = get_tag_attribute(soup, cover_config, "src") return meta
def get_full_content(ori_url): text_config = { "params": { "selector": "article.article" }, "method": "select" } document = http.download_html(url=ori_url) soup = BeautifulSoup(document, "lxml") text = get_tag_attribute(soup, text_config, "text") return text
def video_gifcool_parser(url): # http://www.gifcool.com/xsp/ def get_like_dislike(id): url = "http://www.gifcool.com/plus/digg_ajax_index.php?id=%s" % id content = http.download_html(url=url) n_like = int(num_like_config.findall(content)[0]) n_dislike = int(num_dislike_config.findall(content)[0]) return n_like, n_dislike detail_url_config = { "params": { "selector": "div.title a" }, "method": "select" } title_config = {"params": {"selector": "div.title a"}, "method": "select"} publish_time_config = { "params": { "selector": "span.g9.ml50" }, "method": "select" } src_config = {"params": {"selector": "video"}, "method": "select"} cover_config = {"params": {"selector": "video"}, "method": "select"} num_like_config = re.compile('<i class="up"></i>(\d+)<s>') num_dislike_config = re.compile('<i class="down"></i>(\d+)<s>') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.main > ul > li") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.publish_ori_name = "姐夫酷" video.publish_ori_icon = None video.publish_time = get_tag_attribute(soup, publish_time_config, "text") video.publish_time = format_datetime_string(video.publish_time) video.src = get_tag_attribute(tag, src_config, "src") video.thumbnail = get_tag_attribute(tag, cover_config, "poster") video.thumbnail = urljoin(url, video.thumbnail) vid = video.publish_ori_url.split("/")[-1].strip(".html") n_like, n_dislike = get_like_dislike(vid) video.n_like = n_like video.n_dislike = n_dislike videos.append(video) sleep(0.2) return videos
def video_duowan_parser(url): detail_info_template = "http://video.duowan.com/jsapi/playPageVideoInfo/?vids={vid}" detail_url_config = { "params": { "selector": "a.uiVideo__ori" }, "method": "select" } video_src_re = re.compile('<video src="(.*?)" id="video"') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.uiVideo__item") videos = list() for tag in tags: video = VideoFields() detail_url = get_tag_attribute(tag, detail_url_config, "href") vid = detail_url.split("/")[-1].strip(".html") m_detail_url = detail_url.replace(".com/", ".cn/") detail_json_url = detail_info_template.format(vid=vid) jsond_data = http.download_json(url=detail_json_url) video_info = jsond_data[vid] video.title = video_info["video_title"] video.n_comment = int(video_info["video_raw_comment_num"]) video.n_read = video_info["video_raw_play_num"] video.n_like = int(video_info["video_raw_support"]) video.tags = ";".join(video_info["video_tags"]) video.publish_ori_name = video_info["user_nickname"] video.publish_ori_icon = video_info["user_avatar"] video.publish_time = format_datetime_string( video_info["video_upload_time"]) video.publish_ori_url = video_info["video_url"] video.thumbnail = video_info["video_big_cover"] video.duration = int(video_info["video_raw_duration"]) m_detail_content = http.download_html(url=m_detail_url) video.src = video_src_re.findall(m_detail_content)[0] videos.append(video) sleep(0.2) return videos
def joke_pengfu_parser(url): id_config = {"method": "select", "attribute": "id"} title_config = {"params": {"selector": "h1.dp-b > a"}, "method": "select"} text_config = { "params": { "selector": "div.content-img" }, "method": "select" } user_config = { "params": { "selector": "p.user_name_list > a" }, "method": "select" } user_icon_config = { "params": { "selector": "a.mem-header > img" }, "method": "select" } like_config = {"params": {"selector": "span.ding em"}, "method": "select"} dislike_config = { "params": { "selector": "span.cai em" }, "method": "select" } comment_config = { "params": { "selector": "span.commentClick em" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.list-item") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") # code = get_tag_attribute(tag, id_config, "text") # Comment need jokes.append(joke) return jokes
def joke_biedoul_parser(url): title_config = { "params": { "selector": "div.dz-list-con > a > p" }, "method": "select" } text_config = { "params": { "selector": "div.dz-list-con > p" }, "method": "select" } user_config = { "params": { "selector": "div.dz-username > a" }, "method": "select" } user_icon_config = { "params": { "selector": "div.user-portrait > img.avatar" }, "method": "select" } like_config = {"params": {"selector": "a.zanUp"}, "method": "select"} dislike_config = {"params": {"selector": "a.zanDown"}, "method": "select"} pb_time_config = { "params": { "selector": "div.dz-username > span" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.lcommon.dz-bg > div") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_xiha_parser(url): def get_metas(ids): url = "http://dg.xxhh.com/getcnums/?__jsonp__=fn&ids={ids}".format( ids=",".join(ids)) document = http.download_json(url=url, skip=(3, -1)) metas = dict() for i, meta in enumerate(document.get("d", [])): metas[ids[i]] = (int(meta[0]), int(meta[1]), int(meta[2]) ) # comment, like, dislike return metas user_config = { "params": { "selector": "div.user-info-username > a" }, "method": "select" } user_icon_config = { "params": { "selector": "div.user-avatar40 > a > img" }, "method": "select" } text_config = { "params": { "selector": "div.article > pre" }, "method": "select" } id_config = {"params": {"selector": "div.comment"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.min > div.section") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") _id = get_tag_attribute(tag, id_config, "id") _id = _id.replace("comment-", "") joke.id = _id # Note add id attribute, comment need this filed jokes.append(joke) metas = get_metas([joke.id for joke in jokes]) for joke in jokes: meta = metas[joke.id] joke.n_comment, joke.n_like, joke.n_dislike = meta del joke.id return jokes
def joke_waduanzi_parser(url): title_config = { "params": { "selector": "h2.item-title > a" }, "method": "select" } text_config = { "params": { "selector": "div.item-content" }, "method": "select" } user_config = { "params": { "selector": "div.post-author > a" }, "method": "select" } # user_icon_config = {"params": {"selector": "div.post-author > img"}, "method": "select"} like_config = { "params": { "selector": "div.item-toolbar > ul > li:nth-of-type(1) > a" }, "method": "select" } dislike_config = { "params": { "selector": "div.item-toolbar > ul > li:nth-of-type(2) > a" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.post-item") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") # joke.publish_ori_icon =get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") jokes.append(joke) return jokes
def joke_fun48_parser(url): def get_full_content(ori_url): text_config = { "params": { "selector": "article.article" }, "method": "select" } document = http.download_html(url=ori_url) soup = BeautifulSoup(document, "lxml") text = get_tag_attribute(soup, text_config, "text") return text title_config = { "params": { "selector": "div.texttitle > a" }, "method": "select" } ori_url_config = { "params": { "selector": "div.texttitle > a" }, "method": "select" } pb_time_config = { "params": { "selector": "div.card-info" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#isonormal > div") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_url = get_tag_attribute(tag, ori_url_config, "href") joke.text = get_full_content(joke.publish_ori_url) joke.title = get_tag_attribute(tag, title_config, "text") joke.text = joke.text.strip("[...]") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def video_ifeng_parser(url): # http://v.ifeng.com/vlist/channel/85/showData/first_more.js body = http.download_html(url=url)[10:-2] detail_url_config = {"params": {"selector": "a"}, "method": "select"} video_info_re = re.compile(r"var videoinfo =(.*?);", re.S) video_src_re = re.compile(r'"gqSrc":"(.*?)"') soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="ul > li") videos = list() def get_detail_content(detail_url): detail_html = http.download_html(url=detail_url) video_info = video_info_re.findall(detail_html)[0] video_info = video_info.replace("'", '"') video_json = json.loads(video_info) return video_json def get_video_src(id): video_info_url = "http://tv.ifeng.com/h6/{}_/video.json".format(id) v_content = http.download_html(url=video_info_url) result = video_src_re.findall(v_content) print result if result: return result[0] else: return None for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") detail_info = get_detail_content(video.publish_ori_url) video.title = detail_info["name"] video.publish_time = detail_info["createdate"] video.publish_time = format_datetime_string(video.publish_time) video.tags = ";".join(detail_info["keywords"].split()) video.publish_ori_name = "凤凰视频" video.publish_ori_icon = None video.thumbnail = detail_info["videoLargePoster"] video.duration = int(detail_info["duration"]) id = detail_info["id"] video.src = get_video_src(id) videos.append(video) sleep(0.2) return videos
def joke_duanzidao_parser(url): text_config = {"params": {"selector": "div.article"}, "method": "select"} user_config = { "params": { "selector": "table.author td > ul > li > a" }, "method": "select" } user_icon_config = { "params": { "selector": "td.avatar img" }, "method": "select" } like_config = { "params": { "selector": "em.good-btn > span" }, "method": "select" } dislike_config = { "params": { "selector": "em.bad-btn > span" }, "method": "select" } pb_time_config = {"params": {"selector": "table"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#main > div.panel") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_3jy_parser(url): title_config = {"params": {"selector": "h2 > a"}, "method": "select"} text_config = {"params": {"selector": "div.c"}, "method": "select"} user_config = {"params": {"selector": "a.u_name"}, "method": "select"} like_config = {"params": {"selector": "p.zan"}, "method": "select"} dislike_config = {"params": {"selector": "p.bs"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#zb > div.xh") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") jokes.append(joke) return jokes
def joke_caoegg_parser(url): text_config = { "params": { "selector": "div.c > a > span" }, "method": "select" } like_config = { "params": { "selector": "div#dateright span.voteyes > font" }, "method": "select" } dislike_config = { "params": { "selector": "div#dateright span.voteno > font" }, "method": "select" } pb_time_config = { "params": { "selector": "div#dateright" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#wrap_info > div.infobox") jokes = list() for tag in tags: joke = JokeFields() joke.text = get_tag_attribute(tag, text_config, "text") joke.text = joke.text.strip("What a f*****g day!") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_nbsw_parser(url): text_config = {"params": {"selector": "div.ecae > p"}, "method": "select"} user_config = {"params": {"selector": "a.local-link"}, "method": "select"} user_icon_config = { "params": { "selector": "img.avatar" }, "method": "select" } like_config = {"params": {"selector": "div.count-box"}, "method": "select"} comment_config = { "params": { "selector": "span.wppviews" }, "method": "select" } pb_time_config = { "params": { "selector": "span.meta > abbr" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="ul#postlist > li") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.text = joke.text.strip("[...]") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_helegehe_parser(url): text_config = {"params": {"selector": "a.contentHerf"}, "method": "select"} user_config = {"params": {"selector": "h2"}, "method": "select"} user_icon_config = {"params": {"selector": "img"}, "method": "select"} like_config = { "params": { "selector": "a.output-leftSupport" }, "method": "select" } dislike_config = { "params": { "selector": "a.output-leftOpposition" }, "method": "select" } pb_time_config = { "params": { "selector": "div.publishedIn" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="article.post") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def video_thepaper_parser(url): body = http.download_html(url=url) thepaper_video_url_re = re.compile(r'source src="(.*?)" type="video/mp4"') detail_config = {"params": {"selector": "a"}, "method": "select"} title_config = { "params": { "selector": "div.video_title" }, "method": "select" } user_name_config = { "params": { "selector": "div.t_source > a" }, "method": "select" } thumbnail_config = { "params": { "selector": "div.video_list_pic > img" }, "method": "select" } user_icon_config = { "params": { "selector": "div.video_txt_r_icon img" }, "method": "select" } duration_config = { "params": { "selector": "div.video_list_pic > span.p_time" }, "method": "select" } comment_config = { "params": { "selector": "div.t_source > span.reply" }, "method": "select" } description_config = {"params": {"selector": "p"}, "method": "select"} soup = BeautifulSoup(body, "lxml") tags = soup.select(selector=".video_news") videos = list() for tag in tags: url = urljoin("http://www.thepaper.cn/", get_tag_attribute(tag, detail_config, "href")) try: req = http.Request(url=url) response = http.download(req) _, content = http.response_url_content(response) video_url = unquote_plus(thepaper_video_url_re.findall(content)[0]) except Exception: continue video = VideoFields() video.title = get_tag_attribute(tag, title_config, "text") video.src = video_url video.publish_ori_url = url video.publish_ori_name = get_tag_attribute(tag, user_name_config, "text") video.publish_ori_name = video.publish_ori_name.replace( u"@所有人", u"澎湃视频") video.thumbnail = get_tag_attribute(tag, thumbnail_config, "src") video.n_comment = get_tag_attribute_int(tag, comment_config, "text") video.description = get_tag_attribute(tag, description_config, "text") string = get_tag_attribute(tag, duration_config, "text") if string: try: m, s = string.split(":") second = int(m) * 60 + int(s) except Exception: pass else: video.duration = second detail = BeautifulSoup(content, "lxml") video.publish_ori_icon = get_tag_attribute(detail, user_icon_config, "src") videos.append(video) return videos
def get_detail_content(detail_url): detail_html = http.download_html(url=detail_url) video_info = video_info_re.findall(detail_html)[0] video_info = video_info.replace("'", '"') video_json = json.loads(video_info) return video_json
def joke_budejie_parser(url): text_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } user_config = {"params": {"selector": "img.u-logo"}, "method": "select"} user_icon_config = { "params": { "selector": "img.u-logo" }, "method": "select" } like_config = { "params": { "selector": "li.j-r-list-tool-l-up" }, "method": "select" } dislike_config = { "params": { "selector": "li.j-r-list-tool-l-down" }, "method": "select" } comment_config = { "params": { "selector": "li.j-comment" }, "method": "select" } pb_time_config = { "params": { "selector": "span.u-time" }, "method": "select" } repost_config = { "params": { "selector": "div.j-r-list-tool-ct-share-c" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.j-r-list > ul > li") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "alt") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "data-original") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) joke.n_repost = get_tag_attribute_int(tag, repost_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") jokes.append(joke) return jokes
def video_miaopai_parser(url): # 根据秒拍号进行列表抓取 body = http.download_html(url=url) video_url_template = "http://gslb.miaopai.com/stream/{id}.mp4" detail_url_template = "http://www.miaopai.com/show/{id}.htm" vid_re = re.compile('data-scid="(.*?)"') cover_re = re.compile('data-img="(.*?)"') title_config = { "params": { "selector": "div.viedoAbout > p" }, "method": "select" } publish_name_config = { "params": { "selector": "p.personalDataN" }, "method": "select" } publish_icon_config = { "params": { "selector": "a.pic > img" }, "method": "select" } read_config = { "params": { "selector": "p.personalDataT > span.red" }, "method": "select" } tag_config = { "params": { "selector": "div.viedoAbout > p.orange" }, "method": "select" } num_like_config = { "params": { "selector": "ul.commentLike > li > a" }, "method": "select" } num_comment_config = { "params": { "selector": "ul.commentLike a.commentIco" }, "method": "select" } soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.contentLeft > div.videoCont") videos = list() for tag in tags: video = VideoFields() vid = vid_re.findall(str(tag)) vid = vid[0] video.title = get_tag_attribute(tag, title_config, "text") video.n_comment = get_tag_attribute_int(tag, num_comment_config, "text") video.n_read = get_tag_attribute_int(tag, read_config, "text") video.n_like = get_tag_attribute_int(tag, num_like_config, "text") video.tags = get_tag_attribute(tag, tag_config, "text") video.tags = ";".join( filter(lambda y: y != "", map(lambda x: x.strip(), video.tags.split("#")))) video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") video.src = video_url_template.format(id=vid) video.publish_ori_url = detail_url_template.format(id=vid) video.thumbnail = cover_re.findall(str(tag))[0] videos.append(video) sleep(0.2) return videos
def video_budejie_parser(url): detail_url_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } title_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } publish_name_config = { "params": { "selector": "div.u-txt > a" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.u-img img" }, "method": "select" } publish_time_config = { "params": { "selector": "div.u-txt > span" }, "method": "select" } src_config = { "params": { "selector": "div.j-video-c > div.j-video" }, "method": "select" } cover_config = { "params": { "selector": "div.j-video-c > div.j-video" }, "method": "select" } duration_config = { "params": { "selector": "div.j-r-list-c > div.j-video-c" }, "method": "select" } num_like_config = { "params": { "selector": "li.j-r-list-tool-l-up > span" }, "method": "select" } num_dislike_config = { "params": { "selector": "li.j-r-list-tool-l-down > span" }, "method": "select" } num_comment_config = { "params": { "selector": "span.comment-counts" }, "method": "select" } num_repost_config = { "params": { "selector": "div.j-r-list-tool-ct-share-c > span" }, "method": "select" } body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.j-r-list > ul > li") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") video.publish_time = get_tag_attribute(soup, publish_time_config, "text") video.src = get_tag_attribute(tag, src_config, "data-mp4") video.thumbnail = get_tag_attribute(tag, cover_config, "data-poster") video.n_like = get_tag_attribute_int(tag, num_like_config, "text") video.n_dislike = get_tag_attribute_int(tag, num_dislike_config, "text") video.n_comment = get_tag_attribute_int(tag, num_comment_config, "text") video.n_repost = get_tag_attribute_int(tag, num_repost_config, "text") video.duration = get_tag_attribute(tag, duration_config, "data-videoMlen") print video.duration videos.append(video) sleep(0.2) return videos
def get_like_dislike(id): url = "http://www.gifcool.com/plus/digg_ajax_index.php?id=%s" % id content = http.download_html(url=url) n_like = int(num_like_config.findall(content)[0]) n_dislike = int(num_dislike_config.findall(content)[0]) return n_like, n_dislike
def video_autohome_parser(url): body = http.download_html(url=url) autohome_vid_re = re.compile(r'vid=(.*?)&|vid: \"(.*?)\"') video_info_url_template = "http://p-vp.autohome.com.cn/api/gmi?mid={mid}&useragent=Android" title_config = { "params": { "selector": "div.video-item-tit > a" }, "method": "select" } detail_config = { "params": { "selector": "div.video-item-tit > a" }, "method": "select" } publish_time_config = { "params": { "selector": "div:nth-of-type(3) span:nth-of-type(3)" }, "method": "select" } publish_name_config = { "params": { "selector": "a#author_nickName" }, "method": "select" } publish_icon_config = { "params": { "selector": "img#author_headimageurl" }, "method": "select" } comment_config = { "params": { "selector": "span.videocom" }, "method": "select" } read_config = { "params": { "selector": "span.count-eye" }, "method": "select" } soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.video-item") videos = list() for tag in tags: video = VideoFields() video.title = get_tag_attribute(tag, title_config, "text") video.publish_time = get_tag_attribute(tag, publish_time_config, "text") video.publish_time = format_datetime_string(video.publish_time) video.n_comment = get_tag_attribute_int(tag, comment_config, "text") video.n_read = get_tag_attribute_int(tag, read_config, "text") detail_url = urljoin(url, get_tag_attribute(tag, detail_config, "href")) try: req = http.Request(url=detail_url) response = http.download(req) _, content = http.response_url_content(response) vid_one, vid_two = autohome_vid_re.findall(content)[0] vid = vid_one if vid_one else vid_two soup = BeautifulSoup(content, "lxml") ts = soup.select("div.card-label > a") or soup.select( "a.video-label") video.tags = ";".join( [extract_tag_attribute(t, "text") for t in ts]) kinenames = ";".join([ extract_tag_attribute(t, "text") for t in soup.select("a.kindname") ]) if kinenames: video.tags += ";" + kinenames video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") if video.publish_ori_icon: _u = urljoin(url, video.publish_ori_icon) video.publish_ori_icon = remove_url_query_params(_u) except Exception: continue info_url = video_info_url_template.format(mid=vid) try: req = http.Request(url=info_url) response = http.download(req) content = response.body[5:-1] info = json.loads(content) except Exception as e: try: content = response.body info = json.loads(content) except: continue if int(info["status"]) == 0: continue video.src = remove_url_query_params(info["copies"][-1]["playurl"]) video.publish_ori_url = detail_url video.thumbnail = info["img"] video.duration = int(info["duration"]) videos.append(video) sleep(0.2) return videos
def video_4399pk_parser(url): # http://joke.4399pk.com/video/find.html# def get_num_comment(id): n_comment_url = "http://joke.4399pk.com/wap/funnycourse-num-id-%s" % id content = http.download_json(url=n_comment_url) n_comment = content["msg"]["vcomment"] return int(n_comment) def get_wap_detail(id): meta = {} detail_wap = "http://joke.4399pk.com/wap/video-content-id-%s.html" % vid content = http.download_json(detail_wap) soup = BeautifulSoup(content, "lxml") meta["name"] = get_tag_attribute(soup, publish_name_config, "text") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") return meta def get_video_inf(id): pass detail_url_config = {"params": {"selector": "a.img"}, "method": "select"} title_config = {"params": {"selector": "div.tit"}, "method": "select"} num_like_config = { "params": { "selector": "div.info > span.fr > em" }, "method": "select" } publish_name_config = { "params": { "selector": "div.kind-user.cf > div.fl > p" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.kind-user.cf img" }, "method": "select" } body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.piclist > ul > li") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.title = get_tag_attribute(tag, title_config, "text") video.n_like = get_tag_attribute_int(tag, num_like_config, "text") vid = video.publish_ori_url.split("/")[-1].split(".")[0] video.n_comment = get_num_comment(vid) video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") print video.duration videos.append(video) sleep(0.2) return videos
def video_pearvideo_parser(url): def format_duration(d_text): duration = map(lambda x: int(x), d_text.split(":")) duration = filter(lambda y: y != 0, duration) length = len(duration) result = 0 for i in range(length, 0, -1): result += duration[length - i] * pow(60, i - 1) return int(result) def get_detail_info(url): meta = {} content = http.download_html(url=url) soup = BeautifulSoup(content, "lxml") meta["src"] = src_re.findall(content)[0] meta["name"] = get_tag_attribute(soup, publish_name_config, "alt") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") meta["time"] = get_tag_attribute(soup, publish_time_config, "text") meta["thumbnail"] = get_tag_attribute(soup, cover_config, "src") return meta detail_url_config = { "params": { "selector": "a.vervideo-lilink" }, "method": "select" } title_config = { "params": { "selector": "div.vervideo-title" }, "method": "select" } duration_config = { "params": { "selector": "div.duration" }, "method": "select" } num_like_config = {"params": {"selector": "span.fav"}, "method": "select"} publish_name_config = { "params": { "selector": "div.thiscat img" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.thiscat img" }, "method": "select" } cover_config = { "params": { "selector": "div#poster img" }, "method": "select" } publish_time_config = { "params": { "selector": "div.details-content div.date" }, "method": "select" } src_re = re.compile('dUrl="(.*?)"') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="li.categoryem ") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.duration = get_tag_attribute(tag, duration_config, "text") video.duration = format_duration(video.duration) video.n_like = get_tag_attribute_int(tag, num_like_config, "text") meta = get_detail_info(video.publish_ori_url) video.publish_ori_name = meta["name"] video.publish_ori_icon = meta["icon"] video.publish_time = meta["time"] video.publish_time = format_datetime_string(video.publish_time) video.thumbnail = meta["thumbnail"] video.src = meta["src"] videos.append(video) sleep(0.2) return videos