def get_article(url, mode=None): returnee = {} now = time.localtime() if not mode: agent = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/1A542a Safari/419.3" structure = requests.get(url, headers={"User-Agent": agent}, timeout=5.0) else: structure = mode charset = structure.encoding tree = html.fromstring(structure.text) body = tree.cssselect("div#ct")[0] title = body.cssselect("div.end_tt h2")[0] title.remove(title.cssselect("a")[0]) returnee["title"] = st.refine_text(html.tostring(title), encoding=charset) returnee["name"] = st.refine_text(html.tostring(body.cssselect("div.end_tt p span a")[0]), encoding=charset) date = datetime.datetime.now() try: date = DATE.parse(st.refine_text(html.tostring(body.cssselect("div.end_tt p span.s_tm")[0]), encoding=charset)) except Exception, e: pass
def get_article(url, mode=None): returnee = {} if not mode: agent = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/1A542a Safari/419.3" structure = requests.get(url, headers={"User-Agent": agent}, timeout=5.0) else: structure = mode charset = structure.encoding tree = html.fromstring(structure.text) body = tree.cssselect("div.wrap_posting")[0] returnee["title"] = st.refine_text(html.tostring(body.cssselect("div.area_tit h2 a")[0], encoding=charset, method="text")) owner_info = body.cssselect("span.owner_info")[0] date = owner_info.cssselect("span.datetime")[0] owner_info.remove(date) txt_bars = owner_info.cssselect("span.txt_bar") for txt_bar in txt_bars: owner_info.remove(txt_bar) categories = owner_info.cssselect("span.category_info") for cate in categories: owner_info.remove(cate) name = owner_info returnee["name"] = st.refine_text(html.tostring(name, encoding=charset, method="text")) returnee["date"] = DATE.parse(st.refine_text(html.tostring(date), encoding=charset)) article = body.cssselect("div.area_content")[0] scripts = article.cssselect("script") for script in scripts: script.getparent().remove(script) sections = article.cssselect("div.section_writing") for section in sections: article.remove(section) snss = article.cssselect("div.sns") for sns in snss: article.remove(sns) returnee["content"] = st.refine_text(html.tostring(article), encoding=charset).decode("utf8", "ignore").encode("utf8") returnee["images"] = get_images(article) returnee["post_id"] = url[url.rfind("/")+1:] return returnee
def get_article(url, mode=None): returnee = {} if not mode: structure = requests.get(url, headers={"User-Agent": UserAgent}, timeout=5.0) else: structure = mode charset = structure.encoding tree = html.fromstring(structure.text) daum_content = tree.cssselect("div#daumContent") if daum_content is None or len(daum_content) == 0: return returnee body = daum_content[0] #print dir(body) returnee["title"] = st.refine_text(html.tostring(body.cssselect("p.title")[0]), encoding=charset) returnee["name"] = st.refine_text(html.tostring(body.cssselect("span.nick")[0]), encoding=charset) returnee["date"] = DATE.parse(st.refine_text(html.tostring(body.cssselect("span.date")[0]), encoding=charset)) article = body.cssselect("div#article")[0] navis = article.cssselect("div.articleNavi") for navi in navis: navi.getparent().remove(navi) rel_articles = article.cssselect("div.relation_article") for rel_article in rel_articles: rel_article.getparent().remove(rel_article) returnee["content"] = st.refine_text(html.tostring(article)) returnee["images"] = get_images(article) post_id = url[url.rfind("/") + 1:] post_id = post_id[post_id.find("articleno=") + 10:] post_id = post_id[:post_id.find("&")]; if post_id == '' : str = "<meta property=\"og:url\" content=\"" part = structure.text[structure.text.find(str) + len(str):] part = part[:part.find("\"")] post_id = part[part.rfind("/") + 1:] post_id.encode(charset) returnee["post_id"] = post_id return returnee
returnee["title"] = st.refine_text(html.tostring(title), encoding=charset) returnee["name"] = st.refine_text(html.tostring(body.cssselect("div.end_tt p span a")[0]), encoding=charset) date = datetime.datetime.now() try: date = DATE.parse(st.refine_text(html.tostring(body.cssselect("div.end_tt p span.s_tm")[0]), encoding=charset)) except Exception, e: pass returnee["date"] = date article = body.cssselect("div.post_tx")[0] article.remove(article.cssselect("span.ut_txt")[0]) returnee["content"] = st.refine_text(html.tostring(article), encoding=charset) returnee["images"] = get_images(article) returnee["post_id"] = url[url.rfind("/")+1:] return returnee def get_article_list(host, lp=None): import re returnee = [] flag, page = 1, 1 if host.find("http://m.") == -1: host = host.replace("http://", "http://m.")