def test_hackernews(self): from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which hacker_news = { 'fields_rules': { 'news': xpaths("//body/center/table/tr", evalx="result"), }, 'result_rules': { 'title': xpath("//span[@class='pagetop']/b/a"), 'google': search(r'\>(Google.+?)\<'), 'html5': search(r'\>(HTML5.+?)\<'), 'facebook': xpath("//a[contains(text(),'Facebook')]"), 'titles': xpathz( 'news', xpaths(".//td[@class='title']/a")), 'tsmps': xpathz( 'news', xpaths(".//td[@class='subtext']/node()[4]")), 'points': xpathz( 'news', xpaths(".//td[@class='subtext']/span")), }, 'result_filters': { 'titles': (None, "result[2]"), 'tsmps': (None, "result[2]"), 'points': (None, "result[2]"), } } url = 'https://news.ycombinator.com/' html = url_get(url) results = dict(Parser(html, hacker_news, 'utf-8').parse()) for key, result in results.items(): print '{} : {}'.format(key, result)
"shopUrl": which( xpath("//a[contains(@class, 'seller-name')]/@href"), xpath("//a[contains(@class,'enter-shop')]/@href") ), "itemImg": xpath("//img[@id='J_ImgBooth']/@data-src"), "itemTitle": fields("itemViewed", dpath("['title']")), "initPrice": fields("itemViewed", dpath("['price']")), "promoInfo": None, "postageInfo": None, "monthlyTrade": ajax("apiItemInfo", dpath("['quantity']['quanity']")), "itemRate": None, "bonus": None, "favNum": ajax("saveCounts", dpath("['{apiItemCollectsKey}']")), "totalSoldOut": None, "attrList": xpaths( "//ul[@class='attributes-list']/li", # evalx="re.split(u'[:\uff1a]', _strip(result.text))", evalx="result.text.partition(':' if result.text.partition(':')[0] < result.text.partition(u'\uff1a')[0] else u'\uff1a')[::2]", ), "reviewCount": ajax("saveCounts", dpath("['{apiItemViewsKey}']")), "starts": addon("starts"), "ends": addon("ends"), "userTag": None, "cid": None, "location": ajax("wholeSibUrl", dpath("['location']")), "brand": None, "gradeAvg": None, "peopleNum": None, "periodSoldQuantity": None, "rateTotal": None, "spuId": None, "totalSoldQuantity": None,
# encoding: utf8 from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType appsorules = { 'fields_rules': { 'items': xpaths("//div[contains(@id, 'liveblog-entry-')]/div", evalx="result"), }, 'result_rules': { 'tsmp': xpathz( 'items', xpaths(".//p[1]/strong", evalx="result.text")), 'title': xpathz( 'items', xpaths(".//p[2]/a/strong", evalx="result.text")), 'link': xpathz( 'items', xpaths(".//p[2]/a/@href")), 'desc': xpathz( 'items', xpaths(".//p[3]", evalx="result.text")), 'intro': xpathz( 'items', xpaths(".//p[4]", evalx="result.text")), }, }
# encoding: utf8 from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, next, ajaxurl, ajax, RespType from sasoup.baserules import init_rules rules = { 'url': 'http://www.etao.com', 'fields_rules': { 'feedList': xpaths("//div[@id='J_FeedList']//div[@id]", evalx="result"), }, 'result_rules': { 'feed': ( xpathz('feedList', xpath(".//h3[@class='feed-title']/a/@title", evalx="_strip(result)")), xpathz('feedList', xpath(".//h3[@class='feed-title']/a/strong/text()", evalx="_strip(result)")), xpathz('feedList', xpath(".//div[@class='feed-desc']/p/text()", evalx="_strip(result)")), ), 'cats': next( xpaths("//div[contains(@class,'J_PCMain')/li]", evalx="result"), xpath(".//h3/a/text()", evalx="result"), ), }, } rules = init_rules(rules)
# encoding: utf8 from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType """ jdrules: cat: 顶级类目 electrontics_cat: 家电通讯类目列表 electrontics_items: 家电通讯类目特价商品 digitals_cat: 电脑数码类目列表 digitals_items: 电脑数码类目特价商品 """ jdrules = { 'fields_rules': { 'catItems': xpaths("//div[@id='_JD_ALLSORT']/div", evalx="result"), 'cat1Cats': xpaths("//div[@id='electronics']/div[contains(@class,'catalogue')]//ul/li", evalx="result"), 'cat1Items': xpaths("//div[@id='electronics']/div[contains(@class,'plist')]/div[2]//li", evalx="result"), 'cat2Cats': xpaths("//div[@id='digitals']/div[contains(@class,'catalogue')]//ul/li", evalx="result"), 'cat2Items': xpaths("//div[@id='digitals']/div[contains(@class,'plist')]/div[2]//li", evalx="result"), }, 'result_rules': { 'cat': xpathz( 'catItems', xpaths(".//h3/a", evalx="result.text")), 'electrontics_cat': xpathz( 'cat1Cats', xpath(".//a", evalx="result.text")), 'electrontics_items': xpathz( 'cat1Items', xpaths("./div[@class='p-name']/a|./div[@class='p-price']/span", evalx="result.text")), 'digitals_cat': xpathz( 'cat2Cats',