def test_api(): class MySettings(Settings): web = {"with_ajax": False} api = Api('https://news.ycombinator.com/', settings=MySettings) class Post(Item): url = XPath('//a[@class="storylink"][1]/@href') title = XPath('//a[@class="storylink"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = {'/all?page=:page': '/news?p=:page'} class Page(Item): next_page = XPath('//a[@class="morelink"]/@href') class Meta: source = None route = {'/all?page=:page': '/news?p=:page'} def clean_next_page(self, next_page): return "http://127.0.0.1:5000/" + str(next_page) api.register(Post) api.register(Page) api.parse('/news?p=1')
def test_api_with_ajax(): from toapi import XPath, Item, Api api = Api('https://news.ycombinator.com/', with_ajax=True) class Post(Item): url = XPath('//a[@class="storylink"][1]/@href') title = XPath('//a[@class="storylink"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = '/news\?p=\d+' class Page(Item): next_page = XPath('//a[@class="morelink"]/@href') class Meta: source = None route = '/news\?p=\d+' def clean_next_page(self, next_page): return "http://127.0.0.1:5000/" + next_page api.register(Post) api.register(Page) print(api.parse('/news?p=1'))
def test_api_with_ajax(): from toapi import XPath, Item, Api api = Api('https://news.ycombinator.com/', with_ajax=True) class Post(Item): url = XPath('//a[@class="storylink"][1]/@href') title = XPath('//a[@class="storylink"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = '/news\?p=\d+' class Page(Item): next_page = XPath('//a[@class="morelink"]/@href') class Meta: source = None route = '/news\?p=\d+' api.parse('/news?p=1')
def test_api_with_ajax(): from toapi import XPath, Item, Api api = Api('https://news.ycombinator.com/', with_ajax=True) class Post(Item): url = XPath('//a[@class="storylink"][1]/@href') title = XPath('//a[@class="storylink"][1]/text()') class Meta: source = XPath('//tr[@class="athing"]') route = '/' api.register(Post) print(api.parse('/'))
else: return ''.join( [i.text.strip().replace(u'\xa0', '') for i in title]) def clean_url(self, value): return value api.register(Post) if __name__ == '__main__': headers = { 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" } print(api.parse('/250/?start=25', headers=headers)) api.serve() # Visit http://127.0.0.1:5000/250/ # http://127.0.0.1:5000/250/?start=25 # http://127.0.0.1:5000/250/?start=50 # ... """ { "post": [ { "title": "肖申克的救赎/The Shawshank Redemption", "url": "https://movie.douban.com/subject/1292052/" }, { "title": "霸王别姬", "url": "https://movie.douban.com/subject/1291546/"
def clean_title(self, title): if isinstance(title, unicode): return title.replace(u'\xa0', '') else: return ''.join( [i.text.strip().replace(u'\xa0', '') for i in title]) api.register(Post) if __name__ == '__main__': headers = { 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" } pprint(api.parse('/', headers=headers)) api.serve() # Visit http://127.0.0.1:5000/ # http://127.0.0.1:5000/?start=25 # http://127.0.0.1:5000/?start=50 # ... """ { "post": [ { "title": "肖申克的救赎/The Shawshank Redemption", "url": "https://movie.douban.com/subject/1292052/" }, { "title": "霸王别姬", "url": "https://movie.douban.com/subject/1291546/"
for node in title[0].itertext(): text += node title = text.strip() return title class Meta: source = Css('li.b_algo') route = {'/:wd': '/search?q=:wd&ensearch=1'} class Baidu(Bing): __name__ = 'baidu' __base_url__ = 'http://www.baidu.com' url = Css('h3.t a', attr='href') title = Css('h3.t a') class Meta: source = Css('div.result') route = {'/:wd': '/s?wd=:wd&ie=utf-8&vf_bl=1'} api.register(Baidu) api.register(Bing) if __name__ == '__main__': print(api.parse('/python')) api.serve() # Visit http://127.0.0.1:5000/python
route = '/' def clean_title(self, title): if isinstance(title, unicode): return title.replace(u'\xa0', '') else: return ''.join([i.text.strip().replace(u'\xa0', '') for i in title]) api.register(Post) if __name__ == '__main__': headers = { 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" } api.parse('/', headers=headers) api.serve() # Visit http://127.0.0.1:5000/ # http://127.0.0.1:5000/?start=25 # http://127.0.0.1:5000/?start=50 # ... """ { "post": [ { "title": "肖申克的救赎/The Shawshank Redemption", "url": "https://movie.douban.com/subject/1292052/" }, { "title": "霸王别姬",