class Page(Item): __base_url__ = "http://www.allitebooks.com" pages = Css('.pagination>.pages') current_page = Css('.pagination>.current') next_page = Css('.pagination>a', attr='href') def clean_next_page(self, value): if isinstance(value, list): return [ i.get('href').replace('http://www.allitebooks.com/', 'http://127.0.0.1:5000/allitebooks/') for i in value ] else: return ['http://127.0.0.1:5000/' + value] class Meta: source = None route = { '/allitebooks/': '/', '/allitebooks/?s=:keyword': '/?s=:keyword', '/allitebooks/:keyword/': '/:keyword/', '/allitebooks/page/:path': '/page/:path' }
class Post(Item): url = Css('div.hd>a', attr='href') title = Css('span.title') class Meta: source = Css('div.item', attr='target') route = '/'
class IndexArticle(Item): __base_url__ = "http://wufazhuce.com" # Article one_article_index = Css(".fp-one-articulo p.one-titulo") one_article_title = Css(".fp-one-articulo p.one-articulo-titulo a") one_article_url = Css(".fp-one-articulo p.one-articulo-titulo a", attr='href') one_article_list = Css(".fp-one-articulo ul li") def clean_one_article_index(self, one_article_index): return one_article_index.strip() def clean_one_article_title(self, one_article_title): return one_article_title.strip() def clean_one_article_url(self, one_article_url): return 'http://127.0.0.1:5000/' + one_article_url.replace("http://wufazhuce.com", "one") def clean_one_article_list(self, one_article_list): article_list = [] for article in one_article_list: each_article = {} each_article['one_index'] = article.cssselect('span')[0].text.strip() each_article['one_title'] = article.cssselect('a')[0].text.strip() each_article['one_article_url'] = 'http://127.0.0.1:5000/' + article.cssselect('a')[0].get('href').replace( "http://wufazhuce.com", "one") article_list.append(each_article) return article_list class Meta: source = None route = { '/one/': '/' }
class IndexQuestion(Item): __base_url__ = "http://wufazhuce.com" # Question one_question_index = Css(".fp-one-cuestion p.one-titulo") one_question_title = Css(".fp-one-cuestion p.one-cuestion-titulo a") one_question_url = Css(".fp-one-cuestion p.one-cuestion-titulo a", attr='href') one_question_list = Css(".fp-one-cuestion ul li") def clean_one_question_index(self, one_question_index): return one_question_index.strip() def clean_one_question_title(self, one_question_title): return one_question_title.strip() def clean_one_question_url(self, one_question_url): return 'http://127.0.0.1:5000/' + one_question_url.replace("http://wufazhuce.com", "one") def clean_one_question_list(self, one_question_list): question_list = [] for question in one_question_list: each_question = {} each_question['one_index'] = question.cssselect('span')[0].text.strip() each_question['one_title'] = question.cssselect('a')[0].text.strip() each_question['one_question_url'] = 'http://127.0.0.1:5000/' + question.cssselect('a')[0].get( 'href').replace("http://wufazhuce.com", "one") question_list.append(each_question) return question_list class Meta: source = None route = { '/one/': '/' }
class Question(Item): __base_url__ = "http://wufazhuce.com" title = Css("div.one-cuestion > h4") editor = Css("div.one-cuestion p.cuestion-editor") content = Css("div.one-cuestion div.cuestion-contenido") def clean_title(self, title): if isinstance(title, list): return ''.join([i.text.strip() for i in title]) return title.strip() def clean_content(self, abstract): if isinstance(abstract, list): result = [] for i in abstract: text = '' for node in i.itertext(): text += node.strip() value = text result.append(value) return result return [abstract.strip()] class Meta: source = None route = { '/one/question/:path': '/question/:path' }
class Baidu(Bing): __name__ = 'baidu' __base_url__ = 'http://www.baidu.com' url = Css('h3.t a', attr='href') title = Css('h3.t a') class Meta: source = Css('div.result') route = {'/:wd': '/s?wd=:wd&ie=utf-8&vf_bl=1'}
class Post(Item): url = Css('div.hd>a', attr='href') title = Css('span.title') class Meta: source = Css('div.item', attr='target') route = '/' def clean_title(self, title): if isinstance(title, unicode): return title.replace(u'\xa0', '') else: return ''.join([i.text.strip().replace(u'\xa0', '') for i in title])
class Recipe(Item): url = Css('div.recipe > a', attr='href') name = Css('div.recipe > div.info > p.name > a') cover = Css('div.recipe > a > div.cover > img', attr='data-src') def clean_name(self, name): return name.split(' ') class Meta: source = XPath('//div[contains(@class, "main-panel")]//div[@class="normal-recipe-list"]/ul[@class="list"]/li') route = { '/category/:cat/': '/category/:cat/', '/category/:cat/?page=:page': '/category/:cat/?page=:page', '/search/:keyword': '/search/?keyword=:keyword&cat=1001' }
class Book(Item): __base_url__ = "http://www.allitebooks.com" book_list = Css('article>div.entry-body>header>.entry-title>a', attr='href') def clean_book_list(self, book_list): if isinstance(book_list, list): result = [{ 'id': str(index), "name": value.text, "url": value.get('href').replace( 'http://www.allitebooks.com/', 'http://127.0.0.1:5000/allitebooks-info/') } for index, value in enumerate(book_list)] return result else: return [{ 'id': '0', 'name': '', 'href': 'http://127.0.0.1:5000/' + book_list }] class Meta: source = None route = Page.Meta.route
class Category(Item): categories = Css('div.cates-list') def clean_categories(self, nodes): categories = [] for node in nodes: topic = { 'name': node.findtext('div/h3').strip(), 'list': [] } cates_list = node.find('div[3]') h4_nodes = cates_list.findall('h4') ul_nodes = cates_list.findall('ul') for idx, el in enumerate(h4_nodes): tmp_dict = { 'name': el.text.strip(), 'types': [] } for a in ul_nodes[idx].findall('li/a'): tmp_dict['types'].append({ 'name': a.text.strip(), 'link': a.get('href', '#') }) topic['list'].append(tmp_dict) categories.append(topic) return categories class Meta: source = Css('div.category-container > div') route = { '/category/': '/category/' }
class IndexOne(Item): __base_url__ = "http://wufazhuce.com" # One one_item_list = Css("div#carousel-one div.item") def clean_one_item_list(self, one_item): item_list = [] for item in one_item: each_item = {} each_item['one_index'] = item.cssselect('div.fp-one-titulo-pubdate p.titulo')[0].text.strip() each_item['one_type'] = item.cssselect('div.fp-one-imagen-footer')[0].text.strip() each_item['one_url'] = 'http://127.0.0.1:5000/' + item.cssselect('div.fp-one-cita a')[0].get( 'href').replace( "http://wufazhuce.com", "one") abstract = '' for node in item.cssselect('div.fp-one-cita a')[0].itertext(): abstract += node.strip() + " " each_item['one_abstract'] = abstract.strip() each_item['date'] = item.cssselect('div.fp-one-titulo-pubdate p.dom')[0].text + " " + \ item.cssselect('div.fp-one-titulo-pubdate p.may')[0].text item_list.append(each_item) return item_list class Meta: source = None route = { '/one/': '/' }
class Meta: """ URL: http://127.0.0.1:5000/250/ Des: 豆瓣250电影api Params: start: eg: http://127.0.0.1:5000/250/?start=25 """ source = Css('div.item', attr='target') route = (('/250/?start=:start', '/?start=:start'), ('/250/', '/'))
class Detail(Item): __base_url__ = "http://www.allitebooks.com" title = Css('.single-title') abstract = Css('.entry-header>h4') cover = Css('.entry-body-thumbnail>a>img', attr='src') description = Css('.entry-content') pdf_url = Css('span.download-links>a', attr='href') def clean_pdf_url(self, pdf_url): if isinstance(pdf_url, list): return pdf_url[0].get('href') else: return pdf_url class Meta: source = None route = {'/allitebooks-info/:keyword': '/:keyword/'}
class ImageInfo(Item): image_url = Css('head > meta[property="og:image"]', attr='content') description = Css('head > meta[property="og:description"]', attr='content') source_url = Css('head > meta[property="og:url"]', attr='content') user_id = Css('head > meta[property="instapp:owner_user_id"]', attr='content') user_info_url = Css('head > meta[property="instapp:owner_user_id"]', attr='content') def clean_user_info_url(self, user_info_url): return "https://i.instagram.com/api/v1/users/{}/info/".format( user_info_url[0]) class Meta: source = None route = '/p/.*?' web = {"with_ajax": False}
class Page(Item): next = Css('a.next', attr='href') class Meta: source = XPath('//div[@class="pager"]') route = { '/category/:cat/': '/category/:cat/', '/category/:cat/?page=:page': '/category/:cat/?page=:page', '/search/:keyword': '/search/?keyword=:keyword&cat=1001' }
class Article(Item): __base_url__ = "http://wufazhuce.com" title = Css("h2.articulo-titulo") author = Css("p.articulo-autor") abstract = Css("div.comilla-cerrar") content = Css("div.articulo-contenido") def clean_title(self, title): return title.strip() def clean_author(self, author): return author.strip() def clean_abstract(self, abstract): return abstract.strip() class Meta: source = None route = {'/one/article/:path': '/article/:path'}
def test_css(): field = Css(rule="head title", attr=None) inline_field = Css(rule="p.p1", attr='html') value = field.parse(html) inline_field_value = inline_field.parse(html) assert isinstance(inline_field_value, list) == True assert value == "toapi"
class One(Item): __base_url__ = "http://wufazhuce.com" index = Css("div.tab-content div.one-titulo") image = Css("div.tab-content div.one-imagen img", attr='src') abstract = Css("div.tab-content div.one-cita") type = Css("div.tab-content div.one-imagen-leyenda") date = Css("div.tab-content div.one-pubdate p") def clean_index(self, index): return index.strip() def clean_abstract(self, abstract): return abstract.strip() def clean_date(self, date): if isinstance(date, list): return ' '.join([i.text.strip() for i in date]) class Meta: source = None route = {'/one/one/:path': '/one/:path'}
class Bing(Item): __name__ = 'bing' __base_url__ = 'https://www.bing.com' url = Css('h2 a', attr='href') title = Css('h2 a') def clean_url(self, url): if isinstance(url, list) and len(url): url = url[0].get('href') return url if url else '' def clean_title(self, title): if isinstance(title, list) and len(title): text = '' for node in title[0].itertext(): text += node title = text.strip() return title if title else '' class Meta: source = Css('li.b_algo') route = {'/:wd': '/search?q=:wd&ensearch=1'}
class Post(Item): url = Css('div.hd>a', attr='href') title = Css('span.title') class Meta: """ URL: http://127.0.0.1:5000/250/ Des: 豆瓣250电影api Params: start: eg: http://127.0.0.1:5000/250/?start=25 """ source = Css('div.item', attr='target') route = (('/250/?start=:start', '/?start=:start'), ('/250/', '/')) def clean_title(self, title): if isinstance(title, unicode): return title.replace(u'\xa0', '') else: return ''.join( [i.text.strip().replace(u'\xa0', '') for i in title]) def clean_url(self, value): return value
class Meta: source = Css('div.g') route = { '/:wd': '/search?hl=en&q=:wd&btnG=Search&gbv=1', } web = { "with_ajax": False, "request_config": { 'headers': { 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" }, 'proxies': { 'http': '0.0.0.0:8118', 'https': '0.0.0.0:8118' } }, "headers": None }
class Content(Item): name = Css('h1.page-title[itemprop="name"]') cover = Css('div.recipe-show > div.cover > img', attr='src') grade = Css( 'div.recipe-show > div.container > div.stats > div.score > span.number' ) materials = Css('div.recipe-show > div.ings > table tr') steps = Css('div.steps > ol li', attr='html') tip = Css('div.tip') def clean_name(self, name): return name.strip() def clean_materials(self, nodes): materials = [{ 'name': node.findtext('td[@class="name"]').strip() or node.findtext('td[@class="name"]/a').strip(), 'unit': node.findtext('td[@class="unit"]').strip() } for node in nodes] return materials def clean_steps(self, nodes): # HTML tag <p/> re_p = re.compile('</?p[^>]*>') # HTML tag <br/> re_br = re.compile('<br\s*?/?>') steps = [{ 'step': idx + 1, 'desc': re_br.sub('\n', re_p.sub('', etree.tounicode( node.find('p')).strip())).strip(), 'img': node.find('img').get('src') if node.find('img') is not None else '' } for idx, node in enumerate(nodes)] return steps def clean_tip(self, tip): return tip.strip() class Meta: source = XPath('//div[contains(@class,"main-panel")]/div[1]') route = {'/recipe/:no/': '/recipe/:no/'}
class Meta: source = Css('div.category-container > div') route = { '/category/': '/category/' }
class Meta: source = Css('li.b_algo') route = {'/:wd': '/search?q=:wd&ensearch=1'}
class Meta: source = Css('div.item', attr='target') route = (('/250/?start=:start', '/?start=:start'), ('/250/', '/'))
class Meta: source = Css('div.item', attr='target') route = {'/250/?start=:start': '/?start=:start', '/250/': '/'}
class Google(Item): __name__ = 'google' __base_url__ = 'https://www.google.com' url = Css('h3.r > a', attr='href') title = Css('h3.r > a') def clean_url(self, url): if isinstance(url, list) and len(url): url = url[0].get('href') return self.filter_link(link=url) if url else '' def clean_title(self, title): if isinstance(title, list) and len(title): text = '' for node in title[0].itertext(): text += node title = text.strip() return title if title else '' @classmethod def filter_link(cls, link): """ Returns None if the link doesn't yield a valid result. Token from https://github.com/MarioVilas/google :return: a valid result """ try: # Valid results are absolute URLs not pointing to a Google domain # like images.google.com or googleusercontent.com o = urlparse(link, 'http') if o.netloc: return link # Decode hidden URLs. if link.startswith('/url?'): link = parse_qs(o.query)['q'][0] # Valid results are absolute URLs not pointing to a Google domain # like images.google.com or googleusercontent.com o = urlparse(link, 'http') if o.netloc: return link # Otherwise, or on error, return None. except Exception as e: return '' class Meta: source = Css('div.g') route = { '/:wd': '/search?hl=en&q=:wd&btnG=Search&gbv=1', } web = { "with_ajax": False, "request_config": { 'headers': { 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)" }, 'proxies': { 'http': '0.0.0.0:8118', 'https': '0.0.0.0:8118' } }, "headers": None }
class Meta: source = Css('div.item', attr='target') route = '/'
def test_css_attr(): field = Css(rule="p a.test_link", attr='href') value = field.parse(html) assert value == "https://github.com/gaojiuli/toapi"
class Content(Item): name = Css('h1.page-title[itemprop="name"]') cover = Css('div.recipe-show > div.cover > img', attr='src') grade = Css('div.recipe-show > div.container > div.stats > div.score > span.number') cooked = Css("div.recipe-show > div.container > div.stats > div.cooked > span.number") materials = Css('div.recipe-show > div.ings > table tr') steps = Css('div.steps > ol li', attr='html') tip = Css('div.tip') def clean_name(self, name): #assert(isinstance(name, str)) assert(name is not None) return name.strip() def clean_materials(self, nodes): assert(nodes is not None) #assert(nodes[0].findtext('td[@class="name"]') is not None) materials = [] for node in nodes: name1 = node.findtext('td[@class="name"]') name2 = node.findtext('td[@class="name"]/a') unit = node.findtext('td[@class="unit"]') if (name1 is None and name2 is None) or unit is None: pass else: if name1 is None: name1 = "" if name2 is None: name2 = "" name = name1.strip() or name2.strip() unit = unit.strip() materials.append({"name": name, unit: "unit"}) """ print(nodes[0].findtext('td[@class="unit"]')) materials = [{ 'name': node.findtext('td[@class="name"]').strip() or node.findtext('td[@class="name"]/a').strip(), 'unit': node.findtext('td[@class="unit"]').strip() } for node in nodes] """ return materials def clean_steps(self, nodes): # HTML tag <p/> re_p = re.compile('</?p[^>]*>') # HTML tag <br/> re_br = re.compile('<br\s*?/?>') steps = [{ 'step': idx + 1, 'desc': re_br.sub('\n', re_p.sub('', etree.tounicode(node.find('p')).strip())).strip(), 'img': node.find('img').get('src') if node.find('img') is not None else '' } for idx, node in enumerate(nodes)] return steps def clean_tip(self, tip): #assert(isinstance(tip, str)) if not isinstance(tip, str): tip = "" return tip.strip() class Meta: source = XPath('//div[contains(@class,"main-panel")]/div[1]') route = { '/recipe/:no/': '/recipe/:no/' }