class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = AttrField(css_select='h3.wrap-title', attr='html') more = AttrField(css_select='h3>a.more', attr='href') book_list = TextField(css_select='div.book-list>ul>li') async def clean_ranking_title(self, ranking_title): if isinstance(ranking_title, list): return ranking_title[0].text async def clean_more(self, more): return "http:" + more
class HackerNewsItem(Item): target_item = TextField(css_select='tr.athing') title = TextField(css_select='a.storylink') url = AttrField(css_select='a.storylink', attr='href') async def clean_title(self, value): return value
class DoubanTargetItem(Item): target_item = TextField(css_select='div.item') title = TextField(css_select='span.title') cover = AttrField(css_select='div.pic>a>img', attr='src') abstract = TextField(css_select='span.inq') async def clean_title(self, title): if isinstance(title, str): return title else: return ''.join([i.text.strip().replace('\xa0', '') for i in title])
class JianshuItem(Item): target_item = TextField(css_select='ul.list>li') author_name = TextField(css_select='a.name') author_url = AttrField(attr='href', css_select='a.name')
def test_attr_field(): attr_field = AttrField(css_select="p a.test_link", attr='href') value = attr_field.extract_value(html) assert value == "https://github.com/howie6879/aspider"
class RankingItem(Item): target_item = TextField(css_select='div.rank_i_p_list') ranking_title = TextField(css_select='div.rank_i_p_tit') more = AttrField(css_select='div.rank_i_more a', attr='href') book_list = TextField(css_select='div.rank_i_p_list>div.rank_i_li')